[0-9a-zA-Z]+://[-_\.a-zA-Z0-9]+[^ \t<>]*
这是我的一段正则表达式
是专门匹配网页中的URL地址的
我发现有点问题,抓取网页中的URL时,哪些地址后面都带啦个单引号(”)
下面是我从网页中抓取的URL地址:
http://www.sina.com.cn/contactus.html"
http://www.lenovo.com.cn/Public/public_bottom/contact.shtml"
http://www.tom.com/about/about_contact_1.htm"
怎么才能把后面得单引号不要呢深入一层:
就是我要把在Google中搜索到的记录,只抓取标题中的10条记录
就是每页只显示的10条记录的那10个URL地址
下面还有很多分页,都要抓取出来,我现在怎么把那个页面多余的URL全部过滤掉呢?
还有怎么把其他分页的URL也抓取出来,但是只抓取每个页面的那主要的10条URL地址
其他的全部过滤掉
各位大虾给个详细的讲解,最好有代码,正则表达式等等

解决方案 »

  1.   

    http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?
      

  2.   

    或者 "(http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?)" ,Group[1]就是你要的。
      

  3.   

    http://images.google.cn/images?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wi"
    http://video.google.cn/videosearch?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wv"
    http://ditu.google.cn/maps?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wl"
    http://news.google.cn/news?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wn"
    http://www.google.cn/music/search?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wU"
    http://www.google.cn/finance?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=we"
    http://www.google.cn/intl/zh-CN/options/"
    http://blogsearch.google.cn/blogsearch?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wb"
    http://translate.google.cn/translate_t?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wT"
    http://shenghuo.google.cn/shenghuo/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=w8"
    http://www.google.cn/rebang/search?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=w9"
    http://www.265.com/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wA"
    http://www.google.com/calendar/render?hl=zh-CN&tab=wc"
    http://picasaweb.google.com/lh/view?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wq"
    http://docs.google.com/?hl=zh-CN&tab=wo"
    http://sites.google.com/?hl=zh-CN&tab=w3"
    http://tools.google.com/pinyin/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wI"
    http://toolbar.google.com/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wO"
    http://pack.google.cn/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wP"
    http://www.google.cn/intl/zh-CN/options/"
    https://www.google.com/accounts/Login?hl=zh-CN&continue=http://www.google.cn/search%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540%26hl%3Dzh-CN%26newwindow%3D1%26start%3D0%26sa%3DN"
    http://www.google.cn/webhp?hl=zh-CN"
    http://images.google.cn/images?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;hl=zh-CN&amp;newwindow=1&amp;um=1&amp;ie=UTF-8&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result_group&amp;ct=title&amp;resnum=1"
    http://ship1.hist.edu.cn/Lists/List12/Attachments/3/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.jpg&amp;imgrefurl=http://ship1.hist.edu.cn/Lists/List12/DispForm.aspx%3FID%3D3%26Source%3Dhttp%253A%252F%252Fship1.hist.edu.cn%252FLists%252FList12%252FAllItems.aspx&amp;h=303&amp;w=569&amp;sz=70&amp;tbnid=gOexNWyfFm1KIM:&amp;tbnh=71&amp;tbnw=134&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__aQppp5U-VVj-O50BIiuzcgHfKFw=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=1&amp;ct=image"
    http://ship1.hist.edu.cn/Lists/List12/DispForm.aspx?ID=3&amp;Source=http%3A%2F%2Fship1.hist.edu.cn%2FLists%2FList12%2FAllItems.aspx"
    http://ship1.hist.edu.cn/Lists/List12/DispForm.aspx?ID=3&amp;Source=http%3A%2F%2Fship1.hist.edu.cn%2FLists%2FList12%2FAllItems.aspx"
    http://pigimg.zhongso.com/space/gallery/%E7%8E%89%E7%9F%B3%E5%BA%8A%E5%9E%AB/455918-%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.jpg&amp;imgrefurl=http://www.zhongsou.net/%25E7%258E%2589%25E7%259F%25B3%25E5%25BA%258A%25E5%259E%25AB/detail/article/215893&amp;h=542&amp;w=595&amp;sz=102&amp;tbnid=3IzmkzYzNlc7eM:&amp;tbnh=123&amp;tbnw=135&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__NS0C70k2hDEwnrGBZitfnwzGWhA=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=2&amp;ct=image"
    http://www.zhongsou.net/%E7%8E%89%E7%9F%B3%E5%BA%8A%E5%9E%AB/detail/article/215893"
    http://www.zhongsou.net/%E7%8E%89%E7%9F%B3%E5%BA%8A%E5%9E%AB/detail/article/215893"
    http://www.gxmuseum.com/image/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.jpg&amp;imgrefurl=http://www.gxmuseum.com/%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC.html&amp;h=520&amp;w=650&amp;sz=39&amp;tbnid=ZEi7PQEByGEXFM:&amp;tbnh=110&amp;tbnw=137&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__fGou4S46NLnWYIMg0kwM073Cl9I=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=3&amp;ct=image"
    http://www.gxmuseum.com/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.html"
    http://www.gxmuseum.com/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.html"
    http://www.cspn.cn/CSPN/upload/fckeditor/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC-%E5%B9%B3%E5%8F%B0.jpg&amp;imgrefurl=http://www.cspn.cn/templates/CN_About/index.aspx%3Fnodeid%3D39&amp;h=325&amp;w=442&amp;sz=19&amp;tbnid=Qx3MiSiRuoscjM:&amp;tbnh=93&amp;tbnw=127&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__XzVvpPY0AcklqF_UYv_Z8P-VWE0=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=4&amp;ct=image"
    http://www.cspn.cn/templates/CN_About/index.aspx?nodeid=39"
    http://www.cspn.cn/templates/CN_About/index.aspx?nodeid=39"
    http://www.sina.com.cn/contactus.html"
    http://203.208.37.132/search?q=cache:TWZYbdLoytsJ:www.sina.com.cn/contactus.html+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=5&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2-vSUyK3Sr4fDv9x8r4F6ptGm0NpQ"
    http://www.lenovo.com.cn/Public/public_bottom/contact.shtml"
    http://203.208.37.132/search?q=cache:silaDFZkQTcJ:www.lenovo.com.cn/Public/public_bottom/contact.shtml+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=6&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy28Wmsva_T202ABtZl_P7CcZb9oCUg"
    http://www.hurray.com.cn/chinese/contact/contact.htm"
    http://203.208.37.132/search?q=cache:lAsyFIEvOXAJ:www.hurray.com.cn/chinese/contact/contact.htm+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=7&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy29vZq_suGUY2dGdIuGjAjeJmjnrrg"
    http://www.chsi.com.cn/about/contact.shtml"
    http://203.208.37.132/search?q=cache:DkxmYYYLkOgJ:www.chsi.com.cn/about/contact.shtml+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=8&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_fFb_Kdl7VAbNLLaEtbx5LsJQbEw"
    http://www.nokia.com.cn/about-nokia/contacts"
    http://203.208.37.132/search?q=cache:ysYlek8cmagJ:www.nokia.com.cn/about-nokia/contacts+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=9&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_UyAQS1EnbGKe-rtQARYX_P3Wdzg"
    http://www.peopledaily.com.cn/GB/other7018/7026/index.html"
    http://203.208.37.132/search?q=cache:WeVjf9Yb1isJ:www.peopledaily.com.cn/GB/other7018/7026/index.html+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=10&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_CI6gTK8Vbr7Ow4psZ4WaGEWy_yw"
    http://my.taobao.com/mytaobao/misc/contact.jhtml"
    http://203.208.37.132/search?q=cache:LET-FY8mHXMJ:my.taobao.com/mytaobao/misc/contact.jhtml+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=11&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_XL_sr5KVpvj6Af50YRbqApaEvAQ"
    http://www.huawei.com/cn/about/officeList.do"
    http://203.208.37.132/search?q=cache:AgbEzZn053kJ:www.huawei.com/cn/about/officeList.do+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=12&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2868eMeUsk4bOPf0EIObHw4zAOt8w"
    http://www.tom.com/about/about_contact_1.htm"
    http://203.208.37.132/search?q=cache:hPgcVks7yIYJ:www.tom.com/about/about_contact_1.htm+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=13&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_hTp0ugRiDqonvRf4sAJ_vTmfjew"
    http://www.gree.com.cn/index01.html"
    http://203.208.37.132/search?q=cache:yDDnn8zwzm8J:www.gree.com.cn/index01.html+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=14&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy293Bg5hjQWrxcvtiVU5KckByvIqVw"
    这些是我抓到的元数据
      

  4.   


    //首先你得结果中不是单引号是双引号,这两个总不至于分不清吧
    "[0-9a-zA-Z]+://[-_\.a-zA-Z0-9]+[^ \t<>]*" 
    //  因为你的最后的[^ \t<>] 包含单引号双引号 所以取值的时候就包括了要想不要那就加进去
    // LZ是不是在源码中取阿,用这个
    @"(?<=<a\s+.*?href\s*=\s*[""']?)[^'""\s]+"
      

  5.   

    能说清楚点么
    group[1]里是什么东东阿
      

  6.   

    http://images.google.cn/images?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wi" 
    http://video.google.cn/videosearch?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wv" 
    http://ditu.google.cn/maps?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wl" 
    http://news.google.cn/news?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wn" 
    http://www.google.cn/music/search?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wU" 
    http://www.google.cn/finance?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=we" 
    http://www.google.cn/intl/zh-CN/options/" 
    http://blogsearch.google.cn/blogsearch?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wb" 
    http://translate.google.cn/translate_t?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wT" 
    http://shenghuo.google.cn/shenghuo/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=w8" 
    http://www.google.cn/rebang/search?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=w9" 
    http://www.265.com/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wA" 
    http://www.google.com/calendar/render?hl=zh-CN&tab=wc" 
    http://picasaweb.google.com/lh/view?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wq" 
    http://docs.google.com/?hl=zh-CN&tab=wo" 
    http://sites.google.com/?hl=zh-CN&tab=w3" 
    http://tools.google.com/pinyin/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wI" 
    http://toolbar.google.com/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wO" 
    http://pack.google.cn/?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&hl=zh-CN&newwindow=1&um=1&ie=UTF-8&sa=N&tab=wP" 
    http://www.google.cn/intl/zh-CN/options/" 
    https://www.google.com/accounts/Login?hl=zh-CN&continue=http://www.google.cn/search%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540%26hl%3Dzh-CN%26newwindow%3D1%26start%3D0%26sa%3DN" 
    http://www.google.cn/webhp?hl=zh-CN" 
    http://images.google.cn/images?q=%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;hl=zh-CN&amp;newwindow=1&amp;um=1&amp;ie=UTF-8&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result_group&amp;ct=title&amp;resnum=1" 
    http://ship1.hist.edu.cn/Lists/List12/Attachments/3/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.jpg&amp;imgrefurl=http://ship1.hist.edu.cn/Lists/List12/DispForm.aspx%3FID%3D3%26Source%3Dhttp%253A%252F%252Fship1.hist.edu.cn%252FLists%252FList12%252FAllItems.aspx&amp;h=303&amp;w=569&amp;sz=70&amp;tbnid=gOexNWyfFm1KIM:&amp;tbnh=71&amp;tbnw=134&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__aQppp5U-VVj-O50BIiuzcgHfKFw=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=1&amp;ct=image" 
    http://ship1.hist.edu.cn/Lists/List12/DispForm.aspx?ID=3&amp;Source=http%3A%2F%2Fship1.hist.edu.cn%2FLists%2FList12%2FAllItems.aspx" 
    http://ship1.hist.edu.cn/Lists/List12/DispForm.aspx?ID=3&amp;Source=http%3A%2F%2Fship1.hist.edu.cn%2FLists%2FList12%2FAllItems.aspx" 
    http://pigimg.zhongso.com/space/gallery/%E7%8E%89%E7%9F%B3%E5%BA%8A%E5%9E%AB/455918-%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.jpg&amp;imgrefurl=http://www.zhongsou.net/%25E7%258E%2589%25E7%259F%25B3%25E5%25BA%258A%25E5%259E%25AB/detail/article/215893&amp;h=542&amp;w=595&amp;sz=102&amp;tbnid=3IzmkzYzNlc7eM:&amp;tbnh=123&amp;tbnw=135&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__NS0C70k2hDEwnrGBZitfnwzGWhA=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=2&amp;ct=image" 
    http://www.zhongsou.net/%E7%8E%89%E7%9F%B3%E5%BA%8A%E5%9E%AB/detail/article/215893" 
    http://www.zhongsou.net/%E7%8E%89%E7%9F%B3%E5%BA%8A%E5%9E%AB/detail/article/215893" 
    http://www.gxmuseum.com/image/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.jpg&amp;imgrefurl=http://www.gxmuseum.com/%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC.html&amp;h=520&amp;w=650&amp;sz=39&amp;tbnid=ZEi7PQEByGEXFM:&amp;tbnh=110&amp;tbnw=137&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__fGou4S46NLnWYIMg0kwM073Cl9I=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=3&amp;ct=image" 
    http://www.gxmuseum.com/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.html" 
    http://www.gxmuseum.com/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC.html" 
    http://www.cspn.cn/CSPN/upload/fckeditor/%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC-%E5%B9%B3%E5%8F%B0.jpg&amp;imgrefurl=http://www.cspn.cn/templates/CN_About/index.aspx%3Fnodeid%3D39&amp;h=325&amp;w=442&amp;sz=19&amp;tbnid=Qx3MiSiRuoscjM:&amp;tbnh=93&amp;tbnw=127&amp;prev=/images%3Fq%3D%2522%25E8%2581%2594%25E7%25B3%25BB%25E6%2588%2591%25E4%25BB%25AC%2522%2540&amp;hl=zh-CN&amp;usg=__XzVvpPY0AcklqF_UYv_Z8P-VWE0=&amp;ei=Sg5kSt3jBoGdkAWQr_HtDw&amp;sa=X&amp;oi=image_result&amp;resnum=4&amp;ct=image" 
    http://www.cspn.cn/templates/CN_About/index.aspx?nodeid=39" 
    http://www.cspn.cn/templates/CN_About/index.aspx?nodeid=39" 
    http://www.sina.com.cn/contactus.html" 
    http://203.208.37.132/search?q=cache:TWZYbdLoytsJ:www.sina.com.cn/contactus.html+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=5&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2-vSUyK3Sr4fDv9x8r4F6ptGm0NpQ" 
    http://www.lenovo.com.cn/Public/public_bottom/contact.shtml" 
    http://203.208.37.132/search?q=cache:silaDFZkQTcJ:www.lenovo.com.cn/Public/public_bottom/contact.shtml+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=6&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy28Wmsva_T202ABtZl_P7CcZb9oCUg" 
    http://www.hurray.com.cn/chinese/contact/contact.htm" 
    http://203.208.37.132/search?q=cache:lAsyFIEvOXAJ:www.hurray.com.cn/chinese/contact/contact.htm+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=7&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy29vZq_suGUY2dGdIuGjAjeJmjnrrg" 
    http://www.chsi.com.cn/about/contact.shtml" 
    http://203.208.37.132/search?q=cache:DkxmYYYLkOgJ:www.chsi.com.cn/about/contact.shtml+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=8&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_fFb_Kdl7VAbNLLaEtbx5LsJQbEw" 
    http://www.nokia.com.cn/about-nokia/contacts" 
    http://203.208.37.132/search?q=cache:ysYlek8cmagJ:www.nokia.com.cn/about-nokia/contacts+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=9&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_UyAQS1EnbGKe-rtQARYX_P3Wdzg" 
    http://www.peopledaily.com.cn/GB/other7018/7026/index.html" 
    http://203.208.37.132/search?q=cache:WeVjf9Yb1isJ:www.peopledaily.com.cn/GB/other7018/7026/index.html+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=10&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_CI6gTK8Vbr7Ow4psZ4WaGEWy_yw" 
    http://my.taobao.com/mytaobao/misc/contact.jhtml" 
    http://203.208.37.132/search?q=cache:LET-FY8mHXMJ:my.taobao.com/mytaobao/misc/contact.jhtml+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=11&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_XL_sr5KVpvj6Af50YRbqApaEvAQ" 
    http://www.huawei.com/cn/about/officeList.do" 
    http://203.208.37.132/search?q=cache:AgbEzZn053kJ:www.huawei.com/cn/about/officeList.do+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=12&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2868eMeUsk4bOPf0EIObHw4zAOt8w" 
    http://www.tom.com/about/about_contact_1.htm" 
    http://203.208.37.132/search?q=cache:hPgcVks7yIYJ:www.tom.com/about/about_contact_1.htm+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=13&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy2_hTp0ugRiDqonvRf4sAJ_vTmfjew" 
    http://www.gree.com.cn/index01.html" 
    http://203.208.37.132/search?q=cache:yDDnn8zwzm8J:www.gree.com.cn/index01.html+%22%E8%81%94%E7%B3%BB%E6%88%91%E4%BB%AC%22%40&amp;cd=14&amp;hl=zh-CN&amp;ct=clnk&amp;gl=cn&amp;ie=UTF-8&amp;st_usg=ALhdy293Bg5hjQWrxcvtiVU5KckByvIqVw" 
    这些是我抓到的元数据
      

  7.   

    把你那个正则改为
    [0-9a-zA-Z]+://[-_\.a-zA-Z0-9]+[^ \t <>"]*
    就行了
      

  8.   


    我是先获取页面的源码,然后取源码中的URL
    我刚试啦一下,好像不行饿
      

  9.   

    改成这样
    [0-9a-zA-Z]+://[-_\.a-zA-Z0-9]+[^ \t <>"]*
      

  10.   

    这样是ok啦
    我现在要过滤其他的一些URL链接应该要怎么做呢?
    或者更好的正则表达式怎么写呢
    就是在Google上搜索记录,比如我要搜‘联系我们“然后只抓取那10条主标题的URL
    其次还要根据分页把其他页面的URL也要抓取出来,每页都取那主要的10条主标题URL
      

  11.   

    Regex UrlRegex = new Regex(@"([0-9a-zA-Z]+://[-_\.a-zA-Z0-9]+[^ \t <>"]*)", RegexOptions.IgnoreCase | RegexOptions.Compiled);
    有错啊
      

  12.   

    Regex UrlRegex = new Regex(@"([0-9a-zA-Z]+://[-_\.a-zA-Z0-9]+[^ \t <>""]*)", RegexOptions.IgnoreCase | RegexOptions.Compiled);"要转义,写成二个就行了
      

  13.   

    你那个可以的
    我现在要过滤其他的一些URL链接应该要怎么做呢? 
    或者更好的正则表达式怎么写呢 
    就是在Google上搜索记录,比如我要搜‘联系我们“然后只抓取那10条主标题的URL 
    其次还要根据分页把其他页面的URL也要抓取出来,每页都取那主要的10条主标题URL
      

  14.   

    你那个可以的
    我现在要过滤其他的一些URL链接应该要怎么做呢? 
    或者更好的正则表达式怎么写呢 
    就是在Google上搜索记录,比如我要搜‘联系我们“然后只抓取那10条主标题的URL 
    其次还要根据分页把其他页面的URL也要抓取出来,每页都取那主要的10条主标题URL
      

  15.   

    我现在要过滤其他的一些URL链接应该要怎么做呢? 
    或者更好的正则表达式怎么写呢 
    就是在Google上搜索记录,比如我要搜‘联系我们“然后只抓取那10条主标题的URL 
    其次还要根据分页把其他页面的URL也要抓取出来,每页都取那主要的10条主标题URL
      

  16.   

    <a\shref\s*="(?<URL>[^"]*).*?>(?<title>[^<]*)</a>
    用这个正则表达示
    string content="网页内容";
     Regex r = new Regex(@"<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase|RegexOptions.Singleline);
     MatchCollection ms = r.Matches(content);
      foreach (Match m in ms)
                { 
        if(m.Success)
                    {
                      string title=m.Groups["title"].Value;
                      string url = m.Groups["URL"].Value;
                    if(title=="联系我们")
                    {
                     //把url加入列表
                   }
    }
    }
      

  17.   

    搜索到的记录有的是
    华友世纪-联系我们
    TOM在线-联系方式
    像这种的话那就抓不了啊
    还有很多关于该记录的是分页显示的,那怎么把其他分页里面的主标题URL也得到呢
      

  18.   

    搜索到的记录有的是
    华友世纪-联系我们
    TOM在线-联系方式
    像这种的话那就抓不了啊
    还有很多关于该记录的是分页显示的,那怎么把其他分页里面的主标题URL也得到呢
      

  19.   


    @"(?<=<a\s+.*?href\s*=\s*[""']?)[^'""\s]+(?![^>]+>\s*(网页快照|类似结果))"
    // 这个 
      

  20.   


    @"(?<=<h3[^>]*>\s*<a\s+[^<>]*href\s*=\s*[""']?)[^'""\s]+"
    // 这个也行
      

  21.   

    我现在要过滤其他的一些URL链接应该要怎么做呢? 
    或者更好的正则表达式怎么写呢 
    就是在Google上搜索记录,比如我要搜‘联系我们“然后只抓取那10条主标题的URL 
    其次还要根据分页把其他页面的URL也要抓取出来,每页都取那主要的10条主标题URL
      

  22.   


    不明白? 什么抓不了,什么分页?  什么什么最好的正则?--其次还要根据分页把其他页面的URL也要抓取出来,每页都取那主要的10条主标题URL 这不是正则干的事吧
      

  23.   

    这是我的代码:
     Regex UrlRegex = new Regex(@"(<a\shref\s*=""(? <URL>[^""]*).*?>(? <title>[^ <]*) </a>)", RegexOptions.IgnoreCase | RegexOptions.Singleline); 
                MatchCollection matches = UrlRegex.Matches(ContentHtml);
                foreach (Match match in matches)
                {
                    if (match.Success)
                    {
                        string title = match.Groups["title"].Value;
                        string htmlurl = match.Groups["URL"].Value;
                        if (title == "联系我们")
                        {
                            this.listurl.Items.Add(htmlurl);
                            strwriterobj.WriteLine(htmlurl);
                        }
                    }
                 //this.listurl.Items.Add(match.Value.ToString());
                 //strwriterobj.WriteLine(match.Value.ToString());
                }
                MessageBox.Show("抓取完毕!!!");
                strwriterobj.Close();
                sr.Close();
                stream.Close();
                rs.Close();
    然后我运行就报错啦:System.ArgumentException: 正在分析“(<a\shref\s*="(? <URL>[^"]*).*?>(? <title>[^ <]*) </a>)”- 无法识别的分组构造。
    那个分页是不用正则的,我想问下那个怎么解决啊
      

  24.   

    Regex UrlRegex = new Regex(@"(<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^ <]*)</a>)", 
    可能是?号之间有空格
      

  25.   

    正则式没问题啦
    可是抓不了东西啊
    Regex UrlRegex = new Regex(@"(<a\shref\s*=""(?<URL>[^""]*).*?>(?<title>[^ <]*)</a>)", RegexOptions.IgnoreCase | RegexOptions.Singleline); 
                MatchCollection matches = UrlRegex.Matches(ContentHtml);
                foreach (Match match in matches)
                {
                    if (match.Success)
                    {
                        string title = match.Groups["title"].Value;
                        string htmlurl = match.Groups["URL"].Value;
                        if (title == "联系我们")
                        {
                            this.listurl.Items.Add(htmlurl);
                            strwriterobj.WriteLine(htmlurl);
                        }
                    }
                 //this.listurl.Items.Add(match.Value.ToString());
                 //strwriterobj.WriteLine(match.Value.ToString());
                }
    代码没有问题啊