<div class="tabContentDiv selected">
        <table width="100%" cellspacing="0" cellpadding="0" border="0">
            <tbody>
                <tr>
                    <td class="border2 borderB pad5 padRL" align="left" colspan="2">
                        <strong>Product</strong>
                    </td>
                    <td align="center" class="border2 borderB">
                        <strong>Rate</strong>
                    </td>
                    <td align="center" class="border2 borderB">
                        <strong>Last week</strong>
                    </td>
                </tr>
                <tr>
                    <td height="18" align="left" class="bgListBlue pad5 padRL">
                        <a href="http://www.bankrate.com/funnel/mortgages/?prods=1" class="oarates_box_data">
                            10</a>
                    </td>
                    <td align="center" class="bgListBlue pad5 padRL">
                        <a href="http://www.bankrate.com/funnel/graph/?&cat=2&state=zz&d=1825&t=Line&ids=1">
                            <div class="graph-oa">
                            </div>
                        </a>
                    </td>
                    <td align="center" class="bgListBlue pad5 padRL">
                        我要抓取的数据1
                    </td>
                    <td width="15" align="center" class="bgListBlue pad5 padRL">
                        <img width="13" height="10" src="http://www.bankrate.com/images_MRA/news_advice_averages/arrow-dwn-gold-13x10.gif">
                    </td>
                    <td align="center" class="bgListBlue pad5 padRL">
                        3.96%
                    </td>
                </tr>
                <tr>
                    <td height="18" align="left" class="pad5 padRL">
                        <a href="http://www.bankrate.com/funnel/mortgages/?prods=2" class="oarates_box_data">
                            15 yr fixed mtg</a>
                    </td>
                    <td align="center" class="pad5 padRL">
                        <a href="http://www.bankrate.com/funnel/graph/?&cat=2&state=zz&d=1825&t=Line&ids=10">
                            <div class="graph-oa">
                            </div>
                        </a>
                    </td>
                    <td align="center" class="pad5 padRL">
                       我要抓取的数据2
                    </td>
                    <td width="15" align="center" class="pad5 padRL">
                        <img width="13" height="10" src="http://www.bankrate.com/images_MRA/news_advice_averages/arrow-dwn-gold-13x10.gif">
                    </td>
                    <td align="center" class="pad5 padRL">
                        3.30%
                    </td>
                </tr>
                <tr>
                    <td height="18" align="left" class="bgListBlue pad5 padRL">
                        <a href="http://www.bankrate.com/funnel/mortgages/?prods=1" class="oarates_box_data">
                            aa</a>
                    </td>
                    <td align="center" class="bgListBlue pad5 padRL">
                        <a href="http://www.bankrate.com/funnel/graph/?&cat=2&state=zz&d=1825&t=Line&ids=1">
                            <div class="graph-oa">
                            </div>
                        </a>
                    </td>
                    <td align="center" class="bgListBlue pad5 padRL">
                       我要抓取的数据3
                    </td>
                    <td width="15" align="center" class="bgListBlue pad5 padRL">
                        <img width="13" height="10" src="http://www.bankrate.com/images_MRA/news_advice_averages/arrow-dwn-gold-13x10.gif">
                    </td>
                     <td align="center" class="pad5 padRL">
                        3.30%
                    </td>
                </tr>
            </tbody>
        </table>
    </div>
            WebRequest webr = WebRequest.Create(url);//一个url请求
            Stream rc = webr.GetResponse().GetResponseStream();//响应请求并返回数据
            StreamReader read = new StreamReader(rc, System.Text.Encoding.UTF8);//指定返回的数据流的编码
            StringBuilder RegHtml = new StringBuilder();
            RegHtml.Append("(?is)<div class=\"tabContentDiv selected\">.*?");
            RegHtml.Append("<table width=\"100%\" cellspacing=\"0\" cellpadding=\"0\" border=\"0\">.*?<tbody>.*?");            RegHtml.Append("<tr>.*?<td class=\"border2 borderB pad5 padRL\" align=\"left\" colspan=\"2\">.*?<strong>Product</strong>.*?</td>.*?");
            RegHtml.Append("<td align=\"center\" class=\"border2 borderB\">.*?<strong>Rate</strong>.*?</td>.*?");
            RegHtml.Append("<td align=\"center\" class=\"border2 borderB\">.*?<strong>Last week</strong>.*?</td>.*?</tr>.*?");            RegHtml.Append("<tr>.*?<td height=\"18\" align=\"left\" class=\"bgListBlue pad5 padRL\">.*?<a href=\"http://www\\.bankrate\\.com/funnel/mortgages/\\?prods=[0-9]+\" class=\"oarates_box_data\">.*</a>.*?</td>.*?");            RegHtml.Append("<td align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append("<a href=\"http://www\\.bankrate\\.com/funnel/graph/\\?&cat=[0-9]+&state=[\\w]+&d=[0-9]+&t=[\\w]+&ids=[0-9]+\">.*?");
            RegHtml.Append("<div class=\"graph-oa\">.*?</div>.*?</a>.*?</td>.*?");            RegHtml.Append("<td align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append("(?<value1>.*).*?</td>.*?");//数据1            RegHtml.Append("<td width=\"15\" align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append("<img width=\"13\" height=\"10\" src=\"http://www.bankrate.com/images_MRA/news_advice_averages/arrow-dwn-gold-13x10\\.gif\">.*?</td>.*?");            RegHtml.Append("<td align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append(".*.*?</td>.*?");            RegHtml.Append("</tr>.*?");
            //1
            RegHtml.Append("<tr>.*?");
            RegHtml.Append("<td height=\"18\" align=\"left\" class=\"pad5 padRL\">.*?");
            RegHtml.Append("<a href=\"http://www\\.bankrate\\.com/funnel/mortgages/\\?prods=[0-9]+\" class=\"oarates_box_data\">");
            RegHtml.Append(".*</a>.*?");
            RegHtml.Append("</td>.*?");
            RegHtml.Append("<td align=\"center\" class=\"pad5 padRL\">.*?");
            RegHtml.Append("<a href=\"http://www\\.bankrate\\.com/funnel/graph/\\?&cat=[0-9]+&state=[\\w]+&d=[0-9]+&t=[\\w]+&ids=[0-9]+\">.*?");
//这里的.号不用转义也可以
            RegHtml.Append("<div class=\"graph-oa\">.*?</div>.*?</a>.*?</td>.*?");
            RegHtml.Append("<td align=\"center\" class=\"pad5 padRL\">.*?(?<value2>.*).*?</td>.*?");//数据2
            RegHtml.Append("<td width=\"15\" align=\"center\" class=\"pad5 padRL\">.*?");
            RegHtml.Append("<img width=\"13\" height=\"10\" src=\"http://www.bankrate\\.com/images_MRA/news_advice_averages/arrow-dwn-gold-13x10\\.gif\">.*?</td>.*?");
           RegHtml.Append("<td align=\"center\" class=\"pad5 padRL\">.*?(.*).*?</td>.*?");
            RegHtml.Append("</tr>.*?");            RegHtml.Append("<tr>.*?");
            RegHtml.Append("<td height=\"18\" align=\"left\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append(" <a href=\"http://www\\.bankrate\\.com/funnel/mortgages/\\?prods=[0-9]+\" class=\"oarates_box_data\">");
            RegHtml.Append("(.*).*?</td>.*?");
            RegHtml.Append("<td align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append("<a href=\"http://www\\.bankrate\\.com/funnel/graph/\\?&cat=[0-9]+&state=[\\w]+&d=[0-9]+&t=[\\w]+&ids=[0-9]+\">.*?");
           RegHtml.Append("<div class=\"graph-oa\">.*?</div>.*?</a>.*?");            RegHtml.Append("</td>.*?");            RegHtml.Append("<td align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append("(?<value3>.*).*?</td>.*?");//数据三            RegHtml.Append("<td width=\"15\" align=\"center\" class=\"bgListBlue pad5 padRL\">.*?");
            RegHtml.Append("<img width=\"13\" height=\"10\" src=\"http://www\\.bankrate\\.com/images_MRA/news_advice_averages/arrow-dwn-gold-13x10\\.gif\">.*?</td>.*?");            RegHtml.Append("<td align=\"center\" class=\"pad5 padRL\">.*?");
            RegHtml.Append("(.*).*?</td>.*?");            RegHtml.Append("</tr>.*?");
            RegHtml.Append("</tbody>.*?</table>.*?</div>");            string content = read.ReadToEnd();//输出网页的源码代码
            Regex r = new Regex(RegHtml.ToString(), RegexOptions.IgnoreCase);
            MatchCollection mc = r.Matches(content);//在源代码中,用正则表达式进行搜索,返回为一个集合
            string sMsg = "";
            foreach (Match m in mc)//不能进行循环?
            {
                sMsg += m.Groups["value1"].Value;
                sMsg += m.Groups["value2"].Value;
                sMsg += m.Groups["value3"].Value;
            }
            Response.Write(sMsg);
        }
这样是可以抓取出来,但是加载的速度非常慢。。有时出现超时的提示?第一次做这东西,麻烦哪位指点,谢谢

解决方案 »

  1.   

    你知道吗?正则在处理复杂表达式时,运行效率是很低的,何况你写的还是那么长的正则,没出现死循环已经不错了。用 htmlagilitypack 来处理吧。
      

  2.   

    用HtmlAgilityPack:
    string s = File.ReadAllText(Server.MapPath("~/test.txt"));
    HtmlDocument xmlDoc = new HtmlDocument();
    xmlDoc.LoadHtml(s);
    HtmlNodeCollection nodes = xmlDoc.DocumentNode.SelectNodes(@"//div[@class='tabContentDiv selected']//tr[position()>1]/td[3]");
    foreach (HtmlNode node in nodes)
    Response.Write(node.InnerText + "<br/>");
    输出:
    我要抓取的数据1 
    我要抓取的数据2 
    我要抓取的数据3