我这个页面是想远程抓取一个给定URL的网页的一部分内容(例如某个div里的内容),用到了正则表达式,我的抓取页面很简单。如下:<%@ Page Language="C#" AutoEventWireup="true" CodeBehind="Test.aspx.cs" Inherits="ToHtml.Test" %><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"><html xmlns="http://www.w3.org/1999/xhtml" >
<head runat="server">
<title>无标题页</title>
</head>
<body>
<form id="form1" runat="server">
<div></div>
</form>
</body>
</html>
Test.aspx.cs里的内容是:using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
using System.IO;
using System.Xml;namespace ToHtml
{
public partial class Test : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(Server.MapPath("Setting.xml")); XmlNode node1 = xmlDoc.SelectSingleNode("/root/HttpAdress");
XmlNode node2 = xmlDoc.SelectSingleNode("/root/RegexExpression"); string HttpAdress = node1.InnerText; string RexFat = node2.InnerText; Encoding code = Encoding.GetEncoding("UTF-8");
StreamReader sr = null;
string str = null; //读取远程路径
WebRequest temp = WebRequest.Create("http://localhost:1806/plat.htm");
WebResponse myTemp = temp.GetResponse();
sr = new StreamReader(myTemp.GetResponseStream(), code);
//读取
try
{
sr = new StreamReader(myTemp.GetResponseStream(), code);
str = sr.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
sr.Close();
} string StrC = str.Replace("\"", "");
Regex reg = new Regex(RexFat);
MatchCollection matches = reg.Matches(StrC);
foreach (Match m in matches)
Response.Write(m.Value);
}
}
}就是要页面一加载就执行!正则表达式在Setting.xml里,Setting.xml文件如下:<?xml version="1.0" encoding="utf-8" ?>
<root>
<HttpAdress><![CDATA[http://club.qikan.com/6/GroupIndex.aspx]]></HttpAdress>
<RegexExpression><![CDATA[<table cellspacing=0 border=0 id=_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid(?<content>.*?)</table>]]></RegexExpression>
</root>
要抓取的页面plat.htm如下:<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Untitled Page</title>
</head>
<body>
<div>
dajiahao</div>
<table cellspacing="0" border="0" id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid"
style="border-style: None; font-size: X-Small; height: 120px; border-collapse: collapse;">
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl2_NewPostsHyperLink"
href="/45195/ShowPost.aspx" target="_blank">打破完美世界</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl3_NewPostsHyperLink"
href="/45182/ShowPost.aspx" target="_blank">化妆品也会致癌?什么样的化妆品最安全?</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl4_NewPostsHyperLink"
href="/45178/ShowPost.aspx" target="_blank">化妆品节约省钱大法!</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl5_NewPostsHyperLink"
href="/45158/ShowPost.aspx" target="_blank">关于人类寿命大趋势的决定因素</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl6_NewPostsHyperLink"
href="/44904/ShowPost.aspx" target="_blank">小丽搭配 五类外套巧搭显瘦技巧</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl7_NewPostsHyperLink"
href="/44901/ShowPost.aspx" target="_blank">史上最牛逼的SUV,有钱不一定买得到</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl8_NewPostsHyperLink"
href="/44512/ShowPost.aspx" target="_blank">欧尚志*手袋站,国际名牌女士包包手袋批发集....</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl9_NewPostsHyperLink"
href="/44453/ShowPost.aspx" target="_blank">幽怨悲歌 中国历史上最能哭的五大女人 </a>
</td>
</tr>
</table>
</body>
</html>看明白了吧?我要的就是抓取plat.htm的指定id的<table>里的内容,我看过,正则表达式好像没有配置错,但为什么就抓不到内容呢?谢谢各位,能帮我解答一下吗?
<head runat="server">
<title>无标题页</title>
</head>
<body>
<form id="form1" runat="server">
<div></div>
</form>
</body>
</html>
Test.aspx.cs里的内容是:using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Text;
using System.Net;
using System.IO;
using System.Xml;namespace ToHtml
{
public partial class Test : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
XmlDocument xmlDoc = new XmlDocument();
xmlDoc.Load(Server.MapPath("Setting.xml")); XmlNode node1 = xmlDoc.SelectSingleNode("/root/HttpAdress");
XmlNode node2 = xmlDoc.SelectSingleNode("/root/RegexExpression"); string HttpAdress = node1.InnerText; string RexFat = node2.InnerText; Encoding code = Encoding.GetEncoding("UTF-8");
StreamReader sr = null;
string str = null; //读取远程路径
WebRequest temp = WebRequest.Create("http://localhost:1806/plat.htm");
WebResponse myTemp = temp.GetResponse();
sr = new StreamReader(myTemp.GetResponseStream(), code);
//读取
try
{
sr = new StreamReader(myTemp.GetResponseStream(), code);
str = sr.ReadToEnd();
}
catch (Exception ex)
{
throw ex;
}
finally
{
sr.Close();
} string StrC = str.Replace("\"", "");
Regex reg = new Regex(RexFat);
MatchCollection matches = reg.Matches(StrC);
foreach (Match m in matches)
Response.Write(m.Value);
}
}
}就是要页面一加载就执行!正则表达式在Setting.xml里,Setting.xml文件如下:<?xml version="1.0" encoding="utf-8" ?>
<root>
<HttpAdress><![CDATA[http://club.qikan.com/6/GroupIndex.aspx]]></HttpAdress>
<RegexExpression><![CDATA[<table cellspacing=0 border=0 id=_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid(?<content>.*?)</table>]]></RegexExpression>
</root>
要抓取的页面plat.htm如下:<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Untitled Page</title>
</head>
<body>
<div>
dajiahao</div>
<table cellspacing="0" border="0" id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid"
style="border-style: None; font-size: X-Small; height: 120px; border-collapse: collapse;">
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl2_NewPostsHyperLink"
href="/45195/ShowPost.aspx" target="_blank">打破完美世界</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl3_NewPostsHyperLink"
href="/45182/ShowPost.aspx" target="_blank">化妆品也会致癌?什么样的化妆品最安全?</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl4_NewPostsHyperLink"
href="/45178/ShowPost.aspx" target="_blank">化妆品节约省钱大法!</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl5_NewPostsHyperLink"
href="/45158/ShowPost.aspx" target="_blank">关于人类寿命大趋势的决定因素</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl6_NewPostsHyperLink"
href="/44904/ShowPost.aspx" target="_blank">小丽搭配 五类外套巧搭显瘦技巧</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl7_NewPostsHyperLink"
href="/44901/ShowPost.aspx" target="_blank">史上最牛逼的SUV,有钱不一定买得到</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl8_NewPostsHyperLink"
href="/44512/ShowPost.aspx" target="_blank">欧尚志*手袋站,国际名牌女士包包手袋批发集....</a>
</td>
</tr>
<tr>
<td>
<font color="#00329a" size="2">·</font> <a id="_ctl0_MainContent_ExpandGroupIndex__ctl0_ExpandNewPosts__ctl0_NewPostDataGrid__ctl9_NewPostsHyperLink"
href="/44453/ShowPost.aspx" target="_blank">幽怨悲歌 中国历史上最能哭的五大女人 </a>
</td>
</tr>
</table>
</body>
</html>看明白了吧?我要的就是抓取plat.htm的指定id的<table>里的内容,我看过,正则表达式好像没有配置错,但为什么就抓不到内容呢?谢谢各位,能帮我解答一下吗?
string strPattern = @"你的正则表达式";
MatchCollection Matches = Regex.Matches(plat.htm的内容, strPattern, RegexOptions.IgnoreCase | RegexOptions.Compiled);
foreach (Match NextMatch in Matches)
{
str = NextMatch.Groups["src"].Value.ToString().Trim();
}这是我原来用的很好用
改为
Regex reg = new Regex(RexFat, RegexOptions.Singleline);要加上RegexOptions.Singleline,改变.的含义,因为你的源文本里面有换行\n,\n一样要作为匹配结果。