请教C#如何提取网页正文。急!急!急! C#如何提取网页正文?例如说某网页的新闻标题和新闻正文,标题我会了,关键是正文呐!!各位高手来帮帮忙!~ 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 先提取文章内容html代码,再把html元素过滤掉, 先用正则表达式,定义提取内容的模式,再提取正文,(<body></body>)部分,然后再用正则表达式过滤掉HTML标签就行了. /(body.*?.*?<\/body>)/i这样写正则式可以吗? /(body.*?>.*?<\/body>)/i这样..... 下载源码:public static string GetHttpSourceValue(string a_strUrl){string strResult;HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(new System.Uri(a_strUrl));myReq.Method = "GET";myReq.Accept = "*/*";myReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)";try{HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();string tmp = myReq.Headers.ToString();Stream myStream = HttpWResp.GetResponseStream();StreamReader sr = new StreamReader(myStream, Encoding.Default);StringBuilder strBuilder = new StringBuilder();while (-1 != sr.Peek()){strBuilder.Append(sr.ReadLine() + "\r\n");}strResult = strBuilder.ToString();//StreamWriter sw = new StreamWriter("E:\\1.txt", false, Encoding.Default);//sw.Write(strResult);myStream.Close();sr.Close();// sw.Close();}catch (Exception exp){strResult = "错误:" + exp.Message;}return strResult;//StreamWriter sw = new StreamWriter(SaveFileName(), false, Encoding.Default);//sw.Write(body);//sw.Close();} 正则:(?<Style_Block>(?<begin>\<(?<tag>style)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Script_Block>(?<begin>\<(?<tag>script)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Xml_Directive>\<!(?<name>[\w-:]+)(?:\s+(?<argument>[\w-:]+|\"[\s\S]*?\"|\'[\s\S]*?\'))*\s*\>)|(?<Xml_Comment>\<!--[\s\S]*?--\>)|(?<Beginning_Tag>\<(?<tag>[\w-:]+)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)|(?<Ending_Tag>\</(?<tag>[\w-:]+)\>)|(?<Xml_CDATA>\<!\[CDATA\[(?<data>[\s\S]*?)\]\]\>)|(?<Xml_Literal>(?:(?<blank>[ ]+)|[^ \<\>])+) match.Groups["body"] 提取.. 求助:关于时间限制的license实现原理 请高手过来看看这个代码怎么错了 信息显示不出来 刚转向C#,提个问题! 求救简单的正则表达式 现在我需要持久的实时的同时向100个网页请求获取数据,每个网页都是每隔3秒重新请求一次, 基础: 引用类型还是值类型? C#怎么引用DELPHI写的Webservice? 请不吝赐教如何使用web treeview??? 数据库查询时间问题 如何把输入到textBox1里的数值传到变量里? 冰天雪地,裸体跪求,调试Windows服务!
这样写正则式可以吗?
这样.....
public static string GetHttpSourceValue(string a_strUrl)
{string strResult;
HttpWebRequest myReq = (HttpWebRequest)WebRequest.Create(new System.Uri(a_strUrl));
myReq.Method = "GET";
myReq.Accept = "*/*";
myReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)";try
{
HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse();string tmp = myReq.Headers.ToString();Stream myStream = HttpWResp.GetResponseStream();StreamReader sr = new StreamReader(myStream, Encoding.Default);
StringBuilder strBuilder = new StringBuilder();
while (-1 != sr.Peek())
{
strBuilder.Append(sr.ReadLine() + "\r\n");
}strResult = strBuilder.ToString();
//StreamWriter sw = new StreamWriter("E:\\1.txt", false, Encoding.Default);
//sw.Write(strResult);
myStream.Close();
sr.Close();
// sw.Close();
}
catch (Exception exp)
{
strResult = "错误:" + exp.Message;
}
return strResult;
//StreamWriter sw = new StreamWriter(SaveFileName(), false, Encoding.Default);
//sw.Write(body);
//sw.Close();
}
(?<Style_Block>(?<begin>\<(?<tag>style)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Script_Block>(?<begin>\<(?<tag>script)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)(?<body>[\s\S]*?)(?<end>\</\k<tag>\>))|(?<Xml_Directive>\<!(?<name>[\w-:]+)(?:\s+(?<argument>[\w-:]+|\"[\s\S]*?\"|\'[\s\S]*?\'))*\s*\>)|(?<Xml_Comment>\<!--[\s\S]*?--\>)|(?<Beginning_Tag>\<(?<tag>[\w-:]+)(?:\s+(?<attribute>[\w-:]+)(?:=(?<value>[^\s\>\<]*|\"[\s\S]*?\"|\'[\s\S]*?\'))?)*\s*(?:/)?\>)|(?<Ending_Tag>\</(?<tag>[\w-:]+)\>)|(?<Xml_CDATA>\<!\[CDATA\[(?<data>[\s\S]*?)\]\]\>)|(?<Xml_Literal>(?:(?<blank>[ ]+)|[^ \<\>])+) match.Groups["body"] 提取..