提取页面指定内容连接- 数据采集,读取该页面,用正则去提取源代码中的文章链接。 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 呵呵...能否给个示例代码呀 看美女面子.写个教学贴好了哈..以前刚好写过..不过最近别人改了页面..拿来教学到也不错例子,提取http://www.webcargo.com.cn/QueryPages/QuerySeaTariff.asp中的表格原理是.匹配所有的行 就是<tr>找到需要提取得行提取之用到的正则//提取页面上所有的 tr 标记<tr></tr> tRContent为提取内容protected string strPartenTr = @"(?<trBeginTag><[tT][rR].*?>)(?<tRContent>.*?)</[tT][rR]>";//提取页面上所有和 td 标记 protected string strPartenTd = @"(?<tdBeginTag><[tT][Dd].*?>)(?<tdContent>.*?)</[tT][Dd]>";//提取 font标记protected string strPartenFont = @"(?<FontBeginTag><[fF][oO][Nn][Tt].*?>)(?<FontContent>.*?)</[fF][oO][Nn][Tt]>";//提取 连取标记的内容protected string strPartenA = @"(?<FontBeginTag><[aA].*?>)(?<aContent>.*?)</[Aa]>";相关代码public class BuildChildElement { XmlElement rootElement; int pageNo; XmlDocument xmldoc ; //xmlchildNode XmlElement xmlchildNode; //子结点 XmlElement xmlchildelem ; string url; System.Net.WebClient myWebClient = new System.Net.WebClient(); Form1 form1; public BuildChildElement(ref XmlElement par,int no,ref XmlDocument doc,Form1 f) { this.rootElement = par; this.pageNo = no; xmldoc = doc; url = f.Url; form1 = f; myWebClient.Headers.Add("Content-Type","application/x-www-form-urlencoded"); special = f.Special; } protected string strPartenTr = @"(?<trBeginTag><[tT][rR].*?>)(?<tRContent>.*?)</[tT][rR]>"; protected string strPartenTd = @"(?<tdBeginTag><[tT][Dd].*?>)(?<tdContent>.*?)</[tT][Dd]>"; protected string strPartenFont = @"(?<FontBeginTag><[fF][oO][Nn][Tt].*?>)(?<FontContent>.*?)</[fF][oO][Nn][Tt]>"; protected string strPartenA = @"(?<FontBeginTag><[aA].*?>)(?<aContent>.*?)</[Aa]>"; protected string strPartenDiv = @"(?<DivBeginTag><[Dd][Ii][Vv].*?>)(?<divContent>.*?)</[Dd][Ii][Vv]>"; //特征码 protected string special ;//="<tr class=\"majorListTR\""; protected virtual string ClearFontTag(string s) { //去除font标签 s = System.Text.RegularExpressions.Regex.Replace(s,strPartenFont,new MatchEvaluator(this.CapText)); //去除 a 标签 s = System.Text.RegularExpressions.Regex.Replace(s, strPartenA, new MatchEvaluator(this.CapA)); //去除div标签 return System.Text.RegularExpressions.Regex.Replace(s, strPartenDiv, new MatchEvaluator(this.CapDiv)); } string CapText(System.Text.RegularExpressions.Match m) { return m.Groups["FontContent"].Value; } string CapA(System.Text.RegularExpressions.Match m) { return m.Groups["aContent"].Value; } string CapDiv(System.Text.RegularExpressions.Match m) { return m.Groups["divContent"].Value; } public void GetData(object obj) { string postData = string.Format("curpage={0}",pageNo.ToString()); byte[] byteArray = Encoding.ASCII.GetBytes(postData); // 上传数据,并获取返回的二进制数据. byte[] responseArray = myWebClient.UploadData(url,"POST",byteArray); System.Text.StringBuilder txtSys = new System.Text.StringBuilder(Encoding.Default.GetString(responseArray).Replace("\r","")); //替换换行 txtSys = txtSys.Replace("\n",string.Empty); //替换制表符 txtSys = txtSys.Replace("\t",string.Empty); AppendChild(txtSys.ToString()); lock(xmldoc) //多进程下防止访问冲突 { xmldoc.AppendChild ( rootElement ) ; xmldoc.Save(form1.FilePath); } form1.increaseProcess(); } void AppendChild(string analyseString) { //生成XML文档 //对于每一个匹配的行进行操作 foreach(System.Text.RegularExpressions.Match matchTr in System.Text.RegularExpressions.Regex.Matches(analyseString,strPartenTr)) { //是不是我们要匹配的行 if(matchTr.Groups["trBeginTag"].Value.StartsWith(special)) { //如果是添加XML节点 xmlchildelem = xmldoc.CreateElement("",form1.TotalElementName ,""); int i = 0; foreach(System.Text.RegularExpressions.Match matchTd in System.Text.RegularExpressions.Regex.Matches(matchTr.Groups["tRContent"].Value,strPartenTd)) { if(!form1.IsUseElement) { XmlAttribute n= xmldoc.CreateAttribute( form1.GetElementNameByIndex(i));//使用属性方式 //去掉网页标签 System.Text.StringBuilder Result =new System.Text.StringBuilder( this.ClearFontTag(matchTd.Groups["tdContent"].Value)); Result = Result.Replace(" ",string.Empty); n .Value = Result.ToString(); xmlchildelem.Attributes.Append(n); } else//string strElementName = GetElementNameByIndex(i); //使用元素方式 { xmlchildNode = xmldoc.CreateElement("",form1.GetElementNameByIndex(i),""); System.Text.StringBuilder Result =new System.Text.StringBuilder( this.ClearFontTag(matchTd.Groups["tdContent"].Value)); Result = Result.Replace(" ",string.Empty); xmlchildNode.InnerText = Result.ToString(); xmlchildelem.AppendChild(xmlchildNode); } i++; } rootElement.AppendChild ( xmlchildelem ) ; } } } //生成属性的标题 protected virtual string GetElementNameByIndex(int index) { switch(index) { case 0: return "start";//启始 case 1: return "target";//目的 case 2: return "gp20";//货币名称(E) case 3: return "gp40";//货币名称(E) case 4: return "hq40";//金额 case 5: return "box";//金额 case 6: return "leavedate";//金额 case 7: return "liabilityman";//金额 case 8: return "efecteddate"; case 9: return "corporation"; //发布公司 case 10: return "re"; default: return "abc"; }//start,target,gp20,gp40,hq40,box,leavedate,liabilityman,efecteddate,corporation,re } }整个源代码可以发邮件给你.留下邮箱既可里面用了多线程,当时对多线程不熟....结果...<?xml version="1.0"?><ROOT> <SeaTariff start="QINGDAO, CHINA" target="DAMMAM, SAUDI ARABIA" gp20="" gp40="2140" hq40="2140" box=" " leavedate="5" liabilityman="O" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" /> <SeaTariff start="QINGDAO, CHINA" target="KUWAIT, KUWAIT" gp20="" gp40="2140" hq40="2140" box=" " leavedate="1" liabilityman="A" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" /> <SeaTariff start="QINGDAO, CHINA" target="JEBEL ALI FREEZONE, UNITED ARAB EMIRATES" gp20="1100" gp40="1730" hq40="1730" box=" " leavedate="1" liabilityman="A" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" /> <SeaTariff start="QINGDAO, CHINA" target="NHAVA SHEVA, INDIA" gp20="920" gp40="1530" hq40="1530" box=" " leavedate="1" liabilityman="A" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" />...................</ROOT>......... stringbuilder去掉最后一个逗号 我在网上找了个flash的广告位,但是在我的ASPX页面里不能显示啊~~ 如何在用VS发布asp.net程序的时候保留部分服务器上的目录 哪种模式更合适silverlight. aspx (C#2003)在sql中创建表的问题 db.SubmitChanges();为何不起作用? 怎样获取一个DataSet中某个字段的字段类型? 如何将gridview中的文本替换成星 500分第四贴 微软的大BUG,求救!!!(高分相送) 求救:iframe刷新的问题,高手请进 问一个winform 下的datagrid问题
哈..以前刚好写过..不过最近别人改了页面..拿来教学到也不错
例子,提取
http://www.webcargo.com.cn/QueryPages/QuerySeaTariff.asp中的表格原理是.匹配所有的行 就是<tr>
找到需要提取得行
提取之用到的正则//提取页面上所有的 tr 标记<tr></tr> tRContent为提取内容
protected string strPartenTr = @"(?<trBeginTag><[tT][rR].*?>)(?<tRContent>.*?)</[tT][rR]>";
//提取页面上所有和 td 标记
protected string strPartenTd = @"(?<tdBeginTag><[tT][Dd].*?>)(?<tdContent>.*?)</[tT][Dd]>";
//提取 font标记
protected string strPartenFont = @"(?<FontBeginTag><[fF][oO][Nn][Tt].*?>)(?<FontContent>.*?)</[fF][oO][Nn][Tt]>";
//提取 连取标记的内容
protected string strPartenA = @"(?<FontBeginTag><[aA].*?>)(?<aContent>.*?)</[Aa]>";相关代码
public class BuildChildElement
{
XmlElement rootElement;
int pageNo;
XmlDocument xmldoc ; //xmlchildNode
XmlElement xmlchildNode; //子结点
XmlElement xmlchildelem ;
string url;
System.Net.WebClient myWebClient = new System.Net.WebClient(); Form1 form1; public BuildChildElement(ref XmlElement par,int no,ref XmlDocument doc,Form1 f)
{
this.rootElement = par;
this.pageNo = no;
xmldoc = doc;
url = f.Url;
form1 = f;
myWebClient.Headers.Add("Content-Type","application/x-www-form-urlencoded");
special = f.Special;
} protected string strPartenTr = @"(?<trBeginTag><[tT][rR].*?>)(?<tRContent>.*?)</[tT][rR]>";
protected string strPartenTd = @"(?<tdBeginTag><[tT][Dd].*?>)(?<tdContent>.*?)</[tT][Dd]>";
protected string strPartenFont = @"(?<FontBeginTag><[fF][oO][Nn][Tt].*?>)(?<FontContent>.*?)</[fF][oO][Nn][Tt]>";
protected string strPartenA = @"(?<FontBeginTag><[aA].*?>)(?<aContent>.*?)</[Aa]>";
protected string strPartenDiv = @"(?<DivBeginTag><[Dd][Ii][Vv].*?>)(?<divContent>.*?)</[Dd][Ii][Vv]>";
//特征码
protected string special ;//="<tr class=\"majorListTR\""; protected virtual string ClearFontTag(string s)
{
//去除font标签
s = System.Text.RegularExpressions.Regex.Replace(s,strPartenFont,new MatchEvaluator(this.CapText));
//去除 a 标签
s = System.Text.RegularExpressions.Regex.Replace(s, strPartenA, new MatchEvaluator(this.CapA));
//去除div标签
return System.Text.RegularExpressions.Regex.Replace(s, strPartenDiv, new MatchEvaluator(this.CapDiv)); } string CapText(System.Text.RegularExpressions.Match m)
{
return m.Groups["FontContent"].Value;
} string CapA(System.Text.RegularExpressions.Match m)
{
return m.Groups["aContent"].Value;
} string CapDiv(System.Text.RegularExpressions.Match m)
{
return m.Groups["divContent"].Value;
} public void GetData(object obj)
{
string postData = string.Format("curpage={0}",pageNo.ToString());
byte[] byteArray = Encoding.ASCII.GetBytes(postData);
// 上传数据,并获取返回的二进制数据.
byte[] responseArray = myWebClient.UploadData(url,"POST",byteArray); System.Text.StringBuilder txtSys = new System.Text.StringBuilder(Encoding.Default.GetString(responseArray).Replace("\r",""));
//替换换行
txtSys = txtSys.Replace("\n",string.Empty);
//替换制表符
txtSys = txtSys.Replace("\t",string.Empty); AppendChild(txtSys.ToString());
lock(xmldoc) //多进程下防止访问冲突
{
xmldoc.AppendChild ( rootElement ) ;
xmldoc.Save(form1.FilePath);
}
form1.increaseProcess();
} void AppendChild(string analyseString)
{
//生成XML文档
//对于每一个匹配的行进行操作
foreach(System.Text.RegularExpressions.Match matchTr in System.Text.RegularExpressions.Regex.Matches(analyseString,strPartenTr))
{
//是不是我们要匹配的行
if(matchTr.Groups["trBeginTag"].Value.StartsWith(special))
{
//如果是添加XML节点
xmlchildelem = xmldoc.CreateElement("",form1.TotalElementName ,"");
int i = 0;
foreach(System.Text.RegularExpressions.Match matchTd in System.Text.RegularExpressions.Regex.Matches(matchTr.Groups["tRContent"].Value,strPartenTd))
{
if(!form1.IsUseElement)
{
XmlAttribute n= xmldoc.CreateAttribute( form1.GetElementNameByIndex(i));//使用属性方式
//去掉网页标签
System.Text.StringBuilder Result =new System.Text.StringBuilder( this.ClearFontTag(matchTd.Groups["tdContent"].Value));
Result = Result.Replace(" ",string.Empty);
n .Value = Result.ToString();
xmlchildelem.Attributes.Append(n);
}
else//string strElementName = GetElementNameByIndex(i); //使用元素方式
{
xmlchildNode = xmldoc.CreateElement("",form1.GetElementNameByIndex(i),"");
System.Text.StringBuilder Result =new System.Text.StringBuilder( this.ClearFontTag(matchTd.Groups["tdContent"].Value));
Result = Result.Replace(" ",string.Empty);
xmlchildNode.InnerText = Result.ToString();
xmlchildelem.AppendChild(xmlchildNode);
}
i++;
}
rootElement.AppendChild ( xmlchildelem ) ;
}
}
} //生成属性的标题
protected virtual string GetElementNameByIndex(int index)
{
switch(index)
{
case 0:
return "start";//启始
case 1:
return "target";//目的
case 2:
return "gp20";//货币名称(E)
case 3:
return "gp40";//货币名称(E)
case 4:
return "hq40";//金额
case 5:
return "box";//金额
case 6:
return "leavedate";//金额
case 7:
return "liabilityman";//金额
case 8:
return "efecteddate";
case 9:
return "corporation"; //发布公司
case 10:
return "re";
default:
return "abc";
}//start,target,gp20,gp40,hq40,box,leavedate,liabilityman,efecteddate,corporation,re
} }整个源代码可以发邮件给你.留下邮箱既可
里面用了多线程,当时对多线程不熟....结果...<?xml version="1.0"?>
<ROOT>
<SeaTariff start="QINGDAO, CHINA" target="DAMMAM, SAUDI ARABIA" gp20="" gp40="2140" hq40="2140" box=" " leavedate="5" liabilityman="O" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" />
<SeaTariff start="QINGDAO, CHINA" target="KUWAIT, KUWAIT" gp20="" gp40="2140" hq40="2140" box=" " leavedate="1" liabilityman="A" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" />
<SeaTariff start="QINGDAO, CHINA" target="JEBEL ALI FREEZONE, UNITED ARAB EMIRATES" gp20="1100" gp40="1730" hq40="1730" box=" " leavedate="1" liabilityman="A" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" />
<SeaTariff start="QINGDAO, CHINA" target="NHAVA SHEVA, INDIA" gp20="920" gp40="1530" hq40="1530" box=" " leavedate="1" liabilityman="A" efecteddate="2008-06-17" corporation="青岛SUNMOON国际货运代理有限公司" re="青岛--墨尔本/布里斯班冻代干.长年特价收货...青岛-欧地干柜冻柜/东南亚/中东/非洲/加拿大/澳洲....[email protected]" />
...................</ROOT>
.........