//获取http页面函数 public string Get_Http(string a_strUrl,int timeout) { string strResult ; try {HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(a_strUrl) ; myReq.Timeout = timeout; HttpWebResponse HttpWResp = (HttpWebResponse)myReq.GetResponse(); Stream myStream = HttpWResp.GetResponseStream () ; StreamReader sr = new StreamReader(myStream , Encoding.Default); StringBuilder strBuilder = new StringBuilder(); while (-1 != sr.Peek()) { strBuilder.Append(sr.ReadLine()+"\r\n"); } strResult = strBuilder.ToString(); } catch(Exception exp) { strResult = "错误:" + exp.Message ; } return strResult ; }获取页面内容后,分析页面中连接地址取到要抓取的url://处理页面标题和链接 public string SniffWebUrl( string urlStr,string blockB,string blockE ) { string urlch1 = ""; string urlch2 = ""; int end_n1 = 0; int end_nums = 0; int end_nums1 = 0; int end_nums2 = 0; int end_nums3 = 0; string reUTStr = ""; string reTitle = ""; string ret = ""; try { int pos01 = urlStr.IndexOf( "." ); int pos02 = urlStr.LastIndexOf( "/" ); if( pos01 < 0 ) { return ""; } if( pos02 < 0 ) { return ""; } int pos03 = urlStr.IndexOf( "/",pos01 ); if ( pos03 < 0 ) { urlch1 = urlStr; urlch2 = urlStr; } else { urlch1 = urlStr.Substring( 0,pos03 ); urlch2 = urlStr.Substring( 0,pos02 ); } string tmpAllStr = new PublicFun().Get_Http( urlStr ,time1); int pos1 = tmpAllStr.IndexOf( blockB ); int pos2 = tmpAllStr.IndexOf( blockE,pos1 + blockB.Length ); if ( pos1>0 && pos2>0 && pos2>pos1 ) { ret = tmpAllStr.Substring( pos1 + blockB.Length,pos2 - pos1 - blockB.Length ); ret = ret.Substring( ret.IndexOf( "<" )); while( ret.IndexOf( "<A" ) >= 0 ) { ret = ret.Substring( 0,ret.IndexOf( "<A" ) ) + "<a" + ret.Substring( ret.IndexOf( "<A" ) + 2 ); } while( ret.IndexOf( "</A" ) >=0 ) { ret = ret.Substring( 0,ret.IndexOf( "</A" ) ) + "</a" + ret.Substring( ret.IndexOf( "</A" ) + 3 ); } while( ret.IndexOf( "Href=" ) >=0 ) { ret = ret.Substring( 0,ret.IndexOf( "Href=" )) + "href=" + ret.Substring( ret.IndexOf( "Href=" ) + 5 ); } while( ret.IndexOf( "HREF=" ) >=0 ) { ret = ret.Substring( 0,ret.IndexOf( "HREF=" )) + "href=" + ret.Substring( ret.IndexOf( "HREF=" ) + 5 ); } while( ret.IndexOf( "href='" ) >=0 ) { ret = ret.Substring( 0,ret.IndexOf( "href='" )) + "href=\"" + ret.Substring( ret.IndexOf( "href='" ) + 6 );
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货