上次用这个方法来抓到页面的数据,但不知道怎么去分隔出超链接地址。熟悉的朋友指点下。谢谢。
http://topic.csdn.net/u/20110726/17/814d9eff-e6a2-45a2-8de7-d3d132f311cb.htmlHttpWebRequest httpWebRequest = null;
string str = string.Empty;
string data = string.Empty;
try
{
httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(http://www.qiutanwang.com); //例如你要抓取的URL是http://www.qiutanwang.com,URL可以是随意网址.
httpWebRequest.ContentType = "application/x-www-form-urlencoded";
httpWebRequest.Accept = "*/*"; httpWebRequest.UserAgent = " Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; QQDownload 538; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)";
httpWebRequest.Method = "GET";
//httpWebRequest.CookieContainer = cc;//给一个新的cookies
httpWebRequest.Timeout = 3000; //这里是超时时间,可以设置长一点,其它3000就是3妙钟
HttpWebResponse hwr = (HttpWebResponse)httpWebRequest.GetResponse();
StreamReader sr = new StreamReader(hwr.GetResponseStream(), Encoding.GetEncoding("gb2312"));//如果抓取出来的数据乱码,就把"gb2312"改成"urf-8"
str = sr.ReadToEnd();
hwr.Close(); str = Regex.Replace(str, @"<a[^>]*?>(.|\n)*?</a>", "", RegexOptions.IgnoreCase);
//需要过滤什么字符可以在这里写正则表达式
data = str;
}
catch { }
finally
{
if (httpWebRequest != null)
httpWebRequest.Abort();
} //到这里data就是抓取的网页的内容了
//如果想很好的抓取网页,然后分析,最好安装抓包工具,其中火狐浏览器也有自带这种工具,操作起来很方便。有时也要把所有<li></li>的内容提出来。
http://topic.csdn.net/u/20110726/17/814d9eff-e6a2-45a2-8de7-d3d132f311cb.htmlHttpWebRequest httpWebRequest = null;
string str = string.Empty;
string data = string.Empty;
try
{
httpWebRequest = (HttpWebRequest)HttpWebRequest.Create(http://www.qiutanwang.com); //例如你要抓取的URL是http://www.qiutanwang.com,URL可以是随意网址.
httpWebRequest.ContentType = "application/x-www-form-urlencoded";
httpWebRequest.Accept = "*/*"; httpWebRequest.UserAgent = " Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; QQDownload 538; .NET CLR 1.1.4322; CIBA; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)";
httpWebRequest.Method = "GET";
//httpWebRequest.CookieContainer = cc;//给一个新的cookies
httpWebRequest.Timeout = 3000; //这里是超时时间,可以设置长一点,其它3000就是3妙钟
HttpWebResponse hwr = (HttpWebResponse)httpWebRequest.GetResponse();
StreamReader sr = new StreamReader(hwr.GetResponseStream(), Encoding.GetEncoding("gb2312"));//如果抓取出来的数据乱码,就把"gb2312"改成"urf-8"
str = sr.ReadToEnd();
hwr.Close(); str = Regex.Replace(str, @"<a[^>]*?>(.|\n)*?</a>", "", RegexOptions.IgnoreCase);
//需要过滤什么字符可以在这里写正则表达式
data = str;
}
catch { }
finally
{
if (httpWebRequest != null)
httpWebRequest.Abort();
} //到这里data就是抓取的网页的内容了
//如果想很好的抓取网页,然后分析,最好安装抓包工具,其中火狐浏览器也有自带这种工具,操作起来很方便。有时也要把所有<li></li>的内容提出来。
httpWebRequest.ContentType = "application/x-www-form-urlencoded";
httpWebRequest.Accept =
"image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
//httpWebRequest.Referer = Domain + "/user_bwyun268";
httpWebRequest.UserAgent =
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; MyIE2; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; .NET CLR 1.1.4322; .NET CLR 2.0.50727)";
httpWebRequest.Method = "GET";
//httpWebRequest.CookieContainer = cc;//给一个新的cookies
httpWebRequest.Timeout = 10000;
HttpWebResponse hwr = (HttpWebResponse)httpWebRequest.GetResponse();
StreamReader sr = new StreamReader(hwr.GetResponseStream(), Encoding.GetEncoding("UTF-8"));
str = sr.ReadToEnd();
hwr.Close();
string mz = @"<a[^>]*?>(.|\n)*?</a>";
Regex rg = new Regex(mz, RegexOptions.IgnoreCase);
MatchCollection mc = rg.Matches(str);
if (mc.Count >=0 )
{
data += mc[0].Value;
} //data就是你要的超链接内容,试下咯。