public static string DownloadData(string url, Encoding encoding)
{
WebClient web = new WebClient();
return encoding.GetString(web.DownloadData(url));
} private void button1_Click(object sender, EventArgs e)
{
string htmlstr = DownloadData("http://china.alibaba.com/member/join/common_join.htm")
}获取不到这个页面的源码。。用HttpWebRequest也不行 请教大家这种情况要怎么弄?把地址换成http://china.alibaba.com/就可以
{
WebClient web = new WebClient();
return encoding.GetString(web.DownloadData(url));
} private void button1_Click(object sender, EventArgs e)
{
string htmlstr = DownloadData("http://china.alibaba.com/member/join/common_join.htm")
}获取不到这个页面的源码。。用HttpWebRequest也不行 请教大家这种情况要怎么弄?把地址换成http://china.alibaba.com/就可以
protected string GetPageHtml(string url)
{
string pageinfo;
try
{
WebRequest myreq = WebRequest.Create(url);
WebResponse myrep = myreq.GetResponse();
StreamReader reader = new StreamReader(myrep.GetResponseStream(), Encoding.GetEncoding("gb2312"));
pageinfo = reader.ReadToEnd();
}
catch
{
pageinfo = "";
}
return pageinfo;
}
按上述方法就可以在程序中获取某URL的页面源文件。
但是有些网站屏蔽了爬虫,那就需要模拟浏览器获取的方法来进行,具体代码如下:
protected string GetPageHtml(string url)
{
string pageinfo;
try
{
HttpWebRequest myReq = (HttpWebRequest)HttpWebRequest.Create(url);
myReq.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
myReq.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727)";
HttpWebResponse myRep = (HttpWebResponse)myReq.GetResponse();
Stream myStream = myRep.GetResponseStream();
StreamReader sr = new StreamReader(myStream, Encoding.Default);
pageinfo = sr.ReadToEnd().ToString();
}
catch
{
pageinfo = "";
}
return pageinfo;
}