static void Main(string[] args)
{
string url = "http://sharepoint2010.microsoft.com/search/Results.aspx?k=sharepoint"; string responseText = GetHtmlCode(url, Encoding.Default); } private static string GetHtmlCode(string url, Encoding encoding)
{
System.Net.HttpWebRequest request = (System.Net.HttpWebRequest)System.Net.WebRequest.Create(url);
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream resStream = response.GetResponseStream();
System.IO.StreamReader sr = new System.IO.StreamReader(resStream, encoding);
string html = (sr.ReadToEnd());
resStream.Close();
sr.Close();
return html;
}
我用这个代码来抓取那个sharepoint站点的html代码,但每次抓的都是一个错误页面的代码,调试的时候用html方式可以看到responseText的值就明白了。奇怪的是我用IE直接打开那个站点是可以正常得到数据,可以显示出来的。我就不知道是什么原因。请大家试试。如果换成别的URL,比如google,baidu之类的,都可以正常抓取。
string content = wc.DownloadString("");
{
if(!Page.IsPostBack)
{
string responseText = Download(Request.QueryString["s"], "gb2312");
}
}
public static string Download(string URL, string str)
{
Stream strmPage = null;
StreamReader srPage = null;
string HTML = null;
try
{
HttpWebRequest wrqPage = (HttpWebRequest)WebRequest.Create(URL);//根据制定的URL字符构造一个网络请求
wrqPage.UserAgent = "Mozilla/4.0(compatible; MSIE 6.0; Windows NT 5.1)";//设定UserAgent的属性
WebResponse wrpPage = wrqPage.GetResponse();//获取网络相应
strmPage = wrpPage.GetResponseStream();//获取网络相应的数据流
//srPage = new StreamReader(strmPage, Encoding.GetEncoding("gb2312"));//将获取的数据流构造为一个StreamReader,用来读取流的内容
srPage = new StreamReader(strmPage, System.Text.Encoding.GetEncoding(str));//将获取的数据流构造为一个StreamReader,用来读取流的内容
HTML = srPage.ReadToEnd(); //使用StreamReader读取到流的末尾,并将读取的内容存储到HTML变量中
strmPage.Close();
return HTML;
}
catch (Exception)
{
if (strmPage != null)
{
strmPage.Close();
}
return HTML;
}
}