Encoding encode = System.Text.Encoding.GetEncoding(936); HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URL); request.Timeout = 60*1000; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"; WebResponse response = request.GetResponse(); Stream stream = response.GetResponseStream(); StreamReader reader = new StreamReader(stream,encode); string html = reader.ReadToEnd(); mShopping 为啥我测试还是通不过?
我的还是不行,代码如果: public static HtmlAndCookie getHtmlAndCookie(string v, string url, Method m, CookieContainer cookie, string Referer, WebProxy proxy, Encoding encode) { //创建一个HTTP请求 HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url); req.AllowAutoRedirect = false; //设置一个代理 if (proxy != null) req.Proxy = proxy; //req.Proxy = myproxy; //获取代理服务器的回应 if (cookie == null) cookie = new CookieContainer(); req.CookieContainer = cookie; //if (!string.IsNullOrEmpty(cookieString)) // req.CookieContainer.SetCookies(new Uri("http://www.cmfu.com"), cookieString); if (m == Method.Get) req.Method = "Get"; else req.Method = "Post"; req.Accept = "*/*"; req.KeepAlive = true; req.ContentType = "application/x-www-form-urlencoded"; //req.CookieContainer.SetCookies //req.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)"); req.Headers.Add("Accept-Language", "zh-cn"); req.Headers.Add("UA-CPU", "x86"); req.Headers.Add("Accept-Encoding", "gzip, deflate"); //req.Headers.Add("Connection", "Keep-Alive"); req.Headers.Add("Cache-Control", "no-cache"); req.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)"; //req.UserAgent="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"; req.ServicePoint.Expect100Continue = false; if (!string.IsNullOrEmpty(Referer)) req.Referer = Referer; if (encode == null) encode = Encoding.GetEncoding("GB2312"); if (m==Method.Post) { byte[] b = encode.GetBytes(v); req.ContentLength = b.Length; System.IO.Stream newStream = req.GetRequestStream(); newStream.Write(b, 0, b.Length); //newStream.Flush(); newStream.Close(); }
HttpWebResponse res = null; try { res = (HttpWebResponse)req.GetResponse(); } catch { //MessageBox.Show("无法连接服务器!"); return null; } //判断HTTP响应状态 if (res.StatusCode != HttpStatusCode.OK) { //richTextBox1.Invoke(new richWrite(richWrite_f), new object[] { "访问失败!" }); HtmlAndCookie hac = new HtmlAndCookie(); hac.GotoUrl = res.Headers["Location"]; hac.HttpStatusCode = res.StatusCode; hac.CookieContainer = req.CookieContainer; res.Close(); return hac; } //获取应答流 System.IO.Stream stream = res.GetResponseStream(); //if(!string.IsNullOrEmpty(res.CharacterSet)) //{ // encode=Encoding.GetEncoding(res.CharacterSet); //} System.IO.StreamReader r = new System.IO.StreamReader(stream,encode); string reV = r.ReadToEnd(); stream.Close(); res.Close(); HtmlAndCookie h = new HtmlAndCookie(); h.CookieContainer = req.CookieContainer; h.Html = reV; return h; }
问题已解决,原因是服务器传回的流使用了gzip方法压缩,需要先对流进行解压才行,关键代码: System.IO.StreamReader r = new System.IO.StreamReader(new GZipStream(res.GetResponseStream(), CompressionMode.Decompress),encode);
我用HttpWebResponse测试了下,不管是gb2312还是utf-8得到的都是乱码,和楼主的结果一样!很奇怪?
等会我用webBrowser测试下看看。
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
Encoding encode = System.Text.Encoding.GetEncoding(936);
代码部分如下: public frmTest()
{
InitializeComponent();
wb.DocumentCompleted += new WebBrowserDocumentCompletedEventHandler(wb_DocumentCompleted);
wb.Navigate("http://www.qidian.com/");
}
private void wb_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
{
string WholeHtml = wb.DocumentText; //有乱码,中文无法识别??
string BodyHtml = wb.Document.Body.InnerHtml; //没有乱码 用wb.Document.Body.OuterHtml也行
}
private WebBrowser wb = new WebBrowser(); 这个网页(起点)的内容里面没有链接可提取,主要都是javaScript代码。估计数据都在后台吧,不知道是不是以XML文件存放在数据库!
用wb.DocumentText时候产生了乱码,我也不知道为什么?!
楼主想做搜索引擎么?WebBrowser 很慢,不值得取!
Encoding encode = System.Text.Encoding.GetEncoding(936);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URL);
request.Timeout = 60*1000;
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
WebResponse response = request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream,encode);
string html = reader.ReadToEnd();
mShopping
为啥我测试还是通不过?
public static HtmlAndCookie getHtmlAndCookie(string v, string url, Method m, CookieContainer cookie, string Referer, WebProxy proxy, Encoding encode)
{ //创建一个HTTP请求
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
req.AllowAutoRedirect = false;
//设置一个代理
if (proxy != null)
req.Proxy = proxy;
//req.Proxy = myproxy;
//获取代理服务器的回应
if (cookie == null) cookie = new CookieContainer();
req.CookieContainer = cookie;
//if (!string.IsNullOrEmpty(cookieString))
// req.CookieContainer.SetCookies(new Uri("http://www.cmfu.com"), cookieString);
if (m == Method.Get)
req.Method = "Get";
else
req.Method = "Post";
req.Accept = "*/*";
req.KeepAlive = true;
req.ContentType = "application/x-www-form-urlencoded";
//req.CookieContainer.SetCookies
//req.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)");
req.Headers.Add("Accept-Language", "zh-cn");
req.Headers.Add("UA-CPU", "x86");
req.Headers.Add("Accept-Encoding", "gzip, deflate");
//req.Headers.Add("Connection", "Keep-Alive");
req.Headers.Add("Cache-Control", "no-cache");
req.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)";
//req.UserAgent="Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)";
req.ServicePoint.Expect100Continue = false;
if (!string.IsNullOrEmpty(Referer))
req.Referer = Referer;
if (encode == null)
encode = Encoding.GetEncoding("GB2312");
if (m==Method.Post)
{ byte[] b = encode.GetBytes(v);
req.ContentLength = b.Length;
System.IO.Stream newStream = req.GetRequestStream();
newStream.Write(b, 0, b.Length);
//newStream.Flush();
newStream.Close();
}
HttpWebResponse res = null;
try
{
res = (HttpWebResponse)req.GetResponse();
}
catch
{
//MessageBox.Show("无法连接服务器!");
return null;
}
//判断HTTP响应状态
if (res.StatusCode != HttpStatusCode.OK)
{
//richTextBox1.Invoke(new richWrite(richWrite_f), new object[] { "访问失败!" }); HtmlAndCookie hac = new HtmlAndCookie();
hac.GotoUrl = res.Headers["Location"];
hac.HttpStatusCode = res.StatusCode;
hac.CookieContainer = req.CookieContainer;
res.Close();
return hac;
}
//获取应答流
System.IO.Stream stream = res.GetResponseStream();
//if(!string.IsNullOrEmpty(res.CharacterSet))
//{
// encode=Encoding.GetEncoding(res.CharacterSet);
//}
System.IO.StreamReader r = new System.IO.StreamReader(stream,encode);
string reV = r.ReadToEnd();
stream.Close();
res.Close();
HtmlAndCookie h = new HtmlAndCookie();
h.CookieContainer = req.CookieContainer;
h.Html = reV;
return h; }
System.IO.StreamReader r = new System.IO.StreamReader(new GZipStream(res.GetResponseStream(), CompressionMode.Decompress),encode);