抓取的网页格式
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
和我调用里面的设置是一样的 为什么会出现乱码
试了其他格式的也是乱码
调用代码如下
///命名空间using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions; try
{ HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create("http://google.gb5u.cn/qiangzhixingjizhuyan/"); HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse(); if (all_codeResponse.StatusCode == HttpStatusCode.OK)
{ Encoding encoding = Encoding.GetEncoding("gb2312"); StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding); string _content = the_Reader.ReadToEnd(); the_Reader.Close();
the_Reader.Dispose(); all_codeResponse.Close(); } }
catch (Exception)
{
}
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
和我调用里面的设置是一样的 为什么会出现乱码
试了其他格式的也是乱码
调用代码如下
///命名空间using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions; try
{ HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create("http://google.gb5u.cn/qiangzhixingjizhuyan/"); HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse(); if (all_codeResponse.StatusCode == HttpStatusCode.OK)
{ Encoding encoding = Encoding.GetEncoding("gb2312"); StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding); string _content = the_Reader.ReadToEnd(); the_Reader.Close();
the_Reader.Dispose(); all_codeResponse.Close(); } }
catch (Exception)
{
}
/// <summary>
/// 取得网页源码
/// </summary>
/// <param name="url">网页地址,eg: "http://www.yongfa365.com/" </param>
/// <param name="charset">网页编码,eg: Encoding.UTF8</param>
/// <returns>返回网页源文件</returns>
public static string GetHtmlSource(string url, Encoding charset)
{
//处理内容
string html = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
StreamReader reader = new StreamReader(stream, charset);
html = reader.ReadToEnd();
stream.Close();
}
catch (Exception e)
{
logServer.addLog(e.Message, "网络爬虫模块");
}
return html;
}
做网爬的时候 最主要的就是要与html的格式相同 不然就是乱码我刚刚做了个 一般用Defalut就行了 有些特殊的用
"Default",
"UTF8",
"UTF7",
"Unicode",
"ASCII" 只有这几个 没其他的什么gb2312
/// 得到整个网页的源码
/// </summary>
/// <param name="Url"></param>
/// <returns></returns>
public static string _GetHtml(string Url)
{ Stream MyInStream = null;
string Html = "";
try
{
HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse(); MyInStream = MyResponse.GetResponseStream(); Encoding encode = System.Text.Encoding.UTF8;
StreamReader sr = new StreamReader(MyInStream, encode); Char[] read = new Char[256];
int count = sr.Read(read, 0, 256);
while (count > 0)
{
String str = new String(read, 0, count);
Html += str;
count = sr.Read(read, 0, 256);
}
}
catch (Exception)
{
Html = "错误";
}
finally
{
if (MyInStream != null)
{
MyInStream.Close();
}
}
return Html;
}
public static string GetHtmlSource(string url, Encoding charset)
charset为: if (encodeValue == "Default")
{
return Encoding.Default;
}
else if (encodeValue == "UTF8")
{
return Encoding.UTF8;
}
else if (encodeValue == "UTF7")
{
return Encoding.UTF7;
}
else if (encodeValue == "Unicode")
{
return Encoding.Unicode;
}
else if (encodeValue == "ASCII")
{
return Encoding.ASCII;
}
else
{
return Encoding.Default;
}
不敢说百分百行 只少95%的网站都不会乱码
static string GetHtml(string url, Encoding encoding)
{
byte[] buf = new WebClient().DownloadData(url);
if (encoding != null) return encoding.GetString(buf);
string html = Encoding.UTF8.GetString(buf);
encoding = GetEncoding(html);
if (encoding == null || encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);
} // 根据网页的HTML内容提取网页的Encoding
static Encoding GetEncoding(string html)
{
string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
string charset = Regex.Match(html, pattern).Groups["charset"].Value;
try { return Encoding.GetEncoding(charset); }
catch (ArgumentException) { return null; }
}
现在的问题是 编码设置的是正确的
抓取的http://google.gb5u.cn/qiangzhixingjizhuyan/网页为什么是乱码的
其他的网页是没问题的
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;namespace WikiPageCreater.Common
{
public class PageHelper
{
/// <summary>
/// 根据 url 获取网页编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return Encoding.Default.BodyName;
} /// <summary>
/// 根据 url 和 encoding 获取当前url页面的 html 源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetHtml(string url, Encoding encoding)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
else
reader = new StreamReader(response.GetResponseStream(), encoding);
string html = reader.ReadToEnd(); return html;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return string.Empty;
}
}
}
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<title>强直性脊柱炎的症状_强直性脊柱炎的治疗方法_湖南强直性脊柱炎哪里治疗_湖南省军区医院骨科中心</title>
<meta name="keywords" content="强直性脊柱炎的症状,强直性脊柱炎的治疗方法,湖南强直性脊柱炎哪里治疗好" />
<meta name="description" content="强直性脊柱炎的治疗方法新突破,湖南强直性脊柱炎哪里治疗好采用的新一代“液体刀”滑膜切除术,能直接针对强直性脊柱炎的症状病灶,使关节滑膜炎症细胞失去活性并被溶解吸收,突破传统方法的局限性." />
<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />
<link href="/css/kmwj.css" rel="stylesheet" type="text/css" />
<SCRIPT language="javascript" type="text/javascript" src="/js/wj.js"></SCRIPT>
</head>
<body class="articlelist">
<div class="wj_main w980">
<div class="wj_banner w960">
不过7L的 方法有误 应为
if (encodeValue.ToUpper() == "DEFAULT")
{
return Encoding.Default;
}
else if (encodeValue.ToUpper() == "UTF-8")
{
return Encoding.UTF8;
}
else if (encodeValue.ToUpper() == "UTF-7")
{
return Encoding.UTF7;
}
else if (encodeValue.ToUpper() == "UNICODE")
{
return Encoding.Unicode;
}
else if (encodeValue.ToUpper() == "ASCII")
{
return Encoding.ASCII;
}
else
{
return Encoding.Default;
}
C# codeusing System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Net;
using System.IO;
using System.IO.Compression;
using System.Text.RegularExpressions;namespace WikiPageCreater.Common
{
public class PageHelper
{
/// <summary>
/// 根据 url 获取网页编码
/// </summary>
/// <param name="url"></param>
/// <returns></returns>
public static string GetEncoding(string url)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
else
reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII); string html = reader.ReadToEnd(); Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
if (reg_charset.IsMatch(html))
{
return reg_charset.Match(html).Groups["charset"].Value;
}
else if (response.CharacterSet != string.Empty)
{
return response.CharacterSet;
}
else
return Encoding.Default.BodyName;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return Encoding.Default.BodyName;
} /// <summary>
/// 根据 url 和 encoding 获取当前url页面的 html 源代码
/// </summary>
/// <param name="url"></param>
/// <param name="encoding"></param>
/// <returns></returns>
public static string GetHtml(string url, Encoding encoding)
{
HttpWebRequest request = null;
HttpWebResponse response = null;
StreamReader reader = null;
try
{
request = (HttpWebRequest)WebRequest.Create(url);
request.Timeout = 20000;
request.AllowAutoRedirect = false; response = (HttpWebResponse)request.GetResponse();
if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
{
if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
else
reader = new StreamReader(response.GetResponseStream(), encoding);
string html = reader.ReadToEnd(); return html;
}
}
catch
{
}
finally
{ if (response != null)
{
response.Close();
response = null;
}
if (reader != null)
reader.Close(); if (request != null)
request = null; } return string.Empty;
}
}
}
你这句代码有问题<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
注意 有个空格 两个属性之间 content 和 charset 多出一个空格,所以http头响应后 ,给你按默认的编码解析了哦。再说一遍 content="text/html; <!--空格在这里--> charset=gb2312"