抓取的网页格式
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
和我调用里面的设置是一样的 为什么会出现乱码
试了其他格式的也是乱码
调用代码如下
///命名空间using System.Text;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;            try
            {                HttpWebRequest all_codeRequest = (HttpWebRequest)WebRequest.Create("http://google.gb5u.cn/qiangzhixingjizhuyan/");                HttpWebResponse all_codeResponse = (HttpWebResponse)all_codeRequest.GetResponse();                if (all_codeResponse.StatusCode == HttpStatusCode.OK)
                {                    Encoding encoding = Encoding.GetEncoding("gb2312");                    StreamReader the_Reader = new StreamReader(all_codeResponse.GetResponseStream(), encoding);                    string _content = the_Reader.ReadToEnd();                    the_Reader.Close();
                    the_Reader.Dispose();                    all_codeResponse.Close();                }            }
            catch (Exception)
            {
            }

解决方案 »

  1.   


     /// <summary>   
            /// 取得网页源码   
            /// </summary>   
            /// <param name="url">网页地址,eg: "http://www.yongfa365.com/" </param>    
            /// <param name="charset">网页编码,eg: Encoding.UTF8</param>   
            /// <returns>返回网页源文件</returns>   
            public static string GetHtmlSource(string url, Encoding charset)
            {
                //处理内容   
                string html = "";
                try
                {
                    HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
                    HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                    Stream stream = response.GetResponseStream();
                    StreamReader reader = new StreamReader(stream, charset);
                    html = reader.ReadToEnd();
                    stream.Close();
                }
                catch (Exception e)
                {
                    logServer.addLog(e.Message, "网络爬虫模块");
                }
                return html;
            }
    做网爬的时候 最主要的就是要与html的格式相同 不然就是乱码我刚刚做了个 一般用Defalut就行了 有些特殊的用
    "Default",
    "UTF8",
    "UTF7",
    "Unicode",
    "ASCII"  只有这几个 没其他的什么gb2312
      

  2.   

    是用这个http://google.gb5u.cn/qiangzhixingjizhuyan/网址测试的吗?
      

  3.   

    /// <summary>
            /// 得到整个网页的源码
            /// </summary>
            /// <param name="Url"></param>
            /// <returns></returns>
            public static string _GetHtml(string Url)
            {            Stream MyInStream = null;
                string Html = "";
                try
                {
                    HttpWebRequest MyRequest = (HttpWebRequest)WebRequest.Create(Url);
                    HttpWebResponse MyResponse = (HttpWebResponse)MyRequest.GetResponse();                MyInStream = MyResponse.GetResponseStream();                Encoding encode = System.Text.Encoding.UTF8;
                    StreamReader sr = new StreamReader(MyInStream, encode);                Char[] read = new Char[256];
                    int count = sr.Read(read, 0, 256);
                    while (count > 0)
                    {
                        String str = new String(read, 0, count);
                        Html += str;
                        count = sr.Read(read, 0, 256);
                    }
                }
                catch (Exception)
                {
                    Html = "错误";
                }
                finally
                {
                    if (MyInStream != null)
                    {
                        MyInStream.Close();
                    }
                }
                return Html;
    }
      

  4.   

    传http://google.gb5u.cn/qiangzhixingjizhuyan/调用这个也是乱码
      

  5.   

    你用我的方法 把那几个
    public static string GetHtmlSource(string url, Encoding charset)
    charset为: if (encodeValue == "Default")
                {
                    return Encoding.Default;
                }
                else if (encodeValue == "UTF8")
                {
                    return Encoding.UTF8;
                }
                else if (encodeValue == "UTF7")
                {
                    return Encoding.UTF7;
                }
                else if (encodeValue == "Unicode")
                {
                    return Encoding.Unicode;
                }
                else if (encodeValue == "ASCII")
                {
                    return Encoding.ASCII;
                }
                else
                {
                    return Encoding.Default;
                }
    不敢说百分百行 只少95%的网站都不会乱码
      

  6.   

    敢问如何动态获得网页是 encodeValue 是什么值
      

  7.   

     获取网页的HTML内容,指定Encoding 
    static string GetHtml(string url, Encoding encoding) 

    byte[] buf = new WebClient().DownloadData(url); 
    if (encoding != null) return encoding.GetString(buf); 
    string html = Encoding.UTF8.GetString(buf); 
    encoding = GetEncoding(html); 
    if (encoding == null || encoding == Encoding.UTF8) return html; 
    return encoding.GetString(buf); 
    } // 根据网页的HTML内容提取网页的Encoding 
    static Encoding GetEncoding(string html) 

    string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)"; 
    string charset = Regex.Match(html, pattern).Groups["charset"].Value; 
    try { return Encoding.GetEncoding(charset); } 
    catch (ArgumentException) { return null; } 
      

  8.   


    现在的问题是 编码设置的是正确的 
    抓取的http://google.gb5u.cn/qiangzhixingjizhuyan/网页为什么是乱码的 
    其他的网页是没问题的
      

  9.   

    先自动获取编码 在抓取内容
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.IO.Compression;
    using System.Text.RegularExpressions;namespace WikiPageCreater.Common
    {
        public class PageHelper
        {
            /// <summary>
            /// 根据 url 获取网页编码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetEncoding(string url)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        else
                            reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                    string html = reader.ReadToEnd();                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                        if (reg_charset.IsMatch(html))
                        {
                            return reg_charset.Match(html).Groups["charset"].Value;
                        }
                        else if (response.CharacterSet != string.Empty)
                        {
                            return response.CharacterSet;
                        }
                        else
                            return Encoding.Default.BodyName;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return Encoding.Default.BodyName;
            }        /// <summary>
            /// 根据 url 和 encoding 获取当前url页面的 html 源代码        
           /// </summary>
            /// <param name="url"></param>
            /// <param name="encoding"></param>
            /// <returns></returns>
            public static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                        else
                            reader = new StreamReader(response.GetResponseStream(), encoding);
                        string html = reader.ReadToEnd();                    return html;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return string.Empty;
            }
        }
    }
      

  10.   

    大侠们 谁能试下抓取http://google.gb5u.cn/qiangzhixingjizhuyan/网页不出现乱码吗
      

  11.   

    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head>
    <meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
    <title>强直性脊柱炎的症状_强直性脊柱炎的治疗方法_湖南强直性脊柱炎哪里治疗_湖南省军区医院骨科中心</title>
    <meta name="keywords" content="强直性脊柱炎的症状,强直性脊柱炎的治疗方法,湖南强直性脊柱炎哪里治疗好" />
    <meta name="description" content="强直性脊柱炎的治疗方法新突破,湖南强直性脊柱炎哪里治疗好采用的新一代“液体刀”滑膜切除术,能直接针对强直性脊柱炎的症状病灶,使关节滑膜炎症细胞失去活性并被溶解吸收,突破传统方法的局限性." />
    <meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />
    <link href="/css/kmwj.css" rel="stylesheet" type="text/css" />
    <SCRIPT language="javascript" type="text/javascript" src="/js/wj.js"></SCRIPT>
    </head>
    <body class="articlelist">
    <div class="wj_main w980">
    <div class="wj_banner w960">
      

  12.   

    我用了 7L和13L的 方法 无乱码
    不过7L的 方法有误 应为
    if (encodeValue.ToUpper() == "DEFAULT")
            {
                return Encoding.Default;
            }
            else if (encodeValue.ToUpper() == "UTF-8")
            {
                return Encoding.UTF8;
            }
            else if (encodeValue.ToUpper() == "UTF-7")
            {
                return Encoding.UTF7;
            }
            else if (encodeValue.ToUpper() == "UNICODE")
            {
                return Encoding.Unicode;
            }
            else if (encodeValue.ToUpper() == "ASCII")
            {
                return Encoding.ASCII;
            }
            else
            {
                return Encoding.Default;
            }
      

  13.   

    ASP.NET正在学习中,谢谢分享!
      

  14.   

    先自动获取编码 在抓取内容
    C# codeusing System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Text;
    using System.Net;
    using System.IO;
    using System.IO.Compression;
    using System.Text.RegularExpressions;namespace WikiPageCreater.Common
    {
        public class PageHelper
        {
            /// <summary>
            /// 根据 url 获取网页编码
            /// </summary>
            /// <param name="url"></param>
            /// <returns></returns>
            public static string GetEncoding(string url)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress));
                        else
                            reader = new StreamReader(response.GetResponseStream(), Encoding.ASCII);                    string html = reader.ReadToEnd();                    Regex reg_charset = new Regex(@"charset\b\s*=\s*(?<charset>[^""]*)");
                        if (reg_charset.IsMatch(html))
                        {
                            return reg_charset.Match(html).Groups["charset"].Value;
                        }
                        else if (response.CharacterSet != string.Empty)
                        {
                            return response.CharacterSet;
                        }
                        else
                            return Encoding.Default.BodyName;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return Encoding.Default.BodyName;
            }        /// <summary>
            /// 根据 url 和 encoding 获取当前url页面的 html 源代码        
           /// </summary>
            /// <param name="url"></param>
            /// <param name="encoding"></param>
            /// <returns></returns>
            public static string GetHtml(string url, Encoding encoding)
            {
                HttpWebRequest request = null;
                HttpWebResponse response = null;
                StreamReader reader = null;
                try
                {
                    request = (HttpWebRequest)WebRequest.Create(url);
                    request.Timeout = 20000;
                    request.AllowAutoRedirect = false;                response = (HttpWebResponse)request.GetResponse();
                    if (response.StatusCode == HttpStatusCode.OK && response.ContentLength < 1024 * 1024)
                    {
                        if (response.ContentEncoding != null && response.ContentEncoding.Equals("gzip", StringComparison.InvariantCultureIgnoreCase))
                            reader = new StreamReader(new GZipStream(response.GetResponseStream(), CompressionMode.Decompress), encoding);
                        else
                            reader = new StreamReader(response.GetResponseStream(), encoding);
                        string html = reader.ReadToEnd();                    return html;
                    }
                }
                catch
                {
                }
                finally
                {                if (response != null)
                    {
                        response.Close();
                        response = null;
                    }
                    if (reader != null)
                        reader.Close();                if (request != null)
                        request = null;            }            return string.Empty;
            }
        }
    }
      

  15.   

    楼主,分是我的了!告诉你原因:
    你这句代码有问题<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
    注意 有个空格 两个属性之间 content 和 charset 多出一个空格,所以http头响应后 ,给你按默认的编码解析了哦。再说一遍  content="text/html;     <!--空格在这里-->    charset=gb2312"