public static string GetHtmlAutoEncoding()
        {
            HttpWebRequest req = null;
            HttpWebResponse resp = null;
            Stream stream = null;
            StreamReader read = null;
            try
            {                string url = "http://roll.sohu.com/20110827/n317536952.shtml";
                req = (HttpWebRequest)HttpWebRequest.Create(sUrl);
                req.UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:6.0) Gecko/20100101 Firefox/6.0";
                req.Accept = "*/*";
                req.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
                req.ContentType = "text/xml";
                req.Referer = url;                resp = (HttpWebResponse)req.GetResponse();
                Encoding enc = Encoding.GetEncoding(resp.CharacterSet);                string sHTML = string.Empty;
                stream = resp.GetResponseStream();
                string sChartSet = "";
                read = new StreamReader(stream, enc);                sHTML = read.ReadToEnd();
                Match charSetMatch = Regex.Match(sHTML, @"charset=(?<code>[\s\S]+?)", RegexOptions.IgnoreCase);
                sChartSet = charSetMatch.Groups["code"].Value;
                //if it's not utf-8,we should redecode the html.
                if (!string.IsNullOrEmpty(sChartSet.Trim()))
                    sHTML = Encoding.GetEncoding(sChartSet).GetString(enc.GetBytes(sHTML));
                if (resp.CharacterSet != "iso-8859-1")
                {
                    Encoding encoding = System.Text.Encoding.GetEncoding("ISO-8859-1");
                    byte[] byteArray = encoding.GetBytes(sHTML);                    string aaXX = System.Text.Encoding.GetEncoding("GBK").GetString(byteArray);
                }
                return sHTML;
            }
            catch
            {
                return "";
            }
            finally
            {
                if (resp != null)
                {
                    resp.Close();
                }
                if (stream != null)
                {
                    stream.Close();
                }
                if (read != null)
                {
                    read.Close();
                }
            }
        }
上面代码是获取页面源码,但是很多页面resp.CharacterSet 的编码格式是iso-8859-1,尝试了几种方法,都不行正确转码,请指点。 Encoding encoding = System.Text.Encoding.GetEncoding("ISO-8859-1");
                    byte[] byteArray = encoding.GetBytes(sHTML);                    string aaXX = System.Text.Encoding.GetEncoding("GBK").GetString(byteArray);
此种方法出来还是乱码。

解决方案 »

  1.   

            private static Encoding ConvertISO88591ForEncoding(string srcString)
            {
                String sResult;
                Encoding rEncoding = Encoding.UTF8;
                Encoding ISO88591Encoding = Encoding.GetEncoding("ISO-8859-1");
                Encoding GB2312Encoding = Encoding.GetEncoding("GB2312");
                byte[] srcBytes = ISO88591Encoding.GetBytes(srcString);
                byte[] dstBytes = Encoding.Convert(GB2312Encoding, ISO88591Encoding, srcBytes);
                char[] dstChars = new char[ISO88591Encoding.GetCharCount(dstBytes, 0, dstBytes.Length)];
                ISO88591Encoding.GetChars(dstBytes, 0, dstBytes.Length, dstChars, 0);
                sResult = new string(dstChars);
                if (sResult.Contains("gb2312"))
                    rEncoding = Encoding.Default;
                return rEncoding;
            }
    楼主给分吧,这个函数是我自己的写的,确保能用
      

  2.   

    iso-8859-1是西欧拉丁语,单字节编码,根本就不支持中文...如果有内容有中文,是那些垃圾网站的问题,因为它们没有为多字节编码页面设置charset,很多浏览器就默认iso-8859-1了...直接用ANSI或UTF-8读,不能用ISO-8859-1解码...至于它正确的编码是什么,你就挨个试吧...郁闷的话就骂它们的程序员...
      

  3.   

    关键还是sohu的 是我获取错了吗?
      

  4.   

    仔细看看还真是你的request有问题...加上 Accept-Charset:GB2312,utf-8; Accept-Language 改成 Accept-Language:zh-cn,zh;
      

  5.   

    加上了,utf-8 gb2312,gbk 都试了,还是乱码。