网址:http://newspic.cn.yahoo.com/newspic/bbs/2260/遇到个麻烦:虽然根据其标定的编码方式进行了转换,但源码中的汉字还是乱码。(绝大多数yahoo网页和其它网页不存在这个问题)首先用WebClient下载byte[]数据,然后获取网页源码:
private string BytesToString(byte[] data, out string charSet)
{
charSet = "";
string sHtml = Encoding.Default.GetString(data);
string CharSet = charSet; Regex rg = new Regex("<meta.+?charset=(?<name>.*?)\".*?>", RegexOptions.IgnoreCase);
Match m = rg.Match(sHtml); if (m.Success)
{
CharSet = m.Groups["name"].Value;
charSet = CharSet;
} if (CharSet.Length > 0)
{
sHtml = Encoding.GetEncoding(CharSet).GetString(data);
} return sHtml;
} WebClient wc = new WebClient();
byte[] _Data = wc.DownloadData("http://newspic.cn.yahoo.com/newspic/bbs/2260/");
string _MimeType = wc.ResponseHeaders["content-type"];
string set = "";
string content = BytesToString(_Data,out set);
System.IO.File.WriteAllText("C:\\html.txt", content, Encoding.GetEncoding(set));
另外一个获取网页编码的办法是WebClient属性ResponseHeaders["content-type"];两种方法得到的网页编码都是gb2312,但不清楚为啥还是乱码。求解释,求成功示例!
private string BytesToString(byte[] data, out string charSet)
{
charSet = "";
string sHtml = Encoding.Default.GetString(data);
string CharSet = charSet; Regex rg = new Regex("<meta.+?charset=(?<name>.*?)\".*?>", RegexOptions.IgnoreCase);
Match m = rg.Match(sHtml); if (m.Success)
{
CharSet = m.Groups["name"].Value;
charSet = CharSet;
} if (CharSet.Length > 0)
{
sHtml = Encoding.GetEncoding(CharSet).GetString(data);
} return sHtml;
} WebClient wc = new WebClient();
byte[] _Data = wc.DownloadData("http://newspic.cn.yahoo.com/newspic/bbs/2260/");
string _MimeType = wc.ResponseHeaders["content-type"];
string set = "";
string content = BytesToString(_Data,out set);
System.IO.File.WriteAllText("C:\\html.txt", content, Encoding.GetEncoding(set));
另外一个获取网页编码的办法是WebClient属性ResponseHeaders["content-type"];两种方法得到的网页编码都是gb2312,但不清楚为啥还是乱码。求解释,求成功示例!
解决方案 »
- 至少一个参数没有被指定值
- 新人请教关于sql事务的业务逻辑
- C#WinFrom的textBox取值
- C# 调用类中的函数,一定要用该类的实例来引用吗,可以直接调用吗
- 如何用SqlDataReader读出多行数据?
- 关于Excel的几个问题
- 在线问sql语句 急
- 【散分】如何往dataGrid中写入一行数据?
- 水晶报表中的TextObject(文本输入框)中的文本如何实现让程序控制换行
- 关于 未处理的“System.StackOverflowException”类型的异常
- 求一个生成16位随机数的C#方法
- C# 在类文件中调用ocx控件,引发System.Windows.Forms.AxHost异常,求助
byte[] _Data = wc.DownloadData("http://newspic.cn.yahoo.com/newspic/bbs/2260/");
string _MimeType = wc.ResponseHeaders["content-type"];//此处显示为utf8
string content = Encoding.UTF8.GetString(_Data);
System.IO.File.WriteAllText("C:\\html.txt", content, Encoding.UTF8);
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Text.RegularExpressions;
using System.Net;
using System.Net.Cache;
using System.Windows.Forms;namespace GetHtmlCode
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
} private void button1_Click(object sender, EventArgs e)
{
this.textBox1.Text = GetHtmlCode(this.textBox2.Text);
} public static string GetHtmlCode(string url)
{
WebClient wc = new WebClient();
wc.Credentials = CredentialCache.DefaultCredentials;
wc.Headers.Add("Accept", "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*");
wc.Headers.Add("Accept-Language", "zh-cn");
wc.Headers.Add("UA-CPU", "x86");
wc.Headers.Add("User-Agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)");
byte[] dataBuffer = wc.DownloadData(url);
string _MimeType = wc.ResponseHeaders["content-type"]; string set = "";
Regex reg = new Regex("charset=(?<set>[^\"\\s;]+)");
Match m = reg.Match(_MimeType);
if (m.Success)
{
set = m.Groups["set"].Value;
return Encoding.GetEncoding(set).GetString(dataBuffer);
}
else
{
string strWebData = Encoding.Default.GetString(dataBuffer);
Regex rg = new Regex("<meta.+?charset=(?<name>.*?)\".*?>", RegexOptions.IgnoreCase);
Match m2 = rg.Match(strWebData);
string webCharSet = m2.Groups["name"].Value;
strWebData = Encoding.GetEncoding(webCharSet).GetString(dataBuffer);
return strWebData;
}
}
}
}