先贴下代码
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Data.Common;
using System.Data.SqlClient;
using System.Text.RegularExpressions;
using System.Collections;
using System.Configuration;
using System.Web;namespace yahooCache
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
} private void Form1_Load(object sender, EventArgs e)
{
string url = "http://search.yahoo.co.jp/search?p=%E3%83%AF%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0%E3%83%9B%E3%83%AA%E3%83%87%E3%83%BC&search.x=1&fr=top_ga1_sa&tid=top_ga1_sa&ei=UTF-8&aq=&oq=";
string encod = GetEncoding(url);
string a = GetStringByUrl(url, encod);
FileStream fs = new FileStream(@"e:\yahoo.co.jp_14.html", FileMode.OpenOrCreate, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding(encod));//通过指定字符编码方式可以实现对汉字的支持,否则在用记事本打开查看会出现乱码
sw.Flush();
sw.BaseStream.Seek(0, SeekOrigin.Begin);
sw.WriteLine(a);
sw.Flush();
sw.Close();
}
//得到页面
private string GetStringByUrl(string strUrl, string encod)
{
WebRequest wrt = WebRequest.Create(strUrl);
WebResponse wrse = wrt.GetResponse();
Stream strM = wrse.GetResponseStream();
StreamReader SR = new StreamReader(strM, Encoding.GetEncoding(encod));
string strallstrm = SR.ReadToEnd();
return strallstrm;
}
//获取页面编码
public string GetEncoding(string url)
{
WebClient myWebClient = new WebClient();
myWebClient.Credentials = CredentialCache.DefaultCredentials; byte[] myDataBuffer = myWebClient.DownloadData(url);
string strWebData = Encoding.Default.GetString(myDataBuffer); //获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[2].Value;
return webCharSet;
}
}
}这是我写的下载页面的程序,但是下载下来的页面却和从浏览器里直接搜索出来的结果不一样,有哪位高手可以帮我分析下,是我代码里缺少什么还是其他别的原因,旨在“程序下载的页面和浏览器里看到的一致”。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using System.Net;
using System.Data.Common;
using System.Data.SqlClient;
using System.Text.RegularExpressions;
using System.Collections;
using System.Configuration;
using System.Web;namespace yahooCache
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
} private void Form1_Load(object sender, EventArgs e)
{
string url = "http://search.yahoo.co.jp/search?p=%E3%83%AF%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0%E3%83%9B%E3%83%AA%E3%83%87%E3%83%BC&search.x=1&fr=top_ga1_sa&tid=top_ga1_sa&ei=UTF-8&aq=&oq=";
string encod = GetEncoding(url);
string a = GetStringByUrl(url, encod);
FileStream fs = new FileStream(@"e:\yahoo.co.jp_14.html", FileMode.OpenOrCreate, FileAccess.Write);
StreamWriter sw = new StreamWriter(fs, Encoding.GetEncoding(encod));//通过指定字符编码方式可以实现对汉字的支持,否则在用记事本打开查看会出现乱码
sw.Flush();
sw.BaseStream.Seek(0, SeekOrigin.Begin);
sw.WriteLine(a);
sw.Flush();
sw.Close();
}
//得到页面
private string GetStringByUrl(string strUrl, string encod)
{
WebRequest wrt = WebRequest.Create(strUrl);
WebResponse wrse = wrt.GetResponse();
Stream strM = wrse.GetResponseStream();
StreamReader SR = new StreamReader(strM, Encoding.GetEncoding(encod));
string strallstrm = SR.ReadToEnd();
return strallstrm;
}
//获取页面编码
public string GetEncoding(string url)
{
WebClient myWebClient = new WebClient();
myWebClient.Credentials = CredentialCache.DefaultCredentials; byte[] myDataBuffer = myWebClient.DownloadData(url);
string strWebData = Encoding.Default.GetString(myDataBuffer); //获取网页字符编码描述信息
Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)\"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
string webCharSet = charSetMatch.Groups[2].Value;
return webCharSet;
}
}
}这是我写的下载页面的程序,但是下载下来的页面却和从浏览器里直接搜索出来的结果不一样,有哪位高手可以帮我分析下,是我代码里缺少什么还是其他别的原因,旨在“程序下载的页面和浏览器里看到的一致”。
如果在www.yahoo.co.jp里搜索相同的关键字得到的页面是:
两个页面的格式是完全不一样的
http://search.yahoo.co.jp/search?p=%E3%83%AF%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0%E3%83%9B%E3%83%AA%E3%83%87%E3%83%BC&search.x=1&fr=top_ga1_sa&tid=top_ga1_sa&ei=UTF-8&aq=&oq=
两个页面的格式是完全不一样的
//得到页面
private string GetStringByUrl(string strUrl, string encod)
{
WebRequest wrt = WebRequest.Create(strUrl);
WebResponse wrse = wrt.GetResponse();
Stream strM = wrse.GetResponseStream();
StreamReader SR = new StreamReader(strM, Encoding.GetEncoding(encod));
string strallstrm = SR.ReadToEnd();
return strallstrm;
} try //得到页面
private string GetStringByUrl(string strUrl, string encod)
{
WebRequest wrt = WebRequest.Create(strUrl);
wrt.Headers.Add ( "UserAgent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" );
wrt.ContentType = "text/html; charset=utf-8"; // 不要UTF8的话可以不用
wrt.Headers.Add ( "Accept-Charset", "UTF-8" ); // 不要UTF8的话可以不用
//一般做yahoo的话 我还要加 proxy 的
WebResponse wrse = wrt.GetResponse();
Stream strM = wrse.GetResponseStream();
StreamReader SR = new StreamReader(strM, Encoding.GetEncoding(encod));
string strallstrm = SR.ReadToEnd();
return strallstrm;
}
http://search.yahoo.co.jp/search?p=%E3%83%AF%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0%E3%83%9B%E3%83%AA%E3%83%87%E3%83%BC&search.x=1&fr=top_ga1_sa&tid=top_ga1_sa&ei=UTF-8&aq=&oq=
是UTF-8
http://clockoo.com/photo/20090618/yahoo.co.jp_14.html
这个是EUC-JP
这个我也发现了,但是程序下载下来的确是是EUC_JP编码的页面,我想知道的就是为什么程序下载的页面会和浏览器搜索到得不一样。
request.Headers.Add ( "UserAgent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" );3.Yahoo可能也会判断你的客户端Cookie支持及是否有特定Cookie等,所以还要加上一定的Cookie信息,具体可以用一些浏览器插件进行查看