目标地址 http://zhaoqing.baixing.com/ershoumotuoche/
采集过程中。 会出现两种版本的列表源码。 一份是TABLE排版的。 一份是LI 排版的。
问题:“浏览器多次F5,源文件代码是一样的,为什么程序采集的就会偶发性出现TABLE的排版。。
希望高手解答。
public string GetHttpPageCode(string Url, Encoding EnCodeType)
{
string strResult = string.Empty;
if (Url.Length < 10)
return "$UrlIsFalse";
try
{
string rl;
HttpWebRequest myReq = ((HttpWebRequest)WebRequest.Create(Url));
myReq.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";
myReq.Accept = "*/*";
myReq.KeepAlive = true;
myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
WebResponse myRes = myReq.GetResponse();
Stream resStream = myRes.GetResponseStream();
StreamReader sr = new StreamReader(resStream, EnCodeType);
StringBuilder sb = new StringBuilder();
while ((rl = sr.ReadLine()) != null)
{
sb.Append(rl);
}
strResult = sb.ToString();
myRes.Close(); }
catch (Exception)
{
throw;
}
采集过程中。 会出现两种版本的列表源码。 一份是TABLE排版的。 一份是LI 排版的。
问题:“浏览器多次F5,源文件代码是一样的,为什么程序采集的就会偶发性出现TABLE的排版。。
希望高手解答。
public string GetHttpPageCode(string Url, Encoding EnCodeType)
{
string strResult = string.Empty;
if (Url.Length < 10)
return "$UrlIsFalse";
try
{
string rl;
HttpWebRequest myReq = ((HttpWebRequest)WebRequest.Create(Url));
myReq.UserAgent = "User-Agent: Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705;)";
myReq.Accept = "*/*";
myReq.KeepAlive = true;
myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
WebResponse myRes = myReq.GetResponse();
Stream resStream = myRes.GetResponseStream();
StreamReader sr = new StreamReader(resStream, EnCodeType);
StringBuilder sb = new StringBuilder();
while ((rl = sr.ReadLine()) != null)
{
sb.Append(rl);
}
strResult = sb.ToString();
myRes.Close(); }
catch (Exception)
{
throw;
}
就是简单的抓取。 static string GetHtml(string url, Encoding encoding)
{
byte[] buf = new WebClient().DownloadData(url);
if (encoding != null) return encoding.GetString(buf);
string html = Encoding.UTF8.GetString(buf);
encoding = GetEncoding(html);
if (encoding == null || encoding == Encoding.UTF8) return html;
return encoding.GetString(buf);
} static Encoding GetEncoding(string html)
{
string pattern = @"(?i)\bcharset=(?<charset>[-a-zA-Z_0-9]+)";
string charset = Regex.Match(html, pattern).Groups["charset"].Value;
try { return Encoding.GetEncoding(charset); }
catch (ArgumentException) { return null; }
}