关于爬虫 这是我爬取goole时骂我发现goole不让爬,请问应该怎么办啊? 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 应该可以吧 你可以通过url拼接得到关键字的结果 然后通过node来读取啊 这是我爬取goole时骂我发现goole不让爬 没看懂 是不是有什么东西没放上来 不好意思,刚有些忙,打错字了,就是goole不让我的爬虫爬取,不知道为什么? 不让爬是因为你缺少GOOGLE中提交的其他辅助性参数,用HTTPLOOK对GOOGLE的网页请求进行分析,把缺省的参数都提交过去,单单调用URL是绝对不行的 private string GetPage(string url) { try { string html = string.Empty; string encoding = string.Empty; //声明一个HttpWebRequest请求 HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url); webRequest.Method = "GET"; webRequest.UserAgent = "Opera/9.25 (Windows NT 6.0; U; en)"; webRequest.ContentType = "text/html"; byte[] buffer = new byte[1024]; HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse(); string encodingTypt = webResponse.CharacterSet; using (Stream stream = webResponse.GetResponseStream()) { using (Stream reader = webResponse.GetResponseStream()) { using (MemoryStream memory = new MemoryStream()) { int index = 1; int sum = 0; while (index > 0 && sum < 100 * 1024) { index = reader.Read(buffer, 0, 1024); if (index > 0) { memory.Write(buffer, 0, index); sum += index; } } html = Encoding.GetEncoding("gb2312").GetString(memory.ToArray()); if (string.IsNullOrEmpty(html)) { return html; } else { string reg_charset = "(<meta[^>]*charset=(?<charset>[^>'\"]*)[\\s\\S]*?>)|(xml[^>]+encoding=(\"|')*(?<charset>[^>'\"]*)[\\s\\S]*?>)"; Regex r = new Regex(reg_charset, RegexOptions.IgnoreCase); Match m = r.Match(html); encoding = (m.Captures.Count != 0) ? m.Result("${charset}") : ""; // Regex re = new Regex(@"charset=(?<charset>[\s\S]*?)[""|'|>]"); //Match m = re.Match(html.ToLower()); //encoding = m.Groups["charset"].ToString(); Console.WriteLine("22222222222=" + encoding); } if (string.Equals(encoding.ToLower(), "gb2312")) { return html; } else if (string.Equals(encoding.ToLower(), "gbk") || string.Equals(encoding.ToLower(), "utf-8")) { return Encoding.GetEncoding(encoding).GetString(memory.ToArray()); } else if (string.IsNullOrEmpty(encoding)) { Console.WriteLine("@@@@@@@@@@@@@@@=" + encodingTypt.ToLower()); if (string.Equals(encodingTypt.ToLower(), "iso-8859-1")) { return Encoding.GetEncoding("utf-8").GetString(memory.ToArray()); } else { return Encoding.GetEncoding(encodingTypt).GetString(memory.ToArray()); } } else { return Encoding.GetEncoding("utf-8").GetString(memory.ToArray()); } } } } } catch (WebException e) { System.Console.WriteLine("下载失败,错误:" + e); return null; } catch (IOException e) { System.Console.WriteLine("下载失败,错误:" + e); return null; } catch (Exception e) { System.Console.WriteLine("下载失败,错误:" + e); return null; } }这是我的取得网页源码的程序,这样不对吗?好像是在一段时间里发送请求次数太多了,就出现上图的情况。不知道改怎么解决? winform中登录窗口怎么把值传给新窗口,并关闭登录窗口 在线等,C#字符串的排序打印问题 C#中DES加密在PHP中解密 vs2008安装部署的时候,怎么把.net框架一起打包到安装文件内? .net 关闭窗体 Winfrom与Excel查询问题? 关于位操作的问题! 关于treeview的问题(webForm)? 线程中打开一个form的问题 为什么用DataAdapter的Update方法更新数据库总是不行? 数据库插入insert高级问题--高手进来 取左边的字符串
private string GetPage(string url)
{
try
{
string html = string.Empty;
string encoding = string.Empty;
//声明一个HttpWebRequest请求
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url);
webRequest.Method = "GET";
webRequest.UserAgent = "Opera/9.25 (Windows NT 6.0; U; en)";
webRequest.ContentType = "text/html";
byte[] buffer = new byte[1024];
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
string encodingTypt = webResponse.CharacterSet;
using (Stream stream = webResponse.GetResponseStream())
{
using (Stream reader = webResponse.GetResponseStream())
{
using (MemoryStream memory = new MemoryStream())
{
int index = 1;
int sum = 0;
while (index > 0 && sum < 100 * 1024)
{
index = reader.Read(buffer, 0, 1024);
if (index > 0)
{
memory.Write(buffer, 0, index);
sum += index;
}
}
html = Encoding.GetEncoding("gb2312").GetString(memory.ToArray());
if (string.IsNullOrEmpty(html))
{
return html;
}
else
{
string reg_charset = "(<meta[^>]*charset=(?<charset>[^>'\"]*)[\\s\\S]*?>)|(xml[^>]+encoding=(\"|')*(?<charset>[^>'\"]*)[\\s\\S]*?>)";
Regex r = new Regex(reg_charset, RegexOptions.IgnoreCase);
Match m = r.Match(html);
encoding = (m.Captures.Count != 0) ? m.Result("${charset}") : ""; // Regex re = new Regex(@"charset=(?<charset>[\s\S]*?)[""|'|>]");
//Match m = re.Match(html.ToLower());
//encoding = m.Groups["charset"].ToString();
Console.WriteLine("22222222222=" + encoding);
} if (string.Equals(encoding.ToLower(), "gb2312"))
{
return html;
}
else if (string.Equals(encoding.ToLower(), "gbk") || string.Equals(encoding.ToLower(), "utf-8"))
{
return Encoding.GetEncoding(encoding).GetString(memory.ToArray());
}
else if (string.IsNullOrEmpty(encoding))
{
Console.WriteLine("@@@@@@@@@@@@@@@=" + encodingTypt.ToLower());
if (string.Equals(encodingTypt.ToLower(), "iso-8859-1"))
{
return Encoding.GetEncoding("utf-8").GetString(memory.ToArray());
}
else
{
return Encoding.GetEncoding(encodingTypt).GetString(memory.ToArray());
}
}
else
{
return Encoding.GetEncoding("utf-8").GetString(memory.ToArray());
}
}
}
} }
catch (WebException e)
{
System.Console.WriteLine("下载失败,错误:" + e);
return null;
}
catch (IOException e)
{
System.Console.WriteLine("下载失败,错误:" + e);
return null;
}
catch (Exception e)
{
System.Console.WriteLine("下载失败,错误:" + e);
return null;
}
}这是我的取得网页源码的程序,这样不对吗?好像是在一段时间里发送请求次数太多了,就出现上图的情况。不知道改怎么解决?