关于使用Google搜索采集数据

private int CatchContent(string strUrl)
        {
            int count = 0;
            int scount = 0;
            int fcount = 0;
            try
            {
                datas d = new datas();
                WebClient wc = new WebClient();
                string group = GroupList.SelectedValue.ToString();
                string type = LinkList.SelectedValue.ToString();
                Regex r = new Regex("<ol>[\\s\\S]*?</ol>", RegexOptions.IgnoreCase);
                Regex r0 = new Regex("<h3 class=\"r\">[\\s\\S]*?</h3>", RegexOptions.IgnoreCase);
                Regex r1 = new Regex("http[\\s\\S]*?target", RegexOptions.IgnoreCase);
                string html = Encoding.UTF8.GetString(wc.DownloadData(strUrl));
                html = r.Match(html).Value;
                for (Match m = r0.Match(html); m.Success; m = m.NextMatch())
                {
                    string url = r1.Match(m.Value).Value;
                    url = url.Replace("target", "").Replace("\"", "");
                    //写数据库
                    int i = d.LinkIn(url, url, "0", this.UserId, group, type);
                    if (i == 1)
                        scount++;
                    else
                        fcount++;
                    count++;
                }
                wc.Dispose();
            }
            catch { }
            return count;
        }这是源代码
string html = Encoding.UTF8.GetString(wc.DownloadData(strUrl));在这里报 503 服务器不可用错误
在网上看了很多资料最终大致确定Googel可能是有防抓取设置
求高手解决
不想用WebRequest 因为需改动的代码较多

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

据我了解好像Google有机制判断是否是程序访问
当判断是程序访问时会返回如上错误
会有这问题的
以前我的做法是   当出这问题了  换代理然后用代理去就行但是代理也用不太久次数一多 google又给断了  再换另一个代理清cookie
貌似这样很麻烦
看了一篇文章介绍说有个API专门给程序员研究访问的后来封了
很是麻烦啊
求高手高见在线等
代理是个方法,另一个是采集速度不要过快一个IP.
同时HTTP头部信息Header不要使用默认的,要使用多个不同的头信息来采集,否则很容易是机器抓取的.