日,刚才打算把获取的数据贴上来,结果把浏览器给搞崩溃了!!!
晕死!
using System;
using System.Web;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using Boolue.Real;namespace GoogleRefer
{
    public class Google
    {
        static string URLFormat = "http://news.google.cn/news?tab=vn&hl=zh-CN&ned=cn&scoring=n&q=|&ie=UTF-8&sa=N&start=0";
        static string StrImage;//图片
        static string StrURL;//网址
        static string StrCaption;//标题
        static string StrFrom;//新闻源头
        static string StrMark;//简介
        static string StrTime;
        public Google(string StrKey)
        {
            StrImage = @"src=(.*?)\s";
            StrURL = @"href=""(.*?)""";
            StrCaption = @"target=_blank>(.*?)</a>";
            StrFrom = @"size=-1.*?>(.*?)-";
            StrMark = @"size=-1.*?size=-1>(.*?)</div>";
            StrTime = @"<nobr>(.*?)</nobr>";
            GetKey(StrKey);
        }
        //得到关键字,然后获取到源代码以及对HTML做一些相关的判断
        private void GetKey(string GetKey)
        {            URLFormat= URLFormat.Replace("|", HttpUtility.UrlEncode(GetKey, Encoding.GetEncoding("UTF-8")));
            string Html = RealPage.Get(URLFormat);
            if (Html.Length == 0)//URL有错误
            {
                GetError();
            }
            if (Html.IndexOf("找不到和您的查询") != -1)//没有搜索到数据
            {
                GetError();
            }
            GetPageCount(Html);
        }        private void GetPageCount(string HTML)
        {             GetHtml(HTML, 0);//第一页
            Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
            Match MPage = RePage.Match(HTML);
            int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
            if (PageCount > 9)
            {
                string URL = URLFormat.Replace("start=0", "start=90");
                HTML = RealPage.Get(URL);
                PageCount=JudgePageCount(HTML,10);
                if (PageCount > 18)
                {
                    URL = URL.Replace("start=90", "start=180");
                    HTML = RealPage.Get(URL);
                    PageCount = JudgePageCount(HTML,19);
                    if (PageCount > 27)
                    {
                        URL = URL.Replace("start=180", "start=270");
                        HTML = RealPage.Get(URL);
                        PageCount = JudgePageCount(HTML,28);
                        GetHtml("", PageCount);
                    }
                    else
                    {
                        GetHtml("", PageCount);
                    }
                }
                else
                {
                    GetHtml("", PageCount);
                }
            }
            else
            {
                GetHtml("", PageCount);
            }
        }
        private int JudgePageCount(string HTML, int CurrentPageCount)
        {
            int Page = 0;
            if (HTML.IndexOf("<b>下一页</b>") != -1)
            {
                Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
                Match MPage = RePage.Match(HTML);
                int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
                return PageCount;
            }
            else
            {
                Page = CurrentPageCount;
            }
            return Page;
        }        private void GetHtml(string HTML, int PageCount)
        {
            if (PageCount == 0)
            {
                GetList(HTML);
            }
            else
            {
                for (int i = 1; i <= PageCount; i++)
                {
                    HTML = RealPage.Get(URLFormat.Replace("start=0", "start=" + (i * 10)));
                    GetList(HTML);
                }
            }
        }
        private void GetList(string HTML)
        {
            Regex ReList = new Regex(@"<table\s{1,10}border=0\s{1,10}valign=top.*?>(.*?)</table>");
            MatchCollection McList = ReList.Matches(HTML);
            if (McList.Count == 0)
            {
                GetError();//没有数据
            }
            foreach (Match MList in McList)
            {
                GetItem(MList.Groups[1].Value);
            }
        }
        private void GetItem(string Html)
        {
            Regex ReImage = new Regex(StrImage);
            Match MImage = ReImage.Match(Html);
            string Str_Image = MImage.Groups[1].Value;            Regex ReURL = new Regex(StrURL);
            Match MURL = ReURL.Match(Html);
            string Str_URL = MURL.Groups[1].Value;            Regex ReCaption = new Regex(StrCaption);
            Match MCaption = ReCaption.Match(Html);
            string Str_Caption = Regex.Replace(MCaption.Groups[1].Value,"<.*?>","");            Regex ReFrom = new Regex(StrFrom);
            Match MFrom = ReFrom.Match(Html);
            string Str_From = Regex.Replace(MFrom.Groups[1].Value.Replace("&nbsp;", ""),"<.*?>","");            Regex ReMark = new Regex(StrMark);
            Match MMark = ReMark.Match(Html);
            string Str_Mark = Regex.Replace(MMark.Groups[1].Value,"<.*?>","").Replace("&nbsp;","");            Regex ReTime = new Regex(StrTime);
            Match Mtime = ReTime.Match(Html);
            string Str_Time = Regex.Replace(Mtime.Groups[1].Value,"<.*?>","");
            if (Str_Time.IndexOf("分钟前") != -1)
            {
                Str_Time = "-" + Regex.Replace(Str_Time, @"[^\d]", "");
                Str_Time = DateTime.Now.AddMinutes(double.Parse(Str_Time)).ToString();
            }            else if (Str_Time.IndexOf("小时前") != -1)
            {
                Str_Time = "-" + Regex.Replace(Str_Time, @"[^\d]", "");
                Str_Time = DateTime.Now.AddHours(double.Parse(Str_Time)).ToString();
            }
         
            
            System.IO.StreamWriter sw = new System.IO.StreamWriter(@"C:\aaa.txt",true,Encoding.Default);
            sw.WriteLine("图片地址:"+Str_Image);
            sw.WriteLine("新闻网址:"+Str_URL);
            sw.WriteLine("新闻日期:"+Str_Time);
            sw.WriteLine("新闻来源:"+Str_From);
            sw.WriteLine("新闻标题:" + Str_Caption);
            sw.WriteLine("新闻简介:"+Str_Mark);
            sw.WriteLine("\n\n");
            sw.Flush();
            sw.Close();
        }
        private void GetError()
        {
            throw new Exception();
        }
    }这个是原代码这个是Image:http://p.thec.cn/xingtianzhang/Google403.html(Google提示)我常来网申请的号,苦于没有RMB,所以没有管理.net framework以及mssql的操作!!只能.html这个是记录在记事本的数据:http://p.thec.cn/xingtianzhang/GetGoogleRefer.html现在我是零分了!!!!只不过是从头再来,目前我的需求就是大家帮忙看看我的获取代码能不能改善下???

解决方案 »

  1.   

    看来它限制流量针对单个ip!!我只不过想获取点数据而已!!!!
    郁闷ing!
      

  2.   

    那天写了一个获取baidu知道的,也是限制ip流量!!
    没办法!只能故意降低程序速度!跑三步休息下,抓取几条数据休息2秒!
      

  3.   

    抓几条休息一下,最好装个adsl,隔一个小时重新拨一下号.