日,刚才打算把获取的数据贴上来,结果把浏览器给搞崩溃了!!!
晕死!
using System;
using System.Web;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using Boolue.Real;namespace GoogleRefer
{
public class Google
{
static string URLFormat = "http://news.google.cn/news?tab=vn&hl=zh-CN&ned=cn&scoring=n&q=|&ie=UTF-8&sa=N&start=0";
static string StrImage;//图片
static string StrURL;//网址
static string StrCaption;//标题
static string StrFrom;//新闻源头
static string StrMark;//简介
static string StrTime;
public Google(string StrKey)
{
StrImage = @"src=(.*?)\s";
StrURL = @"href=""(.*?)""";
StrCaption = @"target=_blank>(.*?)</a>";
StrFrom = @"size=-1.*?>(.*?)-";
StrMark = @"size=-1.*?size=-1>(.*?)</div>";
StrTime = @"<nobr>(.*?)</nobr>";
GetKey(StrKey);
}
//得到关键字,然后获取到源代码以及对HTML做一些相关的判断
private void GetKey(string GetKey)
{ URLFormat= URLFormat.Replace("|", HttpUtility.UrlEncode(GetKey, Encoding.GetEncoding("UTF-8")));
string Html = RealPage.Get(URLFormat);
if (Html.Length == 0)//URL有错误
{
GetError();
}
if (Html.IndexOf("找不到和您的查询") != -1)//没有搜索到数据
{
GetError();
}
GetPageCount(Html);
} private void GetPageCount(string HTML)
{ GetHtml(HTML, 0);//第一页
Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
Match MPage = RePage.Match(HTML);
int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
if (PageCount > 9)
{
string URL = URLFormat.Replace("start=0", "start=90");
HTML = RealPage.Get(URL);
PageCount=JudgePageCount(HTML,10);
if (PageCount > 18)
{
URL = URL.Replace("start=90", "start=180");
HTML = RealPage.Get(URL);
PageCount = JudgePageCount(HTML,19);
if (PageCount > 27)
{
URL = URL.Replace("start=180", "start=270");
HTML = RealPage.Get(URL);
PageCount = JudgePageCount(HTML,28);
GetHtml("", PageCount);
}
else
{
GetHtml("", PageCount);
}
}
else
{
GetHtml("", PageCount);
}
}
else
{
GetHtml("", PageCount);
}
}
private int JudgePageCount(string HTML, int CurrentPageCount)
{
int Page = 0;
if (HTML.IndexOf("<b>下一页</b>") != -1)
{
Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
Match MPage = RePage.Match(HTML);
int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
return PageCount;
}
else
{
Page = CurrentPageCount;
}
return Page;
} private void GetHtml(string HTML, int PageCount)
{
if (PageCount == 0)
{
GetList(HTML);
}
else
{
for (int i = 1; i <= PageCount; i++)
{
HTML = RealPage.Get(URLFormat.Replace("start=0", "start=" + (i * 10)));
GetList(HTML);
}
}
}
private void GetList(string HTML)
{
Regex ReList = new Regex(@"<table\s{1,10}border=0\s{1,10}valign=top.*?>(.*?)</table>");
MatchCollection McList = ReList.Matches(HTML);
if (McList.Count == 0)
{
GetError();//没有数据
}
foreach (Match MList in McList)
{
GetItem(MList.Groups[1].Value);
}
}
private void GetItem(string Html)
{
Regex ReImage = new Regex(StrImage);
Match MImage = ReImage.Match(Html);
string Str_Image = MImage.Groups[1].Value; Regex ReURL = new Regex(StrURL);
Match MURL = ReURL.Match(Html);
string Str_URL = MURL.Groups[1].Value; Regex ReCaption = new Regex(StrCaption);
Match MCaption = ReCaption.Match(Html);
string Str_Caption = Regex.Replace(MCaption.Groups[1].Value,"<.*?>",""); Regex ReFrom = new Regex(StrFrom);
Match MFrom = ReFrom.Match(Html);
string Str_From = Regex.Replace(MFrom.Groups[1].Value.Replace(" ", ""),"<.*?>",""); Regex ReMark = new Regex(StrMark);
Match MMark = ReMark.Match(Html);
string Str_Mark = Regex.Replace(MMark.Groups[1].Value,"<.*?>","").Replace(" ",""); Regex ReTime = new Regex(StrTime);
Match Mtime = ReTime.Match(Html);
string Str_Time = Regex.Replace(Mtime.Groups[1].Value,"<.*?>","");
if (Str_Time.IndexOf("分钟前") != -1)
{
Str_Time = "-" + Regex.Replace(Str_Time, @"[^\d]", "");
Str_Time = DateTime.Now.AddMinutes(double.Parse(Str_Time)).ToString();
} else if (Str_Time.IndexOf("小时前") != -1)
{
Str_Time = "-" + Regex.Replace(Str_Time, @"[^\d]", "");
Str_Time = DateTime.Now.AddHours(double.Parse(Str_Time)).ToString();
}
System.IO.StreamWriter sw = new System.IO.StreamWriter(@"C:\aaa.txt",true,Encoding.Default);
sw.WriteLine("图片地址:"+Str_Image);
sw.WriteLine("新闻网址:"+Str_URL);
sw.WriteLine("新闻日期:"+Str_Time);
sw.WriteLine("新闻来源:"+Str_From);
sw.WriteLine("新闻标题:" + Str_Caption);
sw.WriteLine("新闻简介:"+Str_Mark);
sw.WriteLine("\n\n");
sw.Flush();
sw.Close();
}
private void GetError()
{
throw new Exception();
}
}这个是原代码这个是Image:http://p.thec.cn/xingtianzhang/Google403.html(Google提示)我常来网申请的号,苦于没有RMB,所以没有管理.net framework以及mssql的操作!!只能.html这个是记录在记事本的数据:http://p.thec.cn/xingtianzhang/GetGoogleRefer.html现在我是零分了!!!!只不过是从头再来,目前我的需求就是大家帮忙看看我的获取代码能不能改善下???
晕死!
using System;
using System.Web;
using System.Collections.Generic;
using System.Text;
using System.Text.RegularExpressions;
using Boolue.Real;namespace GoogleRefer
{
public class Google
{
static string URLFormat = "http://news.google.cn/news?tab=vn&hl=zh-CN&ned=cn&scoring=n&q=|&ie=UTF-8&sa=N&start=0";
static string StrImage;//图片
static string StrURL;//网址
static string StrCaption;//标题
static string StrFrom;//新闻源头
static string StrMark;//简介
static string StrTime;
public Google(string StrKey)
{
StrImage = @"src=(.*?)\s";
StrURL = @"href=""(.*?)""";
StrCaption = @"target=_blank>(.*?)</a>";
StrFrom = @"size=-1.*?>(.*?)-";
StrMark = @"size=-1.*?size=-1>(.*?)</div>";
StrTime = @"<nobr>(.*?)</nobr>";
GetKey(StrKey);
}
//得到关键字,然后获取到源代码以及对HTML做一些相关的判断
private void GetKey(string GetKey)
{ URLFormat= URLFormat.Replace("|", HttpUtility.UrlEncode(GetKey, Encoding.GetEncoding("UTF-8")));
string Html = RealPage.Get(URLFormat);
if (Html.Length == 0)//URL有错误
{
GetError();
}
if (Html.IndexOf("找不到和您的查询") != -1)//没有搜索到数据
{
GetError();
}
GetPageCount(Html);
} private void GetPageCount(string HTML)
{ GetHtml(HTML, 0);//第一页
Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
Match MPage = RePage.Match(HTML);
int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
if (PageCount > 9)
{
string URL = URLFormat.Replace("start=0", "start=90");
HTML = RealPage.Get(URL);
PageCount=JudgePageCount(HTML,10);
if (PageCount > 18)
{
URL = URL.Replace("start=90", "start=180");
HTML = RealPage.Get(URL);
PageCount = JudgePageCount(HTML,19);
if (PageCount > 27)
{
URL = URL.Replace("start=180", "start=270");
HTML = RealPage.Get(URL);
PageCount = JudgePageCount(HTML,28);
GetHtml("", PageCount);
}
else
{
GetHtml("", PageCount);
}
}
else
{
GetHtml("", PageCount);
}
}
else
{
GetHtml("", PageCount);
}
}
private int JudgePageCount(string HTML, int CurrentPageCount)
{
int Page = 0;
if (HTML.IndexOf("<b>下一页</b>") != -1)
{
Regex RePage = new Regex(@".*width=16.*?>(.*?)</a>");//贪婪匹配
Match MPage = RePage.Match(HTML);
int PageCount = int.Parse(Regex.Replace(MPage.Groups[1].Value, "<.*?>", ""));
return PageCount;
}
else
{
Page = CurrentPageCount;
}
return Page;
} private void GetHtml(string HTML, int PageCount)
{
if (PageCount == 0)
{
GetList(HTML);
}
else
{
for (int i = 1; i <= PageCount; i++)
{
HTML = RealPage.Get(URLFormat.Replace("start=0", "start=" + (i * 10)));
GetList(HTML);
}
}
}
private void GetList(string HTML)
{
Regex ReList = new Regex(@"<table\s{1,10}border=0\s{1,10}valign=top.*?>(.*?)</table>");
MatchCollection McList = ReList.Matches(HTML);
if (McList.Count == 0)
{
GetError();//没有数据
}
foreach (Match MList in McList)
{
GetItem(MList.Groups[1].Value);
}
}
private void GetItem(string Html)
{
Regex ReImage = new Regex(StrImage);
Match MImage = ReImage.Match(Html);
string Str_Image = MImage.Groups[1].Value; Regex ReURL = new Regex(StrURL);
Match MURL = ReURL.Match(Html);
string Str_URL = MURL.Groups[1].Value; Regex ReCaption = new Regex(StrCaption);
Match MCaption = ReCaption.Match(Html);
string Str_Caption = Regex.Replace(MCaption.Groups[1].Value,"<.*?>",""); Regex ReFrom = new Regex(StrFrom);
Match MFrom = ReFrom.Match(Html);
string Str_From = Regex.Replace(MFrom.Groups[1].Value.Replace(" ", ""),"<.*?>",""); Regex ReMark = new Regex(StrMark);
Match MMark = ReMark.Match(Html);
string Str_Mark = Regex.Replace(MMark.Groups[1].Value,"<.*?>","").Replace(" ",""); Regex ReTime = new Regex(StrTime);
Match Mtime = ReTime.Match(Html);
string Str_Time = Regex.Replace(Mtime.Groups[1].Value,"<.*?>","");
if (Str_Time.IndexOf("分钟前") != -1)
{
Str_Time = "-" + Regex.Replace(Str_Time, @"[^\d]", "");
Str_Time = DateTime.Now.AddMinutes(double.Parse(Str_Time)).ToString();
} else if (Str_Time.IndexOf("小时前") != -1)
{
Str_Time = "-" + Regex.Replace(Str_Time, @"[^\d]", "");
Str_Time = DateTime.Now.AddHours(double.Parse(Str_Time)).ToString();
}
System.IO.StreamWriter sw = new System.IO.StreamWriter(@"C:\aaa.txt",true,Encoding.Default);
sw.WriteLine("图片地址:"+Str_Image);
sw.WriteLine("新闻网址:"+Str_URL);
sw.WriteLine("新闻日期:"+Str_Time);
sw.WriteLine("新闻来源:"+Str_From);
sw.WriteLine("新闻标题:" + Str_Caption);
sw.WriteLine("新闻简介:"+Str_Mark);
sw.WriteLine("\n\n");
sw.Flush();
sw.Close();
}
private void GetError()
{
throw new Exception();
}
}这个是原代码这个是Image:http://p.thec.cn/xingtianzhang/Google403.html(Google提示)我常来网申请的号,苦于没有RMB,所以没有管理.net framework以及mssql的操作!!只能.html这个是记录在记事本的数据:http://p.thec.cn/xingtianzhang/GetGoogleRefer.html现在我是零分了!!!!只不过是从头再来,目前我的需求就是大家帮忙看看我的获取代码能不能改善下???
郁闷ing!
没办法!只能故意降低程序速度!跑三步休息下,抓取几条数据休息2秒!