/// <summary>
/// 提取整个网页内容
/// </summary>
/// <param name="Url">被提取的地址</param>
/// <returns></returns>
public static String Get(String Url, CookieContainer CC)
{ String Html = "";
if (Url.Length < 1)
{ return "";
}
String address = Url;
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(address);
req.Method = "GET";
req.ContentType = "application/x-www-form-urlencoded;charset=gb2312";
req.Referer = Url;
//75秒无法获取数据就放弃
//req.Timeout = 30000;
req.CookieContainer = CC;
//接受数据回来 using (HttpWebResponse wr = (HttpWebResponse)req.GetResponse())
{
Stream R = wr.GetResponseStream();
StreamReader SR = new StreamReader(R, System.Text.Encoding.Default);
Html = SR.ReadToEnd();
R.Dispose();
SR.Dispose();
}
Url = null;
address = null;
CC = null;
req.Abort ();
return Html;
}以上是我的捕捉代码我出现的问题是在一个循环中执行了一次Get然后 判断是否有内容分页的时候
再次执行 Get似乎就不成功了
for(DetailI=0;DetailI<ListUrl.Count;DetailI++)
{
try
{
#region 查看是否已经到了地址结尾
//lock (locker)
//{
if (!Stop)
{
String Html = "";
int TempI = 0;
Invoke (DWF , new object [] { "打开内容页.当前编号:" + DetailI.ToString() +" 总序列:" + ListUrl.Count.ToString() }); ////进度条动画
//progressBar1.Minimum = DetailI;
ProgressBarMaximum PBM = new ProgressBarMaximum (FProgressBarMin);
Invoke (PBM , new object [] { DetailI });
PBM = null;
TempI = DetailI;
String Url = ListUrl [DetailI].ToString ();
String TempTempUrl = Url;
//判断读取路径是否正确!
if (TempTempUrl!=""&&TempTempUrl!=null&&TempTempUrl.Substring (0 , 7).ToUpper () != "HTTP://")
{
if (Url.Substring (0 , 1) != "/")
{
Url = Model.Url.Substring (0 , Model.Url.LastIndexOf ('/') + 1) + Url;
}
else
{
String HostName = Model.Url.Substring (7 , Model.Url.Length - 7);
HostName = HostName.Substring (0 , HostName.IndexOf ('/'));
Url = "HTTP://" + HostName + Url;
}
}
Invoke (DWF , new object [] { "正在采集:" + Url }); if (!IsUTF8.Checked)
{
Html = Panda.WebController.Get (Url , null);
}
else
{
Html = Panda.WebController.GetUTF8 (Url , null);
} //Html = Panda.WebController.Get (Url , null); Invoke (DWF , new object [] { "数据整理"});
#region 数据整理
String Title = Panda.RegController.FirstRegMatch (Model.RegexTitle , Html);
Title = Title.Replace("\r\n", "");
Title = Title.Trim();
Title = Title.Replace(" ", "");
Title = Panda.RegController.DelHtml(Title);
Title = Panda.RegController.DelScript(Title);
Title = Title.Replace("<b>", "");
String Detail = Panda.RegController.FirstRegMatch (Model.RegexDetail , Html);
Detail = Panda.RegController.DelScript (Detail);
#region 判断是否有内容页分页正则.如果有就进行分页处理
if (Model.BodyPage != null && Model.BodyPage != ""&&Model.RegexBlockPage!=null&&Model.RegexBlockPage!="")
{
String TempPageHtml = "";
TempPageHtml = Panda.RegController.FirstRegMatch (Model.RegexBlockPage,Html);
MatchCollection MCPage = Panda.RegController.RegMatch (Model.BodyPage , TempPageHtml);
Invoke (DWF , new object [] { "包含分页:" + (MCPage.Count+1).ToString() });
if (MCPage.Count > 0)
{ String PageHtml = "";
for (int MCPageI = 0 ; MCPageI < MCPage.Count ;MCPageI++)
{
String PageUrl = MCPage [MCPageI].Value; //判断读取路径是否正确!
if (PageUrl.Substring (0 , 7).ToUpper () != "HTTP://")
{
if (PageUrl.Substring (0 , 1) != "/")
{
PageUrl = Model.Url.Substring (0 , Model.Url.LastIndexOf ('/') + 1) + PageUrl;
}
else
{
String HostName = Model.Url.Substring (7 , Model.Url.Length - 7);
HostName = HostName.Substring (0 , HostName.IndexOf ('/'));
PageUrl = "HTTP://" + HostName + PageUrl;
}
}
//if (!IsUTF8.Checked)
//{
PageHtml = Panda.WebController.Get (PageUrl , null);
MessageBox.Show (PageHtml);
PageHtml = Panda.RegController.FirstRegMatch (Model.RegexDetail , PageHtml);
Detail += PageHtml;
//}
//else
//{
// PageHtml = Panda.WebController.GetUTF8 (Url , null);
// PageHtml = Panda.RegController.FirstRegMatch (Model.RegexDetail , PageHtml);
// Detail += PageHtml;
//}
}
}
}这个是到循环的内容...
MessageBox到 //}的内容并不会执行
而且PageHtml并没有捕捉到任何内容...
/// 提取整个网页内容
/// </summary>
/// <param name="Url">被提取的地址</param>
/// <returns></returns>
public static String Get(String Url, CookieContainer CC)
{ String Html = "";
if (Url.Length < 1)
{ return "";
}
String address = Url;
HttpWebRequest req = (HttpWebRequest)HttpWebRequest.Create(address);
req.Method = "GET";
req.ContentType = "application/x-www-form-urlencoded;charset=gb2312";
req.Referer = Url;
//75秒无法获取数据就放弃
//req.Timeout = 30000;
req.CookieContainer = CC;
//接受数据回来 using (HttpWebResponse wr = (HttpWebResponse)req.GetResponse())
{
Stream R = wr.GetResponseStream();
StreamReader SR = new StreamReader(R, System.Text.Encoding.Default);
Html = SR.ReadToEnd();
R.Dispose();
SR.Dispose();
}
Url = null;
address = null;
CC = null;
req.Abort ();
return Html;
}以上是我的捕捉代码我出现的问题是在一个循环中执行了一次Get然后 判断是否有内容分页的时候
再次执行 Get似乎就不成功了
for(DetailI=0;DetailI<ListUrl.Count;DetailI++)
{
try
{
#region 查看是否已经到了地址结尾
//lock (locker)
//{
if (!Stop)
{
String Html = "";
int TempI = 0;
Invoke (DWF , new object [] { "打开内容页.当前编号:" + DetailI.ToString() +" 总序列:" + ListUrl.Count.ToString() }); ////进度条动画
//progressBar1.Minimum = DetailI;
ProgressBarMaximum PBM = new ProgressBarMaximum (FProgressBarMin);
Invoke (PBM , new object [] { DetailI });
PBM = null;
TempI = DetailI;
String Url = ListUrl [DetailI].ToString ();
String TempTempUrl = Url;
//判断读取路径是否正确!
if (TempTempUrl!=""&&TempTempUrl!=null&&TempTempUrl.Substring (0 , 7).ToUpper () != "HTTP://")
{
if (Url.Substring (0 , 1) != "/")
{
Url = Model.Url.Substring (0 , Model.Url.LastIndexOf ('/') + 1) + Url;
}
else
{
String HostName = Model.Url.Substring (7 , Model.Url.Length - 7);
HostName = HostName.Substring (0 , HostName.IndexOf ('/'));
Url = "HTTP://" + HostName + Url;
}
}
Invoke (DWF , new object [] { "正在采集:" + Url }); if (!IsUTF8.Checked)
{
Html = Panda.WebController.Get (Url , null);
}
else
{
Html = Panda.WebController.GetUTF8 (Url , null);
} //Html = Panda.WebController.Get (Url , null); Invoke (DWF , new object [] { "数据整理"});
#region 数据整理
String Title = Panda.RegController.FirstRegMatch (Model.RegexTitle , Html);
Title = Title.Replace("\r\n", "");
Title = Title.Trim();
Title = Title.Replace(" ", "");
Title = Panda.RegController.DelHtml(Title);
Title = Panda.RegController.DelScript(Title);
Title = Title.Replace("<b>", "");
String Detail = Panda.RegController.FirstRegMatch (Model.RegexDetail , Html);
Detail = Panda.RegController.DelScript (Detail);
#region 判断是否有内容页分页正则.如果有就进行分页处理
if (Model.BodyPage != null && Model.BodyPage != ""&&Model.RegexBlockPage!=null&&Model.RegexBlockPage!="")
{
String TempPageHtml = "";
TempPageHtml = Panda.RegController.FirstRegMatch (Model.RegexBlockPage,Html);
MatchCollection MCPage = Panda.RegController.RegMatch (Model.BodyPage , TempPageHtml);
Invoke (DWF , new object [] { "包含分页:" + (MCPage.Count+1).ToString() });
if (MCPage.Count > 0)
{ String PageHtml = "";
for (int MCPageI = 0 ; MCPageI < MCPage.Count ;MCPageI++)
{
String PageUrl = MCPage [MCPageI].Value; //判断读取路径是否正确!
if (PageUrl.Substring (0 , 7).ToUpper () != "HTTP://")
{
if (PageUrl.Substring (0 , 1) != "/")
{
PageUrl = Model.Url.Substring (0 , Model.Url.LastIndexOf ('/') + 1) + PageUrl;
}
else
{
String HostName = Model.Url.Substring (7 , Model.Url.Length - 7);
HostName = HostName.Substring (0 , HostName.IndexOf ('/'));
PageUrl = "HTTP://" + HostName + PageUrl;
}
}
//if (!IsUTF8.Checked)
//{
PageHtml = Panda.WebController.Get (PageUrl , null);
MessageBox.Show (PageHtml);
PageHtml = Panda.RegController.FirstRegMatch (Model.RegexDetail , PageHtml);
Detail += PageHtml;
//}
//else
//{
// PageHtml = Panda.WebController.GetUTF8 (Url , null);
// PageHtml = Panda.RegController.FirstRegMatch (Model.RegexDetail , PageHtml);
// Detail += PageHtml;
//}
}
}
}这个是到循环的内容...
MessageBox到 //}的内容并不会执行
而且PageHtml并没有捕捉到任何内容...
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货