大家好,我做一个winform程序抓取网页代码,使用了如下函数: /// <summary>
/// 根据Url地址得到网页的html源码
/// </summary>
/// <param name="Url">目标URL</param>
/// <returns>字符串类型源码(GB2312)</returns>
private string GetWebContent(string Url)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
return strResult;
}
//要抓取的URL地址
string Url = "http://d1.7l.mop.com/statistics"; //得到指定Url的源码
string strWebContent = this.GetPageContent(Url);
结果这个网站需要登陆,于是便只能抓到登陆页面,非常郁闷 请大家帮我想想办法,100分相赠! 谢谢啦!
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
namespace bot
{
public class Html
{
/// <summary>
/// httpwebrequest类中的一些属性的集合
/// </summary>
public struct RequestPPT
{
private string strAccept;
/// <summary>
/// 获取或设置request类中的Accept属性
/// 用以设置接受的文件类型
/// </summary>
public string Accept
{
get
{
return strAccept;
}
set
{
strAccept = value;
}
}
private string strContentType;
/// <summary>
/// 获取或设置request类中的ContentType属性
/// 用以设置请求的媒体类型
/// </summary>
public string ContentType
{
get
{
return strContentType;
}
set
{
strContentType = value;
}
}
/// <summary>
/// 获取或设置request类中的UserAgent属性
/// 用以设置请求的客户端信息
/// </summary>
private string strUserAgent;
public string UserAgent
{
get
{
return strUserAgent;
}
set
{
strUserAgent = value;
}
}
private string strMethod;
/// <summary>
/// 获取或设置request类中的Method属性
/// 可以将 Method 属性设置为任何 HTTP 1.1 协议谓词:GET、HEAD、POST、PUT、DELETE、TRACE 或 OPTIONS。
/// 如果 ContentLength 属性被设置为 -1 以外的任何值,则必须将 Method 属性设置为上载数据的协议属性。
/// </summary>
public string Method
{
get
{
return strMethod;
}
set
{
strMethod = value;
}
}
}
/// <summary>
/// 构建一个httt请求以获取目标链接的cookies,需要传入目标的登录地址和相关的post信息,返回完成登录的cookies,以及返回的html内容
/// </summary>
/// <param name="url">登录页面的地址</param>
/// <param name="post">post信息</param>
/// <param name="strHtml">输出的html代码</param>
/// <param name="rppt">请求的标头所需要的相关属性设置</param>
/// <returns>请求完成后的cookies</returns>
public CookieCollection funGetCookie(string url, byte[] post, out string strHtml, RequestPPT rppt,string server)
{
CookieCollection ckclReturn = new CookieCollection();
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse;
//请求cookies的格式
//hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
//hwResponse = (HttpWebResponse)hwRequest.GetResponse();
//string cookie = hwResponse.Headers.Get("Set-Cookie");
//cookie = cookie.Split(';')[0];
//hwRequest = null;
//hwResponse = null;
//构建即将发送的包头
//cc.SetCookies(new Uri(server), cookie);
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = post.Length;
//写入标头
Stream stream;
stream = hwRequest.GetRequestStream();
stream.Write(post, 0, post.Length);
stream.Close();
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close();
//获取缓存内容
ckclReturn = hwResponse.Cookies;
return ckclReturn;
}
/// <summary>
/// 根据已经获取的有效cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标链接的url</param>
/// <param name="ccl">已经获取到的有效cookies</param>
/// <param name="rppt">头属性的相关设置</param>
/// <returns>目标连接的纯文本:"txt/html"</returns>
public string funGetHtmlByCookies(string strUri, CookieCollection ccl, RequestPPT rppt)
{
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse;
//构建即将发送的包头
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
cc.Add(ccl);
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = 0;
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
Stream stream;
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
string strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close(); //返回值
return strHtml;
}
/// <summary>
/// 根据已经获取的有效cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标链接的url</param>
///<param name="post">post的byte信息</param>
/// <param name="ccl">已经获取到的有效cookies</param>
/// <param name="rppt">头属性的相关设置</param>
/// <returns>目标连接的纯文本:"txt/html"</returns>
public string funGetHtmlByCookies(string strUri,byte[] post, CookieCollection ccl, RequestPPT rppt)
{
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse; //构建即将发送的包头
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
cc.Add(ccl);
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = post.Length;
//写入post信息
Stream stream;
stream = hwRequest.GetRequestStream();
stream.Write(post, 0, post.Length);
stream.Close();
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
string strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close(); //返回值
return strHtml;
}
}
}
但是之前你需要一个抓包工具,把你登录的时候的包抓下来,然后模拟的方式构建post然后httpwebrequest去get就ok了
关于验证码你可以在验证码那部分人工识别,获取下他的cookies这样以后都自动登录了不用在去理会验证码了,你的2个帖子我都会了,另外一个上面有一些具体的调用方法你看看,应该对你有帮助,下班了下班了,有问题明天继续哈,早日解决早日结贴啊,俺可是巴巴的往着你的分的呵呵