大家好,我做一个winform程序抓取网页代码,使用了如下函数:
/// <summary>
/// 根据Url地址得到网页的html源码
/// </summary>
/// <param name="Url">目标URL</param>
/// <returns>字符串类型源码(GB2312)</returns>
private string GetWebContent(string Url)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
return strResult;
}
//要抓取的URL地址
string Url = "http://d1.7l.mop.com/statistics"; //得到指定Url的源码
string strWebContent = this.GetPageContent(Url);结果这个网站需要登陆,于是便只能抓到登陆页面,非常郁闷请大家帮我想想办法,100分相赠!谢谢啦!
/// <summary>
/// 根据Url地址得到网页的html源码
/// </summary>
/// <param name="Url">目标URL</param>
/// <returns>字符串类型源码(GB2312)</returns>
private string GetWebContent(string Url)
{
string strResult = "";
try
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
//声明一个HttpWebRequest请求
request.Timeout = 30000;
//设置连接超时时间
request.Headers.Set("Pragma", "no-cache");
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream streamReceive = response.GetResponseStream();
Encoding encoding = Encoding.GetEncoding("UTF-8");
StreamReader streamReader = new StreamReader(streamReceive, encoding);
strResult = streamReader.ReadToEnd();
}
catch
{
MessageBox.Show("出错");
}
return strResult;
}
//要抓取的URL地址
string Url = "http://d1.7l.mop.com/statistics"; //得到指定Url的源码
string strWebContent = this.GetPageContent(Url);结果这个网站需要登陆,于是便只能抓到登陆页面,非常郁闷请大家帮我想想办法,100分相赠!谢谢啦!
解决方案 »
- DataTable为何只能增加一行
- 火狐不能用JavaScript显示表格
- 引用Dll出现"不能是外部了,无法声明主体"错误
- 关于打印模板加载不上,急,在线等!
- 问个小白问题
- 求一javascript函数,实现鼠标放到文字xx上时显示view1,鼠标放到文字xxx时显示view2.
- 关于WEB程序部署的一个迷茫问题
- 使用PageMethods提示对象不支持此属性或方法
- odp.net访问Oracle11g提示“找不到请求的 .Net Framework Data Provider”
- 如何总计DataSet某列中的值?
- asp分类查询无效,请大家帮看下哪错了
- .net,javascript常用的开发框架有那些
CookieContainer cc = new CookieContainer();
myReq.CookieContainer = cc;先把用户名和密码给它 Post 过去,登录成功之后,就可以用了。我没有具体做过这个,不知道是否正确,思路应该没问题吧。
http://blog.breakn.net/article.asp?id=286
http://www.zhangyongjun.com/blog/article.asp?id=154
代码要点:1、通过附加一个cookiecontainer到httprequest对象中,可以得到登录后返回的代表SESSION ID的COOKIE。2、将此COOKIE包含在一个cookiecontainer中并附加到另一个HTTPREQUEST请求中,则可以实现SESSION的还原。部分主要代码: CookieContainer cookieContainer = new CookieContainer(); ///////////////////////////////////////////////////
// 1. 打开 Login.aspx 页面,获得 VeiwState & EventValidation。
//如果是登陆页为asp.net页面,需要获取VeiwState及EventValidation
///////////////////////////////////////////////////
// 设置打开页面的参数
string URI = http://localhost/Test/Login.aspx;
HttpWebRequest request = WebRequest.Create(URI) as HttpWebRequest;
request.Method = "GET";
request.KeepAlive = false; // 接收返回的页面
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
System.IO.Stream responseStream = response.GetResponseStream();
System.IO.StreamReader reader = new System.IO.StreamReader(responseStream,Encoding.UTF8);
string srcString = reader.ReadToEnd(); // 获取页面的 VeiwState
string viewStateFlag = "id=\"__VIEWSTATE\" value=\"";
int i = srcString.IndexOf(viewStateFlag) + viewStateFlag.Length;
int j = srcString.IndexOf("\"", i);
string viewState = srcString.Substring(i, j - i); // 获取页面的 EventValidation
string eventValidationFlag = "id=\"__EVENTVALIDATION\" value=\"";
i = srcString.IndexOf(eventValidationFlag) + eventValidationFlag.Length;
j = srcString.IndexOf("\"", i);
string eventValidation = srcString.Substring(i, j - i); ///////////////////////////////////////////////////
// 2. 自动填充并提交 Login.aspx 页面
///////////////////////////////////////////////////
// 提交按钮的文本
string submitButton = "登录"; // 用户名和密码
string userName = "1";
string password = "1"; // 将文本转换成 URL 编码字符串
viewState = System.Web.HttpUtility.UrlEncode(viewState);
eventValidation = System.Web.HttpUtility.UrlEncode(eventValidation);
submitButton = System.Web.HttpUtility.UrlEncode(submitButton); // 要提交的字符串数据。格式形如:user=uesr1&password=123
string formatString =
"userName={0}&password={1}&loginButton={2}&__VIEWSTATE={3}&__EVENTVALIDATION={4}";
string postString =
string.Format(formatString, userName, password, submitButton, viewState, eventValidation); // 将提交的字符串数据转换成字节数组
byte[] postData = Encoding.ASCII.GetBytes(postString); // 设置提交的相关参数
request = WebRequest.Create(URI) as HttpWebRequest;
request.Method = "POST";
request.KeepAlive = false;
request.ContentType = "application/x-www-form-urlencoded";
request.CookieContainer = cookieContainer;///注意这里 request.ContentLength = postData.Length; // 提交请求数据
System.IO.Stream outputStream = request.GetRequestStream();
outputStream.Write(postData, 0, postData.Length);
outputStream.Close(); // 接收返回的页面
response = request.GetResponse() as HttpWebResponse;
responseStream = response.GetResponseStream();
reader = new System.IO.StreamReader(responseStream,Encoding.GetEncoding("GB2312"));
srcString = reader.ReadToEnd(); ///////////////////////////////////////////////////
// 3. 打开 Default.aspx 页面
///////////////////////////////////////////////////
// 设置打开页面的参数
URI = "http://localhost:1165/WebTest/Default.aspx";
request = WebRequest.Create(URI) as HttpWebRequest;
request.Method = "GET";
request.KeepAlive = false;
request.CookieContainer = cookieContainer; // 接收返回的页面
response = request.GetResponse() as HttpWebResponse;
responseStream = response.GetResponseStream();
reader = new System.IO.StreamReader(responseStream, Encoding.UTF8);
srcString = reader.ReadToEnd(); ///////////////////////////////////////////////////
// 4. 分析返回的页面
///////////////////////////////////////////////////
//
cookies可以先登录下就可以得到了
#region 处理Web请求的方法
private bool DoWebRequest(string url, string post, CookieContainer cookies, ref string content, ref string message)
{
try
{
HttpWebRequest request = (HttpWebRequest)HttpWebRequest.Create(url);
request.Accept = "*/*";
request.Headers.Add("Accept-Language", "zh-cn");
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; .NET CLR 2.0.50727;";
request.CookieContainer = cookies;//设置cookie if (string.IsNullOrEmpty(post))
{
request.Method = "GET";
}
else
{
// 发送POST数据
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
byte[] data = Encoding.ASCII.GetBytes(post); request.ContentLength = data.Length;
Stream send = request.GetRequestStream();
send.Write(data, 0, data.Length);
send.Close();
} // 获取响应
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
StreamReader recv = new StreamReader(response.GetResponseStream()); StringBuilder sb = new StringBuilder();
string line = string.Empty;
while (line != null)
{
if (m_cancel) break; line = recv.ReadLine();
sb.AppendLine(line);
} recv.Close();
response.Close();
content = sb.ToString();
return true;
}
catch (Exception e)
{
message = e.Message;
return false;
}
}
#endregion
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
namespace bot
{
public class Html
{
/// <summary>
/// httpwebrequest类中的一些属性的集合
/// </summary>
public struct RequestPPT
{
private string strAccept;
/// <summary>
/// 获取或设置request类中的Accept属性
/// 用以设置接受的文件类型
/// </summary>
public string Accept
{
get
{
return strAccept;
}
set
{
strAccept = value;
}
}
private string strContentType;
/// <summary>
/// 获取或设置request类中的ContentType属性
/// 用以设置请求的媒体类型
/// </summary>
public string ContentType
{
get
{
return strContentType;
}
set
{
strContentType = value;
}
}
/// <summary>
/// 获取或设置request类中的UserAgent属性
/// 用以设置请求的客户端信息
/// </summary>
private string strUserAgent;
public string UserAgent
{
get
{
return strUserAgent;
}
set
{
strUserAgent = value;
}
}
private string strMethod;
/// <summary>
/// 获取或设置request类中的Method属性
/// 可以将 Method 属性设置为任何 HTTP 1.1 协议谓词:GET、HEAD、POST、PUT、DELETE、TRACE 或 OPTIONS。
/// 如果 ContentLength 属性被设置为 -1 以外的任何值,则必须将 Method 属性设置为上载数据的协议属性。
/// </summary>
public string Method
{
get
{
return strMethod;
}
set
{
strMethod = value;
}
}
}
/// <summary>
/// 构建一个httt请求以获取目标链接的cookies,需要传入目标的登录地址和相关的post信息,返回完成登录的cookies,以及返回的html内容
/// </summary>
/// <param name="url">登录页面的地址</param>
/// <param name="post">post信息</param>
/// <param name="strHtml">输出的html代码</param>
/// <param name="rppt">请求的标头所需要的相关属性设置</param>
/// <returns>请求完成后的cookies</returns>
public CookieCollection funGetCookie(string url, byte[] post, out string strHtml, RequestPPT rppt,string server)
{
CookieCollection ckclReturn = new CookieCollection();
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse;
//请求cookies的格式
//hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
//hwResponse = (HttpWebResponse)hwRequest.GetResponse();
//string cookie = hwResponse.Headers.Get("Set-Cookie");
//cookie = cookie.Split(';')[0];
//hwRequest = null;
//hwResponse = null;
//构建即将发送的包头
//cc.SetCookies(new Uri(server), cookie);
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = post.Length;
//写入标头
Stream stream;
stream = hwRequest.GetRequestStream();
stream.Write(post, 0, post.Length);
stream.Close();
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close();
//获取缓存内容
ckclReturn = hwResponse.Cookies;
return ckclReturn;
}
/// <summary>
/// 根据已经获取的有效cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标链接的url</param>
/// <param name="ccl">已经获取到的有效cookies</param>
/// <param name="rppt">头属性的相关设置</param>
/// <returns>目标连接的纯文本:"txt/html"</returns>
public string funGetHtmlByCookies(string strUri, CookieCollection ccl, RequestPPT rppt)
{
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse;
//构建即将发送的包头
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
cc.Add(ccl);
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = 0;
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
Stream stream;
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
string strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close(); //返回值
return strHtml;
}
/// <summary>
/// 根据已经获取的有效cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标链接的url</param>
///<param name="post">post的byte信息</param>
/// <param name="ccl">已经获取到的有效cookies</param>
/// <param name="rppt">头属性的相关设置</param>
/// <returns>目标连接的纯文本:"txt/html"</returns>
public string funGetHtmlByCookies(string strUri,byte[] post, CookieCollection ccl, RequestPPT rppt)
{
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse; //构建即将发送的包头
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
cc.Add(ccl);
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = post.Length;
//写入post信息
Stream stream;
stream = hwRequest.GetRequestStream();
stream.Write(post, 0, post.Length);
stream.Close();
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
string strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close(); //返回值
return strHtml;
}
}
}
/// <summary>
/// 构建一个httt请求以获取目标链接的cookies,需要传入目标的登录地址和相关的post信息,返回完成登录的cookies,以及返回的html内容
/// </summary>
/// <param name="url">登录页面的地址</param>
/// <param name="post">post信息</param>
/// <param name="strHtml">输出的html代码</param>
/// <param name="rppt">请求的标头所需要的相关属性设置</param>
/// <returns>请求完成后的cookies</returns>
public CookieCollection funGetCookie(string url, byte[] post, out string strHtml, RequestPPT rppt, string server)
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.Data;
using System.Xml;
using System.Text.RegularExpressions;
using System.Collections;
namespace bot
{
public class SisHtml :Html
{
public SisHtml()
{
} /// <summary>
/// 设置主机ip地址
/// </summary>
public string Host
{
get {
return strHost; }
set {
strHost = value;
}
} private string strHost; /// <summary>
/// 获取目标登录链接的cookies
/// </summary>
/// <param name="url">目标的登录链接</param>
/// <param name="dir">构造头的泛型键值对</param>
/// <param name="strHtml">登录后返回的页面内容</param>
/// <returns>登录后的cookies</returns>
public CookieCollection funGetCookie(string url, Dictionary<string, string> dir, out string strHtml)
{
CookieCollection cc = new CookieCollection();
RequestPPT rppt = new RequestPPT(); //构建post内容
string strPost = funMakePost(dir);
byte[] post = Encoding.Default.GetBytes(strPost); //设置标头属性
rppt.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
rppt.ContentType = "application/x-www-form-urlencoded";
rppt.Method = "Post";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
string server ="http://"+ new Uri(url).Host;
return cc = base.funGetCookie(url, post, out strHtml, rppt, server);
} /// <summary>
/// 根据已经获取到cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标的url</param>
/// <param name="ccl">已经获取好的cookies</param>
/// <returns>目标url的纯文本:"txt/html"</returns>
public string funGetHtmlByCookies(string strUri,CookieCollection ccl )
{
RequestPPT rppt = new RequestPPT();
//设置头属性
rppt.Accept = "txt/html";
rppt.ContentType = "application/x-www-form-urlencoded";
rppt.Method = "Post";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
return base.funGetHtmlByCookies(strUri, ccl, rppt);
}
/// <summary>
/// 投票帖子用的方法
/// </summary>
/// <param name="strHtml">投票帖子的htmlcode</param>
/// <param name="ccl">有效的cookies</param>
/// <returns>投票完成以后的htmlcode</returns>
public string funVote(string strHtml,CookieCollection ccl)
{
//判断是不是选取投票
strHtml = strHtml.Substring(strHtml.IndexOf("<form"), strHtml.LastIndexOf("</form>") - strHtml.IndexOf("<form") + 7);
string strCheck = @"name=""pollanswers[]""";
//如果代码中包含关键信息说明没有被投票过
if(strHtml.IndexOf(strCheck)>0)
{
//获取post头的需求信息
string strFormHash = "77b49df4";
string strPollanswers;
strPollanswers = strHtml.Substring(strHtml.IndexOf(strCheck)+strCheck.Length, 20).Split('"')[1];
string strPollansubmit = "提交";
Dictionary<string,string>dir = new Dictionary<string,string>();
dir.Add("formhash",strFormHash);
dir.Add("pollanswers[]",strPollanswers);
dir.Add("pollsubmit",strPollansubmit);
string strPost = funMakePost(dir);
byte[] post = Encoding.Default.GetBytes(strPost);
//获取请求的路径
string strUrl= "http://"+Host+"/bbs/";
string strActionUrl =@"method=""post""";
strUrl+= strHtml.Substring(strHtml.IndexOf(strActionUrl)+strActionUrl.Length,100).Split('"')[1].Replace("amp;","");
//构建头
RequestPPT rppt = new RequestPPT();
rppt.Accept = "txt/html";
rppt.ContentType = "application/x-www-form-urlencoded";
rppt.Method = "Post";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
strHtml = base.funGetHtmlByCookies(strUrl, post, ccl, rppt);
}
return strHtml;
} /// <summary>
/// 根据泛型来构建字符串用于post
/// </summary>
/// <param name="dir">带有键值对的泛型</param>
/// <returns>构建完毕的字符串</returns>
private string funMakePost(Dictionary<string,string> dir)
{
string strPost="";
foreach (KeyValuePair<string, string> kvp in dir)
{
strPost += kvp.Key + "=";
if (kvp.Value == "")
{
strPost += "''";
}
else
{
strPost += kvp.Value;
}
strPost += "&";
}
strPost = strPost.Substring(0, strPost.Length - 1);
return strPost;
}
没有完
/// <summary>
/// 获取下一个列表页面的路径
/// </summary>
/// <param name="strHtml">当前页面的htmlcode</param>
/// <returns>下一个列表页面的路径</returns>
public string funGetNextUrl(string strHtml)
{
string strUrl = "";
//判断是否是列表型页面
if (strHtml.IndexOf("<form") != -1)
{
return strUrl;
}
string strKey =@"class=""next""";
strUrl = "http://"+Host+"/bbs/"+strHtml.Substring(strHtml.IndexOf(strKey) - 100, 100).Split('"')[1].Replace("amp;", "");
return strUrl;
} public DataTable funGetListTable(string strHtml)
{
DataTable dt = new DataTable();
DataColumn dc = new DataColumn("Url");
dt.Columns.Add(dc);
DataRow dr ;
string strReg = @"viewthread.php(\S)+highlight=";
Regex rg = new Regex(strReg);
MatchCollection mc = rg.Matches(strHtml); foreach (Match ms in mc)
{
dr = dt.NewRow();
dr[0] = "http://" + Host + "/bbs/" + ms.ToString().Replace("amp;", "");
dt.Rows.Add(dr);
}
return dt;
}
/// <summary>
/// 返回一个包含有页面相关数据的datatable
/// </summary>
/// <returns></returns>
public DataTable funGetRePlayDatatable()
{
//获取目标页面所有内容
DataTable dtpage = new DataTable("page");
DataColumn dc0 = new DataColumn("标题");
DataColumn dc1 = new DataColumn("发表人");
DataColumn dc2 = new DataColumn("发表时间");
DataColumn dc3 = new DataColumn("发表内容");
DataColumn dc4 = new DataColumn("楼数");
DataColumn dc5 = new DataColumn("所属链接");
DataColumn dc6 = new DataColumn("类型"); dtpage.Columns.Add(dc0);
dtpage.Columns.Add(dc1);
dtpage.Columns.Add(dc2);
dtpage.Columns.Add(dc3);
dtpage.Columns.Add(dc4);
dtpage.Columns.Add(dc5);
dtpage.Columns.Add(dc6);
return dtpage; }
public DataTable funGetCheater(string strHtml)
{
//获取到datatable的格式
DataTable dtpage = funGetRePlayDatatable();
DataRow dr;
//获取发表人的数据
ArrayList alposter = funGetPoster(strHtml);
//获取内容列表 //获取楼数列表
//获取连接列表
return dtpage;
}
/// <summary>
/// 获取发贴人的列表
/// </summary>
/// <param name="strHtml">出入的html</param>
/// <returns>发帖人的名字集合</returns>
public ArrayList funGetPoster(string strHtml)
{
ArrayList al = new ArrayList();
//string strReg = @"id=""userinfo(\S)+""(\s)+class=""dropmenu"">";
//string strReg = @"id=""userinfo(\S)+""(\s)+class=""dropmenu""(\s)+onmouseover=""showMenu(this.id)"">lucky_cky</a></cite>";
string strReg = @"id=""userinfo(\S)+""(\s)+class=""dropmenu""(\s)+onmouseover=""showMenu\(this.id\)"">(\S)+</a>";
Regex rg = new Regex(strReg);
MatchCollection mc = rg.Matches(strHtml);
foreach (Match ma in mc)
{
strReg = @">(\S)+<";
Regex regx = new Regex(strReg);
al.Add(regx.Match(ma.Value).Value.Replace("<","").Replace(">",""));
//al.Add(mc.ToString());
}
return al;
} public ArrayList funGetRepostContent(string strHtml)
{
// strHtml = strHtml.Replace("\n", "");
ArrayList al = new ArrayList();
string strReg = @"<div(\s)*id=(\s)*""postmessage_(.)*</div>";
Regex reg = new Regex(strReg);
MatchCollection mc = reg.Matches(strHtml);
foreach (Match ma in mc)
{
al.Add(ma.Value);
}
return al;
}
}
}
我的隐私啊= =#这个色站真的不错嘿嘿