登录这个网页时,我用抓包工具http://59.50.113.196:9080/personal/interfaces/hainan/index.jsp
看到有以下的东西
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.1.GA (build: SVNTag=JBoss_4_2_1_GA date=200707131605)/Tomcat-5.5
Set-Cookie: JSESSIONID=FD9D50AA193FA6E04C6E363BC9740FA2; Path=/
Content-Type: text/html;charset=GB2312
Transfer-Encoding: chunked
Date: Fri, 25 Jul 2008 15:38:23 GMT736 这个数是干什么的,怎么获取阿
看到有以下的东西
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.1.GA (build: SVNTag=JBoss_4_2_1_GA date=200707131605)/Tomcat-5.5
Set-Cookie: JSESSIONID=FD9D50AA193FA6E04C6E363BC9740FA2; Path=/
Content-Type: text/html;charset=GB2312
Transfer-Encoding: chunked
Date: Fri, 25 Jul 2008 15:38:23 GMT736 这个数是干什么的,怎么获取阿
这是响应头,里面JSESSIONID这个有用,在POST时需要带上这个
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
namespace bot
{
public class Html
{
/// <summary>
/// httpwebrequest类中的一些属性的集合
/// </summary>
public struct RequestPPT
{
private string strAccept;
/// <summary>
/// 获取或设置request类中的Accept属性
/// 用以设置接受的文件类型
/// </summary>
public string Accept
{
get
{
return strAccept;
}
set
{
strAccept = value;
}
}
private string strContentType;
/// <summary>
/// 获取或设置request类中的ContentType属性
/// 用以设置请求的媒体类型
/// </summary>
public string ContentType
{
get
{
return strContentType;
}
set
{
strContentType = value;
}
}
/// <summary>
/// 获取或设置request类中的UserAgent属性
/// 用以设置请求的客户端信息
/// </summary>
private string strUserAgent;
public string UserAgent
{
get
{
return strUserAgent;
}
set
{
strUserAgent = value;
}
}
private string strMethod;
/// <summary>
/// 获取或设置request类中的Method属性
/// 可以将 Method 属性设置为任何 HTTP 1.1 协议谓词:GET、HEAD、POST、PUT、DELETE、TRACE 或 OPTIONS。
/// 如果 ContentLength 属性被设置为 -1 以外的任何值,则必须将 Method 属性设置为上载数据的协议属性。
/// </summary>
public string Method
{
get
{
return strMethod;
}
set
{
strMethod = value;
}
}
}
/// <summary>
/// 构建一个httt请求以获取目标链接的cookies,需要传入目标的登录地址和相关的post信息,返回完成登录的cookies,以及返回的html内容
/// </summary>
/// <param name="url">登录页面的地址 </param>
/// <param name="post">post信息 </param>
/// <param name="strHtml">输出的html代码 </param>
/// <param name="rppt">请求的标头所需要的相关属性设置 </param>
/// <returns>请求完成后的cookies </returns>
public CookieCollection funGetCookie(string url, byte[] post, out string strHtml, RequestPPT rppt,string server)
{
CookieCollection ckclReturn = new CookieCollection();
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse;
//请求cookies的格式
//hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
//hwResponse = (HttpWebResponse)hwRequest.GetResponse();
//string cookie = hwResponse.Headers.Get("Set-Cookie");
//cookie = cookie.Split(';')[0];
//hwRequest = null;
//hwResponse = null;
//构建即将发送的包头
//cc.SetCookies(new Uri(server), cookie);
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(url));
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = post.Length;
//写入标头
Stream stream;
stream = hwRequest.GetRequestStream();
stream.Write(post, 0, post.Length);
stream.Close();
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close();
//获取缓存内容
ckclReturn = hwResponse.Cookies;
return ckclReturn;
}
/// <summary>
/// 根据已经获取的有效cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标链接的url </param>
/// <param name="ccl">已经获取到的有效cookies </param>
/// <param name="rppt">头属性的相关设置 </param>
/// <returns>目标连接的纯文本:"txt/html" </returns>
public string funGetHtmlByCookies(string strUri, CookieCollection ccl, RequestPPT rppt)
{
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse;
//构建即将发送的包头
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
cc.Add(ccl);
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = 0;
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
Stream stream;
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
string strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close(); //返回值
return strHtml;
}
/// <summary>
/// 根据已经获取的有效cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标链接的url </param>
/// <param name="post">post的byte信息 </param>
/// <param name="ccl">已经获取到的有效cookies </param>
/// <param name="rppt">头属性的相关设置 </param>
/// <returns>目标连接的纯文本:"txt/html" </returns>
public string funGetHtmlByCookies(string strUri,byte[] post, CookieCollection ccl, RequestPPT rppt)
{
CookieContainer cc = new CookieContainer();
HttpWebRequest hwRequest;
HttpWebResponse hwResponse; //构建即将发送的包头
hwRequest = (HttpWebRequest)HttpWebRequest.Create(new Uri(strUri));
cc.Add(ccl);
hwRequest.CookieContainer = cc;
hwRequest.Accept = rppt.Accept;
hwRequest.ContentType = rppt.ContentType;
hwRequest.UserAgent = rppt.UserAgent;
hwRequest.Method = rppt.Method;
hwRequest.ContentLength = post.Length;
//写入post信息
Stream stream;
stream = hwRequest.GetRequestStream();
stream.Write(post, 0, post.Length);
stream.Close();
//发送请求获取响应内容
hwResponse = (HttpWebResponse)hwRequest.GetResponse();
stream = hwResponse.GetResponseStream();
StreamReader sReader = new StreamReader(stream, Encoding.Default);
string strHtml = sReader.ReadToEnd();
sReader.Close();
stream.Close(); //返回值
return strHtml;
}
}
}
using System;
using System.IO;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.Data;
using System.Xml;
using System.Text.RegularExpressions;
using System.Collections;
namespace bot
{
public class SisHtml :Html
{
public SisHtml()
{ } /// <summary>
/// 设置主机ip地址
/// </summary>
public string Host
{
get {
return strHost; }
set {
strHost = value;
}
} private string strHost; /// <summary>
/// 获取目标登录链接的cookies
/// </summary>
/// <param name="url">目标的登录链接 </param>
/// <param name="dir">构造头的泛型键值对 </param>
/// <param name="strHtml">登录后返回的页面内容 </param>
/// <returns>登录后的cookies </returns>
public CookieCollection funGetCookie(string url, Dictionary <string, string> dir, out string strHtml)
{
CookieCollection cc = new CookieCollection();
RequestPPT rppt = new RequestPPT(); //构建post内容
string strPost = funMakePost(dir);
byte[] post = Encoding.Default.GetBytes(strPost); //设置标头属性
rppt.Accept = "image/gif, image/x-xbitmap, image/jpeg, image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel, application/vnd.ms-powerpoint, application/msword, */*";
rppt.ContentType = "application/x-www-form-urlencoded";
rppt.Method = "Post";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
string server ="http://"+ new Uri(url).Host;
return cc = base.funGetCookie(url, post, out strHtml, rppt, server);
} /// <summary>
/// 根据已经获取到cookies来获取目标链接的内容
/// </summary>
/// <param name="strUri">目标的url </param>
/// <param name="ccl">已经获取好的cookies </param>
/// <returns>目标url的纯文本:"txt/html" </returns>
public string funGetHtmlByCookies(string strUri,CookieCollection ccl )
{
RequestPPT rppt = new RequestPPT();
//设置头属性
rppt.Accept = "txt/html";
rppt.ContentType = "application/x-www-form-urlencoded";
rppt.Method = "Post";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
return base.funGetHtmlByCookies(strUri, ccl, rppt);
}
/// <summary>
/// 投票帖子用的方法
/// </summary>
/// <param name="strHtml">投票帖子的htmlcode </param>
/// <param name="ccl">有效的cookies </param>
/// <returns>投票完成以后的htmlcode </returns>
public string funVote(string strHtml,CookieCollection ccl)
{
//判断是不是选取投票
strHtml = strHtml.Substring(strHtml.IndexOf(" <form"), strHtml.LastIndexOf(" </form>") - strHtml.IndexOf(" <form") + 7);
string strCheck = @"name=""pollanswers[]""";
//如果代码中包含关键信息说明没有被投票过
if(strHtml.IndexOf(strCheck)>0)
{
//获取post头的需求信息
string strFormHash = "77b49df4";
string strPollanswers;
strPollanswers = strHtml.Substring(strHtml.IndexOf(strCheck)+strCheck.Length, 20).Split('"')[1];
string strPollansubmit = "提交";
Dictionary <string,string>dir = new Dictionary <string,string>();
dir.Add("formhash",strFormHash);
dir.Add("pollanswers[]",strPollanswers);
dir.Add("pollsubmit",strPollansubmit);
string strPost = funMakePost(dir);
byte[] post = Encoding.Default.GetBytes(strPost);
//获取请求的路径
string strUrl= "http://"+Host+"/bbs/";
string strActionUrl =@"method=""post""";
strUrl+= strHtml.Substring(strHtml.IndexOf(strActionUrl)+strActionUrl.Length,100).Split('"')[1].Replace("amp;","");
//构建头
RequestPPT rppt = new RequestPPT();
rppt.Accept = "txt/html";
rppt.ContentType = "application/x-www-form-urlencoded";
rppt.Method = "Post";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; SV1; .NET CLR 1.1.4322; InfoPath.1; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)";
strHtml = base.funGetHtmlByCookies(strUrl, post, ccl, rppt);
}
return strHtml;
} /// <summary>
/// 根据泛型来构建字符串用于post
/// </summary>
/// <param name="dir">带有键值对的泛型 </param>
/// <returns>构建完毕的字符串 </returns>
private string funMakePost(Dictionary <string,string> dir)
{
string strPost="";
foreach (KeyValuePair <string, string> kvp in dir)
{
strPost += kvp.Key + "=";
if (kvp.Value == "")
{
strPost += "''";
}
else
{
strPost += kvp.Value;
}
strPost += "&";
}
strPost = strPost.Substring(0, strPost.Length - 1);
return strPost;
}
没有完
/// 获取下一个列表页面的路径
/// </summary>
/// <param name="strHtml">当前页面的htmlcode </param>
/// <returns>下一个列表页面的路径 </returns>
public string funGetNextUrl(string strHtml)
{
string strUrl = "";
//判断是否是列表型页面
if (strHtml.IndexOf(" <form") != -1)
{
return strUrl;
}
string strKey =@"class=""next""";
strUrl = "http://"+Host+"/bbs/"+strHtml.Substring(strHtml.IndexOf(strKey) - 100, 100).Split('"')[1].Replace("amp;", "");
return strUrl;
} public DataTable funGetListTable(string strHtml)
{
DataTable dt = new DataTable();
DataColumn dc = new DataColumn("Url");
dt.Columns.Add(dc);
DataRow dr ;
string strReg = @"viewthread.php(\S)+highlight=";
Regex rg = new Regex(strReg);
MatchCollection mc = rg.Matches(strHtml); foreach (Match ms in mc)
{
dr = dt.NewRow();
dr[0] = "http://" + Host + "/bbs/" + ms.ToString().Replace("amp;", "");
dt.Rows.Add(dr);
}
return dt;
}
/// <summary>
/// 返回一个包含有页面相关数据的datatable
/// </summary>
/// <returns> </returns>
public DataTable funGetRePlayDatatable()
{
//获取目标页面所有内容
DataTable dtpage = new DataTable("page");
DataColumn dc0 = new DataColumn("标题");
DataColumn dc1 = new DataColumn("发表人");
DataColumn dc2 = new DataColumn("发表时间");
DataColumn dc3 = new DataColumn("发表内容");
DataColumn dc4 = new DataColumn("楼数");
DataColumn dc5 = new DataColumn("所属链接");
DataColumn dc6 = new DataColumn("类型"); dtpage.Columns.Add(dc0);
dtpage.Columns.Add(dc1);
dtpage.Columns.Add(dc2);
dtpage.Columns.Add(dc3);
dtpage.Columns.Add(dc4);
dtpage.Columns.Add(dc5);
dtpage.Columns.Add(dc6);
return dtpage; }
public DataTable funGetCheater(string strHtml)
{
//获取到datatable的格式
DataTable dtpage = funGetRePlayDatatable();
DataRow dr;
//获取发表人的数据
ArrayList alposter = funGetPoster(strHtml);
//获取内容列表 //获取楼数列表
//获取连接列表
return dtpage;
}
/// <summary>
/// 获取发贴人的列表
/// </summary>
/// <param name="strHtml">出入的html </param>
/// <returns>发帖人的名字集合 </returns>
public ArrayList funGetPoster(string strHtml)
{
ArrayList al = new ArrayList();
//string strReg = @"id=""userinfo(\S)+""(\s)+class=""dropmenu"">";
//string strReg = @"id=""userinfo(\S)+""(\s)+class=""dropmenu""(\s)+onmouseover=""showMenu(this.id)"">lucky_cky </a> </cite>";
string strReg = @"id=""userinfo(\S)+""(\s)+class=""dropmenu""(\s)+onmouseover=""showMenu\(this.id\)"">(\S)+ </a>";
Regex rg = new Regex(strReg);
MatchCollection mc = rg.Matches(strHtml);
foreach (Match ma in mc)
{
strReg = @">(\S)+ <";
Regex regx = new Regex(strReg);
al.Add(regx.Match(ma.Value).Value.Replace(" <","").Replace(">",""));
//al.Add(mc.ToString());
}
return al;
} public ArrayList funGetRepostContent(string strHtml)
{
// strHtml = strHtml.Replace("\n", "");
ArrayList al = new ArrayList();
string strReg = @" <div(\s)*id=(\s)*""postmessage_(.)* </div>";
Regex reg = new Regex(strReg);
MatchCollection mc = reg.Matches(strHtml);
foreach (Match ma in mc)
{
al.Add(ma.Value);
}
return al;
}
}
}
Accept: */*
Accept-Language: zh-cn
UA-CPU: x86
Accept-Encoding: gzip, deflate
User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; EmbeddedWB 14.52 from: http://www.bsalsa.com/ EmbeddedWB 14.52; .NET CLR 2.0.50727)
Host: 59.50.113.196:9080
Proxy-Connection: Keep-Alive
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.1.GA (build: SVNTag=JBoss_4_2_1_GA date=200707131605)/Tomcat-5.5
Set-Cookie: JSESSIONID=FD9D50AA193FA6E04C6E363BC9740FA2; Path=/
Content-Type: text/html;charset=GB2312
Transfer-Encoding: chunked
Date: Fri, 25 Jul 2008 15:38:23 GMT736<html>
.....
</html>
http://hi.baidu.com/zkheartboy/blog/item/9216a0fd05591e1508244d74.html
看一下Chunked编码即可。
那个736是个十六进制的ASCII码表示的Chunk大小。
RequestPPT rppt = new RequestPPT();
rppt.Accept = "*/* ";
rppt.ContentType = " text/html;charset=GB2312 ";
rppt.Method = "GET ";
rppt.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; EmbeddedWB 14.52";
strHtml = base.funGetHtmlByCookies(strUrl, post, ccl, rppt); 头信息你看看上面这样行不行,不行你自己按照相应的属性像上面这样设置成你抓下来的包信息