网页爬虫的问题 在中华英才网上输入职位java,搜索后出现了几万条职位,共分了解100页,不过第一页和第二页的页面改变了,但是,地址栏上的地址却没有改变。看一下源代码发现还是和第一页的一样。第二页没有地址,想从它上面搞点东西来也不行了。 后面99页的东西都无法用爬虫来爬点有用的东西。它的上面是怎么搞的?怎样才能搞到第二页到第99页上的source code? 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 用firebug看了下 是post调用 并且是ajax 获取jason数据格式呵呵 网站爬虫做好有点难度呢url只是一部分包括网页的结构,url的血缘关系,url比对,分析是不是本站的url等等 我也弄过这几个网站的,当时属中华英才的难弄,中华英才使用了ajax,并且有加密,而且更新比较快,(不过你找到他返回的json的js就好办了。)你做出来,不出2个月,他会再改版,,现在我的那个爬虫都不能用了。。改朝换代好多年。 参数application/x-www-form-urlencodedhidRnd tmpJtRefRndjtUrl document.location.href源代码jtUrl=document.location.href&hidRnd=tmpJtRefRnd请求参数也加密了高人来解释一下每一次翻页都是这个 东西,后台如何知道 要取 那页的数据呢 //更新结果集4027function RefreshList(){4028 $("#dvAD").hide();4029 document.getElementById("dvSummary").innerHTML="<h5>正在加载,请稍候……< /h5>";4030 document.getElementById("dvTitle").innerHTML="<h5>正在加载,请稍候……< /h5>";4031 purge(document.getElementById("dvSummary"));40324033 $("#chkAllTop")[0].checked = false;4034 $("#chkAllBott")[0].checked = false;40354036 checkedIndex = ",";4037 //增加加密方式 防止非法爬虫抓取页面4038 var BaseSb=new StringBuilder();4039 //BaseSb.Append("&urlKey=");4040 BaseSb.Append((urlKey));40414042 BaseSb.Append("&curPage=");4043 BaseSb.Append(curPage);40444045 BaseSb.Append("&pageSize=");4046 BaseSb.Append(pageSize);40474048 BaseSb.Append("&recordCount=");4049 BaseSb.Append(recordCount);40504051 BaseSb.Append("&orderField=");4052 BaseSb.Append(orderField);40534054 BaseSb.Append("&order=");4055 BaseSb.Append(order);4056 var BaseStr=base64encode(BaseSb.toString());40574058 var httpurl = new StringBuilder();40594060 httpurl.Append("/GetSearchResult.awp?");4061 httpurl.Append("jtq=onlyrecord");4062 httpurl.Append("&urlKey=");4063 httpurl.Append(BaseStr);4064// httpurl.Append("&urlKey=");4065// httpurl.Append(UrlEncode(urlKey));4066//4067// httpurl.Append("&curPage=");4068// httpurl.Append(curPage);4069//4070// httpurl.Append("&pageSize=");4071// httpurl.Append(pageSize);4072//4073// httpurl.Append("&recordCount=");4074// httpurl.Append(recordCount);4075//4076// httpurl.Append("&orderField=");4077// httpurl.Append(orderField);4078//4079// httpurl.Append("&order=");4080// httpurl.Append(order);408140824083 var agentId = request.QueryString("JobAgentID");4084 if (agentId != ""){4085 httpurl.Append("&JobAgentID=");4086 httpurl.Append(agentId);4087 }4088 var Prj= request.QueryString("prj");4089 var ProjectID=$("#HidProjectID").val();4090 httpurl.Append("&ProjectID=");4091 httpurl.Append(ProjectID);4092 if($("#HidIsModel").val()=="true")4093 {4094 httpurl.Append("&IsModel=");4095 httpurl.Append($("#HidIsModel").val());4096 }4097 if(typeof(appendRnd)!="undefined")4098 httpurl=appendRnd(httpurl.toString());4099 var requestUrl =httpurl.toString();4100 httpurl=null;41014102 var tmpJtRefRnd = typeof(jtRefRnd) != "undefined" ? jtRefRnd : requestFromStr.QueryString(window.location.href, "jtr");4103 $.ajax({4104 type: "POST",4105 url: requestUrl,4106 data: "jtUrl=document.location.href&hidRnd=tmpJtRefRnd",//"name=John&location=Boston",4107 dataType: "json",4108 async: "false",4109 success: function(json){4110 if (json["recordCount"] > 0)4111 {4112 if (json["recordCount"] ==1)4113 $("#barbottom").hide();4114 else4115 $("#barbottom").show();4116 jsonResult=null;4117 jsonResult = json["dataSet"];4118 //只加载当前显示状态的列表,并将另一种列表设为需要加载的状态4119 if (curShowST=="Summary")4120 {4121 //LoadSummary(json["dataSet"]);4122 LoadSummary();4123 nedLoadST = "Title";4124 }4125 if (curShowST=="Title")4126 {4127 LoadSummary();4128 LoadTitle();4129 nedLoadST = "Summary";4130 }4131 ////4132 $("#hidRecordCount").val(json["recordCount"]);4133 $("#hidTrueCount").val(json["trueCount"]);4134 ShowPageInfo();41354136 ShowCurCond();41374138 if(IfLoadAD==orderField)4139 {4140 LoadAD();4141 }4142 else4143 {4144 $("#dvAD").show(); 先加密,然后ajax,返回jsonList,然后显示。你只需要知道BaseSb里面是怎么加密的就行了。现在jdk 1.6后出现了script包。它可以帮助我们直接java使用javascript,所以你只需要把BaseSb下载下来,然后使用java调用BaseSb,看参数加密后的样子,然后套用ajax的Url,得到的就是返回的json,然后分析json就哦了。 ps:上面的代码是他的js里面的源码。 看错了。他使用的base64encode加密。。呵呵。。你把base64encode down下来,然后加密参数,就o了 function base64encode(Str) { var keyStr = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="; var output = ""; var chr1, chr2, chr3 = ""; var enc1, enc2, enc3, enc4 = ""; var i = 0; do { chr1 = Str.charCodeAt(i++); chr2 = Str.charCodeAt(i++); chr3 = Str.charCodeAt(i++); enc1 = chr1 >> 2; enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); enc4 = chr3 & 63; if (isNaN(chr2)) { enc3 = enc4 = 64; } else if (isNaN(chr3)) { enc4 = 64; } output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + keyStr.charAt(enc3) + keyStr.charAt(enc4); chr1 = chr2 = chr3 = ""; enc1 = enc2 = enc3 = enc4 = ""; } while (i < Str.length); return output;}这个是早些时候,他们的加密代码。保存为xxx.js,运用jdk1.6后的script包下的方法直接调用js。 原来毕业论文和这个有点联系,建议看下lucence框架 lucence+nutch但是爬出来的进行检索后的效果不好啊,就是分词好像很不好呢 女程序员的悲哀啊:同一台服务器,部署两个web项目session丢失,急!!! Eclipse到期如何解决 我要实现这样的功能,请问有什么好办法吗 struts2文件下载的问题,请帮帮忙 请教一个累死asp里split的方法(jsp的) 安装JRE出错 logic:iterate问题 jsp中怎么才能调.dll文件,可以在vbscript中调用吗,如果可以的话怎么调啊 找人开发短信网站 哪有提供ftp、 空间>50M、速度比较快的免费空间,而且没有文件限制的地方 关于JDBC的问题 JSP怎么显示出页面传过来的值
呵呵
做好有点难度呢url只是一部分
包括网页的结构,url的血缘关系,url比对,分析是不是本站的url等等
参数application/x-www-form-urlencoded
hidRnd tmpJtRefRnd
jtUrl document.location.href
源代码
jtUrl=document.location.href&hidRnd=tmpJtRefRnd
请求参数也加密了
高人来解释一下每一次翻页都是这个 东西,后台如何知道 要取 那页的数据呢
4027function RefreshList(){
4028 $("#dvAD").hide();
4029 document.getElementById("dvSummary").innerHTML="<h5>正在加载,请稍候……< /h5>";
4030 document.getElementById("dvTitle").innerHTML="<h5>正在加载,请稍候……< /h5>";
4031 purge(document.getElementById("dvSummary"));
4032
4033 $("#chkAllTop")[0].checked = false;
4034 $("#chkAllBott")[0].checked = false;
4035
4036 checkedIndex = ",";
4037 //增加加密方式 防止非法爬虫抓取页面
4038 var BaseSb=new StringBuilder();
4039 //BaseSb.Append("&urlKey=");
4040 BaseSb.Append((urlKey));
4041
4042 BaseSb.Append("&curPage=");
4043 BaseSb.Append(curPage);
4044
4045 BaseSb.Append("&pageSize=");
4046 BaseSb.Append(pageSize);
4047
4048 BaseSb.Append("&recordCount=");
4049 BaseSb.Append(recordCount);
4050
4051 BaseSb.Append("&orderField=");
4052 BaseSb.Append(orderField);
4053
4054 BaseSb.Append("&order=");
4055 BaseSb.Append(order);
4056 var BaseStr=base64encode(BaseSb.toString());
4057
4058 var httpurl = new StringBuilder();
4059
4060 httpurl.Append("/GetSearchResult.awp?");
4061 httpurl.Append("jtq=onlyrecord");
4062 httpurl.Append("&urlKey=");
4063 httpurl.Append(BaseStr);
4064// httpurl.Append("&urlKey=");
4065// httpurl.Append(UrlEncode(urlKey));
4066//
4067// httpurl.Append("&curPage=");
4068// httpurl.Append(curPage);
4069//
4070// httpurl.Append("&pageSize=");
4071// httpurl.Append(pageSize);
4072//
4073// httpurl.Append("&recordCount=");
4074// httpurl.Append(recordCount);
4075//
4076// httpurl.Append("&orderField=");
4077// httpurl.Append(orderField);
4078//
4079// httpurl.Append("&order=");
4080// httpurl.Append(order);
4081
4082
4083 var agentId = request.QueryString("JobAgentID");
4084 if (agentId != ""){
4085 httpurl.Append("&JobAgentID=");
4086 httpurl.Append(agentId);
4087 }
4088 var Prj= request.QueryString("prj");
4089 var ProjectID=$("#HidProjectID").val();
4090 httpurl.Append("&ProjectID=");
4091 httpurl.Append(ProjectID);
4092 if($("#HidIsModel").val()=="true")
4093 {
4094 httpurl.Append("&IsModel=");
4095 httpurl.Append($("#HidIsModel").val());
4096 }
4097 if(typeof(appendRnd)!="undefined")
4098 httpurl=appendRnd(httpurl.toString());
4099 var requestUrl =httpurl.toString();
4100 httpurl=null;
4101
4102 var tmpJtRefRnd = typeof(jtRefRnd) != "undefined" ? jtRefRnd : requestFromStr.QueryString(window.location.href, "jtr");
4103 $.ajax({
4104 type: "POST",
4105 url: requestUrl,
4106 data: "jtUrl=document.location.href&hidRnd=tmpJtRefRnd",//"name=John&location=Boston",
4107 dataType: "json",
4108 async: "false",
4109 success: function(json){
4110 if (json["recordCount"] > 0)
4111 {
4112 if (json["recordCount"] ==1)
4113 $("#barbottom").hide();
4114 else
4115 $("#barbottom").show();
4116 jsonResult=null;
4117 jsonResult = json["dataSet"];
4118 //只加载当前显示状态的列表,并将另一种列表设为需要加载的状态
4119 if (curShowST=="Summary")
4120 {
4121 //LoadSummary(json["dataSet"]);
4122 LoadSummary();
4123 nedLoadST = "Title";
4124 }
4125 if (curShowST=="Title")
4126 {
4127 LoadSummary();
4128 LoadTitle();
4129 nedLoadST = "Summary";
4130 }
4131 ////
4132 $("#hidRecordCount").val(json["recordCount"]);
4133 $("#hidTrueCount").val(json["trueCount"]);
4134 ShowPageInfo();
4135
4136 ShowCurCond();
4137
4138 if(IfLoadAD==orderField)
4139 {
4140 LoadAD();
4141 }
4142 else
4143 {
4144 $("#dvAD").show(); 先加密,然后ajax,返回jsonList,然后显示。
你只需要知道BaseSb里面是怎么加密的就行了。
现在jdk 1.6后出现了script包。它可以帮助我们直接java使用javascript,所以你只需要把BaseSb下载下来,然后使用java调用BaseSb,看参数加密后的样子,然后套用ajax的Url,得到的就是返回的json,然后分析json就哦了。
function base64encode(Str) {
var keyStr = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";
var output = "";
var chr1, chr2, chr3 = "";
var enc1, enc2, enc3, enc4 = "";
var i = 0;
do {
chr1 = Str.charCodeAt(i++);
chr2 = Str.charCodeAt(i++);
chr3 = Str.charCodeAt(i++);
enc1 = chr1 >> 2;
enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);
enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);
enc4 = chr3 & 63;
if (isNaN(chr2)) {
enc3 = enc4 = 64;
} else if (isNaN(chr3)) {
enc4 = 64;
}
output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2)
+ keyStr.charAt(enc3) + keyStr.charAt(enc4);
chr1 = chr2 = chr3 = "";
enc1 = enc2 = enc3 = enc4 = "";
} while (i < Str.length);
return output;
}
这个是早些时候,他们的加密代码。保存为xxx.js,运用jdk1.6后的script包下的方法直接调用js。