我一直用HtmlAgilityPack,基本已经满足百度搜索中遇到的大部分类型 比如“贴吧”“相关新闻”“产品搜索”等等..
但是今天碰到一个“另类”,就是百度页面中嵌入的“winform类型”,和别的html不太一样,貌似这个嵌到了jqury中了吧,不太懂,具体界面您可以用百度搜索一下“抑郁症”,然后您会看到页面中间部分有个像winform的应用程序,名字叫做“抑郁症专家”..问题来了..
就算是用HtmlAgilityPack,最后一层分析的结果也很复杂,
HtmlAgilityPack分析的最后一层结果如下(已经没有子节点了,最后一层的内容就这么多...) ->Tag:script
Type:Element
XPath:/html[1]/body[1]/div[1]/div[1]/div[1]/div[2]/table[8]/tr[1]/td[1]/script[3]
Value->(function(){ ;(function(){ var logUrl = 'http://nsclick.baidu.com/v.gif?pid=201'; var params = { tid : '7000', pj : 'app', fm : 'apptm', item : 'apploaded', zone: 'web-list', wd : '抑郁症', qid : window.bdQid, cid : window.bdCid, _t : new Date().getTime() }; var img = window.app_temp_img = new Image(); img.onload = img.onerror = img.onabort = function(){ img.onload = img.onerror = img.onabort = null; img = null; window.app_temp_img = null; }; for(var p in params){ logUrl = logUrl + '&' + p + '=' + encodeURIComponent(params[p]); } img.src = logUrl; })(); app.data = app.data || {}; var baidu = app.baidu; var data = { appLables : 0, appSType : "500" ,otherInfo : {"STYPE":[{"sa":"500", "su":1}]} }; baidu.extend(app.data,data); var args = { keyword : '抑郁症', initKeyword : '抑郁症', dispNum : 1, totalNum : 1, firstAppId : '190015' }; baidu.extend(app.args, args); var divisor = app.args.totalNum > app.args.separation_b ? 20 : 8; if(app.data.appSType == '200') { if(app.args.totalNum <= app.args.separation_b){ divisor = 2; } else if(app.args.totalNum >= 80){ if(!app.data.appLables){ app.data.appLables = {}; app.data.appLables.labels = app.data.otherInfo['LABEL']; } var st = app.data.otherInfo['STYPE']; if(app.data.otherInfo['STYPE']){ var t; for(var i = 0, l = st.length; i < l; i++){ t = st[i]; if(t.sa == '200'){ app.args.totalNum = t.su > 760 ? 760 : t.su; break; } } } divisor = 8; } } else if(app.data.appSType == '400' && app.args.totalNum <= app.args.separation_b){ divisor = 3; } else if(app.data.appLables !=0 && app.data.appLables.labels && app.data.appLables.labels.length >= 1 && app.data.appSType != '200'){ divisor = 12; } var pageNum = app.data.pageNum = Math.ceil(app.args.totalNum / divisor); app.data.initData = { "other_data" : [{"Query":"抑郁症","app_sType":"500", "Keyword":"抑郁症", "TotalPageNum":pageNum, "PageNo":1, "ListNum":1, "DispNum":1,"NowTime":window["bdServerTime"]}], "data": [ { "app_id":'190015', "app_cid":'6', "app_leaf_cid":'25', "rankExpand" : 1, "hotLevel" : 46.745, "app_name":'抑郁症专家', "app_name_hilight":'<em>抑郁症</em>专家', "developer_name":'开发者:好大夫在线', "developer_alias":'好大夫在线', "developer_website":'http://www.haodf.com', "developer_level" : '1', "app_shortdesc":'帮你找到可信赖的正规医院抑郁症专家', "app_sType":'501', "attribute": null, "score":'0.0', "score_num":0, "app_price":'0', "app_logo":'http://apps3.bdimg.com/store/static/kvt/b2c3fd5bf0da7d1254d2e38e9e4c7aec.jpg' ,"app_useTimes":'994' ,"create_time":1323069468 }, {} ].slice(0, -1) }; app.data.searchData = app.data.initData.data; app.data.initData.data = app.data.searchData.slice(0, divisor); })(); app.init();
我的目标是,取到【软件的名称】、【作者】、以及【说明】。看了一下 对应上述html中的位置分别是
"app_name":'抑郁症专家'
"developer_name":'开发者:好大夫在线'
"app_shortdesc":'帮你找到可信赖的正规医院抑郁症专家'
但是今天碰到一个“另类”,就是百度页面中嵌入的“winform类型”,和别的html不太一样,貌似这个嵌到了jqury中了吧,不太懂,具体界面您可以用百度搜索一下“抑郁症”,然后您会看到页面中间部分有个像winform的应用程序,名字叫做“抑郁症专家”..问题来了..
就算是用HtmlAgilityPack,最后一层分析的结果也很复杂,
HtmlAgilityPack分析的最后一层结果如下(已经没有子节点了,最后一层的内容就这么多...) ->Tag:script
Type:Element
XPath:/html[1]/body[1]/div[1]/div[1]/div[1]/div[2]/table[8]/tr[1]/td[1]/script[3]
Value->(function(){ ;(function(){ var logUrl = 'http://nsclick.baidu.com/v.gif?pid=201'; var params = { tid : '7000', pj : 'app', fm : 'apptm', item : 'apploaded', zone: 'web-list', wd : '抑郁症', qid : window.bdQid, cid : window.bdCid, _t : new Date().getTime() }; var img = window.app_temp_img = new Image(); img.onload = img.onerror = img.onabort = function(){ img.onload = img.onerror = img.onabort = null; img = null; window.app_temp_img = null; }; for(var p in params){ logUrl = logUrl + '&' + p + '=' + encodeURIComponent(params[p]); } img.src = logUrl; })(); app.data = app.data || {}; var baidu = app.baidu; var data = { appLables : 0, appSType : "500" ,otherInfo : {"STYPE":[{"sa":"500", "su":1}]} }; baidu.extend(app.data,data); var args = { keyword : '抑郁症', initKeyword : '抑郁症', dispNum : 1, totalNum : 1, firstAppId : '190015' }; baidu.extend(app.args, args); var divisor = app.args.totalNum > app.args.separation_b ? 20 : 8; if(app.data.appSType == '200') { if(app.args.totalNum <= app.args.separation_b){ divisor = 2; } else if(app.args.totalNum >= 80){ if(!app.data.appLables){ app.data.appLables = {}; app.data.appLables.labels = app.data.otherInfo['LABEL']; } var st = app.data.otherInfo['STYPE']; if(app.data.otherInfo['STYPE']){ var t; for(var i = 0, l = st.length; i < l; i++){ t = st[i]; if(t.sa == '200'){ app.args.totalNum = t.su > 760 ? 760 : t.su; break; } } } divisor = 8; } } else if(app.data.appSType == '400' && app.args.totalNum <= app.args.separation_b){ divisor = 3; } else if(app.data.appLables !=0 && app.data.appLables.labels && app.data.appLables.labels.length >= 1 && app.data.appSType != '200'){ divisor = 12; } var pageNum = app.data.pageNum = Math.ceil(app.args.totalNum / divisor); app.data.initData = { "other_data" : [{"Query":"抑郁症","app_sType":"500", "Keyword":"抑郁症", "TotalPageNum":pageNum, "PageNo":1, "ListNum":1, "DispNum":1,"NowTime":window["bdServerTime"]}], "data": [ { "app_id":'190015', "app_cid":'6', "app_leaf_cid":'25', "rankExpand" : 1, "hotLevel" : 46.745, "app_name":'抑郁症专家', "app_name_hilight":'<em>抑郁症</em>专家', "developer_name":'开发者:好大夫在线', "developer_alias":'好大夫在线', "developer_website":'http://www.haodf.com', "developer_level" : '1', "app_shortdesc":'帮你找到可信赖的正规医院抑郁症专家', "app_sType":'501', "attribute": null, "score":'0.0', "score_num":0, "app_price":'0', "app_logo":'http://apps3.bdimg.com/store/static/kvt/b2c3fd5bf0da7d1254d2e38e9e4c7aec.jpg' ,"app_useTimes":'994' ,"create_time":1323069468 }, {} ].slice(0, -1) }; app.data.searchData = app.data.initData.data; app.data.initData.data = app.data.searchData.slice(0, divisor); })(); app.init();
我的目标是,取到【软件的名称】、【作者】、以及【说明】。看了一下 对应上述html中的位置分别是
"app_name":'抑郁症专家'
"developer_name":'开发者:好大夫在线'
"app_shortdesc":'帮你找到可信赖的正规医院抑郁症专家'
Regex reg = new Regex(@"(app_name"":'[^']+)[\s\S]*?(developer_name"":'[^']+)[\s\S]*?(app_shortdesc"":'[^']+)");
if (reg.IsMatch(input))
{
Console.WriteLine(reg.Match(input).Groups[1].Value);
Console.WriteLine(reg.Match(input).Groups[2].Value);
Console.WriteLine(reg.Match(input).Groups[3].Value);
}
我现在得到了三个字符串->
app_name\":'抑郁症专家
developer_name\":'开发者:好大夫在线
app_shortdesc\":'帮你找到可信赖的正规医院抑郁症专家如何只留后面的中文啊?传送门->http://topic.csdn.net/u/20111220/12/a502c677-5e0d-4deb-9cd1-92791d6e5894.html?seed=2000584284&r=77003302#r_77003302