我想抓取http://www.travelzen.com.cn/这个站的国际机票信息,思路如下:
第一步抓取flightLoading.php
$url = "http://www.travelzen.com.cn/flightLoading.php";
$referer = 'http://www.travelzen.com.cn/index.php?type=flight';
$cookie_jar = dirname(__FILE__)."/cookie.txt";
$timeout = 10;
$data = "flightType=Int&selectedDeparture=&selectedReturn=&selectedBookId=&fromCity=HKG&toCity=BKK&adult=1&child=0&datefrom=2011-06-20&searchToken=&type=&dep1=&arr1=&dep2=&arr2=&queryToken=&tripType=2&flightClass=All";
$ch = curl_init();
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_jar);
$contents = curl_exec($ch);
curl_close($ch); 第二步抓取初步显示页面
$url = "http://www.travelzen.com.cn/getFlightSchPreview.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10;
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch); 第三步抓取最后刷新的真实数据
$url = "http://www.travelzen.com.cn/flightResult.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10;
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch);整个过程,到第2步都可以正常抓回,但第3步最后只抓回了架子,里面的数据没抓回来,取消第2步,直接抓第3步也是一样的效果,希望life169能帮我看看,万分感谢,有高手有时间帮忙看看也行,分还可以加。
第一步抓取flightLoading.php
$url = "http://www.travelzen.com.cn/flightLoading.php";
$referer = 'http://www.travelzen.com.cn/index.php?type=flight';
$cookie_jar = dirname(__FILE__)."/cookie.txt";
$timeout = 10;
$data = "flightType=Int&selectedDeparture=&selectedReturn=&selectedBookId=&fromCity=HKG&toCity=BKK&adult=1&child=0&datefrom=2011-06-20&searchToken=&type=&dep1=&arr1=&dep2=&arr2=&queryToken=&tripType=2&flightClass=All";
$ch = curl_init();
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_jar);
$contents = curl_exec($ch);
curl_close($ch); 第二步抓取初步显示页面
$url = "http://www.travelzen.com.cn/getFlightSchPreview.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10;
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch); 第三步抓取最后刷新的真实数据
$url = "http://www.travelzen.com.cn/flightResult.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10;
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_REFERER, $referer);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch);整个过程,到第2步都可以正常抓回,但第3步最后只抓回了架子,里面的数据没抓回来,取消第2步,直接抓第3步也是一样的效果,希望life169能帮我看看,万分感谢,有高手有时间帮忙看看也行,分还可以加。
Set_time_limit(0);$cookie_jar = '/tmp/cookie.tmp';//cookie保存目录//模拟请求数据
Function request($url,$postfields,$cookie_jar,$referer){
$ch = curl_init();
$options = array(CURLOPT_URL => $url,
CURLOPT_HEADER => 0,
CURLOPT_NOBODY => 0,
CURLOPT_PORT => 80,
CURLOPT_POST => 1,
CURLOPT_POSTFIELDS => $postfields,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_COOKIEJAR => $cookie_jar,
CURLOPT_COOKIEFILE => $cookie_jar,
CURLOPT_REFERER => $referer
);
curl_setopt_array($ch, $options);
$code = curl_exec($ch);
curl_close($ch);
Return $code;
}//获取帖子列表
Function getThreadsList($code){
Preg_match_all('/<!--[.|\r|\n]*?<a href=\"viewthread.php\?tid=(\d+)/',$code,$threads);
Return $threads[1];
}//判断该帖子是否存在
Function isExits($code){
Preg_match('/<p>指定的主题不存在或已被删除或正在被审核,请返回。<\/p>/',$code,$error);
Return isset($error[0])?false:true;
}//获取帖子标题
Function getTitle($code){
Preg_match('/<h1>[<\/h1>]*/',$code,$title_tmp);
$title = $title_tmp[0];
Return $title;
}//获取帖子作者:
Function getAuthor($code){
Preg_match('/<a href=\"space.php\?uid=\d+\" target=\"_blank\" id=\"userinfo\d+\" onmouseover=\"showMenu\(this\.id\)\">.+/',$code,$author_tmp);
$author = strip_tags($author_tmp[0]);
Return $author;
}//获取楼主发表的内容
Function getContents($code){
Preg_match('/<div id=\"postmessage_\d+\" class=\"t_msgfont\">(.|\r|\n)*?<\/div>/',$code,$contents_tmp);
$contents = preg_replace('/images\//','http://bbs.war3.cn/images/',$contents_tmp[0]);
Return $contents;
}//打印帖子标题
Function printTitle($title){
Echo "<strong><h2>帖子标题:</h2></strong>",strip_tags($title),"<br/><br/>";
}//输出帖子作者
Function printAuthor($author){
Echo "<strong><h2>帖子作者:</h2></strong>",strip_tags($author),"<br/><br/>";
}//打印帖子内容
Function printContents($contents){
Echo "<strong><h2>作者发表的内容:</h2>",$contents,"</strong><br/>";
}//错误
Function printError(){
Echo "<i>该帖子不存在!</i>";
}
/*函数列表end---------------------------------------------------------------------------------------------------*//*登录论坛 begin*/
$url = 'http://bbs.war3.cn/logging.php?action=login';
$postfields='loginfield=username&username=1nject10n&password=xxxxxx&questionid=0&cookietime=315360000&referer=http://bbs.war3.cn/&loginsubmit=提交';
request($url,$postfields,$cookie_jar,'');
unset($postfields,$url);
/*登录论坛 end*//*获取帖子列表(位于第一页的帖子) begin*/
$url = 'http://bbs.war3.cn/forumdisplay.php?fid=57';
$code = request($url,'',$cookie_jar,'');
$threadsList = getThreadsList($code);
/*获取帖子列表 end*///帖子序列
$rows = 0;/*循环抓取所有帖子源代码 begin*/
Foreach($threadsList as $list){
$url = "http://bbs.war3.cn/viewthread.php?tid=$list"; IF(isExits($code)){
$code = request($url,'',$cookie_jar,'');
$color = $rows%2==0?'#00CCFF':'#FFFF33';
Echo "<div style='background-color:$color'>";
Echo "<h1>第",($rows+1),"贴:</h1><br/>";
$author = getAuthor($code);
printAuthor($author);
$title = getTitle($code);
printTitle($title);
$contents = getContents($code);
printContents($contents);
Echo "</div>";
$rows++;
}Else{
printError();
}
Echo "-----------------------------------------------------------------------------------------<br/><br/>";
}
/*抓取源代码 end*/