我想抓取http://www.travelzen.com.cn/这个站的国际机票信息,思路如下:
第一步抓取flightLoading.php
$url = "http://www.travelzen.com.cn/flightLoading.php"; 
$referer = 'http://www.travelzen.com.cn/index.php?type=flight';
$cookie_jar = dirname(__FILE__)."/cookie.txt"; 
$timeout = 10;  
$data = "flightType=Int&selectedDeparture=&selectedReturn=&selectedBookId=&fromCity=HKG&toCity=BKK&adult=1&child=0&datefrom=2011-06-20&searchToken=&type=&dep1=&arr1=&dep2=&arr2=&queryToken=&tripType=2&flightClass=All";
$ch = curl_init(); 
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);  
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
curl_setopt($ch, CURLOPT_REFERER, $referer);  
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);  
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_jar);
$contents = curl_exec($ch);
curl_close($ch); 第二步抓取初步显示页面
$url = "http://www.travelzen.com.cn/getFlightSchPreview.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10; 
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);  
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
curl_setopt($ch, CURLOPT_REFERER, $referer);  
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch);  第三步抓取最后刷新的真实数据
$url = "http://www.travelzen.com.cn/flightResult.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10; 
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);  
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
curl_setopt($ch, CURLOPT_REFERER, $referer);  
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch);整个过程,到第2步都可以正常抓回,但第3步最后只抓回了架子,里面的数据没抓回来,取消第2步,直接抓第3步也是一样的效果,希望life169能帮我看看,万分感谢,有高手有时间帮忙看看也行,分还可以加。

解决方案 »

  1.   

    我这里有一个抓去论坛的例子你可以稍作修改
    Set_time_limit(0);$cookie_jar = '/tmp/cookie.tmp';//cookie保存目录//模拟请求数据
    Function request($url,$postfields,$cookie_jar,$referer){
    $ch = curl_init();
    $options = array(CURLOPT_URL => $url,
    CURLOPT_HEADER => 0,
    CURLOPT_NOBODY => 0,
    CURLOPT_PORT => 80,
    CURLOPT_POST => 1,
    CURLOPT_POSTFIELDS => $postfields,
    CURLOPT_RETURNTRANSFER => 1,
    CURLOPT_FOLLOWLOCATION => 1,
    CURLOPT_COOKIEJAR => $cookie_jar,
    CURLOPT_COOKIEFILE => $cookie_jar,
    CURLOPT_REFERER => $referer
    );
    curl_setopt_array($ch, $options);
    $code = curl_exec($ch);
    curl_close($ch);
    Return $code;
    }//获取帖子列表
    Function getThreadsList($code){
    Preg_match_all('/<!--[.|\r|\n]*?<a href=\"viewthread.php\?tid=(\d+)/',$code,$threads);
    Return $threads[1];
    }//判断该帖子是否存在
    Function isExits($code){
    Preg_match('/<p>指定的主题不存在或已被删除或正在被审核,请返回。<\/p>/',$code,$error);
    Return isset($error[0])?false:true;
    }//获取帖子标题
    Function getTitle($code){
    Preg_match('/<h1>[<\/h1>]*/',$code,$title_tmp);
    $title = $title_tmp[0];
    Return $title;
    }//获取帖子作者:
    Function getAuthor($code){
    Preg_match('/<a href=\"space.php\?uid=\d+\" target=\"_blank\" id=\"userinfo\d+\" onmouseover=\"showMenu\(this\.id\)\">.+/',$code,$author_tmp);
    $author = strip_tags($author_tmp[0]);
    Return $author;
    }//获取楼主发表的内容
    Function getContents($code){
    Preg_match('/<div id=\"postmessage_\d+\" class=\"t_msgfont\">(.|\r|\n)*?<\/div>/',$code,$contents_tmp);
    $contents = preg_replace('/images\//','http://bbs.war3.cn/images/',$contents_tmp[0]);
    Return $contents;
    }//打印帖子标题
    Function printTitle($title){
    Echo "<strong><h2>帖子标题:</h2></strong>",strip_tags($title),"<br/><br/>";
    }//输出帖子作者
    Function printAuthor($author){
    Echo "<strong><h2>帖子作者:</h2></strong>",strip_tags($author),"<br/><br/>";
    }//打印帖子内容
    Function printContents($contents){
    Echo "<strong><h2>作者发表的内容:</h2>",$contents,"</strong><br/>";
    }//错误
    Function printError(){
    Echo "<i>该帖子不存在!</i>";
    }
    /*函数列表end---------------------------------------------------------------------------------------------------*//*登录论坛 begin*/
    $url = 'http://bbs.war3.cn/logging.php?action=login';
    $postfields='loginfield=username&username=1nject10n&password=xxxxxx&questionid=0&cookietime=315360000&referer=http://bbs.war3.cn/&loginsubmit=提交';
    request($url,$postfields,$cookie_jar,'');
    unset($postfields,$url);
    /*登录论坛 end*//*获取帖子列表(位于第一页的帖子) begin*/
    $url = 'http://bbs.war3.cn/forumdisplay.php?fid=57';
    $code = request($url,'',$cookie_jar,'');
    $threadsList = getThreadsList($code);
    /*获取帖子列表 end*///帖子序列
    $rows = 0;/*循环抓取所有帖子源代码 begin*/
    Foreach($threadsList as $list){
    $url = "http://bbs.war3.cn/viewthread.php?tid=$list"; IF(isExits($code)){
    $code = request($url,'',$cookie_jar,'');
    $color = $rows%2==0?'#00CCFF':'#FFFF33';
    Echo "<div style='background-color:$color'>";
    Echo "<h1>第",($rows+1),"贴:</h1><br/>";
    $author = getAuthor($code);
    printAuthor($author);
    $title = getTitle($code);
    printTitle($title);
    $contents = getContents($code);
    printContents($contents);
    Echo "</div>";
    $rows++;
    }Else{
    printError();
    }
    Echo "-----------------------------------------------------------------------------------------<br/><br/>";
    }
    /*抓取源代码 end*/
      

  2.   

    大神life169现身,针对此类问题搞一个集中答疑吧!