我想抓取http://www.travelzen.com.cn/这个站的国际机票信息,思路如下:
第一步抓取flightLoading.php
$url = "http://www.travelzen.com.cn/flightLoading.php"; 
$referer = 'http://www.travelzen.com.cn/index.php?type=flight';
$cookie_jar = dirname(__FILE__)."/cookie.txt"; 
$timeout = 10;  
$data = "flightType=Int&selectedDeparture=&selectedReturn=&selectedBookId=&fromCity=HKG&toCity=BKK&adult=1&child=0&datefrom=2011-06-20&searchToken=&type=&dep1=&arr1=&dep2=&arr2=&queryToken=&tripType=2&flightClass=All";
$ch = curl_init(); 
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);  
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
curl_setopt($ch, CURLOPT_REFERER, $referer);  
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);  
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_jar);
$contents = curl_exec($ch);
curl_close($ch); 第二步抓取初步显示页面
$url = "http://www.travelzen.com.cn/getFlightSchPreview.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10; 
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);  
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
curl_setopt($ch, CURLOPT_REFERER, $referer);  
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch);  第三步抓取最后刷新的真实数据
$url = "http://www.travelzen.com.cn/flightResult.php";
$referer = "http://www.travelzen.com.cn/flightLoading.php";
$ch = curl_init();
$timeout = 10; 
$data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_URL, $url);  
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);  
curl_setopt($ch, CURLOPT_REFERER, $referer);  
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout); 
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
$contents = curl_exec($ch);
curl_close($ch);整个过程,到第2步都可以正常抓回,但第3步最后只抓回了架子,里面的数据没抓回来,取消第2步,直接抓第3步也是一样的效果,希望life169能帮我看看,万分感谢,有高手有时间帮忙看看
http://topic.csdn.net/u/20110620/15/549ba42b-102d-4244-a7ee-216753f4329a.html
能解决,这个帖子的100分也一并送上。

解决方案 »

  1.   

    没有兴趣玩抓取,何况你代码中的参数已经过期了,现在是7月份了你还在抓6月的数据,当然是抓不到的了就代码而言
    CURLOPT_COOKIEJAR 是发送的 cookie 
    CURLOPT_COOKIEFILE 是接收的 cookie
    你当好弄反了
      

  2.   

    500块是真的吗?发现你第二步加了CURLOPT_COOKIESESSION,为啥啊?
    而且第三步是做什么用的,和第二步有区别?
    注释掉CURLOPT_COOKIESESSION可以获取到机票信息列表,这个东西一般都不设置什么true,有什么cookie就直接请求到服务端算了。
    <?php
    $url = "http://www.travelzen.com.cn/flightLoading.php";  
    $referer = 'http://www.travelzen.com.cn/index.php?type=flight';
    $cookie_jar = dirname(__FILE__)."/cookie.txt";  
    $timeout = 10;   
    $data = "flightType=Int&selectedDeparture=&selectedReturn=&selectedBookId=&fromCity=HKG&toCity=BKK&adult=1&child=0&datefrom=2011-06-20&searchToken=&type=&dep1=&arr1=&dep2=&arr2=&queryToken=&tripType=2&flightClass=All";
    $ch = curl_init();  
    curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_URL, $url);   
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);   
    curl_setopt($ch, CURLOPT_REFERER, $referer);   
    curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);   
    curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
    curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_jar);
    $contents = curl_exec($ch);
    curl_close($ch);  #第二步抓取初步显示页面
    $url = "http://www.travelzen.com.cn/getFlightSchPreview.php";
    $referer = "http://www.travelzen.com.cn/flightLoading.php";
    $ch = curl_init();
    $timeout = 10;  
    $data = "movePrevNext_dep=&movePrevNext_ret=&departureCityIATACode=HKG&departureCountryIATACode=HK&destinationCityIATACode=BKK&destinationCountryIATACode=TH&isReturn=0&typ=2&cityfromDomestic=香港&citytoDomestic=曼谷&datefrom=2011-06-20&dateto=2011-06-21&numOfAdult=1&numOfChild=0&flightClass=All&nonstop=&searchToken=";
    curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
    curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_URL, $url);   
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);   
    curl_setopt($ch, CURLOPT_REFERER, $referer);   
    //注释掉
    #curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);  
    curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_jar);
    curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
    $contents = curl_exec($ch);
    curl_close($ch); 
    echo $contents;
    ?>