最近做一个spider,在爬取http://news.fblife.com/时遇到个问题,这个网站应该是做了防爬取的处理,抓取时只能抓回16K之后就一直等待,求教。

解决方案 »

  1.   

    可以自己用fsockopen模拟浏览器访问
    自己写采集器还是越底层的编程越好~
    给你个函数function get_html($url,$time_out=10, $cookie='',$conf = array()) {
    $url = trim($url);
    $cookie = trim($cookie);
        $return = '';      
        if(!is_array($conf)){      
            return $return;      
        }      
        $matches = parse_url($url);      
        !isset($matches['host']) && $matches['host'] = '';      
        !isset($matches['path']) && $matches['path'] = '';      
        !isset($matches['query']) && $matches['query'] = '';      
        !isset($matches['port']) && $matches['port'] = '';      
        $host = $matches['host'];
    $boardurl = 'http://'.$host.'/';
        $path = $matches['path'] ? $matches['path'].($matches['query'] ? '?'.$matches['query'] : '') : '/';      
        $port = !empty($matches['port']) ? $matches['port'] : 80;      
        $conf_arr = array(      
            'limit'=>0,      
            'post'=>'',      
            'cookie'=>$cookie,      
            'bysocket'=>FALSE,      
            'ip'=>'',      
            'timeout'=>$time_out,      
            'block'=>TRUE,      
            );      
         
        foreach (array_merge($conf_arr, $conf) as $k=>$v) ${$k} = $v;      
         
        if($post) {      
            if(is_array($post)){      
                $post = http_build_query($post);      
            }
            $out = "POST $path HTTP/1.0\r\n";      
            $out .= "Accept: */*\r\n";      
            $out .= "Referer: $boardurl\r\n";      
            $out .= "Accept-Language: zh-cn\r\n";      
            $out .= "Content-Type: application/x-www-form-urlencoded\r\n";      
            $out .= "User-Agent: Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+CIBA;+.NET+CLR+2.0.50727;+360SE)\r\n";       //需要模拟的agent
            $out .= "Host: $host\r\n";      
            $out .= 'Content-Length: '.strlen($post)."\r\n";      
            $out .= "Connection: Close\r\n";
    $out .= "Accept-Encoding: \r\n";  
            $out .= "Cache-Control: no-cache\r\n";      
            $out .= "Cookie: $cookie\r\n\r\n";      
            $out .= $post;      
        } else {      
            $out = "GET $path HTTP/1.0\r\n";      
            $out .= "Accept: */*\r\n";      
            $out .= "Referer: $boardurl\r\n";      
            $out .= "Accept-Language: zh-cn\r\n";      
            $out .= "User-Agent: Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+CIBA;+.NET+CLR+2.0.50727;+360SE)\r\n";        //需要模拟的agent        $out .= "Host: $host\r\n";      
            $out .= "Connection: Close\r\n";   
    $out .= "Accept-Encoding: \r\n";  
            $out .= "Cookie: $cookie\r\n\r\n";      
        }

        $fp = @fsockopen(($ip ? $ip : $host), $port, $errno, $errstr, 5);    
        if(!$fp) {
            return false;      
        } else {      
    $start = time();
            stream_set_blocking($fp, $block);      
            stream_set_timeout($fp, $timeout);      
            @fwrite($fp, $out);      

            $status = stream_get_meta_data($fp);      
            if(!$status['timed_out']) {
    $h='';
                while (!feof($fp)) {      
                    if(($header = @fgets($fp)) && ($header == "\r\n" ||  $header == "\n")) {      
                       break;      
                    } 
    $h .= $header;
                }
    if (strstr($h,' 301 ') || strstr($h,' 302 ')){
    preg_match("/Location:(.*?)$/im",$h,$match);
    $url = $match[1];
    preg_match("/Set-Cookie:(.*?)$/im",$h,$match);
    $cookie = $match[1];
    return get_html($url,15,$cookie);
    }
                $stop = false;      
                while(!$stop) {
    if( $status['timed_out'] || ( time()-$start ) > $timeout ){
    @fclose($fp); 
    return false;
    }
                    $data = fread($fp, ($limit == 0 || $limit > 128 ? 128 : $limit));   
    if ($predata.$data==''){
    break;
    }
    $predata = $data;
                    $return .= $data;      
                    if($limit) {      
                        $limit -= strlen($data);      
                        $stop = $limit <= 0;      
                    }      
                }      
            }      
            @fclose($fp); 
            return $return;      
        }      
    }
      

  2.   

    不知道你的spider 怎么写的,我自己认为 要么你1k都失败不到,要么就是网不好只能到了16k就失败...
    另外这是不是广告贴?
      

  3.   

    curl抓取的确是不行的,16k就断了
    以下是源码
    function my_http($url, $method, $connect_timeout, $timeout, $parameter=NULL, $userpwd=NULL){        $ci = curl_init();
            curl_setopt($ci, CURLOPT_ENCODING, 'gzip');
            curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, $connect_timeout);
            curl_setopt($ci, CURLOPT_TIMEOUT, $timeout);
            curl_setopt($ci, CURLOPT_RETURNTRANSFER, TRUE);
            curl_setopt($ci,CURLOPT_BINARYTRANSFER,TRUE);
            curl_setopt($ci,CURLOPT_HEADER,TRUE);
    //      curl_setopt($ci, CURLOPT_RANGE, "0-16000");
            curl_setopt($ci,CURLOPT_NOPROGRESS,FALSE);
            curl_setopt($ci,CURLOPT_REFERER,"http://www.fblife.com/");
            curl_setopt($ci, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible;  Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
            curl_setopt($ci, CURLOPT_URL, $url);
            if( !empty($userpwd) ){
                    curl_setopt($ci, CURLOPT_USERPWD, $userpwd);
            }        switch ($method) {
            case 'POST':
                    curl_setopt($ci, CURLOPT_POST, TRUE);
                    if (!empty($parameter)) {
                            curl_setopt($ci, CURLOPT_POSTFIELDS, $parameter);
                    }
                    break;
            case 'GET':                break;
            }
            $btime = microtime(true);
            $htmlpage = curl_exec($ci);
            if($htmlpage === false)
            {
                    $errinfo = curl_error($ci);
                    $info = curl_getinfo($ci);
                    //print_r($info);
                                                          if($htmlpage === false)
            {
                    $errinfo = curl_error($ci);
                    $info = curl_getinfo($ci);
                    //print_r($info);
                    $response = array('errinfo'=>$errinfo);
                    return $response;
            }
    //                $info = curl_getinfo($ci);
    //                print_r($info);
            $etime = microtime(true);
            $content_type = curl_getinfo($ci,CURLINFO_CONTENT_TYPE);
            $status = curl_getinfo($ci, CURLINFO_HTTP_CODE);
            $size1 = curl_getinfo($ci,CURLINFO_SIZE_DOWNLOAD);
            $size2 = curl_getinfo($ci,CURLINFO_CONTENT_LENGTH_DOWNLOAD);
            $time = $etime-$btime;
            curl_close ($ci);
            $response = array
    ("errinfo"=>"","status"=>$status,"time"=>$time,"htmlpage"=>$htmlpage,"downloadsize"=>$size1,"announcesize"=>$size2);
            //print_r($content_type);
            return $response;}//my_http("http://www.fblife.com/favicon.ico",'get',10,10);
    $response = my_http("http://news.fblife.com",'get',60,10);
    echo $response['errinfo'];
    $htmlpage = mb_convert_encoding($response['htmlpage'],"GBK","UTF-8");//网站编码为utf-8
    $htmlpage = $response['htmlpage'];
    echo strlen($htmlpage)."\n";
    echo $htmlpage."\n";
      

  4.   

    这个函数里conf是啥,没看懂啊