php的curl如何模拟浏览器进行网页抓取呢?还有就是如何控制curl的执行时间,抓的太块,ip可能会被封掉!!怎么搞呢??看过文档,正在研究想听大家的看法,或者给个例子

解决方案 »

  1.   

    控制curl的执行时间,抓的太块,ip可能会被封掉!! 
    -----------会这样么,没控制过时间,抓就是一瞬间的事,就是正则匹配太麻烦,特别是要模拟登陆的~~~
      

  2.   

    lz试一下下面的代码 class mycurl {
         protected $_useragent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1';
         protected $_url;
         protected $_followlocation;
         protected $_timeout;
         protected $_maxRedirects;
         protected $_cookieFileLocation = 'c:\cookies\curl_cookie.txt';
        protected $_post;
         protected $_postFields;
         protected $_referer ="http://www.google.com";
        
        
         protected $_session;
         protected $_webpage;
         protected $_includeHeader;
         protected $_noBody;
         protected $_status;
         protected $_binaryTransfer;
        
        
         public function __construct($url,$followlocation = true,$timeOut = 30,$maxRedirecs = 4,$binaryTransfer = false,$includeHeader = false,$noBody = false)
         {
             $this->_url = $url;
             $this->_referer = $referer;
             $this->_followlocation = $followlocation;
             $this->_timeout = $timeOut;
             $this->_maxRedirects = $maxRedirecs;
             $this->_noBody = $noBody;
             $this->_includeHeader = $includeHeader;
             $this->_binaryTransfer = $binaryTransfer;
          
         }
            
    public function setReferer($referer)
    {
        $this->_referer = $referer;
    }
        
         public function setCookiFileLocation($path)
         {
             $this->_cookieFileLocation = $path;
         }
        
        
         public function setPost ($postFields)
         {
            $this->_post = true;
            $this->_postFields = $postFields;
         }
        
        
         public function setUserAgent($userAgent)
         {
             $this->_useragent = $userAgent;
         }
     
     
         public function createCurl()
         {
            
             $s = curl_init();
            
             curl_setopt($s,CURLOPT_URL,$this->_url);
             curl_setopt($s,CURLOPT_HTTPHEADER,array('Except:'));
             curl_setopt($s,CURLOPT_TIMEOUT,$this->_timeout);
             curl_setopt($s,CURLOPT_MAXREDIRS,$this->_maxRedirects);
             curl_setopt($s,CURLOPT_RETURNTRANSFER,true);
             curl_setopt($s,CURLOPT_FOLLOWLOCATION,$this->_followlocation);
             curl_setopt($s,CURLOPT_COOKIEJAR,$this->_cookieFileLocation);
             curl_setopt($s,CURLOPT_COOKIEFILE,$this->_cookieFileLocation);
             if($this->_post)
             {
                 curl_setopt($s,CURLOPT_POST,true);
                 curl_setopt($s,CURLOPT_POSTFIELDS,$this->_postFields);
                
             }
            
             if($this->_includeHeader)
             {
                   curl_setopt($s,CURLOPT_HEADER,true);
             }
            
             if($this->_noBody)
             {
                 curl_setopt($s,CURLOPT_NOBODY,true);
             }
             if($this->_binary)
             {
                 curl_setopt($s,CURLOPT_BINARYTRANSFER,true);
             }         curl_setopt($s,CURLOPT_USERAGENT,$this->_useragent);
             curl_setopt($s,CURLOPT_REFERER,$this->_referer);
            
             $this->_webpage = curl_exec($s);
                       $this->_status = curl_getinfo($s,CURLINFO_HTTP_CODE); 
             curl_close($s);
            
            
         }
     
     
       public function getHttpStatus()
       {
           return $this->_status;
       }
     
      public function __tostring()
      {
          return $this->_webpage;
      }
     
     }
     
     //using the above class
     
     $instance = new mycurl('http://google.com',true,30,4);
     $instance->createCurl();
     if($imdb->getHttpStatus() == 200)
     {
         echo $imdb;
     }
      

  3.   

    简单点的
    <?
    $url='http://www.163.com';
    ob_start();                                                                             //打开输出缓冲区
    $ch = curl_init();                                                             //初始化会话
    curl_setopt( $ch, CURLOPT_URL, $url );                         //设定目标URL
    curl_exec( $ch );                                                                 //发送请求
    //$retrievedhtml = ob_get_contents();                                  //返回内部缓冲区的内容
    //ob_end_clean();                           //删除内部缓冲区的内容并关闭内部缓冲区
    curl_close( $ch );                        //会话结束
    ?> 
    复杂点的
    <?
    $url='http://202.201.7.9/CSTJ/Sear.dll?SearchZK';
    $param='E=%3C1%2C4%3E%3A%28%28Title_C%3DCAm%29*Years%3D%282002%2B2003%2B2004%2B2005%2B2006%2B2007%29%29&S=1&N=20&P=1&C=0&H=%28%CC%E2%C3%FB%3DCAD%29*Year%3D2002-2007&M=&KM=&GX=&CSID=%7BC8E8BDCF-73D3-45F2-9FE9-89C8B28A02D1%7D';                                                 //指定POST数据
    ob_start();                                                                             //打开输出缓冲区
    $ch = curl_init();                                                             //初始化会话
    curl_setopt( $ch, CURLOPT_URL, $url );                         //设定目标URL
    curl_setopt( $ch, CURLOPT_POST, 1 );                            //设定表单发送方式为POST
    curl_setopt( $ch, CURLOPT_POSTFIELDS, $param );    //POST数据
    curl_exec( $ch );                                                                 //发送请求
    //$retrievedhtml = ob_get_contents();                                  //返回内部缓冲区的内容
    //ob_end_clean();                           //删除内部缓冲区的内容并关闭内部缓冲区
    curl_close( $ch );                        //会话结束
    ?>