cur 模拟抓取网页 php的curl如何模拟浏览器进行网页抓取呢?还有就是如何控制curl的执行时间,抓的太块,ip可能会被封掉!!怎么搞呢??看过文档,正在研究想听大家的看法,或者给个例子 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 控制curl的执行时间,抓的太块,ip可能会被封掉!! -----------会这样么,没控制过时间,抓就是一瞬间的事,就是正则匹配太麻烦,特别是要模拟登陆的~~~ lz试一下下面的代码 class mycurl { protected $_useragent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1'; protected $_url; protected $_followlocation; protected $_timeout; protected $_maxRedirects; protected $_cookieFileLocation = 'c:\cookies\curl_cookie.txt'; protected $_post; protected $_postFields; protected $_referer ="http://www.google.com"; protected $_session; protected $_webpage; protected $_includeHeader; protected $_noBody; protected $_status; protected $_binaryTransfer; public function __construct($url,$followlocation = true,$timeOut = 30,$maxRedirecs = 4,$binaryTransfer = false,$includeHeader = false,$noBody = false) { $this->_url = $url; $this->_referer = $referer; $this->_followlocation = $followlocation; $this->_timeout = $timeOut; $this->_maxRedirects = $maxRedirecs; $this->_noBody = $noBody; $this->_includeHeader = $includeHeader; $this->_binaryTransfer = $binaryTransfer; } public function setReferer($referer){ $this->_referer = $referer;} public function setCookiFileLocation($path) { $this->_cookieFileLocation = $path; } public function setPost ($postFields) { $this->_post = true; $this->_postFields = $postFields; } public function setUserAgent($userAgent) { $this->_useragent = $userAgent; } public function createCurl() { $s = curl_init(); curl_setopt($s,CURLOPT_URL,$this->_url); curl_setopt($s,CURLOPT_HTTPHEADER,array('Except:')); curl_setopt($s,CURLOPT_TIMEOUT,$this->_timeout); curl_setopt($s,CURLOPT_MAXREDIRS,$this->_maxRedirects); curl_setopt($s,CURLOPT_RETURNTRANSFER,true); curl_setopt($s,CURLOPT_FOLLOWLOCATION,$this->_followlocation); curl_setopt($s,CURLOPT_COOKIEJAR,$this->_cookieFileLocation); curl_setopt($s,CURLOPT_COOKIEFILE,$this->_cookieFileLocation); if($this->_post) { curl_setopt($s,CURLOPT_POST,true); curl_setopt($s,CURLOPT_POSTFIELDS,$this->_postFields); } if($this->_includeHeader) { curl_setopt($s,CURLOPT_HEADER,true); } if($this->_noBody) { curl_setopt($s,CURLOPT_NOBODY,true); } if($this->_binary) { curl_setopt($s,CURLOPT_BINARYTRANSFER,true); } curl_setopt($s,CURLOPT_USERAGENT,$this->_useragent); curl_setopt($s,CURLOPT_REFERER,$this->_referer); $this->_webpage = curl_exec($s); $this->_status = curl_getinfo($s,CURLINFO_HTTP_CODE); curl_close($s); } public function getHttpStatus() { return $this->_status; } public function __tostring() { return $this->_webpage; } } //using the above class $instance = new mycurl('http://google.com',true,30,4); $instance->createCurl(); if($imdb->getHttpStatus() == 200) { echo $imdb; } 简单点的<?$url='http://www.163.com';ob_start(); //打开输出缓冲区$ch = curl_init(); //初始化会话curl_setopt( $ch, CURLOPT_URL, $url ); //设定目标URLcurl_exec( $ch ); //发送请求//$retrievedhtml = ob_get_contents(); //返回内部缓冲区的内容//ob_end_clean(); //删除内部缓冲区的内容并关闭内部缓冲区curl_close( $ch ); //会话结束?> 复杂点的<?$url='http://202.201.7.9/CSTJ/Sear.dll?SearchZK';$param='E=%3C1%2C4%3E%3A%28%28Title_C%3DCAm%29*Years%3D%282002%2B2003%2B2004%2B2005%2B2006%2B2007%29%29&S=1&N=20&P=1&C=0&H=%28%CC%E2%C3%FB%3DCAD%29*Year%3D2002-2007&M=&KM=&GX=&CSID=%7BC8E8BDCF-73D3-45F2-9FE9-89C8B28A02D1%7D'; //指定POST数据ob_start(); //打开输出缓冲区$ch = curl_init(); //初始化会话curl_setopt( $ch, CURLOPT_URL, $url ); //设定目标URLcurl_setopt( $ch, CURLOPT_POST, 1 ); //设定表单发送方式为POSTcurl_setopt( $ch, CURLOPT_POSTFIELDS, $param ); //POST数据curl_exec( $ch ); //发送请求//$retrievedhtml = ob_get_contents(); //返回内部缓冲区的内容//ob_end_clean(); //删除内部缓冲区的内容并关闭内部缓冲区curl_close( $ch ); //会话结束?> php对mysysql数据输入引号的问题 function main(ujn_controller $c, $params = '') 求助 采集一下分集URL 和 连接 zend studio ctrl+单击跟踪方法的问题 求一个不定参数的Url重写的正则表达式 PHP加MySQL出现如下错误,请看看!如何解决 在做一个网站文件管理器时,对于服务器上的目录或文件的根限怎么设置? 我用javascript调用PHP页面不成功 win32_ps_list_procs 怎么用,报undefined 面向对象编程思想 请问是不是浏览器缓存的问题 php 做一个网站的客户历史浏览功能 如何做,思路?
-----------会这样么,没控制过时间,抓就是一瞬间的事,就是正则匹配太麻烦,特别是要模拟登陆的~~~
protected $_useragent = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1';
protected $_url;
protected $_followlocation;
protected $_timeout;
protected $_maxRedirects;
protected $_cookieFileLocation = 'c:\cookies\curl_cookie.txt';
protected $_post;
protected $_postFields;
protected $_referer ="http://www.google.com";
protected $_session;
protected $_webpage;
protected $_includeHeader;
protected $_noBody;
protected $_status;
protected $_binaryTransfer;
public function __construct($url,$followlocation = true,$timeOut = 30,$maxRedirecs = 4,$binaryTransfer = false,$includeHeader = false,$noBody = false)
{
$this->_url = $url;
$this->_referer = $referer;
$this->_followlocation = $followlocation;
$this->_timeout = $timeOut;
$this->_maxRedirects = $maxRedirecs;
$this->_noBody = $noBody;
$this->_includeHeader = $includeHeader;
$this->_binaryTransfer = $binaryTransfer;
}
public function setReferer($referer)
{
$this->_referer = $referer;
}
public function setCookiFileLocation($path)
{
$this->_cookieFileLocation = $path;
}
public function setPost ($postFields)
{
$this->_post = true;
$this->_postFields = $postFields;
}
public function setUserAgent($userAgent)
{
$this->_useragent = $userAgent;
}
public function createCurl()
{
$s = curl_init();
curl_setopt($s,CURLOPT_URL,$this->_url);
curl_setopt($s,CURLOPT_HTTPHEADER,array('Except:'));
curl_setopt($s,CURLOPT_TIMEOUT,$this->_timeout);
curl_setopt($s,CURLOPT_MAXREDIRS,$this->_maxRedirects);
curl_setopt($s,CURLOPT_RETURNTRANSFER,true);
curl_setopt($s,CURLOPT_FOLLOWLOCATION,$this->_followlocation);
curl_setopt($s,CURLOPT_COOKIEJAR,$this->_cookieFileLocation);
curl_setopt($s,CURLOPT_COOKIEFILE,$this->_cookieFileLocation);
if($this->_post)
{
curl_setopt($s,CURLOPT_POST,true);
curl_setopt($s,CURLOPT_POSTFIELDS,$this->_postFields);
}
if($this->_includeHeader)
{
curl_setopt($s,CURLOPT_HEADER,true);
}
if($this->_noBody)
{
curl_setopt($s,CURLOPT_NOBODY,true);
}
if($this->_binary)
{
curl_setopt($s,CURLOPT_BINARYTRANSFER,true);
} curl_setopt($s,CURLOPT_USERAGENT,$this->_useragent);
curl_setopt($s,CURLOPT_REFERER,$this->_referer);
$this->_webpage = curl_exec($s);
$this->_status = curl_getinfo($s,CURLINFO_HTTP_CODE);
curl_close($s);
}
public function getHttpStatus()
{
return $this->_status;
}
public function __tostring()
{
return $this->_webpage;
}
}
//using the above class
$instance = new mycurl('http://google.com',true,30,4);
$instance->createCurl();
if($imdb->getHttpStatus() == 200)
{
echo $imdb;
}
<?
$url='http://www.163.com';
ob_start(); //打开输出缓冲区
$ch = curl_init(); //初始化会话
curl_setopt( $ch, CURLOPT_URL, $url ); //设定目标URL
curl_exec( $ch ); //发送请求
//$retrievedhtml = ob_get_contents(); //返回内部缓冲区的内容
//ob_end_clean(); //删除内部缓冲区的内容并关闭内部缓冲区
curl_close( $ch ); //会话结束
?>
复杂点的
<?
$url='http://202.201.7.9/CSTJ/Sear.dll?SearchZK';
$param='E=%3C1%2C4%3E%3A%28%28Title_C%3DCAm%29*Years%3D%282002%2B2003%2B2004%2B2005%2B2006%2B2007%29%29&S=1&N=20&P=1&C=0&H=%28%CC%E2%C3%FB%3DCAD%29*Year%3D2002-2007&M=&KM=&GX=&CSID=%7BC8E8BDCF-73D3-45F2-9FE9-89C8B28A02D1%7D'; //指定POST数据
ob_start(); //打开输出缓冲区
$ch = curl_init(); //初始化会话
curl_setopt( $ch, CURLOPT_URL, $url ); //设定目标URL
curl_setopt( $ch, CURLOPT_POST, 1 ); //设定表单发送方式为POST
curl_setopt( $ch, CURLOPT_POSTFIELDS, $param ); //POST数据
curl_exec( $ch ); //发送请求
//$retrievedhtml = ob_get_contents(); //返回内部缓冲区的内容
//ob_end_clean(); //删除内部缓冲区的内容并关闭内部缓冲区
curl_close( $ch ); //会话结束
?>