curl抓取超时 最近做一个spider,在爬取http://news.fblife.com/时遇到个问题,这个网站应该是做了防爬取的处理,抓取时只能抓回16K之后就一直等待,求教。 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 可以自己用fsockopen模拟浏览器访问自己写采集器还是越底层的编程越好~给你个函数function get_html($url,$time_out=10, $cookie='',$conf = array()) { $url = trim($url); $cookie = trim($cookie); $return = ''; if(!is_array($conf)){ return $return; } $matches = parse_url($url); !isset($matches['host']) && $matches['host'] = ''; !isset($matches['path']) && $matches['path'] = ''; !isset($matches['query']) && $matches['query'] = ''; !isset($matches['port']) && $matches['port'] = ''; $host = $matches['host']; $boardurl = 'http://'.$host.'/'; $path = $matches['path'] ? $matches['path'].($matches['query'] ? '?'.$matches['query'] : '') : '/'; $port = !empty($matches['port']) ? $matches['port'] : 80; $conf_arr = array( 'limit'=>0, 'post'=>'', 'cookie'=>$cookie, 'bysocket'=>FALSE, 'ip'=>'', 'timeout'=>$time_out, 'block'=>TRUE, ); foreach (array_merge($conf_arr, $conf) as $k=>$v) ${$k} = $v; if($post) { if(is_array($post)){ $post = http_build_query($post); } $out = "POST $path HTTP/1.0\r\n"; $out .= "Accept: */*\r\n"; $out .= "Referer: $boardurl\r\n"; $out .= "Accept-Language: zh-cn\r\n"; $out .= "Content-Type: application/x-www-form-urlencoded\r\n"; $out .= "User-Agent: Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+CIBA;+.NET+CLR+2.0.50727;+360SE)\r\n"; //需要模拟的agent $out .= "Host: $host\r\n"; $out .= 'Content-Length: '.strlen($post)."\r\n"; $out .= "Connection: Close\r\n"; $out .= "Accept-Encoding: \r\n"; $out .= "Cache-Control: no-cache\r\n"; $out .= "Cookie: $cookie\r\n\r\n"; $out .= $post; } else { $out = "GET $path HTTP/1.0\r\n"; $out .= "Accept: */*\r\n"; $out .= "Referer: $boardurl\r\n"; $out .= "Accept-Language: zh-cn\r\n"; $out .= "User-Agent: Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+CIBA;+.NET+CLR+2.0.50727;+360SE)\r\n"; //需要模拟的agent $out .= "Host: $host\r\n"; $out .= "Connection: Close\r\n"; $out .= "Accept-Encoding: \r\n"; $out .= "Cookie: $cookie\r\n\r\n"; } $fp = @fsockopen(($ip ? $ip : $host), $port, $errno, $errstr, 5); if(!$fp) { return false; } else { $start = time(); stream_set_blocking($fp, $block); stream_set_timeout($fp, $timeout); @fwrite($fp, $out); $status = stream_get_meta_data($fp); if(!$status['timed_out']) { $h=''; while (!feof($fp)) { if(($header = @fgets($fp)) && ($header == "\r\n" || $header == "\n")) { break; } $h .= $header; } if (strstr($h,' 301 ') || strstr($h,' 302 ')){ preg_match("/Location:(.*?)$/im",$h,$match); $url = $match[1]; preg_match("/Set-Cookie:(.*?)$/im",$h,$match); $cookie = $match[1]; return get_html($url,15,$cookie); } $stop = false; while(!$stop) { if( $status['timed_out'] || ( time()-$start ) > $timeout ){ @fclose($fp); return false; } $data = fread($fp, ($limit == 0 || $limit > 128 ? 128 : $limit)); if ($predata.$data==''){ break; } $predata = $data; $return .= $data; if($limit) { $limit -= strlen($data); $stop = $limit <= 0; } } } @fclose($fp); return $return; } } 不知道你的spider 怎么写的,我自己认为 要么你1k都失败不到,要么就是网不好只能到了16k就失败...另外这是不是广告贴? curl抓取的确是不行的,16k就断了以下是源码function my_http($url, $method, $connect_timeout, $timeout, $parameter=NULL, $userpwd=NULL){ $ci = curl_init(); curl_setopt($ci, CURLOPT_ENCODING, 'gzip'); curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, $connect_timeout); curl_setopt($ci, CURLOPT_TIMEOUT, $timeout); curl_setopt($ci, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ci,CURLOPT_BINARYTRANSFER,TRUE); curl_setopt($ci,CURLOPT_HEADER,TRUE);// curl_setopt($ci, CURLOPT_RANGE, "0-16000"); curl_setopt($ci,CURLOPT_NOPROGRESS,FALSE); curl_setopt($ci,CURLOPT_REFERER,"http://www.fblife.com/"); curl_setopt($ci, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"); curl_setopt($ci, CURLOPT_URL, $url); if( !empty($userpwd) ){ curl_setopt($ci, CURLOPT_USERPWD, $userpwd); } switch ($method) { case 'POST': curl_setopt($ci, CURLOPT_POST, TRUE); if (!empty($parameter)) { curl_setopt($ci, CURLOPT_POSTFIELDS, $parameter); } break; case 'GET': break; } $btime = microtime(true); $htmlpage = curl_exec($ci); if($htmlpage === false) { $errinfo = curl_error($ci); $info = curl_getinfo($ci); //print_r($info); if($htmlpage === false) { $errinfo = curl_error($ci); $info = curl_getinfo($ci); //print_r($info); $response = array('errinfo'=>$errinfo); return $response; }// $info = curl_getinfo($ci);// print_r($info); $etime = microtime(true); $content_type = curl_getinfo($ci,CURLINFO_CONTENT_TYPE); $status = curl_getinfo($ci, CURLINFO_HTTP_CODE); $size1 = curl_getinfo($ci,CURLINFO_SIZE_DOWNLOAD); $size2 = curl_getinfo($ci,CURLINFO_CONTENT_LENGTH_DOWNLOAD); $time = $etime-$btime; curl_close ($ci); $response = array("errinfo"=>"","status"=>$status,"time"=>$time,"htmlpage"=>$htmlpage,"downloadsize"=>$size1,"announcesize"=>$size2); //print_r($content_type); return $response;}//my_http("http://www.fblife.com/favicon.ico",'get',10,10);$response = my_http("http://news.fblife.com",'get',60,10);echo $response['errinfo'];$htmlpage = mb_convert_encoding($response['htmlpage'],"GBK","UTF-8");//网站编码为utf-8$htmlpage = $response['htmlpage'];echo strlen($htmlpage)."\n";echo $htmlpage."\n"; 这个函数里conf是啥,没看懂啊 www.xx.com/test_file/xx1.php能访问,却找不到test_file这个文件怎么回事? php内部编码 关于动态添加上传 并写入数据库 高分求解啊 smarty路径问题 关于用php抓取阿里巴巴的企业信息 如何根据 this 取得数组索引值? 只能用 javascript 留言版中插入图片链接 form中参数传递不出去,急 PHP微信公众平台开发如何实现定向发送??求大神!! netbeans ide 7.0.1 怎么运行代码 怎么输出变量??
自己写采集器还是越底层的编程越好~
给你个函数function get_html($url,$time_out=10, $cookie='',$conf = array()) {
$url = trim($url);
$cookie = trim($cookie);
$return = '';
if(!is_array($conf)){
return $return;
}
$matches = parse_url($url);
!isset($matches['host']) && $matches['host'] = '';
!isset($matches['path']) && $matches['path'] = '';
!isset($matches['query']) && $matches['query'] = '';
!isset($matches['port']) && $matches['port'] = '';
$host = $matches['host'];
$boardurl = 'http://'.$host.'/';
$path = $matches['path'] ? $matches['path'].($matches['query'] ? '?'.$matches['query'] : '') : '/';
$port = !empty($matches['port']) ? $matches['port'] : 80;
$conf_arr = array(
'limit'=>0,
'post'=>'',
'cookie'=>$cookie,
'bysocket'=>FALSE,
'ip'=>'',
'timeout'=>$time_out,
'block'=>TRUE,
);
foreach (array_merge($conf_arr, $conf) as $k=>$v) ${$k} = $v;
if($post) {
if(is_array($post)){
$post = http_build_query($post);
}
$out = "POST $path HTTP/1.0\r\n";
$out .= "Accept: */*\r\n";
$out .= "Referer: $boardurl\r\n";
$out .= "Accept-Language: zh-cn\r\n";
$out .= "Content-Type: application/x-www-form-urlencoded\r\n";
$out .= "User-Agent: Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+CIBA;+.NET+CLR+2.0.50727;+360SE)\r\n"; //需要模拟的agent
$out .= "Host: $host\r\n";
$out .= 'Content-Length: '.strlen($post)."\r\n";
$out .= "Connection: Close\r\n";
$out .= "Accept-Encoding: \r\n";
$out .= "Cache-Control: no-cache\r\n";
$out .= "Cookie: $cookie\r\n\r\n";
$out .= $post;
} else {
$out = "GET $path HTTP/1.0\r\n";
$out .= "Accept: */*\r\n";
$out .= "Referer: $boardurl\r\n";
$out .= "Accept-Language: zh-cn\r\n";
$out .= "User-Agent: Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+CIBA;+.NET+CLR+2.0.50727;+360SE)\r\n"; //需要模拟的agent $out .= "Host: $host\r\n";
$out .= "Connection: Close\r\n";
$out .= "Accept-Encoding: \r\n";
$out .= "Cookie: $cookie\r\n\r\n";
}
$fp = @fsockopen(($ip ? $ip : $host), $port, $errno, $errstr, 5);
if(!$fp) {
return false;
} else {
$start = time();
stream_set_blocking($fp, $block);
stream_set_timeout($fp, $timeout);
@fwrite($fp, $out);
$status = stream_get_meta_data($fp);
if(!$status['timed_out']) {
$h='';
while (!feof($fp)) {
if(($header = @fgets($fp)) && ($header == "\r\n" || $header == "\n")) {
break;
}
$h .= $header;
}
if (strstr($h,' 301 ') || strstr($h,' 302 ')){
preg_match("/Location:(.*?)$/im",$h,$match);
$url = $match[1];
preg_match("/Set-Cookie:(.*?)$/im",$h,$match);
$cookie = $match[1];
return get_html($url,15,$cookie);
}
$stop = false;
while(!$stop) {
if( $status['timed_out'] || ( time()-$start ) > $timeout ){
@fclose($fp);
return false;
}
$data = fread($fp, ($limit == 0 || $limit > 128 ? 128 : $limit));
if ($predata.$data==''){
break;
}
$predata = $data;
$return .= $data;
if($limit) {
$limit -= strlen($data);
$stop = $limit <= 0;
}
}
}
@fclose($fp);
return $return;
}
}
另外这是不是广告贴?
以下是源码
function my_http($url, $method, $connect_timeout, $timeout, $parameter=NULL, $userpwd=NULL){ $ci = curl_init();
curl_setopt($ci, CURLOPT_ENCODING, 'gzip');
curl_setopt($ci, CURLOPT_CONNECTTIMEOUT, $connect_timeout);
curl_setopt($ci, CURLOPT_TIMEOUT, $timeout);
curl_setopt($ci, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ci,CURLOPT_BINARYTRANSFER,TRUE);
curl_setopt($ci,CURLOPT_HEADER,TRUE);
// curl_setopt($ci, CURLOPT_RANGE, "0-16000");
curl_setopt($ci,CURLOPT_NOPROGRESS,FALSE);
curl_setopt($ci,CURLOPT_REFERER,"http://www.fblife.com/");
curl_setopt($ci, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)");
curl_setopt($ci, CURLOPT_URL, $url);
if( !empty($userpwd) ){
curl_setopt($ci, CURLOPT_USERPWD, $userpwd);
} switch ($method) {
case 'POST':
curl_setopt($ci, CURLOPT_POST, TRUE);
if (!empty($parameter)) {
curl_setopt($ci, CURLOPT_POSTFIELDS, $parameter);
}
break;
case 'GET': break;
}
$btime = microtime(true);
$htmlpage = curl_exec($ci);
if($htmlpage === false)
{
$errinfo = curl_error($ci);
$info = curl_getinfo($ci);
//print_r($info);
if($htmlpage === false)
{
$errinfo = curl_error($ci);
$info = curl_getinfo($ci);
//print_r($info);
$response = array('errinfo'=>$errinfo);
return $response;
}
// $info = curl_getinfo($ci);
// print_r($info);
$etime = microtime(true);
$content_type = curl_getinfo($ci,CURLINFO_CONTENT_TYPE);
$status = curl_getinfo($ci, CURLINFO_HTTP_CODE);
$size1 = curl_getinfo($ci,CURLINFO_SIZE_DOWNLOAD);
$size2 = curl_getinfo($ci,CURLINFO_CONTENT_LENGTH_DOWNLOAD);
$time = $etime-$btime;
curl_close ($ci);
$response = array
("errinfo"=>"","status"=>$status,"time"=>$time,"htmlpage"=>$htmlpage,"downloadsize"=>$size1,"announcesize"=>$size2);
//print_r($content_type);
return $response;}//my_http("http://www.fblife.com/favicon.ico",'get',10,10);
$response = my_http("http://news.fblife.com",'get',60,10);
echo $response['errinfo'];
$htmlpage = mb_convert_encoding($response['htmlpage'],"GBK","UTF-8");//网站编码为utf-8
$htmlpage = $response['htmlpage'];
echo strlen($htmlpage)."\n";
echo $htmlpage."\n";