curl post方式采集列表页问题 本帖最后由 tangjianft 于 2011-02-08 20:58:28 编辑 解决方案 » 免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货 所以 你应该先连接 $url 获取随机值,然后再连接把随机值与其他数据一起post给$url这个就可以正常获取了 加随机参数是为了获取最新的页面信息吗?如果是的话,可以考虑去掉,通过HTTP头信息来发送“始终获取最新页面”的请求 以下代码验证通过,基本符合你的要求set_time_limit(0);$url = "http://www.gxrc.com/positionSearch/PositionSearchResult2011.aspx";$cookie_file = dirname(__FILE__)."/cookie.txt";$curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_REFERER, $url);curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);curl_setopt($curl, CURLOPT_HEADER, 1);curl_setopt($curl, CURLOPT_NOBODY, 0);curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file);$data = curl_exec($curl); curl_close($curl);if(preg_match('/<input\s*type="hidden"\s*name="__VIEWSTATE"\s*id="__VIEWSTATE"\s*value="(.*?)"\s*\/>/is',$data,$match)){ $__VIEWSTATE = $match[1] ;}if(preg_match('/<input\s*type="hidden"\s*name="__EVENTVALIDATION"\s*id="__EVENTVALIDATION"\s*value="(.*?)"\s*\/>/is',$data,$match)){ $__EVENTVALIDATION = $match[1] ;}$curlPost = array() ;$curlPost['__EVENTTARGET'] = 'AspNetPager1';$curlPost['__EVENTARGUMENT'] = 25 ;//设置翻的页数 $curlPost['__VIEWSTATE'] = $__VIEWSTATE ;$curlPost['__EVENTVALIDATION'] = $__EVENTVALIDATION ;$curlPost['ddlSex'] = -1 ;$curlPost['ddlDegreeMin'] = 0 ; $curlPost['ddlDegreeMax'] = 0 ; $curlPost['txtWorkArea'] = '南宁市'; //设置搜索区域$curlPost['ddlCallingPrompt'] = '不限';//设置工作类别$curlPost['ddlCalling'] = 0 ;$curlPost['ddlPublicTime'] = 0 ;$curlPost['ddlSalary'] = 0 ;$curlPost['ddlEnterpriseKind'] = 0 ;$curlPost['ddlComputerLevel'] = 0 ;$curlPost['ddlWorkKind'] = 0 ;$curlPost['ddlTitle'] = 1 ;$curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_setopt($curl, CURLOPT_REFERER, $url);curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);curl_setopt($curl, CURLOPT_HEADER, 1);curl_setopt($curl, CURLOPT_NOBODY, 0);curl_setopt($curl, CURLOPT_POST, 1); curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($curlPost)); curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file);$data = curl_exec($curl); curl_close($curl);echo $data 相当于串行执行,先获取列表数据,再一项一项进行抓取,CURL也可以实现多两种抓取,不然太慢了:PHP Curl 多线程实现使用方法:$urls = array("http://baidu.com", "http://21andy.com", "http://google.com");$mp = new MultiHttpRequest($urls);$mp->start();下载: class_curl_multi.php<?php/* * Curl 多线程类 * 使用方法: * ========================$urls = array("http://baidu.com", "http://dzone.com", "http://google.com");$mp = new MultiHttpRequest($urls);$mp->start(); * ======================== */class MultiHttpRequest { public $urls = array(); public $curlopt_header = 1; public $method = "GET"; function __construct($urls = false) { $this->urls = $urls; } function set_urls($urls) { $this->urls = $urls; return $this; } function is_return_header($b) { $this->curlopt_header = $b; return $this; } function set_method($m) { $this->medthod = strtoupper($m); return $this; } function start() { if(!is_array($this->urls) or count($this->urls) == 0){ return false; } $curl = $text = array(); $handle = curl_multi_init(); foreach($this->urls as $k=>$v){ $curl[$k] = $this->add_handle($handle, $v); } $this->exec_handle($handle); foreach($this->urls as $k=>$v){ curl_multi_getcontent($curl[$k]); echo $curl[$k]."\n"; //$text[$k] = curl_multi_getcontent($curl[$k]); //echo $text[$k], "\n\n"; curl_multi_remove_handle($handle, $curl[$k]); } curl_multi_close($handle); } private function add_handle($handle, $url) { $curl = curl_init(); curl_setopt($curl, CURLOPT_URL, $url); curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header); curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1); curl_multi_add_handle($handle, $curl); return $curl; } private function exec_handle($handle) { $flag = null; do { curl_multi_exec($handle, $flag); } while ($flag > 0); }} 额 我也帖段代码吧private function getCookieJar() { $cookiejar = tempnam('/tmp' , '.co'); $ch = curl_init(); $options = array( CURLOPT_URL => $this->config->spider->login_url, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10', CURLOPT_POST => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_COOKIEJAR => $cookiejar ); curl_setopt_array($ch , $options); $login_form = curl_exec($ch); curl_close($ch); $pattern = '#id="([0-9a-z]{32})".*?value="([0-9a-z]{32})"#is'; preg_match_all($pattern , $login_form , $matches); if($matches[1]) { $ch = curl_init(); $options = array( CURLOPT_URL => $this->config->spider->logincheck_url, CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10', CURLOPT_REFERER => $this->config->spider->login_url, CURLOPT_POST => true, CURLOPT_RETURNTRANSFER => true, CURLOPT_COOKIEFILE => $cookiejar, CURLOPT_COOKIEJAR => $cookiejar, CURLOPT_POSTFIELDS => 'autoLogin=1&userName='.urlencode($this->config->spider->userName).'&userPass='.urlencode($this->config->spider->userPass).'&'.$matches[1][0].'='.$matches[2][0].'&login='.urlencode('立即登录'), CURLOPT_FOLLOWLOCATION => true, ); curl_setopt_array($ch , $options); $login = curl_exec($ch); curl_close($ch); return $cookiejar; } return ''; } 5楼 兄弟写的很好,谢谢,curl 这块内容看起来很麻烦啊 php关于短标签模式的开启 下拉菜单提交问题?菜鸟求教 青岛的php程序员过来看一下 多维数组的排序!! php文件求解密方法 关于正则表达式 如何在页面宽度限定的情况下用使用框架? 谁能回答用SOAP xmlhttp对象提交和接收数据,这是怎么去实现的呀? php论坛遇到的问题 php提交form的form老是过期!怎么办 php中如何遍历一个中文字串中的所有字? opera下mouseover事件div边框显示不完全!
然后再连接把随机值与其他数据一起post给$url
这个就可以正常获取了
加随机参数是为了获取最新的页面信息吗?如果是的话,可以考虑去掉,通过HTTP头信息来发送“始终获取最新页面”的请求
$url = "http://www.gxrc.com/positionSearch/PositionSearchResult2011.aspx";
$cookie_file = dirname(__FILE__)."/cookie.txt";$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_REFERER, $url);
curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_NOBODY, 0);
curl_setopt($curl, CURLOPT_COOKIEJAR, $cookie_file);
$data = curl_exec($curl);
curl_close($curl);
if(preg_match('/<input\s*type="hidden"\s*name="__VIEWSTATE"\s*id="__VIEWSTATE"\s*value="(.*?)"\s*\/>/is',$data,$match)){
$__VIEWSTATE = $match[1] ;
}
if(preg_match('/<input\s*type="hidden"\s*name="__EVENTVALIDATION"\s*id="__EVENTVALIDATION"\s*value="(.*?)"\s*\/>/is',$data,$match)){
$__EVENTVALIDATION = $match[1] ;
}
$curlPost = array() ;
$curlPost['__EVENTTARGET'] = 'AspNetPager1';
$curlPost['__EVENTARGUMENT'] = 25 ;//设置翻的页数
$curlPost['__VIEWSTATE'] = $__VIEWSTATE ;
$curlPost['__EVENTVALIDATION'] = $__EVENTVALIDATION ;
$curlPost['ddlSex'] = -1 ;
$curlPost['ddlDegreeMin'] = 0 ;
$curlPost['ddlDegreeMax'] = 0 ;
$curlPost['txtWorkArea'] = '南宁市'; //设置搜索区域
$curlPost['ddlCallingPrompt'] = '不限';//设置工作类别
$curlPost['ddlCalling'] = 0 ;
$curlPost['ddlPublicTime'] = 0 ;
$curlPost['ddlSalary'] = 0 ;
$curlPost['ddlEnterpriseKind'] = 0 ;
$curlPost['ddlComputerLevel'] = 0 ;
$curlPost['ddlWorkKind'] = 0 ;
$curlPost['ddlTitle'] = 1 ;
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($curl, CURLOPT_REFERER, $url);
curl_setopt($curl, CURLOPT_USERAGENT, $_SERVER['HTTP_USER_AGENT']);
curl_setopt($curl, CURLOPT_HEADER, 1);
curl_setopt($curl, CURLOPT_NOBODY, 0);
curl_setopt($curl, CURLOPT_POST, 1);
curl_setopt($curl, CURLOPT_POSTFIELDS, http_build_query($curlPost));
curl_setopt($curl, CURLOPT_COOKIEFILE, $cookie_file);
$data = curl_exec($curl);
curl_close($curl);
echo $data
相当于串行执行,先获取列表数据,再一项一项进行抓取,CURL也可以实现多两种抓取,不然太慢了:PHP Curl 多线程实现
使用方法:
$urls = array("http://baidu.com", "http://21andy.com", "http://google.com");
$mp = new MultiHttpRequest($urls);
$mp->start();下载: class_curl_multi.php<?php
/*
* Curl 多线程类
* 使用方法:
* ========================
$urls = array("http://baidu.com", "http://dzone.com", "http://google.com");
$mp = new MultiHttpRequest($urls);
$mp->start();
* ========================
*/
class MultiHttpRequest {
public $urls = array();
public $curlopt_header = 1;
public $method = "GET";
function __construct($urls = false) {
$this->urls = $urls;
}
function set_urls($urls) {
$this->urls = $urls;
return $this;
}
function is_return_header($b) {
$this->curlopt_header = $b;
return $this;
}
function set_method($m) {
$this->medthod = strtoupper($m);
return $this;
}
function start() {
if(!is_array($this->urls) or count($this->urls) == 0){
return false;
}
$curl = $text = array();
$handle = curl_multi_init();
foreach($this->urls as $k=>$v){
$curl[$k] = $this->add_handle($handle, $v);
}
$this->exec_handle($handle);
foreach($this->urls as $k=>$v){
curl_multi_getcontent($curl[$k]);
echo $curl[$k]."\n";
//$text[$k] = curl_multi_getcontent($curl[$k]);
//echo $text[$k], "\n\n";
curl_multi_remove_handle($handle, $curl[$k]);
}
curl_multi_close($handle);
}
private function add_handle($handle, $url) {
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_HEADER, $this->curlopt_header);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
curl_multi_add_handle($handle, $curl);
return $curl;
}
private function exec_handle($handle) {
$flag = null;
do {
curl_multi_exec($handle, $flag);
} while ($flag > 0);
}
}
{
$cookiejar = tempnam('/tmp' , '.co');
$ch = curl_init();
$options = array(
CURLOPT_URL => $this->config->spider->login_url,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10',
CURLOPT_POST => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_COOKIEJAR => $cookiejar
);
curl_setopt_array($ch , $options);
$login_form = curl_exec($ch);
curl_close($ch);
$pattern = '#id="([0-9a-z]{32})".*?value="([0-9a-z]{32})"#is';
preg_match_all($pattern , $login_form , $matches);
if($matches[1])
{
$ch = curl_init();
$options = array(
CURLOPT_URL => $this->config->spider->logincheck_url,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.10) Gecko/20100914 Firefox/3.6.10',
CURLOPT_REFERER => $this->config->spider->login_url,
CURLOPT_POST => true,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_COOKIEFILE => $cookiejar,
CURLOPT_COOKIEJAR => $cookiejar,
CURLOPT_POSTFIELDS => 'autoLogin=1&userName='.urlencode($this->config->spider->userName).'&userPass='.urlencode($this->config->spider->userPass).'&'.$matches[1][0].'='.$matches[2][0].'&login='.urlencode('立即登录'),
CURLOPT_FOLLOWLOCATION => true,
);
curl_setopt_array($ch , $options);
$login = curl_exec($ch);
curl_close($ch);
return $cookiejar;
}
return '';
}
curl 这块内容看起来很麻烦啊