遇到了问题,请各位高手指点一下。怎样抓取http://huaban.com/favorite/beauty/这个网页的数据
,这是用javascript动态生成的页面,主要想获得响应的JSON数据,写入数据库。我的问题是,现在抓取的页面是javascript的内容,而不是想要的JSON数据。http://huaban.com/favorite/beauty/?ihd8qpsz&since=534268926&limit=100&wfl=1这 在火狐浏览器中可以看到响应的JSON数据的,就是不知道怎么抓取(PHP)
,这是用javascript动态生成的页面,主要想获得响应的JSON数据,写入数据库。我的问题是,现在抓取的页面是javascript的内容,而不是想要的JSON数据。http://huaban.com/favorite/beauty/?ihd8qpsz&since=534268926&limit=100&wfl=1这 在火狐浏览器中可以看到响应的JSON数据的,就是不知道怎么抓取(PHP)
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
$header = array();
$header[] = 'Accept: application/json';
//$header[] = 'Accept-Encoding: gzip, deflate, sdch';
$header[] = 'Accept-Language: zh-CN,zh;q=0.8';
$header[] = 'Connection: keep-alive';
$header[] = 'Cookie: sid=degsfnZpb0zZasKaJXuT2ogT.y5DrPd0n%2F1flfouGNkdAJyoG2fG9AsVfySSnJAs%2B5%2F0';
$header[] = 'Host: huaban.com';
$header[] = 'Referer: http://huaban.com/favorite/beauty/';
$header[] = 'X-Request: JSON';
$header[] = 'X-Requested-With: XMLHttpRequest';
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOBODY, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36');
$response = curl_exec($ch);
if(curl_errno($ch)) {
echo 'Curl error: ' . curl_error($ch);
} else{
echo $response;
}
curl_close($ch);
<?php$url='http://huaban.com/favorite/beauty/?ihd8qpsz&since=534268926&limit=100&wfl=1';
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
$header = array();
$header[] = 'Accept: application/json';
//$header[] = 'Accept-Encoding: gzip, deflate, sdch';
$header[] = 'Accept-Language: zh-CN,zh;q=0.8';
$header[] = 'Connection: keep-alive';
$header[] = 'Cookie: sid=degsfnZpb0zZasKaJXuT2ogT.y5DrPd0n%2F1flfouGNkdAJyoG2fG9AsVfySSnJAs%2B5%2F0';
$header[] = 'Host: huaban.com';
$header[] = 'Referer: http://huaban.com/favorite/beauty/';
$header[] = 'X-Request: JSON';
$header[] = 'X-Requested-With: XMLHttpRequest';
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOBODY, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36');
$response = curl_exec($ch);
if(curl_errno($ch)) {
echo 'Curl error: ' . curl_error($ch);
} else{
echo $response;
}
curl_close($ch);
$contents = file_get_contents($file);
if ($pin_array = unserialize($contents)) {
$pin_id = end(end($pin_array));
echo '<h3>Pin IDs:</h3>';
foreach ($pin_array as $row) {
foreach ($row as $v) {
echo $v.', ';
}
echo '<hr />';
}
} else {
$pin_id = $contents;
}
$page = request($pin_id);
$new_pins = get_pins($page);
$pin_array[] = $new_pins;
file_put_contents($file, serialize($pin_array));
function request($pin_id) {
$url = 'http://huaban.com/favorite/beauty/?max=' . $pin_id . '&limit=20';
$header = array(
'Host: huaban.com',
'User-Agent: Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Accept: application/json',
'Accept-Language: zh-cn,en-us;q=0.7,en;q=0.3',
'X-Requested-With: XMLHttpRequest',
'X-Request: JSON',
);
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
if ($response = curl_exec($ch)) {
return $response;
} else {
exit('error: ' . curl_errno($ch));
}
curl_close($ch);
}
function get_pins($page) {
if (preg_match_all('/(?<=[\[,]\{"pin_id":)\d+/', $page, $match)) {
return $match[0];
}
}