function get($url,$referer="",$cookies="")
{
/*
$parameter 格式为:aa=1&bb=2
$cookies 格式为:aa=1;bb=2
*/
$returnResult = array();
$returnResult["content"] = false;
$returnResult["cookies"] = false;
$context['http'] = array
(
'method' => 'GET',
'header'=>"Accept: */*\r\n".
"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11\r\n" .
"Cookie:{$cookies}\r\n" .
"Referer:{$referer}\r\n" .
"\r\n"
);
$context["http"] = array
(
'encoding' => 'gb2312'
);
$returnResult["content"] = @file_get_contents($url,false,stream_context_create($context));
if(@$http_response_header)
{
preg_match_all("/Set-Cookie:\s*(.*?)\r\n/",implode("\r\n",$http_response_header),$cookies);
$cookies = preg_replace("/(;).*/","",$cookies[1]);
$returnResult["cookies"] = implode(";",$cookies);
}
return $returnResult;
}
/*
我想问的是为什么同样是utf8的两个站为为什么返回的编码不一样呢?
我想实现的功能是无论对方是utf8的站还是gb2312的站返回的编码都是gb2312编码
请问应当如何实现啊?
这个上下文件参数,究竟有什么作用如下
*/
//$result = get("http://www.dunsh.org/");得到的字符串编码仍然为utf8编码
//$result = get("http://www.google.cn/");//得到的字符串编码仍然为为gbk
echo $result["content"];
<?php
header('Content-Type: text/html; charset=utf-8');
function get($url,$referer="",$cookies="")
{
/*
$parameter 格式为:aa=1&bb=2
$cookies 格式为:aa=1;bb=2
*/
$returnResult = array();
$returnResult["content"] = false;
$returnResult["cookies"] = false;
$context['http'] = array
(
'method' => 'GET',
'header'=>"Accept: */*\r\n".
"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11\r\n" .
"Cookie:{$cookies}\r\n" .
"Referer:{$referer}\r\n" .
"\r\n"
);
$context["http"] = array
(
'encoding' => 'gb2312'
);
$returnResult["content"] = @file_get_contents($url,false,stream_context_create($context));
if(@$http_response_header)
{
preg_match_all("/Set-Cookie:\s*(.*?)\r\n/",implode("\r\n",$http_response_header),$cookies);
$cookies = preg_replace("/(;).*/","",$cookies[1]);
$returnResult["cookies"] = implode(";",$cookies);
}
return $returnResult;
}
/*
我想问的是为什么同样是utf8的两个站为为什么返回的编码不一样呢?
我想实现的功能是无论对方是utf8的站还是gb2312的站返回的编码都是gb2312编码
请问应当如何实现啊?
这个上下文件参数,究竟有什么作用如下
*/
//$result = get("http://www.dunsh.org/");得到的字符串编码仍然为utf8编码
//$result = get("http://www.google.cn/");//得到的字符串编码仍然为为gbk
echo $result["content"];
?>
$context["http"] = array
(
'encoding' => 'gb2312'
);
这个地方注释掉,才正常。
你这样做,对utf8的网站是正常但对gbk网站却不正常,
在线等待好的解决方案。
我抓取别人的网站啊?别人的网站是utf-8还是gbk是不确定的啊?
但有个小问题就是如果把采集过来的字符用iconv从utf8转换到gbk时候报错:
Notice: iconv() [function.iconv]: Detected an illegal
于是我就不转换成gbk了转换成gb18030
请问一下这个gb18030字符字符集是安装在php当中的吧?php当中哪个版本都有这个字符集合吗?
<?php
define("SAFEENCODING","GB18030");
function get($url,$referer="",$cookies="")
{
/*
$parameter 格式为:aa=1&bb=2
$cookies 格式为:aa=1;bb=2
*/
$returnResult = array();
$returnResult["content"] = false;
$returnResult["cookies"] = false;
$context['http'] = array
(
'method' => 'GET',
'header'=>"Accept: */*\r\n".
"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11\r\n" .
"Cookie:{$cookies}\r\n" .
"Referer:{$referer}\r\n" .
"\r\n"
);
// $context["http"] = array
// (
// 'encoding' => 'UTF-8'
// );
$returnResult["content"] = @file_get_contents($url,false,stream_context_create($context));
if($returnResult["content"])
{
$returnResult["content"] = safeEncoding($returnResult["content"],SAFEENCODING);
}
if(@$http_response_header)
{
preg_match_all("/Set-Cookie:\s*(.*?)\r\n/",implode("\r\n",$http_response_header),$cookies);
$cookies = preg_replace("/(;).*/","",$cookies[1]);
$returnResult["cookies"] = implode(";",$cookies);
}
return $returnResult;
}
function post($url,$parameter="",$referer="",$cookies="")
{
/*
$parameter 格式为:aa=1&bb=2
$cookies 格式为:aa=1;bb=2
*/
$returnResult = array();
$returnResult["content"] = false;
$returnResult["cookies"] = false;
$context['http'] = array
(
'method' => 'POST',
'header'=>"Content-type: application/x-www-form-urlencoded\r\n".
"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11\r\n" .
"Cookie:{$cookies}\r\n" .
"Referer:{$referer}\r\n" .
"\r\n",
'content' => $parameter
);
$returnResult["content"] = @file_get_contents($url,false,stream_context_create($context));
if($returnResult["content"])
{
$returnResult["content"] = safeEncoding($returnResult["content"],SAFEENCODING);
}
if(@$http_response_header)
{
preg_match_all("/Set-Cookie:\s*(.*?)\r\n/",implode("\r\n",$http_response_header),$cookies);
$cookies = preg_replace("/(;).*/","",$cookies[1]);
$returnResult["cookies"] = implode(";",$cookies);
}
return $returnResult;
}
function safeEncoding($string,$outEncoding = 'UTF-8')
{
$encoding = "UTF-8";
for($i=0;$i<strlen($string);$i++)
{
if(ord($string{$i})<128)
continue;
if((ord($string{$i})&224)==224)
{
//第一个字节判断通过
$char = $string{++$i};
if((ord($char)&128)==128)
{
//第二个字节判断通过
$char = $string{++$i};
if((ord($char)&128)==128)
{
$encoding = "UTF-8";
break;
}
}
}
if((ord($string{$i})&192)==192)
{
//第一个字节判断通过
$char = $string{++$i};
if((ord($char)&128)==128)
{
//第二个字节判断通过
$encoding = "GB18030";
break;
}
}
}
if(strtoupper($encoding) == strtoupper($outEncoding))
{
return $string;
}
else
{
return iconv($encoding,$outEncoding,$string);
}
}
?>
你那返回有这个信息?.:?
======================================
Array ( [0] => HTTP/1.0 200 OK [1] => Date: Mon, 17 Aug 2009 05:38:57 GMT [2] => Expires: -1 [3] => Cache-Control: private, max-age=0 [4] => Content-Type: text/html; charset=UTF-8 [5] => Set-Cookie: PREF=ID=dd182d200a6c910a:NW=1:TM=1250487537:LM=1250487537:S=_i5KUYQXRx9A8hzX; expires=Wed, 17-Aug-2011 05:38:57 GMT; path=/; domain=.google.cn [6] => Server: gws )
不行还可以正则匹配content里的meta标签,再不行可以用mb_detect_encoding试下返回数据的编码。
万一你的函数抓取一个iso-8859-1编码的网页肯定错啊。
还有你可以用你的函数测试下www.baidu.com,这个是gb2312,貌似也没转换成功。
meta标签一般正式的站点页面都有标记的,个人觉得这个还好点。
function get($url,$referer="",$cookies="")
{
/*
$parameter 格式为:aa=1&bb=2
$cookies 格式为:aa=1;bb=2
*/
$returnResult = array();
$returnResult["content"] = false;
$returnResult["cookies"] = false;
$context['http'] = array
(
'method' => 'GET',
'header'=>"Accept: */*\r\n".
"User-Agent:Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11\r\n" .
"Cookie:{$cookies}\r\n" .
"Referer:{$referer}\r\n" .
"\r\n"
);
// $context["http"] = array
// (
// 'encoding' => 'UTF-8'
// );
$returnResult["content"] = @file_get_contents($url,false,stream_context_create($context));
preg_match("/<meta(?:.[^>]*)charset\s*=([a-zA-Z0-9-_]+)/i",$returnResult['content'],$m);
if($m[1] != "utf-8")
{
$returnResult['content'] = iconv($m[1],"utf-8",$returnResult['content']);
}
if(@$http_response_header)
{
preg_match_all("/Set-Cookie:\s*(.*?)\r\n/",implode("\r\n",$http_response_header),$cookies);
$cookies = preg_replace("/(;).*/","",$cookies[1]);
$returnResult["cookies"] = implode(";",$cookies);
}
return $returnResult;
}
$r = get('http://dir.yahoo.co.jp/');//抓个小日本的页面。
echo $r['content'];
exit;