一个用php写的采集其他网站资料的页面,以前能正常采集,现在采集原网站升级了,网页编码类型更换了。我采集到的内容成了乱码,求人解决!附上 一个采集页面的代码。QQ 1872 32958 可以付费解决!<?
//if ($HTTP_SERVER_VARS['SERVER_ADDR']<>"58.64.136.81"){exit;}if (date('Y-m-d')>'2009-01-01'){exit;}include "../../lib/library.mem.php";
include "../../lib/mysqllib.php";
include "../../lib/pub_library.php";
include "../../lib/http.class.php";
$langx='zh-tw';
$t_page=1;
for($pages=0;$pages<$t_page;$pages++){
$data=get_uid($langx);
$data=theif_data($data[0],$data[1],'FT','r',$langx,$pages);$pb=explode('t_page=',$data);
$pb=explode(';',$pb[0]);
$t_page=$pb[0];if (sizeof(explode("gamount",$data))>1){
$k=0;
preg_match_all("/Array\((.+?)\);/is",$data,$matches);
$cou=sizeof($matches[0]);
$db = new proc_DB(DB_HOST,DB_USER,DB_PWD,DB_NAME); for($i=0;$i<$cou;$i++){
$messages=$matches[0][$i];
$messages=str_replace(");",")",$messages);
$messages=str_replace("cha(5)","",$messages);
$datainfo=eval("return $messages;");
$dtime=match_start($datainfo[1]);
if($datainfo[0]+0!=0){
if (sizeof(explode('<BR>',strtoupper($datainfo[1])))>2){
$m_type=1;
}else{
$m_type=0;
} $mb_team = big52gb($datainfo[5]);
$tg_team = big52gb($datainfo[6]);
$league = big52gb($datainfo[2]); $sql="select mid,mb_team,tg_team,mb_team_tw,tg_team_tw from foot_match where mid=$datainfo[0]";
$db->query($sql,1);
if($db->num_rows()==0){
$sql = "INSERT INTO foot_match(MID,M_Start,M_Date,M_Time,MB_Team,TG_Team,MB_Team_tw,TG_Team_tw,M_League_tw,M_League,MB_MID,TG_MID,R_Show) VALUES
('$datainfo[0]','$dtime[2]','$dtime[0]','$dtime[1]','$mb_team','$tg_team','$datainfo[5]','$datainfo[6]','$datainfo[2]','$league','$datainfo[3]','$datafo[4]','1')";
$db->query($sql);
}else{
if($db->f('mb_team')=='' or $db->f('tg_team')=='' or $db->f('tg_team_tw')=='' or $db->f('mb_team_tw')==''){
$sql = "update foot_match set mb_team='$mb_team',tg_team='$tg_team',MB_Team_tw='$datainfo[4]',TG_Team_tw='$datainfo[6]',M_League='$league',M_League_tw='$datainfo[2]' where MID=$datainfo[0]";
$db->query($sql);
}
}
}
}
$db->close();
}
}
?>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=big5">
<title></title>
<link href="/style/style.css" rel="stylesheet" type="text/css">
<style type="text/css">
<!--
body {
margin-left: 0px;
margin-top: 0px;
}
-->
</style>
</head><body bgcolor="#AACCCC">
<script>
<!--
var limit="45"
if (document.images){
var parselimit=limit
}
function beginrefresh(){
if (!document.images)
return
if (parselimit==1)
window.location.reload()
else{
parselimit-=1
curmin=Math.floor(parselimit)
if (curmin!=0)
curtime=curmin+"秒后自动本页获取最新数据!"
else
curtime=cursec+"秒后自动本页获取最新数据!"
timeinfo.innerText=curtime
setTimeout("beginrefresh()",1000)
}
}window.onload=beginrefresh
file://-->
</script>
<table width="102" height="100" border="0" cellpadding="0" cellspacing="0">
<tr>
<td width="110" height="110" align="center">
虫Αタ钡Μ<br>
<span id="timeinfo"></span><br>
<input type=button name=button value="羉蔨" onClick="window.location.reload()"></td>
</tr>
</table>
</body>
</html>
//if ($HTTP_SERVER_VARS['SERVER_ADDR']<>"58.64.136.81"){exit;}if (date('Y-m-d')>'2009-01-01'){exit;}include "../../lib/library.mem.php";
include "../../lib/mysqllib.php";
include "../../lib/pub_library.php";
include "../../lib/http.class.php";
$langx='zh-tw';
$t_page=1;
for($pages=0;$pages<$t_page;$pages++){
$data=get_uid($langx);
$data=theif_data($data[0],$data[1],'FT','r',$langx,$pages);$pb=explode('t_page=',$data);
$pb=explode(';',$pb[0]);
$t_page=$pb[0];if (sizeof(explode("gamount",$data))>1){
$k=0;
preg_match_all("/Array\((.+?)\);/is",$data,$matches);
$cou=sizeof($matches[0]);
$db = new proc_DB(DB_HOST,DB_USER,DB_PWD,DB_NAME); for($i=0;$i<$cou;$i++){
$messages=$matches[0][$i];
$messages=str_replace(");",")",$messages);
$messages=str_replace("cha(5)","",$messages);
$datainfo=eval("return $messages;");
$dtime=match_start($datainfo[1]);
if($datainfo[0]+0!=0){
if (sizeof(explode('<BR>',strtoupper($datainfo[1])))>2){
$m_type=1;
}else{
$m_type=0;
} $mb_team = big52gb($datainfo[5]);
$tg_team = big52gb($datainfo[6]);
$league = big52gb($datainfo[2]); $sql="select mid,mb_team,tg_team,mb_team_tw,tg_team_tw from foot_match where mid=$datainfo[0]";
$db->query($sql,1);
if($db->num_rows()==0){
$sql = "INSERT INTO foot_match(MID,M_Start,M_Date,M_Time,MB_Team,TG_Team,MB_Team_tw,TG_Team_tw,M_League_tw,M_League,MB_MID,TG_MID,R_Show) VALUES
('$datainfo[0]','$dtime[2]','$dtime[0]','$dtime[1]','$mb_team','$tg_team','$datainfo[5]','$datainfo[6]','$datainfo[2]','$league','$datainfo[3]','$datafo[4]','1')";
$db->query($sql);
}else{
if($db->f('mb_team')=='' or $db->f('tg_team')=='' or $db->f('tg_team_tw')=='' or $db->f('mb_team_tw')==''){
$sql = "update foot_match set mb_team='$mb_team',tg_team='$tg_team',MB_Team_tw='$datainfo[4]',TG_Team_tw='$datainfo[6]',M_League='$league',M_League_tw='$datainfo[2]' where MID=$datainfo[0]";
$db->query($sql);
}
}
}
}
$db->close();
}
}
?>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=big5">
<title></title>
<link href="/style/style.css" rel="stylesheet" type="text/css">
<style type="text/css">
<!--
body {
margin-left: 0px;
margin-top: 0px;
}
-->
</style>
</head><body bgcolor="#AACCCC">
<script>
<!--
var limit="45"
if (document.images){
var parselimit=limit
}
function beginrefresh(){
if (!document.images)
return
if (parselimit==1)
window.location.reload()
else{
parselimit-=1
curmin=Math.floor(parselimit)
if (curmin!=0)
curtime=curmin+"秒后自动本页获取最新数据!"
else
curtime=cursec+"秒后自动本页获取最新数据!"
timeinfo.innerText=curtime
setTimeout("beginrefresh()",1000)
}
}window.onload=beginrefresh
file://-->
</script>
<table width="102" height="100" border="0" cellpadding="0" cellspacing="0">
<tr>
<td width="110" height="110" align="center">
虫Αタ钡Μ<br>
<span id="timeinfo"></span><br>
<input type=button name=button value="羉蔨" onClick="window.location.reload()"></td>
</tr>
</table>
</body>
</html>
这句把big5换成网页现在的编码
<head>
<!--<title>Welcome</title>-->
<title>Welcome </title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<frameset rows="*,0,0" frameborder="NO" border="0" framespacing="0">
<frame name="SI2_mem_index" src="app/member/">
<frame name="SI2_func" scrolling="NO" noresize src="./ok.html">
</frameset>
<noframes>
<body bgcolor="#FFFFFF" text="#000000">
</body>
</noframes>
</html>
给你一个转换编码的函数,很方便的。<?php
//UTF-8 转GB编码
function utf82gb($utfstr)
{
if(function_exists('iconv'))
{
return iconv('utf-8','gbk//ignore',$utfstr);
}
global $UC2GBTABLE;
$okstr = "";
if(trim($utfstr)=="")
{
return $utfstr;
}
if(empty($UC2GBTABLE))
{
$filename = "data/gb2312-utf8.dat";
$fp = fopen($filename,"r");
while($l = fgets($fp,15))
{
$UC2GBTABLE[hexdec(substr($l, 7, 6))] = hexdec(substr($l, 0, 6));
}
fclose($fp);
}
$okstr = "";
$ulen = strlen($utfstr);
for($i=0;$i<$ulen;$i++)
{
$c = $utfstr[$i];
$cb = decbin(ord($utfstr[$i]));
if(strlen($cb)==8)
{
$csize = strpos(decbin(ord($cb)),"0");
for($j=0;$j < $csize;$j++)
{
$i++; $c .= $utfstr[$i];
}
$c = utf82u($c);
if(isset($UC2GBTABLE[$c]))
{
$c = dechex($UC2GBTABLE[$c]+0x8080);
$okstr .= chr(hexdec($c[0].$c[1])).chr(hexdec($c[2].$c[3]));
}
else
{
$okstr .= "&#".$c.";";
}
}
else
{
$okstr .= $c;
}
}
$okstr = trim($okstr);
return $okstr;
}//GB转UTF-8编码
function gb2utf8($gbstr)
{
if(function_exists('iconv'))
{
return iconv('gbk','utf-8//ignore',$gbstr);
}
global $CODETABLE;
if(trim($gbstr)=="")
{
return $gbstr;
}
if(empty($CODETABLE))
{
$filename = "data/gb2312-utf8.dat";
$fp = fopen($filename,"r");
while ($l = fgets($fp,15))
{
$CODETABLE[hexdec(substr($l, 0, 6))] = substr($l, 7, 6);
}
fclose($fp);
}
$ret = "";
$utf8 = "";
while ($gbstr != '')
{
if (ord(substr($gbstr, 0, 1)) > 0x80)
{
$thisW = substr($gbstr, 0, 2);
$gbstr = substr($gbstr, 2, strlen($gbstr));
$utf8 = "";
@$utf8 = u2utf8(hexdec($CODETABLE[hexdec(bin2hex($thisW)) - 0x8080]));
if($utf8!="")
{
for ($i = 0;$i < strlen($utf8);$i += 3)
$ret .= chr(substr($utf8, $i, 3));
}
}
else
{
$ret .= substr($gbstr, 0, 1);
$gbstr = substr($gbstr, 1, strlen($gbstr));
}
}
return $ret;
}//Unicode转utf8
function u2utf8($c)
{
for ($i = 0;$i < count($c);$i++)
{
$str = "";
}
if ($c < 0x80)
{
$str .= $c;
}
else if ($c < 0x800)
{
$str .= (0xC0 | $c >> 6);
$str .= (0x80 | $c & 0x3F);
}
else if ($c < 0x10000)
{
$str .= (0xE0 | $c >> 12);
$str .= (0x80 | $c >> 6 & 0x3F);
$str .= (0x80 | $c & 0x3F);
}
else if ($c < 0x200000)
{
$str .= (0xF0 | $c >> 18);
$str .= (0x80 | $c >> 12 & 0x3F);
$str .= (0x80 | $c >> 6 & 0x3F);
$str .= (0x80 | $c & 0x3F);
}
return $str;
}//utf8转Unicode
function utf82u($c)
{
switch(strlen($c))
{
case 1:
return ord($c);
case 2:
$n = (ord($c[0]) & 0x3f) << 6;
$n += ord($c[1]) & 0x3f;
return $n;
case 3:
$n = (ord($c[0]) & 0x1f) << 12;
$n += (ord($c[1]) & 0x3f) << 6;
$n += ord($c[2]) & 0x3f;
return $n;
case 4:
$n = (ord($c[0]) & 0x0f) << 18;
$n += (ord($c[1]) & 0x3f) << 12;
$n += (ord($c[2]) & 0x3f) << 6;
$n += ord($c[3]) & 0x3f;
return $n;
}
}//Big5码转换成GB码
function big52gb($Text)
{
if(function_exists('iconv'))
{
return iconv('big5','gbk//ignore',$Text);
}
global $BIG5_DATA;
if(empty($BIG5_DATA))
{
$filename = "data/big5-gb.dat";
$fp = fopen($filename, "rb");
$BIG5_DATA = fread($fp,filesize($filename));
fclose($fp);
}
$max = strlen($Text)-1;
for($i=0;$i<$max;$i++)
{
$h = ord($Text[$i]);
if($h>=0x80)
{
$l = ord($Text[$i+1]);
if($h==161 && $l==64)
{
$gbstr = " ";
}
else
{
$p = ($h-160)*510+($l-1)*2;
$gbstr = $BIG5_DATA[$p].$BIG5_DATA[$p+1];
}
$Text[$i] = $gbstr[0];
$Text[$i+1] = $gbstr[1];
$i++;
}
}
return $Text;
}//GB码转换成Big5码
function gb2big5($Text)
{
if(function_exists('iconv'))
{
return iconv('gbk','big5//ignore',$Text);
}
global $GB_DATA;
if(empty($GB_DATA))
{
$filename = "data/gb-big5.dat";
$fp = fopen($filename, "rb");
$gb = fread($fp,filesize($filename));
fclose($fp);
}
$max = strlen($Text)-1;
for($i=0;$i<$max;$i++)
{
$h = ord($Text[$i]);
if($h>=0x80)
{
$l = ord($Text[$i+1]);
if($h==161 && $l==64)
{
$big = " ";
}
else
{
$p = ($h-160)*510+($l-1)*2;
$big = $GB_DATA[$p].$GB_DATA[$p+1];
}
$Text[$i] = $big[0];
$Text[$i+1] = $big[1];
$i++;
}
}
return $Text;
}//unicode url编码转gbk编码函数
function UnicodeUrl2Gbk($str)
{
//载入对照词典
if(!isset($GLOBALS['GbkUniDic']))
{
$fp = fopen('data/gbk-unicode.dat','rb');
while(!feof($fp))
{
$GLOBALS['GbkUniDic'][bin2hex(fread($fp,2))] = fread($fp,2);
}
fclose($fp);
} //处理字符串
$str = str_replace('$#$','+',$str);
$glen = strlen($str);
$okstr = "";
for($i=0; $i < $glen; $i++)
{
if($glen-$i > 4)
{
if($str[$i]=='%' && $str[$i+1]=='u')
{
$uni = strtolower(substr($str,$i+2,4));
$i = $i+5;
if(isset($GLOBALS['GbkUniDic'][$uni]))
{
$okstr .= $GLOBALS['GbkUniDic'][$uni];
}
else
{
$okstr .= "&#".hexdec('0x'.$uni).";";
}
}
else
{
$okstr .= $str[$i];
}
}
else
{
$okstr .= $str[$i];
}
}
return $okstr;
}?>
$data=theif_data($data[0],$data[1],'FT','r',$langx,$pages);
后的 $data 就是采集到的原始页面内容
这一点你可以 echo $data; 看看。不是也没关系,但你总是需要找到他的
找到他后,在他的下面加入(其中 $data 视具体情况定)以下代码即可
$codepage = mb_detect_encoding($data, array('ASCII','UTF-8','BIG-5','GBK'));
$data = mb_convert_encoding($data, 'big5', $codepage);
试试看能不能解决问题吧