这个类的函数说明function get_td_innerText(&$tr){ function get_htmltag_innerText(&$html,$htmltag){ 得到所有htmltag的内容相当于js中的innerText,的数组 function get_htmltag_outerText(&$html,$htmltag){得到tag的outerText的数组 function cut($str,$from,$to,$direct='out')截取from to 之间的字串function clear_td_tags(&$cells,$extag){ 用来将由table得到的cells数组中的html标记去掉。 function clearTag($html){
/** ensure this file is being included by a parent file */
//defined( '_VALID_MOS' ) or die( 'Direct Access to this location is not allowed.' );
//$count = intval( $params->get( 'url', 57029 ) );
$today = date("Y_m_d");
$weatherfile = $today.".html";
if (is_file($weatherfile) && 1==0) {
$file = file_get_contents($weatherfile);
echo $file;
}
else{
$city = 58357;//58357苏州
$imgroot = 'http://weather.china.com.cn/';
$url ='http://weather.china.com.cn/city/'.$city.'_full.html';
//$url="w.html";
@set_time_limit(6000);
$file=file_get_contents($url);$from='<table width="90%" border="0" cellspacing="0" cellpadding="0">'; //开始截取的HTML代码
$start = '<table id="china_weather">';
$to='</table>'; //结束截取的HTML代码
$file = strstr($file,$from);
$topos = strpos($file,$to);
$file = substr($file,0,$topos+strlen($to));
$table = str_replace($from,$start,$file);
$ystable = $table;//echo $table;
$mtable = new managetable;
$tr = $mtable->get_htmltag_innerText($table,'tr');
$cells = $mtable->get_td_innerText($tr);
//print_R($cells);$exceptTag = 'img'; //不允许清理的tag
$cells = $mtable->clear_td_tags($cells,$exceptTag);
//print_R($cells);$city = $cells[0][0];
$img = $cells[0][1];
$src = $mtable->cut($img,"\"","\"",'in');
$img = '<img src="'.$imgroot.$src.'"/>';
$degree = $cells[1][0];
$descrip = $cells[1][1];
$weather = '<marquee class="weather" width="200">'.$city.$img.$degree.$descrip.'</marquee>';
echo $weather;
$fp = fopen($weatherfile,"wb");
fwrite($fp,$weather);}//end else/**
* class managetable Short description. this class is used to manage htmltable string.
*/
class managetable
{
function clear_td_tags(&$cells,$extag){
$exre = "/<".$extag.".[^>]+>/i";
//print_R($cells);
for ($i=0,$n=count($cells);$i<$n;$i++) {
for ($j=0,$nj=count($cells[$i]);$j<$nj;$j++) {
$cell = $cells[$i][$j];
if (preg_match($exre,$cell,$matches)) {
//echo "$extag:this tag find $matches[0] \n ";
}
else {
$cells[$i][$j] = $this->clearTag($cell);
//echo "not find:".$cells[$i][$j][$k] ."\n";
}
}//for 2
}//for 1
return $cells;
}//end funcfunction clearTag($html){
$re = "/<.[^>]+>/i";
$htmlclean = preg_replace($re,"",$html);
return $htmlclean;
}//end funcfunction get_td_innerText(&$tr){
$cells = array();if (is_array($tr)) {
for ($i=0,$n=count($tr);$i<$n;$i++) {
$td = $this->get_htmltag_innerText($tr[$i],'td');
for ($j=0,$m=count($td);$j<$m ;$j++ ) {
$cells[$i][$j]= $td[$j];
}}//end for
return $cells;
}//end if
else {
return false;
}
}//end funct gettdfunction get_htmltag_innerText(&$html,$htmltag){
$from = '<'.$htmltag;
$fromend = '>';
$to ='</'.$htmltag.'>';
//print_R($html);
//echo "转换标记:".$htmltag."\n<br>";
$selfend = array('img','br'); //自闭合标记,self end htmltag
if (in_array($htmltag,$selfend)) {
echo 'htmltag is self ended tag,no innerText';
return ;
}
else {
$strleft = strstr($html,$from);
$i=0; while ($strleft<>'') {
$topos = strpos($strleft,$to);
$fromendpos = strpos($strleft,$fromend);
$taginnerText[] = substr($strleft,$fromendpos+strlen($fromend),$topos-$fromendpos-strlen($fromend));
$html = substr($strleft,$topos+strlen($to));
//echo $taginnerText[0];echo $html;die;
$strleft = strstr($html,$from);
}//end while
}//else 非自闭合标记
return $taginnerText;
}//end funcfunction get_htmltag_outerText(&$html,$htmltag){
$from = '<'.$htmltag;
$fromend = '>';
$to ='</'.$htmltag.'>';
$selfend = array('img','br'); //自闭合标记,self end htmltag
if (in_array($htmltag,$selfend)) {
$to = '/>';
$strleft = strstr($html,$from);
$j=0;
while ($strleft<>'') {
$topos = strpos($strleft,$to);
$tagouterText[] = substr($strleft,0,$topos+strlen($to));
$html = substr($strleft,$topos+strlen($to));
//echo $taginnerText[0];echo $html;die;
$strleft = strstr($html,$from);
}//end while
}//处理自闭合标记结束,operat self end htmltag finished.
//处理非自闭合标记
else{
//echo "转换标记:".$htmltag."\n<br>";//$i=0;
$strleft = strstr($html,$from);
while ($strleft<>'') {
$topos = strpos($strleft,$to);
$fromendpos = strpos($strleft,$fromend);
$tagouterText[] = substr($strleft,0,$topos+strlen($to));
$html = substr($strleft,$topos+strlen($to));
//echo $taginnerText[0];echo $html;die;
$strleft = strstr($html,$from); }//end while
}//end else not self end htmltag,非自闭合标记处理结束return $tagouterText;}//end func
/**
* Short description.
*/
function cut($str,$from,$to,$direct='out')
{
//echo "$str \n $from \n $to \n";
//$from = "\""; $to = "\"";
$frompos = strpos($str,$from);
$topos = strpos($str,$to,$frompos+strlen($from));
if($direct == 'in'){
$start = $frompos+strlen($from);
$end = $topos-$start;
$txt = substr($str,$start,$end);
}
else {
$start = $frompos;
$end = $topos+strlen($to)-$frompos;
$txt = substr($str,$start,$end);
} return $txt;
} // end func cut} // end class?>
2、php5的DomDocument内置对象可接受html文档,但操作比较复杂
3、这个类过于复杂,结构也不清晰你需要改变思路
function get_htmltag_innerText(&$html,$htmltag){ 得到所有htmltag的内容相当于js中的innerText,的数组
function get_htmltag_outerText(&$html,$htmltag){得到tag的outerText的数组
function cut($str,$from,$to,$direct='out')截取from to 之间的字串function clear_td_tags(&$cells,$extag){ 用来将由table得到的cells数组中的html标记去掉。
function clearTag($html){
cells[0][0]
表示第一行第一列
cells[0][1]表示第一行第二列依次类推。还有,这个类的用意很明确,就是得到这个table的cells数组其方法不仅可以用来操作table,而且也可以用来操作其他的htmltag代码里的说明几乎没有,这大概是大家觉得乱的一个原因。其实它的结构还是很清晰的。
提取天气只是这个类一个应用而已这个类有个bug.等我来改