<?
/*$str = "<li><a href=\"http://www.canachieve.com.cn/htmlDocument/2011-09-07/detail_130499.html\" style=\"\" title=\"移民专家解析美国税收\" target=\"_blank\">移民专家解析美国税收</a> <span>2011-09-07</span>";
preg_match('#<li><a href="([^\"]*)" style="" title="(.*)" target="_blank">(.*)</a> <span>(.*)</span>#s',$str,$arr);
echo $arr[1];
exit;
*/
header("Content-type: text/html; charset=utf-8");
//采集的网址
$url = "http://www.canachieve.com.cn/htmlDocument/category444/index.html";
$file_contents = file_get_contents($url);
$start = "<div class=\"list_n_tt\">";
$end = "<div class=\"Page\">";
$s = get_sub_content($file_contents,$start,$end);
$s = str_replace(array("<ul>","</div>","</ul>"),"",$s);
$array = explode("</li>",$s);
for($i=0;$i<count($array);$i++){
//exit;
$str1 = htmlentities(str_replace('"','\"',$array[$i]),ENT_QUOTES,'utf-8');
//$str2 = str_replace("/","\/",$str1);
//echo $str1."<br>";
preg_match('#<li><a href="([^\"]*)" style="" title="(.*)" target="_blank">(.*)</a> <span>(.*)</span>#s',$str,$arr1);
if(count($arr1)<1){
echo "没有匹配成功".$i."<br>";
}else{
echo $arr1[1];
}
}
//函数
function get_sub_content($str,$start,$end){
if ( $start == '' || $end == '' ){
return;
}
$str = explode($start, $str);
$str = explode($end, $str[1]);
return $str[0];
}
?>红色部分不能正常匹配,但是我echo $str1 的然后把正则拿出for循环就可以正常匹配,这是为什么?
htmlentities 的功能不是将html代码输出吗?
我echo 了一下 $str1可以输出HTML代码
界面上当然可以输出,但是htmlentities < > 是把这些标签转换为 <这些。所以正则式匹配不到的。去掉htmlentities
header ( "Content-type: text/html; charset=utf-8" );
//采集的网址
$url = "http://www.canachieve.com.cn/htmlDocument/category444/index.html";
$file_contents = file_get_contents ( $url );
$start = "<div class=\"list_n_tt\">";
$end = "<div class=\"Page\">";
$s = get_sub_content ( $file_contents, $start, $end );
$str1= $s;
preg_match_all(
'/<a.*?(?: |\\t|\\r|\\n)?href=[\'"]?(.+?)[\'"]?(?:(?: |\\t|\\r|\\n)+.*?)?>(.+?)<\/a.*?>(.+?)<span>(.+?)<\/span.*?>/sim',
$str1, $arr1, PREG_PATTERN_ORDER);
print_r($arr1);
//函数
function get_sub_content($str, $start, $end) {
if ($start == '' || $end == '') {
return;
}
$str = explode ( $start, $str );
$str = explode ( $end, $str [1] );
return $str [0];
}
?>