我在一个程序中需要抽取html中的部分内容,如下,需要抽取<h3 class="r">标签下的href属性值,还有<div class="s">标签下到<b>...</b>的内容,并去除中间的标签如<em>,我对正则不太懂,请问在php中用正则表达式怎么实现啊?
<li class=g>
<h3 class="r">
<a href="http://daode.cycnet.com/rewu/t20040416_2649.htm" target=_blank class=l>雷锋:我愿永远做一个<em>螺丝钉</em>-青少年思想道德网</a>
</h3>
<div class="s">一个人的作用,对于革命事业来说,就如一架机器上的一颗<em>螺丝钉</em>。机器由于许许多多的<em>螺丝钉</em>的联结和固定,才成为一个坚实的整体,才能够运转自如,发挥它巨大的工作能力 <b>...</b>
<br><span class=f><cite>daode.cycnet.com/rewu/t20040416_2649.htm</cite> - <span class=gl><a href="http://webcache.googleusercontent.com/search?q=cache:-lM6FaqkWq4J:daode.cycnet.com/rewu/t20040416_2649.htm+%E8%9E%BA%E4%B8%9D%E9%92%89&cd=3&hl=zh-CN&ct=clnk&ie=UTF-8" target=_blank>网页快照</a></span></span>
</div>
<li class=g>
<h3 class="r">
<a href="http://daode.cycnet.com/rewu/t20040416_2649.htm" target=_blank class=l>雷锋:我愿永远做一个<em>螺丝钉</em>-青少年思想道德网</a>
</h3>
<div class="s">一个人的作用,对于革命事业来说,就如一架机器上的一颗<em>螺丝钉</em>。机器由于许许多多的<em>螺丝钉</em>的联结和固定,才成为一个坚实的整体,才能够运转自如,发挥它巨大的工作能力 <b>...</b>
<br><span class=f><cite>daode.cycnet.com/rewu/t20040416_2649.htm</cite> - <span class=gl><a href="http://webcache.googleusercontent.com/search?q=cache:-lM6FaqkWq4J:daode.cycnet.com/rewu/t20040416_2649.htm+%E8%9E%BA%E4%B8%9D%E9%92%89&cd=3&hl=zh-CN&ct=clnk&ie=UTF-8" target=_blank>网页快照</a></span></span>
</div>
$str = <<<html
<li class=g>
<h3 class="r">
<a href="http://daode.cycnet.com/rewu/t20040416_2649.htm" target=_blank class=l>雷锋:我愿永远做一个<em>螺丝钉</em>-青少年思想道德网</a>
</h3>
<div class="s">一个人的作用,对于革命事业来说,就如一架机器上的一颗<em>螺丝钉</em>。机器由于许许多多的<em>螺丝钉</em>的联结和固定,才成为一个坚实的整体,才能够运转自如,发挥它巨大的工作能力 <b>...</b>
<br><span class=f><cite>daode.cycnet.com/rewu/t20040416_2649.htm</cite> - <span class=gl><a href="http://webcache.googleusercontent.com/search?q=cache:-lM6FaqkWq4J:daode.cycnet.com/rewu/t20040416_2649.htm+%E8%9E%BA%E4%B8%9D%E9%92%89&cd=3&hl=zh-CN&ct=clnk&ie=UTF-8" target=_blank>网页快照</a></span></span>
</div>
html;
$pattern = "/(<h3 class=\"r\">\s*<a href=\"(.*?)\".*)<em>(.*)<\/em>(.*<\/h3>)/is";
preg_match_all($pattern, $str, $aMatch);
print_r($aMatch[2]);
$replacement = "\$1$2\$3\$4";
$str1 = preg_replace($pattern, $replacement, $aMatch[0][0]);
echo $str1;
echo "\n";
$pattern2 = "/(<div class=\"s\">.*)<em>(.*)<\/em>(.*<\/b>)/is";
$replacement2 = "\$1$2\$3\$4";
preg_match_all($pattern2, $str, $aMatch2);
$str2 = preg_replace($pattern, $replacement, $aMatch2[0][0]);
echo $str2;
set_time_limit(0);
$url = "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=PHP&aq=f&aqi=&aql=&oq=&gs_rfai=";
$html = file_get_contents($url);$html = preg_replace('/>(\s+)/','>',$html);
$html = preg_replace('/(\s+)</','<',$html);
$html = preg_replace('/(\s+)/',' ',$html);
$html = str_replace("\r\n",'',$html);
$html = str_replace("\t",'',$html);echo "<pre>";
$result=array();
preg_match_all('~<li class=g[^>]*?><h3 class="r[^>]*?"><a href="(.*?)"[^>]*?>(.*?)</a></h3>~is',$html,$data);
foreach($data[1] as $key=>$item){
$result[$key]["href"]=$item;
}
foreach($data[2] as $key=>$item){
$item = str_replace(array("<em>","</em>"),"",$item);
$result[$key]["title"]=$item;
}
preg_match_all('~<div class="s[^>]*?"[^>]*?>(.*?)<br><(span|div)[^>]*?class=[^>]*?>~is',$html,$data);
foreach($data[1] as $key=>$item){
$item = str_replace(array("<b>...</b>","<em>","</em>"),array(""),$item);
$result[$key]["content"]=$item;
}
print_r($result);