有下面一段页面代码HTML代码----------------------
<!-- InstanceBeginEditable name="doctitle" -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<title>测试内容</title>
<meta name="keywords" content="获取这里的keywords">
<meta name="description" content="获取这里的description">
<SCRIPT language=javascript src="JS.js"></SCRIPT>
<script language="javascript" type="text/javascript">
var now=new Date();
var beginTime=now.getTime();
</script>
<SCRIPT LANGUAGE="JavaScript">
<!-- Hide
function killErrors() {
return true;
}
window.onerror = killErrors;
// -->
</SCRIPT>
<style type="text/css">
<!--
#mybody {
margin-left: 0px;
margin-top: 0px;
margin-right: 0px;
margin-bottom: 0px;
}
-->
</style><link href="http://www.51wisdom.com.cn/style.css" rel="stylesheet" type="text/css">
<link href="http://www.51wisdom.com.cn/cs.css" rel="stylesheet" type="text/css">
<link href="http://www.51wisdom.com.cn/a_css.css" rel="stylesheet" type="text/css">
</head>
<BODY id="mybody">
<div style="width:970px;padding-left:5px;">
<div id="wis">
<img src="http://www.51wisdom.com.cn/images/0529_2.gif" alt="这里是图片" width="160" height="88">
</div>
我要获取的内容
</div>
<embed width="700" height="620" quality="high" bgcolor="#FFFFFF" name="fotester" id="fotester" src="../images/swfplayer/2007Player18.swf?id=56744&width=700&height=570&bgcolor=000000&totalframe=1475&framerate=18&MSL=0&main=0&size=5930" type="application/x-shockwave-flash">
<div><a href="link.html">这里是一个连接</a></div>
</BODY>
</html>
--------------------------我想去BODY中所有的HTML,得到里面的纯文本,比如,上面获得的内容就是 "我要获取的内容这里是一个连接" ,请问这个正值该怎么写呢?
<!-- InstanceBeginEditable name="doctitle" -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<title>测试内容</title>
<meta name="keywords" content="获取这里的keywords">
<meta name="description" content="获取这里的description">
<SCRIPT language=javascript src="JS.js"></SCRIPT>
<script language="javascript" type="text/javascript">
var now=new Date();
var beginTime=now.getTime();
</script>
<SCRIPT LANGUAGE="JavaScript">
<!-- Hide
function killErrors() {
return true;
}
window.onerror = killErrors;
// -->
</SCRIPT>
<style type="text/css">
<!--
#mybody {
margin-left: 0px;
margin-top: 0px;
margin-right: 0px;
margin-bottom: 0px;
}
-->
</style><link href="http://www.51wisdom.com.cn/style.css" rel="stylesheet" type="text/css">
<link href="http://www.51wisdom.com.cn/cs.css" rel="stylesheet" type="text/css">
<link href="http://www.51wisdom.com.cn/a_css.css" rel="stylesheet" type="text/css">
</head>
<BODY id="mybody">
<div style="width:970px;padding-left:5px;">
<div id="wis">
<img src="http://www.51wisdom.com.cn/images/0529_2.gif" alt="这里是图片" width="160" height="88">
</div>
我要获取的内容
</div>
<embed width="700" height="620" quality="high" bgcolor="#FFFFFF" name="fotester" id="fotester" src="../images/swfplayer/2007Player18.swf?id=56744&width=700&height=570&bgcolor=000000&totalframe=1475&framerate=18&MSL=0&main=0&size=5930" type="application/x-shockwave-flash">
<div><a href="link.html">这里是一个连接</a></div>
</BODY>
</html>
--------------------------我想去BODY中所有的HTML,得到里面的纯文本,比如,上面获得的内容就是 "我要获取的内容这里是一个连接" ,请问这个正值该怎么写呢?
采集程序,大都是用前后定位的方法
定位一个$start,一个$end截取,然后再处理,只要$start 和$end唯一就可以了,没必要用正则例如
$start="<BODY id="mybody">
<div style="width:970px;padding-left:5px;">
";$end="</a></div>
</BODY>
</html>
"
/<body>.*</body>/
然后再用去标签函数
<?php
function html2txt($document){
$search = array('@<script[^>]*?>.*?</script>@si', // Strip out javascript
'@<[\/\!]*?[^<>]*?>@si', // Strip out HTML tags
'@<style[^>]*?>.*?</style>@siU', // Strip style tags properly
'@<![\s\S]*?--[ \t\n\r]*>@' // Strip multi-line comments including CDATA
);
$text = preg_replace($search, '', $document);
return $text;
}
$sString = <<<html
<!-- InstanceBeginEditable name="doctitle" -->
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312">
<title>测试内容</title>
<meta name="keywords" content="获取这里的keywords">
<meta name="description" content="获取这里的description">
<SCRIPT language=javascript src="JS.js"></SCRIPT>
<script language="javascript" type="text/javascript">
var now=new Date();
var beginTime=now.getTime();
</script>
<SCRIPT LANGUAGE="JavaScript">
<!-- Hide
function killErrors() {
return true;
}
window.onerror = killErrors;
// -->
</SCRIPT>
<style type="text/css">
<!--
#mybody {
margin-left: 0px;
margin-top: 0px;
margin-right: 0px;
margin-bottom: 0px;
}
-->
</style><link href="http://www.51wisdom.com.cn/style.css" rel="stylesheet" type="text/css">
<link href="http://www.51wisdom.com.cn/cs.css" rel="stylesheet" type="text/css">
<link href="http://www.51wisdom.com.cn/a_css.css" rel="stylesheet" type="text/css">
</head>
<BODY id="mybody">
<div style="width:970px;padding-left:5px;">
<div id="wis">
<img src="http://www.51wisdom.com.cn/images/0529_2.gif" alt="这里是图片" width="160" height="88">
</div>
我要获取的内容
</div>
<embed width="700" height="620" quality="high" bgcolor="#FFFFFF" name="fotester" id="fotester" src="../images/swfplayer/2007Player18.swf?id=56744&width=700&height=570&bgcolor=000000&totalframe=1475&framerate=18&MSL=0&main=0&size=5930" type="application/x-shockwave-flash">
<div><a href="link.html">这里是一个连接</a></div>
</BODY>
</html>
html;$iStartBody = stripos($sString, '<BODY id="mybody">')+strlen('<BODY id="mybody">')+1;
$iEndBody = stripos($sString, '</BODY>');
$sContent = substr($sString, $iStartBody, $iEndBody-$iStartBody-1);$sString = html2txt($sContent);
echo $sString;$sString2 = strip_tags($sContent);
echo $sString2;
?>