我个人的感觉,楼主的需求已超越了普通的正则了。 如果仅仅是匹配一个span /<tagName\s*[^>]>(?!<tagName>)(.*)</tagName>/iU 能用上U 修正符,和?! 这种缝隙预判,就可以去匹配, 如果连标签名称也想知道 /<([A-Za-z0-9]+)\s*[^>]>(?!<\/\\1>)(.*)</\\1>/iU 例用\\1 来做回代处理. 更具体一点就要楼主自己动手了。 总之要了解 贪婪,缝隙预判(又有称为环视的),回代的,总能配出来的。但针对html 这个特定的问题,用php_dom_parse 会处理得更好.<?php /******************************************************************************* Version: 1.11 ($Rev: 175 $) Website: http://sourceforge.net/projects/simplehtmldom/ Author: S.C. Chen <[email protected]> Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) Contributions by: Yousuke Kumakura (Attribute filters) Vadim Voituk (Negative indexes supports of "find" method) Antcs (Constructor with automatically load contents either text or file/url) Licensed under The MIT License Redistributions of files must retain the above copyright notice. *******************************************************************************/define('HDOM_TYPE_ELEMENT', 1); define('HDOM_TYPE_COMMENT', 2); define('HDOM_TYPE_TEXT', 3); define('HDOM_TYPE_ENDTAG', 4); define('HDOM_TYPE_ROOT', 5); define('HDOM_TYPE_UNKNOWN', 6); define('HDOM_QUOTE_DOUBLE', 0); define('HDOM_QUOTE_SINGLE', 1); define('HDOM_QUOTE_NO', 3); define('HDOM_INFO_BEGIN', 0); define('HDOM_INFO_END', 1); define('HDOM_INFO_QUOTE', 2); define('HDOM_INFO_SPACE', 3); define('HDOM_INFO_TEXT', 4); define('HDOM_INFO_INNER', 5); define('HDOM_INFO_OUTER', 6); define('HDOM_INFO_ENDSPACE',7);// helper functions // ----------------------------------------------------------------------------- // get html dom form file function file_get_html() { $dom = new simple_html_dom; $args = func_get_args(); $dom->load(call_user_func_array('file_get_contents', $args), true); return $dom; }// get html dom form string function str_get_html($str, $lowercase=true) { $dom = new simple_html_dom; $dom->load($str, $lowercase); return $dom; }// dump html dom tree function dump_html_tree($node, $show_attr=true, $deep=0) { $lead = str_repeat(' ', $deep); echo $lead.$node->tag; if ($show_attr && count($node->attr)>0) { echo '('; foreach($node->attr as $k=>$v) echo "[$k]=>\"".$node->$k.'", '; echo ')'; } echo "\n"; foreach($node->nodes as $c) dump_html_tree($c, $show_attr, $deep+1); }// get dom form file (deprecated) function file_get_dom() { $dom = new simple_html_dom; $args = func_get_args(); $dom->load(call_user_func_array('file_get_contents', $args), true); return $dom; }// get dom form string (deprecated) function str_get_dom($str, $lowercase=true) { $dom = new simple_html_dom; $dom->load($str, $lowercase); return $dom; }// simple html dom node // ----------------------------------------------------------------------------- class simple_html_dom_node { public $nodetype = HDOM_TYPE_TEXT; public $tag = 'text'; public $attr = array(); public $children = array(); public $nodes = array(); public $parent = null; public $_ = array(); private $dom = null; function __construct($dom) { $this->dom = $dom; $dom->nodes[] = $this; } function __destruct() { $this->clear(); } function __toString() { return $this->outertext(); } // clean up memory due to php5 circular references memory leak... function clear() { $this->dom = null; $this->nodes = null; $this->parent = null; $this->children = null; }
// dump node's tree function dump($show_attr=true) { dump_html_tree($this, $show_attr); } // returns the parent of node function parent() { return $this->parent; } // returns children of node function children($idx=-1) { if ($idx===-1) return $this->children; if (isset($this->children[$idx])) return $this->children[$idx]; return null; } // returns the first child of node function first_child() { if (count($this->children)>0) return $this->children[0]; return null; } // returns the last child of node function last_child() { if (($count=count($this->children))>0) return $this->children[$count-1]; return null; } // returns the next sibling of node function next_sibling() { if ($this->parent===null) return null; $idx = 0; $count = count($this->parent->children); while ($idx<$count && $this!==$this->parent->children[$idx]) ++$idx; if (++$idx>=$count) return null; return $this->parent->children[$idx]; } // returns the previous sibling of node function prev_sibling() { if ($this->parent===null) return null; $idx = 0; $count = count($this->parent->children); while ($idx<$count && $this!==$this->parent->children[$idx]) ++$idx; if (--$idx<0) return null; return $this->parent->children[$idx]; } // get dom node's inner html function innertext() { if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); $ret = ''; foreach($this->nodes as $n) $ret .= $n->outertext(); return $ret; } ?>
$str = "<html><div>oh shit</div></html>";
preg_match("#<html>(.*)</html>#",$str,$match);
echo htmlspecialchars($match[0])."<br>";//<html><div>oh shit</div></html>
echo htmlspecialchars($match[1]);//<div>oh shit</div>
?>
$s = <<<html
<div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
html;
$pattern = "/(".
"<\!\w+(?:\s+[^>]*?)+\s*>|".
"<\w+(?:\s+\w+(?:\s*=\s*(?:\"[^\"]*\"|'[^']*'|[^\"'>\s]+))?)*\s*\/?>|".
"<\/\w+\s*>|".
"<\!--[^-]*-->".
")/";preg_match_all($pattern, $s, $aMatches, PREG_OFFSET_CAPTURE);
print_r($aMatches);
function getNested($strHTML, $str, $arr) {
$i=0;
$j=0;
$h=0;
$arrStr = array();
$arrBak = $arr;
$arrNested1 = array();
$arrNested2 = array();
foreach($arr as $k=>$v) {
if(trim($v[0]) == $str) {
$arrStr[0] = $v[0];
$arrStr[1] = $v[1];
$arrStr[2] = $k;
}
}
for($i=$arrStr[2]; $i<count($arrBak); $i++) {
if (preg_match('/<div/', $arrBak[$i][0])) { $j++;
$arrNested1[$j][] = $arrBak[$i];
}
if (preg_match('/<\/div>/', $arrBak[$i][0])) { $h++;
$arrNested2[$h][] = $arrBak[$i];
}
}
return substr($strHTML, $arrStr[1]+strlen($str), $arrNested2[$j][0][1]-$arrStr[1]-strlen($str));
}
echo getNested($s, '<div id=0>', $aMatches[0]);
?>
<?php
$s = <<<html
<div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
html;
$pattern = "/(".
"<\!\w+(?:\s+[^>]*?)+\s*>|".
"<\w+(?:\s+\w+(?:\s*=\s*(?:\"[^\"]*\"|'[^']*'|[^\"'>\s]+))?)*\s*\/?>|".
"<\/\w+\s*>|".
"<\!--[^-]*-->".
")/";preg_match_all($pattern, $s, $aMatches, PREG_OFFSET_CAPTURE);
print_r($aMatches);
function getNested($strHTML, $str, $arr) {
$i=0;
$j=0;
$h=0;
$arrStr = array();
$arrBak = $arr;
$arrNested1 = array();
$arrNested2 = array();
foreach($arr as $k=>$v) {
if(trim($v[0]) == $str) {
$arrStr[0] = $v[0];
$arrStr[1] = $v[1];
$arrStr[2] = $k;
}
}
for($i=$arrStr[2]; $i<count($arrBak); $i++) {
if (preg_match('/<div/', $arrBak[$i][0])) { $j++;
$arrNested1[$j][] = $arrBak[$i];
}
if (preg_match('/<\/div>/', $arrBak[$i][0])) { $h++;
$arrNested2[$h][] = $arrBak[$i];
}
}
return substr($strHTML, $arrStr[1]+strlen($str), $arrNested2[$j][0][1]-$arrStr[1]-strlen($str));
}
echo getNested($s, '<div id=0>', $aMatches[0]);
echo getNested($s, '<div id=1>', $aMatches[0]);
echo getNested($s, '<div id=2>', $aMatches[0]);
echo getNested($s, '<div id=3>', $aMatches[0]);
?>
如果仅仅是匹配一个span
/<tagName\s*[^>]>(?!<tagName>)(.*)</tagName>/iU
能用上U 修正符,和?! 这种缝隙预判,就可以去匹配,
如果连标签名称也想知道
/<([A-Za-z0-9]+)\s*[^>]>(?!<\/\\1>)(.*)</\\1>/iU
例用\\1 来做回代处理.
更具体一点就要楼主自己动手了。
总之要了解 贪婪,缝隙预判(又有称为环视的),回代的,总能配出来的。但针对html 这个特定的问题,用php_dom_parse 会处理得更好.<?php
/*******************************************************************************
Version: 1.11 ($Rev: 175 $)
Website: http://sourceforge.net/projects/simplehtmldom/
Author: S.C. Chen <[email protected]>
Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
Contributions by:
Yousuke Kumakura (Attribute filters)
Vadim Voituk (Negative indexes supports of "find" method)
Antcs (Constructor with automatically load contents either text or file/url)
Licensed under The MIT License
Redistributions of files must retain the above copyright notice.
*******************************************************************************/define('HDOM_TYPE_ELEMENT', 1);
define('HDOM_TYPE_COMMENT', 2);
define('HDOM_TYPE_TEXT', 3);
define('HDOM_TYPE_ENDTAG', 4);
define('HDOM_TYPE_ROOT', 5);
define('HDOM_TYPE_UNKNOWN', 6);
define('HDOM_QUOTE_DOUBLE', 0);
define('HDOM_QUOTE_SINGLE', 1);
define('HDOM_QUOTE_NO', 3);
define('HDOM_INFO_BEGIN', 0);
define('HDOM_INFO_END', 1);
define('HDOM_INFO_QUOTE', 2);
define('HDOM_INFO_SPACE', 3);
define('HDOM_INFO_TEXT', 4);
define('HDOM_INFO_INNER', 5);
define('HDOM_INFO_OUTER', 6);
define('HDOM_INFO_ENDSPACE',7);// helper functions
// -----------------------------------------------------------------------------
// get html dom form file
function file_get_html() {
$dom = new simple_html_dom;
$args = func_get_args();
$dom->load(call_user_func_array('file_get_contents', $args), true);
return $dom;
}// get html dom form string
function str_get_html($str, $lowercase=true) {
$dom = new simple_html_dom;
$dom->load($str, $lowercase);
return $dom;
}// dump html dom tree
function dump_html_tree($node, $show_attr=true, $deep=0) {
$lead = str_repeat(' ', $deep);
echo $lead.$node->tag;
if ($show_attr && count($node->attr)>0) {
echo '(';
foreach($node->attr as $k=>$v)
echo "[$k]=>\"".$node->$k.'", ';
echo ')';
}
echo "\n"; foreach($node->nodes as $c)
dump_html_tree($c, $show_attr, $deep+1);
}// get dom form file (deprecated)
function file_get_dom() {
$dom = new simple_html_dom;
$args = func_get_args();
$dom->load(call_user_func_array('file_get_contents', $args), true);
return $dom;
}// get dom form string (deprecated)
function str_get_dom($str, $lowercase=true) {
$dom = new simple_html_dom;
$dom->load($str, $lowercase);
return $dom;
}// simple html dom node
// -----------------------------------------------------------------------------
class simple_html_dom_node {
public $nodetype = HDOM_TYPE_TEXT;
public $tag = 'text';
public $attr = array();
public $children = array();
public $nodes = array();
public $parent = null;
public $_ = array();
private $dom = null; function __construct($dom) {
$this->dom = $dom;
$dom->nodes[] = $this;
} function __destruct() {
$this->clear();
} function __toString() {
return $this->outertext();
} // clean up memory due to php5 circular references memory leak...
function clear() {
$this->dom = null;
$this->nodes = null;
$this->parent = null;
$this->children = null;
}
// dump node's tree
function dump($show_attr=true) {
dump_html_tree($this, $show_attr);
} // returns the parent of node
function parent() {
return $this->parent;
} // returns children of node
function children($idx=-1) {
if ($idx===-1) return $this->children;
if (isset($this->children[$idx])) return $this->children[$idx];
return null;
} // returns the first child of node
function first_child() {
if (count($this->children)>0) return $this->children[0];
return null;
} // returns the last child of node
function last_child() {
if (($count=count($this->children))>0) return $this->children[$count-1];
return null;
} // returns the next sibling of node
function next_sibling() {
if ($this->parent===null) return null;
$idx = 0;
$count = count($this->parent->children);
while ($idx<$count && $this!==$this->parent->children[$idx])
++$idx;
if (++$idx>=$count) return null;
return $this->parent->children[$idx];
} // returns the previous sibling of node
function prev_sibling() {
if ($this->parent===null) return null;
$idx = 0;
$count = count($this->parent->children);
while ($idx<$count && $this!==$this->parent->children[$idx])
++$idx;
if (--$idx<0) return null;
return $this->parent->children[$idx];
} // get dom node's inner html
function innertext() {
if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); $ret = '';
foreach($this->nodes as $n)
$ret .= $n->outertext();
return $ret;
}
?>
http://nchc.dl.sourceforge.net/project/simplehtmldom/simplehtmldom/1.11/simplehtmldom_1_11.zip
你只需要为每个标签找到配对的标签,并记录他们的原来位置,用substr就可以获取到内容,然后用键值对存入数组即可
不过我还有个思路,就是把string放到dom中,用xpath查 \\div 然后循环列出节点的属性,和内容..不过可能要加载一个dll
看了他N多回帖没一个有用的
这个能匹配嵌套的标签。 但是嵌套的里面不能有其他的标签, 我希望是能嵌套其他的标签 哪位能改改吗?
mygod.......
我真不知道你为什么会如此损我,
我只是一个对普通的在职技术人员,不知道我究竟在哪里做了广告。
我觉楼主这个问题明显是在做dom 解析,所以用用dom 解析类会好一些。代码也是sourceforge 上找的。至于php,我用的是ci 框架,所以很多代码贴出来,没有框架的前提下,确实没法运行,仅表达一下思路。
本想给你留言,结果还没有这个权限。希望能加你为好友。我只是名普通的程序人员,近期喜欢上csdn 上发言。
我不否认现阶段我还是个菜鸟,很感谢您的批评。 我对技术人员的批评向来没有反感,我觉得这是提升的机会。
至于技术方面,N 多帖没用的是正则相关的,还是其它的。
我本身是用php ci 框架.
我仔细想想了想,被你批评有很大的原因是对很多问题,我仅做到了思路上如何去解决,并没有去实质性的去把问题fix 掉,这点我会努力的去改。
我也觉得,在php 行业里,做为一个菜鸟也没关系,努力的一点一点的去进步就行。
$s = <<<html
<html>
<head>
<title>nested tag test</title>
<script>
alert('fdsafdasfasd');
</script>
</head>
<body>
<div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
</body>
</html>
html;
$pattern = "/(".
"<\!\w+(?:\s+[^>]*?)+\s*>|".
"<\w+(?:\s+\w+(?:\s*=\s*(?:\"[^\"]*\"|'[^']*'|[^\"'>\s]+))?)*\s*\/?>|".
"<\/\w+\s*>|".
"<\!--[^-]*-->".
")/";preg_match_all($pattern, $s, $aMatches, PREG_OFFSET_CAPTURE);
function getMatchTags($s, $arr) {
$sMatchClose = '';
$arrClose = array();
$arrReturn = array();
for($i=0; $i<count($arr); $i++) {
$iCount = 0;
if (preg_match("/<[^>\s*]*/", $arr[$i][0], $aMatchOpen)) {
$sMatchClose = '</' . substr($aMatchOpen[0], 1) . '>';
for($j=$i-1; $j<count($arr); $j++) {
if (!(stripos($arr[$j][0], $aMatchOpen[0]) === false)) { $iCount ++;
$flag = 1;
}
if (!(stripos($arr[$j][0], $sMatchClose) === false)) { $iCount --;
$flag = 1;
if($iCount == 0 && $flag == 1) {
$arrClose[] = $arr[$i];
$arrClose[] = $arr[$j];
}
}
}
}
}
$k=0;
for($i=0; $i<count($arrClose); $i+=2) {
$arrReturn[$k][0] = $arrClose[$i];
$arrReturn[$k][1] = $arrClose[$i+1];
$arrReturn[$k][2] = substr($s, $arrClose[$i][1], $arrClose[$i+1][1]+strlen($arrClose[$i+1][0])-$arrClose[$i][1]);
$k++;
}
return $arrReturn;
}
print_r(getMatchTags($s, $aMatches[0]));
?>
out:
Array
(
[0] => Array
(
[0] => Array
(
[0] => <html>
[1] => 0
) [1] => Array
(
[0] => </html>
[1] => 343
) [2] => <html>
<head>
<title>nested tag test</title>
<script>
alert('fdsafdasfasd');
</script>
</head>
<body>
<div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
</body>
</html>
) [1] => Array
(
[0] => Array
(
[0] => <head>
[1] => 7
) [1] => Array
(
[0] => </head>
[1] => 87
) [2] => <head>
<title>nested tag test</title>
<script>
alert('fdsafdasfasd');
</script>
</head>
) [2] => Array
(
[0] => Array
(
[0] => <title>
[1] => 14
) [1] => Array
(
[0] => </title>
[1] => 36
) [2] => <title>nested tag test</title>
) [3] => Array
(
[0] => Array
(
[0] => <script>
[1] => 45
) [1] => Array
(
[0] => </script>
[1] => 77
) [2] => <script>
alert('fdsafdasfasd');
</script>
) [4] => Array
(
[0] => Array
(
[0] => <body>
[1] => 95
) [1] => Array
(
[0] => </body>
[1] => 335
) [2] => <body>
<div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
</body>
) [5] => Array
(
[0] => Array
(
[0] => <div id=0>
[1] => 103
) [1] => Array
(
[0] => </div>
[1] => 328
) [2] => <div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
) [6] => Array
(
[0] => Array
(
[0] => <div id=1>
[1] => 119
) [1] => Array
(
[0] => </div>
[1] => 328
) [2] => <div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
</div>
</div>
) [7] => Array
(
[0] => Array
(
[0] => <div id=2>
[1] => 174
) [1] => Array
(
[0] => </div>
[1] => 308
) [2] => <div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
</div>
</div>
) [8] => Array
(
[0] => Array
(
[0] => <div id=3>
[1] => 233
) [1] => Array
(
[0] => </div>
[1] => 292
) [2] => <div id=3><img name="img3" id="img3" src=""/>
</div>
))
所以,单纯使用正则表达式是不能处理嵌套的。
楼上有几位也给出了一些解决方案,只是离正则表达式“太远”了如果仅仅是分离出各个标记,可以:
$html =<<< HTML
<html>
<head>
<title>title</title>
</head>
<body>
<div>div1
<div>div2</div>
</div>
</body>
</html>
HTML;function foo($r) {
global $dict;
$dict[] = array($r[1] => preg_replace('/\s/s', '', $r[0]));
return '';
}$p = '#<([a-z]+)[^>]*>([^<>]|(?R))*</\\1>#is';
$p = '#<([a-z]+)[^>]*>[^<>]*</\\1>#is';do {
$st = $html;
$html = preg_replace_callback($p, 'foo', $html);
}while($st != $html);print_r($dict);
Array
(
[0] => Array
(
[title] => <title>title</title>
) [1] => Array
(
[div] => <div>div2</div>
) [2] => Array
(
[head] => <head></head>
) [3] => Array
(
[div] => <div>div1</div>
) [4] => Array
(
[body] => <body></body>
) [5] => Array
(
[html] => <html></html>
))
\( ( [^()] | \(([^()])*\) )* \)
<font> ( (?!</?font>). | (<font>((?!</?font>).)*</font>) )* </font>
PHP 和 GRETA 的简便之处在于,匹配嵌套(n-1)层的表达式用 (?R) 表示:
\( ( [^()] | (?R) )* \) 第四步,依此类推,可以编写出匹配有限(n)层的表达式。这种方式写出来的表达式,虽然看上去很长,但是这种表达式经过编译后,匹配效率仍然是很高的。
老大原贴,收藏了
另:
如果是模板,正则还是比较好的。但嵌套不要太多,你可看uchome的。DOm并不好用,因为 html本身不是严格dom的。
因为很多html的书写并不符合标准,而且有时候标签并不匹配。
如果用正则递归碰到这种情况,肯定会慢很多,对于复杂html,递归的层数很可能会成问题。不过可以研究研究,说不定有些情况就会用到。$s = <<<html
<html>
<head>
<title>nested tag test</title>
<script>
alert('fdsafdasfasd');
</script>
</head>
<body>
<!--oooo-->
<div id=0>
<div id=1><img name="img1" id="img1" src=""/>
<div id=2><img name="img2" id="img2" src=""/>
<div id=3><img name="img3" id="img3" src=""/>
haha
</div>
</div>
</div>
</div>
</body>
</html>
html;
ini_set('pcre.recursion_limit', 3000000);
ini_set('pcre.backtrack_limit', 3000000);
$pattern = '#<(\w+)([^>]*?)((\s*/\s*>)|(?:>(((?0)|([^<]*?)|(<!--[\s\S]*?-->))*)</\1>))#isme';
preg_replace($pattern,"mc('$0','$5')",$s);
function mc( $parent,$child )
{
global $pattern,$container,$pattern;
if( $child )
{
$child = str_replace('\\"','"',$child);
$parent = str_replace('\\"','"',$parent);
$container[] = array('element'=>$parent,'innerHTML'=>$child);
preg_replace($pattern,"mc('$0','$5')",$child);
}
}echo "<pre/><xmp/>";
print_r( $container );