<?php
ini_set('pcre.backtrack_limit', 10000000);
$content = '22222';
$bad_words = "/000|22222|33333|4月25日|44444|66666|77777|81813|88888|99999|555555|恐怖/i";
if(preg_match ($bad_words, $content)) {
echo 'exists';
} else {
echo 'not exists';
}
?>
如上程序,$bad_words内容少时,程序正常,但内容很长的时候,就会报错了:
Warning: preg_match() [function.preg-match]: Compilation failed: regular expression is too large at offset 39505 in D:\www\htdocs\1.php on line 5
not exists
ini_set('pcre.backtrack_limit', 10000000);
$content = '22222';
$bad_words = "/000|22222|33333|4月25日|44444|66666|77777|81813|88888|99999|555555|恐怖/i";
if(preg_match ($bad_words, $content)) {
echo 'exists';
} else {
echo 'not exists';
}
?>
如上程序,$bad_words内容少时,程序正常,但内容很长的时候,就会报错了:
Warning: preg_match() [function.preg-match]: Compilation failed: regular expression is too large at offset 39505 in D:\www\htdocs\1.php on line 5
not exists
我有个现成的,参考一下:
其中badword.txt是过滤词汇集合,一词一行//过滤内容
function filter($text) {
//这里写其它处理的代码
filter_badwords($text);
return $text;
}
//过滤词汇
function filter_badwords($text) {
if ($badwords = @file('badword.txt')) {
foreach ($badwords as $badword) {
$badword = trim($badword);
if ($badword != '') {
$badword = preg_replace('/[0-9a-z]|[\x80-\xff]{3}/iU', '$0([^0-9a-z\x80-\xff]|(&[#a-z0-9]+;)| )*', $badword);
$badword = preg_replace('/\(\[\^0-9a-z\\\x80-\\\xff\]\|\(&\[#a-z0-9\]\+;\)\| \)\*$/i', '', $badword);
$badword = preg_replace('/^\(\[\^0-9a-z\\\x80-\\\xff\]\|\(\&\[\#a-z0-9\]\+\;\)\| \)\*/i', '', $badword);
if (preg_match('/'.$badword.'/i', $text, $matches))
die('提交被拒绝!有禁用词汇“'.preg_replace('/[0-9a-z]|[\x80-\xff]{3}/iU', '$0<span style="color:#FFFFFF;display:none;">禁</span>', $matches[0]).'”。请返回检查。');
unset($matches);
}
}
}
}
<?php
//过滤内容
function filter($text) {
//这里写其它处理的代码
filter_badwords($text);
return $text;
}
//过滤词汇
function filter_badwords($text) {
if ($badwords = @file('badword.txt')) {
foreach ($badwords as $badword) {
$badword = trim($badword);
if ($badword != '') {
$badword = preg_replace('/[0-9a-z]|[\x80-\xff]{3}/iU', '$0([^0-9a-z\x80-\xff]|(&[#a-z0-9]+;)| )*', $badword);
$badword = preg_replace('/\(\[\^0-9a-z\\\x80-\\\xff\]\|\(&\[#a-z0-9\]\+;\)\| \)\*$/i', '', $badword);
$badword = preg_replace('/^\(\[\^0-9a-z\\\x80-\\\xff\]\|\(\&\[\#a-z0-9\]\+\;\)\| \)\*/i', '', $badword);
if (preg_match('/'.$badword.'/i', $text, $matches))
die('提交被拒绝!有禁用词汇"'.preg_replace('/[0-9a-z]|[\x80-\xff]{3}/iU', '$0<span style="color:#FFFFFF;display:none;">禁</span>', $matches[0]).'"。请返回检查。');
unset($matches);
}
}
}
}
filter_badwords('5dd399');
?>同样超出了:
Warning: preg_match() [function.preg-match]: Compilation failed: missing terminating ] for character class at offset 46 in D:\www\htdocs\temp.php on line 42
$text = @iconv("utf-8", "gbk", $text);
$so = scws_new();
$dicPath = self::INC_TEXT_DIR . self::BANNDED_WORD;
$so->set_dict($dicPath, SCWS_XDICT_TXT);
$so->send_text($text);
$text = @iconv("gbk", "utf-8", $text);
$ret = $so->has_word("*");
if ($ret) {
$result = $so->get_words("*");
$words = array();
foreach ($result as $w) {
$words[] = $w['word'];
}
$message = '含有敏感词:' . iconv('gb2312', 'utf-8', implode(',', $words)) . ' 请用下划线_隔开敏感词';
KDG::log("step2.1.1 关健字审核 <font color='red'> $message </font> <font color='red'> 发送文字合法性验证未通过,发送短信关闭</font>");
return array(false, $message);
}
2建议多打开几个。 ini_set("pcre.backtrack_limit" , -1);
ini_set("pcre.recursion_limit" , -1);
ini_set("memory_limit" , "1024M");
如果程序赶着做,就建议先用scws 做分词验证。
如果有时间慢慢研究,就好好看看正则配置的问题。我之前有一次做采集,也有类似的限制,改改配置后就正常了。
<?php
ini_set("pcre.backtrack_limit" , -1);
ini_set("pcre.recursion_limit" , -1);
ini_set("memory_limit" , "1024M");
$content = '22222';
$bad_words = "/000|22222|33333|4月25日|44444|66666|77777|81813|88888|99999|555555|恐怖/i";
if(preg_match ($bad_words, $content)) {
echo 'exists';
} else {
echo 'not exists';
}
?>Warning: preg_match() [function.preg-match]: Compilation failed: regular expression is too large at offset 31311 in D:\www\htdocs\t.php on line 7
not exists
2
把 $bad_words 这个分个组吧。 一组里面只存20 个。
$bad_words = "/000|22222|33333|4月25日|44444|66666|77777|81813|88888|99999|555555|恐怖/i";
我当前正在用的badword.txt是16KB
http://info.162100.com/inc/require/badword.txt