用zend_search_lucene搜索英文时,有时模糊搜索不能搜索出结果
例如数据库中有数据(zz12,zz13,100pp1,100pp2)
建立索引
搜索zz就不能搜索到结果
只能搜索zz12这样才可以<?php
class CnLuceneAnalyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
private $_position;
private $_cnStopWords = array( );
public function setCnStopWords( $cnStopWords )
{
$this->_cnStopWords = $cnStopWords;
} /**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "'", "<", ">", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "〔", "〕", "、", "—", " ", "《", "》", "-", "…", "【", "】", "?", "¥" );
$this->_input = str_replace( $search, '', $this->_input );
$this->_input = str_replace( $this->_cnStopWords, ' ', $this->_input );
} /**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null)
{
return null;
} $len = strlen($this->_input);
//print "原始数据:".$this->_input."<br />";
while ($this->_position < $len)
{
// 去掉开头的空格
while ($this->_position < $len &&$this->_input[$this->_position]==' ' )
{
$this->_position++;
} $termStartPosition = $this->_position;
$temp_char = $this->_input[$this->_position];
$isCnWord = false; if(ord($temp_char)>127)
{
$i = 0;
while( $this->_position < $len && ord( $this->_input[$this->_position] )>127 )
{
$this->_position = $this->_position + 3;
$i ++;
if($i==2)
{
$isCnWord = true;
break;
}
} if($i==1) continue;
}
else
{
while ($this->_position < $len && ctype_alnum( $this->_input[$this->_position] ))
{
$this->_position++;
}
//echo $this->_position.":".$this->_input[$this->_position-1]."\n";
}
if ($this->_position == $termStartPosition)
{
$this->_position++;
continue;
}
$tmp_str = substr($this->_input, $termStartPosition, $this->_position - $termStartPosition);
$token = new Zend_Search_Lucene_Analysis_Token( $tmp_str, $termStartPosition,$this->_position );
$token = $this->normalize($token); if($isCnWord)
{
$this->_position = $this->_position - 3;
} if ($token !== null)
{
return $token;
}
}
return null;
}
}
?>
这是分词的源码
是什么原因造成的
例如数据库中有数据(zz12,zz13,100pp1,100pp2)
建立索引
搜索zz就不能搜索到结果
只能搜索zz12这样才可以<?php
class CnLuceneAnalyzer extends Zend_Search_Lucene_Analysis_Analyzer_Common
{
private $_position;
private $_cnStopWords = array( );
public function setCnStopWords( $cnStopWords )
{
$this->_cnStopWords = $cnStopWords;
} /**
* Reset token stream
*/
public function reset()
{
$this->_position = 0;
$search = array(",", "/", "\\", ".", ";", ":", "\"", "!", "~", "`", "^", "(", ")", "?", "-", "'", "<", ">", "$", "&", "%", "#", "@", "+", "=", "{", "}", "[", "]", ":", ")", "(", ".", "。", ",", "!", ";", "“", "”", "‘", "’", "〔", "〕", "、", "—", " ", "《", "》", "-", "…", "【", "】", "?", "¥" );
$this->_input = str_replace( $search, '', $this->_input );
$this->_input = str_replace( $this->_cnStopWords, ' ', $this->_input );
} /**
* Tokenization stream API
* Get next token
* Returns null at the end of stream
*
* @return Zend_Search_Lucene_Analysis_Token|null
*/
public function nextToken()
{
if ($this->_input === null)
{
return null;
} $len = strlen($this->_input);
//print "原始数据:".$this->_input."<br />";
while ($this->_position < $len)
{
// 去掉开头的空格
while ($this->_position < $len &&$this->_input[$this->_position]==' ' )
{
$this->_position++;
} $termStartPosition = $this->_position;
$temp_char = $this->_input[$this->_position];
$isCnWord = false; if(ord($temp_char)>127)
{
$i = 0;
while( $this->_position < $len && ord( $this->_input[$this->_position] )>127 )
{
$this->_position = $this->_position + 3;
$i ++;
if($i==2)
{
$isCnWord = true;
break;
}
} if($i==1) continue;
}
else
{
while ($this->_position < $len && ctype_alnum( $this->_input[$this->_position] ))
{
$this->_position++;
}
//echo $this->_position.":".$this->_input[$this->_position-1]."\n";
}
if ($this->_position == $termStartPosition)
{
$this->_position++;
continue;
}
$tmp_str = substr($this->_input, $termStartPosition, $this->_position - $termStartPosition);
$token = new Zend_Search_Lucene_Analysis_Token( $tmp_str, $termStartPosition,$this->_position );
$token = $this->normalize($token); if($isCnWord)
{
$this->_position = $this->_position - 3;
} if ($token !== null)
{
return $token;
}
}
return null;
}
}
?>
这是分词的源码
是什么原因造成的
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货