一般来说,lucene提供的分词器基本可以达到用户需求,同时也提供客户自己定制分词器,以下是转,提供楼主参考:现有的分词技术可分为三类:基于字符串匹配的分词
基于理解的分词
基于统计的分词
这篇文章中使用的是基于字符串匹配的分词技术,这种技术也被称为机械分词。它是按照一定的策略将待分析的汉字串与一个“充分大的”词库中的词条进行匹配。若在词库中找到某个字符串则匹配成功(识别出一个词)。按照扫描方向的不同,串匹配分词方法可以分为正向匹配和逆向匹配;按照不同长度优先匹配的情况,可以分为最大(最长)匹配和最小(最短)匹配;按照是否与词性标注过程相结合,又可以分为单纯分词法和分词与标注结合法。常用的几种机械分词方法如下:正向最大匹配法(由左到右的方向)
逆向最大匹配法(由右到左的方向)
分词器实现
这个实现了机械分词中正向最大匹配法的Lucene分词器包括两个类,CJKAnalyzer和CJKTokenizer,他们的源代码如下:package org.solol.analysis;import java.io.Reader;
import java.util.Set;import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;/**
* @author solo L
*
*/
public class CJKAnalyzer extends Analyzer {//实现了Analyzer接口,这是lucene的要求
public final static String[] STOP_WORDS = {};
private Set stopTable; public CJKAnalyzer() {
stopTable = StopFilter.makeStopSet(STOP_WORDS);
} @Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new CJKTokenizer(reader), stopTable);
}
} package org.solol.analysis;import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.TreeMap;import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;/**
* @author solo L
*
*/
public class CJKTokenizer extends Tokenizer {
//这个TreeMap用来缓存词库
private static TreeMap simWords = null; private static final int IO_BUFFER_SIZE = 256; private int bufferIndex = 0; private int dataLen = 0; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private String tokenType = "word"; public CJKTokenizer(Reader input) {
this.input = input;
} //这里是lucene分词器实现的最关键的地方
public Token next() throws IOException {
loadWords(); StringBuffer currentWord = new StringBuffer(); while (true) {
char c;
Character.UnicodeBlock ub; if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
} if (dataLen == -1) {
if (currentWord.length() == 0) {
return null;
} else {
break;
}
} else {
c = ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
//通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS,
//因此会丢掉其它的字符,如它会丢掉LATIN字符和数字
//这也是该lucene分词器的一个限制,您可以在此基础之上完善它,
//也很欢迎把您完善的结果反馈给我
if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
tokenType = "double";
if (currentWord.length() == 0) {
currentWord.append(c);
} else {
//这里实现了正向最大匹配法
String temp = (currentWord.toString() + c).intern();
if (simWords.containsKey(temp)) {
currentWord.append(c);
} else {
bufferIndex--;
break;
}
}
}
}
Token token = new Token(currentWord.toString(), bufferIndex - currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
//装载词库,您必须明白它的逻辑和之所以这样做的目的,这样您才能理解正向最大匹配法是如何实现的
public void loadWords() {
if (simWords != null)return;
simWords = new TreeMap(); try {
InputStream words = new FileInputStream("simchinese.txt");
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
String word = null; while ((word = in.readLine()) != null) {
//#使得我们可以在词库中进行必要的注释
if ((word.indexOf("#") == -1) && (word.length() < 5)) {
simWords.put(word.intern(), "1");
if (word.length() == 3) {
if (!simWords.containsKey(word.substring(0, 2).intern())) {
simWords.put(word.substring(0, 2).intern(), "2");
}
}
if (word.length() == 4) {
if (!simWords.containsKey(word.substring(0, 2).intern())) {
simWords.put(word.substring(0, 2).intern(), "2");
}
if (!simWords.containsKey(word.substring(0, 3).intern())) {
simWords.put(word.substring(0, 3).intern(), "2");
} }
}
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
基于理解的分词
基于统计的分词
这篇文章中使用的是基于字符串匹配的分词技术,这种技术也被称为机械分词。它是按照一定的策略将待分析的汉字串与一个“充分大的”词库中的词条进行匹配。若在词库中找到某个字符串则匹配成功(识别出一个词)。按照扫描方向的不同,串匹配分词方法可以分为正向匹配和逆向匹配;按照不同长度优先匹配的情况,可以分为最大(最长)匹配和最小(最短)匹配;按照是否与词性标注过程相结合,又可以分为单纯分词法和分词与标注结合法。常用的几种机械分词方法如下:正向最大匹配法(由左到右的方向)
逆向最大匹配法(由右到左的方向)
分词器实现
这个实现了机械分词中正向最大匹配法的Lucene分词器包括两个类,CJKAnalyzer和CJKTokenizer,他们的源代码如下:package org.solol.analysis;import java.io.Reader;
import java.util.Set;import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;/**
* @author solo L
*
*/
public class CJKAnalyzer extends Analyzer {//实现了Analyzer接口,这是lucene的要求
public final static String[] STOP_WORDS = {};
private Set stopTable; public CJKAnalyzer() {
stopTable = StopFilter.makeStopSet(STOP_WORDS);
} @Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new StopFilter(new CJKTokenizer(reader), stopTable);
}
} package org.solol.analysis;import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.TreeMap;import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;/**
* @author solo L
*
*/
public class CJKTokenizer extends Tokenizer {
//这个TreeMap用来缓存词库
private static TreeMap simWords = null; private static final int IO_BUFFER_SIZE = 256; private int bufferIndex = 0; private int dataLen = 0; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private String tokenType = "word"; public CJKTokenizer(Reader input) {
this.input = input;
} //这里是lucene分词器实现的最关键的地方
public Token next() throws IOException {
loadWords(); StringBuffer currentWord = new StringBuffer(); while (true) {
char c;
Character.UnicodeBlock ub; if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
} if (dataLen == -1) {
if (currentWord.length() == 0) {
return null;
} else {
break;
}
} else {
c = ioBuffer[bufferIndex++];
ub = Character.UnicodeBlock.of(c);
}
//通过这个条件不难看出这里只处理了CJK_UNIFIED_IDEOGRAPHS,
//因此会丢掉其它的字符,如它会丢掉LATIN字符和数字
//这也是该lucene分词器的一个限制,您可以在此基础之上完善它,
//也很欢迎把您完善的结果反馈给我
if (Character.isLetter(c) && ub == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
tokenType = "double";
if (currentWord.length() == 0) {
currentWord.append(c);
} else {
//这里实现了正向最大匹配法
String temp = (currentWord.toString() + c).intern();
if (simWords.containsKey(temp)) {
currentWord.append(c);
} else {
bufferIndex--;
break;
}
}
}
}
Token token = new Token(currentWord.toString(), bufferIndex - currentWord.length(), bufferIndex, tokenType);
currentWord.setLength(0);
return token;
}
//装载词库,您必须明白它的逻辑和之所以这样做的目的,这样您才能理解正向最大匹配法是如何实现的
public void loadWords() {
if (simWords != null)return;
simWords = new TreeMap(); try {
InputStream words = new FileInputStream("simchinese.txt");
BufferedReader in = new BufferedReader(new InputStreamReader(words,"UTF-8"));
String word = null; while ((word = in.readLine()) != null) {
//#使得我们可以在词库中进行必要的注释
if ((word.indexOf("#") == -1) && (word.length() < 5)) {
simWords.put(word.intern(), "1");
if (word.length() == 3) {
if (!simWords.containsKey(word.substring(0, 2).intern())) {
simWords.put(word.substring(0, 2).intern(), "2");
}
}
if (word.length() == 4) {
if (!simWords.containsKey(word.substring(0, 2).intern())) {
simWords.put(word.substring(0, 2).intern(), "2");
}
if (!simWords.containsKey(word.substring(0, 3).intern())) {
simWords.put(word.substring(0, 3).intern(), "2");
} }
}
}
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//布爾查找
BooleanQuery booleanQuery = new BooleanQuery();
QueryParser parser = new QueryParser("newstitle",analyzer);
Query query = parser.parse("国家");
booleanQuery.add(query,Occur.SHOULD);
QueryParser parser2 = new QueryParser("newsContent",analyzer);
Query query2 = parser2.parse("民族");
booleanQuery.add(query2,Occur.SHOULD);Hits hits = searcher.search(booleanQuery , Sort.RELEVANCE);//按得分來排序
Occur有AND OR關系