lucene对xml检索问题

我对文件夹里的xml文件建立了索引，但是为什么就检索不到呢，请各位大侠指点指点：建立索引的类：LuceneIndexLocalDiskpackage Test;import java.io.IOException;
import java.io.File;import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.*;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/*******************************************************************
* 本代码完成本地指定目录的遍历和文件查找。对指定后缀的文件进行分析，利用Lucene建立
* 索引，为后续检索使用做好准备。
*******************************************************************/
public class LuceneIndexLocalDisk { private static String Dest_Index_Path = "D:\\jy\\";
private static String Text_File_Path  = "D:\\jy\\";
/*========================================================
* 主函数，指定索引目录和待分析的目录，生成Lucene索引
*========================================================*/
public static void main(String[] args) {

File indexpath = new File(Dest_Index_Path);
File localPath = new File(Text_File_Path);

try {
int nums = indexBuilder(indexpath,localPath);
System.out.println("Index Finished " + nums + "  docs");
} catch (IOException e) {
e.printStackTrace();
}
}
/*========================================================
* 索引创建函数，生成IndexWriter创建索引，调用子目录索引函数，并优化
* 存储本地磁盘索引
*========================================================*/
public static int indexBuilder( File indexPath , File localPath )
throws IOException{
if(!localPath.exists() || !localPath.isDirectory() || !localPath.canRead()){
throw new IOException(localPath + "不存在或者不允许访问" );
}
System.out.println("目标路径完好");
IndexWriter FSWriter = new IndexWriter(indexPath,new StandardAnalyzer(),true);
FSWriter.setUseCompoundFile(true); SubindexBuilder(FSWriter,localPath);
int num =  FSWriter.docCount();
FSWriter.optimize();
FSWriter.close();
return num;
}

/*========================================================
* 判断当前文件名是否符合文件后缀要求
*========================================================*/
private static boolean IsValidType(String name){
if(name.endsWith(".xml"))
{
return true;
} else {
return false;
}
}
/*========================================================
* 处理各种不同类型文档,调用相应的参数，合并到本地磁盘索引当中
*========================================================*/
private static void  fileindexBuilder(IndexWriter fswriter,File subfile)
throws IOException{

if( subfile.isHidden() || !subfile.exists() || !subfile.canRead()){
return ;
}
        String strname = subfile.getName();
        int dotpos = strname.indexOf(".");
        HandleXml hnxml=new HandleXml();
         if( (dotpos >0) && (dotpos < strname.length()))
         {
         hnxml.handle(fswriter ,subfile);
         }

} /*========================================================
* 递归函数，递归分析目录，如果找到子目录，继续递归；如果找到文件分析索引
*========================================================*/
private static void  SubindexBuilder(IndexWriter fswriter,File subPath)
throws IOException{

File[] filelist = subPath.listFiles();
System.out.println(subPath.getAbsolutePath() + " :子目录个数 " + filelist.length); for(int i = 0; i< filelist.length;i++){
File file = filelist[i];
if(file.isDirectory()){
SubindexBuilder(fswriter,file);
} else if(IsValidType(file.getName())){
fileindexBuilder(fswriter,file);
}
}
}
}
解析xml的类：HandleXmlpackage Test;import java.io.File;
import java.io.IOException;import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;public class HandleXml {

public void  handle(IndexWriter fswriter,File subPath)
{
try { // 处理分析XML文档，并索引文档内容
Directory ramdirectory = new RAMDirectory();
Analyzer TextAnalyzer = new StandardAnalyzer();            // 生成分析器
                    // 根据指定文件创建输入流

Document document = new Document() ;                       // 由Office文件生成文档对象
// 获取DOM对象树的生成器
        DocumentBuilderFactory builderfactory = DocumentBuilderFactory.newInstance();

        try {
            // 获取 DocumentBuilder实例
            DocumentBuilder builder = builderfactory.newDocumentBuilder();
            ////从 XML 文档获取 DOM 文档实例
            org.w3c.dom.Document documentW3c = builder.parse(subPath);
            //获取某节点的集合
            NodeList nodelist = documentW3c.getElementsByTagName("item");
            // 获取节点列表的总长度
            int listnum = nodelist.getLength();
            System.out.println("--------节点数量："+listnum + "--------");
            for (int i = 0; i < listnum; i++) {
             // 获取节点
                Element eltItem = (Element) nodelist.item(i);
                // 获取节点的各项属性
                Node eltTitle = eltItem.getElementsByTagName("title").item(0);
                Node eltLink  = eltItem.getElementsByTagName("addr").item(0);
                Node eltDescription  = eltItem.getElementsByTagName("content").item(0);

                String title = eltTitle.getFirstChild().getNodeValue();
                String addr = eltLink.getFirstChild().getNodeValue();
                String content = eltDescription.getFirstChild().getNodeValue();

                Field field_title=new Field("title",title,Field.Store.YES,Field.Index.UN_TOKENIZED);
                document.add(field_title);

                Field field_addr=new Field("addr",addr,Field.Store.YES,Field.Index.UN_TOKENIZED);
                document.add(field_addr);

                Field field_content=new Field("content",content,Field.Store.YES,Field.Index.UN_TOKENIZED);
                document.add(field_content);
                // 输出结果
                fswriter.addDocument(document);
                System.out.print("标题：");
                System.out.println(title);
                System.out.print("链接：");
                System.out.println(addr);
                System.out.print("描述：");
                System.out.println(content);
                System.out.println("----------------------------\n");
            }
        } catch (ParserConfigurationException e) {
            e.printStackTrace();
        } catch (SAXException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
System.out.println("----------创建索引：XML 文件内容  ----------");
//System.out.println(document);
fswriter.addDocument(document);                          // 添加文档到索引 }catch (IOException e) {
e.printStackTrace();
}
System.out.println("----------创建索引：Office 文件成功. ----------");
}
}
检索代码：
public class SearchTool {
private static String Dest_Index_Path = "D:\\jy";public static void main(String[] args) { @SuppressWarnings("unused")
File indexpath = new File(Dest_Index_Path);
SearchTool tool = new SearchTool(); try {
tool.BasicSearch("故宫");
} catch (Exception e) {
e.printStackTrace();
}
}      public void BasicSearch(String keyWord) throws ParseException { Analyzer analyzer = new SimpleAnalyzer();
try {
IndexSearcher searcher = new IndexSearcher(Dest_Index_Path);
QueryParser parser = new QueryParser("content", analyzer);
Query query = parser.parse(keyWord);
System.out.println(query.toString());
Hits hits = null;
hits = searcher.search(query);
System.out.println(hits.length());
for (int i = 0; i < hits.length(); i++) {
System.out.println(hits.doc(i));
System.out.println(hits.doc(i).getField("title"));
}
hits = null;
System.gc();
} catch (CorruptIndexException e1) {
e1.printStackTrace();
} catch (IOException e1) {
e1.printStackTrace();
} }
｝D://JY中的 xml文件<?xml version="1.0" encoding="GB2312"?>
<root>
<channel>
<title>wood</title>
<addr/>
<content>The channel:wood。The class of code：null</content>
</channel>
<item>
<title>植物大战僵尸</title>
<addr>http://www.node.com</addr>
<content>一款非常好玩的塔防游戏</content>
</item>
<item>
<title>故宫的传说</title>
<addr>http://www.chinanet.com</addr>
<content>想知道故宫的由来吗？紫禁城里到底如何金碧辉煌</content>
</item>
<item>
<title>英语学习指导</title>
<addr>http://www.english.com</addr>
<content>指导性的文章</content>
</item>
</root>

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

1.建索引和查询要用同一个分词器
2.Document在for循环中每次new，这样才是多个document
3.不知道lucence内置的分词器能否切出词条“故宫”，中文分词一般用中文分词器
谢谢您的回答，我把xml文档里的内容改为英文，还是不能检索呢，能不能帮忙调试下，谢谢啦！
没法给你调，我用的lucnece3.我说的你都改了吗，有看了一下你的代码
new Field("content",content,Field.Store.YES,Field.Index.UN_TOKENIZED)被索引的Field怎么最后一个参数设置成不切词了，这么着只有完全content匹配才会有结果。
writer用完就关了吧。
int num =  ((Object) FSWriter).docCount();在lucene3中有错误请问怎么改啦