为什么我的程序只能提取一部分内容????
我用poi对word文档进行纯文本提取,但结果却只有一部分,我不明白是什么原因?
附件:新建 Microsoft Word 文档 (3).doc
程序如下:
mport org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;//import org.apache.poi.hdf.extractor.WordDocument;import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.PrintWriter;
public class POIWordDocHandler implements DocumentHandler {public Document getDocument(InputStream is) throws DocumentHandlerException
{String bodyText = null; try {
WordDocument wd = new WordDocument(is);
StringWriter docTextWriter = new StringWriter();
wd.writeAllText(new PrintWriter(docTextWriter));
docTextWriter.close();
bodyText = docTextWriter.toString();
// bodyText = new WordExtractor().extractText(is);
System.out.println(bodyText);
}
catch (Exception e) {
throw new DocumentHandlerException(
"Cannot extract text from a Word document", e); }
if ((bodyText != null) ) {
Document doc = new Document();
doc.add(new Field("body", bodyText,Field.Store.YES,Field.Index.TOKENIZED));
return doc;
}
return null;
}public static void main(String[] args) throws Exception {
String file = "E:\\new test\\新建文件夹\\新建 Microsoft Word 文档 (3).doc";
POIWordDocHandler handler = new POIWordDocHandler();
Document doc =
handler.getDocument(new FileInputStream(new File(file)));
System.out.println(doc);
}
}
我用poi对word文档进行纯文本提取,但结果却只有一部分,我不明白是什么原因?
附件:新建 Microsoft Word 文档 (3).doc
程序如下:
mport org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;//import org.apache.poi.hdf.extractor.WordDocument;import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.PrintWriter;
public class POIWordDocHandler implements DocumentHandler {public Document getDocument(InputStream is) throws DocumentHandlerException
{String bodyText = null; try {
WordDocument wd = new WordDocument(is);
StringWriter docTextWriter = new StringWriter();
wd.writeAllText(new PrintWriter(docTextWriter));
docTextWriter.close();
bodyText = docTextWriter.toString();
// bodyText = new WordExtractor().extractText(is);
System.out.println(bodyText);
}
catch (Exception e) {
throw new DocumentHandlerException(
"Cannot extract text from a Word document", e); }
if ((bodyText != null) ) {
Document doc = new Document();
doc.add(new Field("body", bodyText,Field.Store.YES,Field.Index.TOKENIZED));
return doc;
}
return null;
}public static void main(String[] args) throws Exception {
String file = "E:\\new test\\新建文件夹\\新建 Microsoft Word 文档 (3).doc";
POIWordDocHandler handler = new POIWordDocHandler();
Document doc =
handler.getDocument(new FileInputStream(new File(file)));
System.out.println(doc);
}
}
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货