package com.goldcell.word;import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; public class AnalyticWord {
public static void main(String[] args) {
String file = "D://word.doc"; //定义word文件
//new AnalyticWord().getWordValues(file);
List<String> list = new AnalyticWord().getWordText(file);
for (String string : list) {
System.out.println(string);
}
} public List<String[][]> getWordValues(String file){
List<String[][]>word = new ArrayList<String[][]>();
try {
List<String[][]> wordValue = new ArrayList<String[][]>();
//表格数据
POIFSFileSystem pfs = new POIFSFileSystem(new FileInputStream(file));
HWPFDocument hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();//得到文档的读取范围
//迭代文档中的表格
TableIterator it = new TableIterator(range);
while (it.hasNext()) {
Table tb = (Table) it.next();
String[][] tables = null;
if(tb.numRows() > 0) tables = new String[tb.numRows()][tb.getRow(0).numCells()];
for (int i = 0; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
//迭代列,默认从0开始
for (int j = 0; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);//取得单元格
//取得单元格的内容
for(int k=0;k<td.numParagraphs();k++){
Paragraph para =td.getParagraph(k);
String s = para.text();
//System.out.print()+"\t"); //输出单元格数据
if( s.indexOf("") < 1) {
tables[i][j] = " ";
continue;
}
tables[i][j] = s.substring(0, s.indexOf(""));
}
// System.out.print(tables[i][j]);
}
// System.out.println();//没一行完后换行
}
// System.out.println();//第一个表格完后换一行
wordValue.add(tables);
}
List<String> wordTest = new AnalyticWord().getWordText(file);
for (int i = 0,k=0; i < wordTest.size(); i++) {
if("tables".equals(wordTest.get(i))){
word.add(wordValue.get(k++));
}else{
word.add(new String[][]{{wordTest.get(i)}});
}
}
} catch (Exception e) {
new Exception("无法从该Mocriosoft Word文档中提取内容").printStackTrace();
word = null;
}
return word;
}
public List<String> getWordText(String file){
List<String> list = new ArrayList<String>();
try {
//表格外的数据
InputStream is = new FileInputStream(file);
WordExtractor ex = new WordExtractor(is);
for (String string : ex.getText().toString().split("\n")) {//截取每行数据
if(string.indexOf("") > 0 ){
list.add("tables");
if(string.lastIndexOf("") > 1){ //表格下的第一行数据此数据没有换行的但紧挨着表格
list.add("\n"+string.substring(string.lastIndexOf("")+2 , string.length()));//输出紧挨着表格的数据
}
}else{
list.add(string);//杈撳嚭琛ㄦ牸澶栫殑鏁版嵁
}
}
} catch (Exception e) {
new Exception("无法从该Mocriosoft Word文档中提取内容").printStackTrace();
list = null;
}
return list;
}
}
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem; public class AnalyticWord {
public static void main(String[] args) {
String file = "D://word.doc"; //定义word文件
//new AnalyticWord().getWordValues(file);
List<String> list = new AnalyticWord().getWordText(file);
for (String string : list) {
System.out.println(string);
}
} public List<String[][]> getWordValues(String file){
List<String[][]>word = new ArrayList<String[][]>();
try {
List<String[][]> wordValue = new ArrayList<String[][]>();
//表格数据
POIFSFileSystem pfs = new POIFSFileSystem(new FileInputStream(file));
HWPFDocument hwpf = new HWPFDocument(pfs);
Range range = hwpf.getRange();//得到文档的读取范围
//迭代文档中的表格
TableIterator it = new TableIterator(range);
while (it.hasNext()) {
Table tb = (Table) it.next();
String[][] tables = null;
if(tb.numRows() > 0) tables = new String[tb.numRows()][tb.getRow(0).numCells()];
for (int i = 0; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
//迭代列,默认从0开始
for (int j = 0; j < tr.numCells(); j++) {
TableCell td = tr.getCell(j);//取得单元格
//取得单元格的内容
for(int k=0;k<td.numParagraphs();k++){
Paragraph para =td.getParagraph(k);
String s = para.text();
//System.out.print()+"\t"); //输出单元格数据
if( s.indexOf("") < 1) {
tables[i][j] = " ";
continue;
}
tables[i][j] = s.substring(0, s.indexOf(""));
}
// System.out.print(tables[i][j]);
}
// System.out.println();//没一行完后换行
}
// System.out.println();//第一个表格完后换一行
wordValue.add(tables);
}
List<String> wordTest = new AnalyticWord().getWordText(file);
for (int i = 0,k=0; i < wordTest.size(); i++) {
if("tables".equals(wordTest.get(i))){
word.add(wordValue.get(k++));
}else{
word.add(new String[][]{{wordTest.get(i)}});
}
}
} catch (Exception e) {
new Exception("无法从该Mocriosoft Word文档中提取内容").printStackTrace();
word = null;
}
return word;
}
public List<String> getWordText(String file){
List<String> list = new ArrayList<String>();
try {
//表格外的数据
InputStream is = new FileInputStream(file);
WordExtractor ex = new WordExtractor(is);
for (String string : ex.getText().toString().split("\n")) {//截取每行数据
if(string.indexOf("") > 0 ){
list.add("tables");
if(string.lastIndexOf("") > 1){ //表格下的第一行数据此数据没有换行的但紧挨着表格
list.add("\n"+string.substring(string.lastIndexOf("")+2 , string.length()));//输出紧挨着表格的数据
}
}else{
list.add(string);//杈撳嚭琛ㄦ牸澶栫殑鏁版嵁
}
}
} catch (Exception e) {
new Exception("无法从该Mocriosoft Word文档中提取内容").printStackTrace();
list = null;
}
return list;
}
}
你的poi版本多少?
poi是3.5的 就是WORD文档里面出现了中文就全部读不出来
无法获取属性"length" 的值 对象为null 或 未定义
报错是
java.lang.IllegalArgumentException: The end (269) must not be before the start (279)