下载包:
http://www.etymon.com/pj/import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper { Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage; public PjWrapper(String PdfFileName) throws IOException, PjException { pdf = new Pdf(PdfFileName); // hopefully the catalog can never be a reference... catalog = (PjCatalog) pdf.getObject(pdf.getCatalog()); // root node of pages tree is specified by a reference in the catalog rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
} public static void main(String[] args) throws IOException, PjException { PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();
Object[] o = textList.toArray();
for (int i = 0;i < o.length;i++) {
System.out.println(o[i].getClass() + " : " + o[i].toString());
}
} /**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException { LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;
String streamText;
boolean moreData;
int textStart, textEnd; System.out.println("Going through streams..."); while (streamIter.hasNext()) { System.out.println("Getting next stream");
stream = (PjStream) streamIter.next(); System.out.println(
"Adding text from stream with filter: " + getFilterString(stream));
stream = stream.flateDecompress(); System.out.println(
"Adding text from stream with filter afterdecompress: "
+ getFilterString(stream));
streamData = new String(stream.getBuffer()); streamText = new String();
moreData = true;
textStart = textEnd = 0; while (moreData) { if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) { moreData = false;
break;
} if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) { moreData = false;
break;
} try { streamText += PjString.decodePdf(streamData.substring(textStart, textEnd + 1));
} catch (Exception e) { System.out.println(
"malformed string: " + streamData.substring(textStart, textEnd + 1));
}
} //if(streamText.equals("inserted text"))
System.out.println(streamText); if (streamText.length() > 0)
stringList.add(streamText);
} return stringList;
} public static String getFilterString(PjStream stream) throws PjException { String filterString = new String();
PjObject filter;
System.out.println("getting filter from dictionary"); if ((filter = stream.getStreamDictionary().getFilter()) == null) {
System.out.println("Got null filter");
return "";
}
System.out.println("got it"); // filter should either be a name or an array of names
if (filter instanceof PjName) { System.out.println("getting filter string from simple name");
filterString = ((PjName) filter).getString();
} else { System.out.println("getting filter string from array of names");
Iterator nameIter;
Vector nameVector; if ((nameVector = ((PjArray) filter).getVector()) == null) { System.out.println("got null vector for list of names");
return "";
} nameIter = nameVector.iterator(); while (nameIter.hasNext()) { filterString += ((PjName) nameIter.next()).getString(); if (nameIter.hasNext())
filterString += " ";
}
} System.out.println("got filter string"); return filterString;
} /**
* Performs a post-order traversal of the pages tree
* from the root node and gets all of the contents streams
* @returns a list of all the contents of all the pages
*/ public LinkedList getAllContentsStreams() throws InvalidPdfObjectException { return getContentsStreams(getAllPages());
} /**
* Get contents streams from the list of PjPage objects
* @returns a list of all the contents of the pages
*/ public LinkedList getContentsStreams(LinkedList pages)
throws InvalidPdfObjectException { LinkedList streams = new LinkedList();
Iterator pageIter = pages.iterator();
PjObject contents; while (pageIter.hasNext()) {
contents = pdf.resolve(((PjPage) pageIter.next()).getContents()); // should only be a stream or an array of streams (or refs to streams) if (contents instanceof PjStream)
streams.add(contents);
else {
Iterator streamsIter = ((PjArray) contents).getVector().iterator(); while (streamsIter.hasNext())
streams.add(pdf.resolve((PjObject) streamsIter.next()));
}
} return streams;
} /**
* Performs a post-order traversal of the pages tree
* from the root node.
* @returns a list of all the PjPage objects
*/ public LinkedList getAllPages() throws InvalidPdfObjectException { LinkedList pages = new LinkedList();
getPages(rootPage, pages);
return pages;
} /**
* Performs a post-order traversal of the pages tree
* from the node passed to it.
* @returns a list of all the PjPage objects under node
*/ public void getPages(PjObject node, LinkedList pages)
throws InvalidPdfObjectException { PjPagesNode pageNode = null; // let's hope pdf's don't have pointers to pointers if (node instanceof PjReference)
pageNode = (PjPagesNode) pdf.resolve(node);
else
pageNode = (PjPagesNode) node; if (pageNode instanceof PjPage) {
pages.add(pageNode);
return;
} // kids better be an array and not a reference to one Iterator kidIterator =
((PjArray) ((PjPages) pageNode).getKids()).getVector().iterator(); while (kidIterator.hasNext()) {
getPages((PjObject) kidIterator.next(), pages);
}
} public Pdf getPdf() {
return pdf;
}
}
http://www.etymon.com/pj/import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper { Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage; public PjWrapper(String PdfFileName) throws IOException, PjException { pdf = new Pdf(PdfFileName); // hopefully the catalog can never be a reference... catalog = (PjCatalog) pdf.getObject(pdf.getCatalog()); // root node of pages tree is specified by a reference in the catalog rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
} public static void main(String[] args) throws IOException, PjException { PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();
Object[] o = textList.toArray();
for (int i = 0;i < o.length;i++) {
System.out.println(o[i].getClass() + " : " + o[i].toString());
}
} /**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException { LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;
String streamText;
boolean moreData;
int textStart, textEnd; System.out.println("Going through streams..."); while (streamIter.hasNext()) { System.out.println("Getting next stream");
stream = (PjStream) streamIter.next(); System.out.println(
"Adding text from stream with filter: " + getFilterString(stream));
stream = stream.flateDecompress(); System.out.println(
"Adding text from stream with filter afterdecompress: "
+ getFilterString(stream));
streamData = new String(stream.getBuffer()); streamText = new String();
moreData = true;
textStart = textEnd = 0; while (moreData) { if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) { moreData = false;
break;
} if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) { moreData = false;
break;
} try { streamText += PjString.decodePdf(streamData.substring(textStart, textEnd + 1));
} catch (Exception e) { System.out.println(
"malformed string: " + streamData.substring(textStart, textEnd + 1));
}
} //if(streamText.equals("inserted text"))
System.out.println(streamText); if (streamText.length() > 0)
stringList.add(streamText);
} return stringList;
} public static String getFilterString(PjStream stream) throws PjException { String filterString = new String();
PjObject filter;
System.out.println("getting filter from dictionary"); if ((filter = stream.getStreamDictionary().getFilter()) == null) {
System.out.println("Got null filter");
return "";
}
System.out.println("got it"); // filter should either be a name or an array of names
if (filter instanceof PjName) { System.out.println("getting filter string from simple name");
filterString = ((PjName) filter).getString();
} else { System.out.println("getting filter string from array of names");
Iterator nameIter;
Vector nameVector; if ((nameVector = ((PjArray) filter).getVector()) == null) { System.out.println("got null vector for list of names");
return "";
} nameIter = nameVector.iterator(); while (nameIter.hasNext()) { filterString += ((PjName) nameIter.next()).getString(); if (nameIter.hasNext())
filterString += " ";
}
} System.out.println("got filter string"); return filterString;
} /**
* Performs a post-order traversal of the pages tree
* from the root node and gets all of the contents streams
* @returns a list of all the contents of all the pages
*/ public LinkedList getAllContentsStreams() throws InvalidPdfObjectException { return getContentsStreams(getAllPages());
} /**
* Get contents streams from the list of PjPage objects
* @returns a list of all the contents of the pages
*/ public LinkedList getContentsStreams(LinkedList pages)
throws InvalidPdfObjectException { LinkedList streams = new LinkedList();
Iterator pageIter = pages.iterator();
PjObject contents; while (pageIter.hasNext()) {
contents = pdf.resolve(((PjPage) pageIter.next()).getContents()); // should only be a stream or an array of streams (or refs to streams) if (contents instanceof PjStream)
streams.add(contents);
else {
Iterator streamsIter = ((PjArray) contents).getVector().iterator(); while (streamsIter.hasNext())
streams.add(pdf.resolve((PjObject) streamsIter.next()));
}
} return streams;
} /**
* Performs a post-order traversal of the pages tree
* from the root node.
* @returns a list of all the PjPage objects
*/ public LinkedList getAllPages() throws InvalidPdfObjectException { LinkedList pages = new LinkedList();
getPages(rootPage, pages);
return pages;
} /**
* Performs a post-order traversal of the pages tree
* from the node passed to it.
* @returns a list of all the PjPage objects under node
*/ public void getPages(PjObject node, LinkedList pages)
throws InvalidPdfObjectException { PjPagesNode pageNode = null; // let's hope pdf's don't have pointers to pointers if (node instanceof PjReference)
pageNode = (PjPagesNode) pdf.resolve(node);
else
pageNode = (PjPagesNode) node; if (pageNode instanceof PjPage) {
pages.add(pageNode);
return;
} // kids better be an array and not a reference to one Iterator kidIterator =
((PjArray) ((PjPages) pageNode).getKids()).getVector().iterator(); while (kidIterator.hasNext()) {
getPages((PjObject) kidIterator.next(), pages);
}
} public Pdf getPdf() {
return pdf;
}
}
好好把java的功底打牢....