Re: How to read the a PDF file content using Ethymon PJ?
Body: try this code :import java.io.*; import java.util.*; import com.etymon.pj.*; import com.etymon.pj.object.*; import com.etymon.pj.exception.*;/** * This is a wrapper for the Pj PDF parser */ public class PjWrapper {Pdf pdf; PjCatalog catalog; PjPagesNode rootPage;public PjWrapper(String PdfFileName,String TextFileName)throws IOException, PjException {pdf = new Pdf(PdfFileName);// hopefully the catalog can never be a reference...catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());// root node of pages tree is specified by a reference in the catalogrootPage = (PjPagesNode) pdf.resolve(catalog.getPages()); }public static void main (String [] args) throws IOException, PjException {/*PjWrapper testWrapper = new PjWrapper(args[0]); LinkedList textList = testWrapper.getAllText();*/ }/** * Returns as much text as we can extract from the PDF. * This currently includes: * * NOTE: Pj does not support LZW, so some text in some PDF's may not * be indexable */ public LinkedList getAllText() throws PjException {LinkedList stringList = new LinkedList(); Iterator streamIter = getAllContentsStreams().iterator(); PjStream stream; String streamData; String streamText; boolean moreData; int textStart, textEnd;//System.out.println("Going through streams...");while(streamIter.hasNext()) {//System.out.println("Getting next stream"); stream = (PjStream) streamIter.next();//System.out.println("Adding text from stream with filter: " +getFilterString(stream); stream = stream.flateDecompress();//System.out.println("Adding text from stream with filter afterdecompress: " + getFilterString(stream)); streamData = new String(stream.getBuffer());streamText = new String(); moreData = true; textStart = textEnd = 0;while(moreData) {if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) {moreData = false; break; }if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) {moreData = false; break; }try {streamText += PjString.decodePdf(streamData.substring(textStart,textEnd + 1)); } catch (Exception e) {System.out.println("malformed string: " + streamData.substring(textStart, textEnd + 1)); } }//if(streamText.equals("inserted text")) System.out.println(streamText);if (streamText.length() > 0) stringList.add(streamText); }return stringList; }public static String getFilterString(PjStream stream) throws PjException {String filterString = new String(); PjObject filter; //System.out.println("getting filter from dictionary");if ((filter = stream.getStreamDictionary().getFilter()) == null) { //System.out.println("Got null filter"); return ""; } //System.out.println("got it"); // filter should either be a name or an array of names if (filter instanceof PjName) {//System.out.println("getting filter string from simple name"); filterString = ((PjName) filter).getString(); } else {//System.out.println("getting filter string from array of names"); Iterator nameIter; Vector nameVector;if ((nameVector = ((PjArray) filter).getVector()) == null) {//System.out.println("got null vector for list of names"); return ""; }nameIter = nameVector.iterator();while (nameIter.hasNext()) {filterString += ((PjName) nameIter.next()).getString();if (nameIter.hasNext()) filterString += " "; } }//System.out.println("got filter string");return filterString; }/** * Performs a post-order traversal of the pages tree * from the root node and gets all of the contents streams * @returns a list of all the contents of all the pages */public LinkedList getAllContentsStreams() throws InvalidPdfObjectException {return getContentsStreams(getAllPages()); }/** * Get contents streams from the list of PjPage objects * @returns a list of all the contents of the pages */ public LinkedList getContentsStreams(LinkedList pages) throws InvalidPdfObjectException {LinkedList streams = new LinkedList(); Iterator pageIter = pages.iterator(); PjObject contents;while(pageIter.hasNext()) { contents = pdf.resolve(((PjPage)pageIter.next()).getContents());// should only be a stream or an array of streams (or refs to streams)if (contents instanceof PjStream) streams.add(contents); else{ Iterator streamsIter = ((PjArray)contents).getVector().iterator();while(streamsIter.hasNext()) streams.add(pdf.resolve((PjObject)streamsIter.next())); } } return streams ; }/** * Performs a post-order traversal of the pages tree * from the root node. * @returns a list of all the PjPage objects */public LinkedList getAllPages() throws InvalidPdfObjectException {LinkedList pages = new LinkedList(); getPages(rootPage, pages); return pages; }/** * Performs a post-order traversal of the pages tree * from the node passed to it. * @returns a list of all the PjPage objects under node */public void getPages(PjObject node, LinkedList pages) throws InvalidPdfObjectException {PjPagesNode pageNode = null;// let's hope pdf's don't have pointers to pointersif (node instanceof PjReference) pageNode = (PjPagesNode) pdf.resolve(node); else pageNode = (PjPagesNode) node;if (pageNode instanceof PjPage) { pages.add(pageNode); return; }// kids better be an array and not a reference to oneIterator kidIterator = ((PjArray) ((PjPages) pageNode).getKids()).getVector().iterator();while(kidIterator.hasNext()) { getPages((PjObject) kidIterator.next(), pages); } }public Pdf getPdf() { return pdf; } }
C:\\Program Files\\Adobe\\Acrobat 5.0\\Reader\\AcroRd32.exetry {
Runtime.getRuntime().exec(C:\\Program Files\\Adobe\\Acrobat 5.0\\Reader\\AcroRd32.exe + " " + yourfile);
} catch (java.io.IOException e) {}
try {
Runtime.getRuntime().exec("C:\\Program Files\\Adobe\\Acrobat 5.0\\Reader\\AcroRd32.exe" + " " + yourfile);
} catch (java.io.IOException e) {}
Body: try this code :import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper {Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage;public PjWrapper(String PdfFileName,String TextFileName)throws
IOException, PjException {pdf = new Pdf(PdfFileName);// hopefully the catalog can never be a reference...catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());// root node of pages tree is specified by a reference in the catalogrootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
}public static void main (String [] args) throws IOException, PjException
{/*PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();*/
}/**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException {LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;
String streamText;
boolean moreData;
int textStart, textEnd;//System.out.println("Going through streams...");while(streamIter.hasNext()) {//System.out.println("Getting next stream");
stream = (PjStream) streamIter.next();//System.out.println("Adding text from stream with filter: "
+getFilterString(stream);
stream = stream.flateDecompress();//System.out.println("Adding text from stream with filter
afterdecompress: " + getFilterString(stream));
streamData = new String(stream.getBuffer());streamText = new String();
moreData = true;
textStart = textEnd = 0;while(moreData) {if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) {moreData = false;
break;
}if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) {moreData = false;
break;
}try {streamText +=
PjString.decodePdf(streamData.substring(textStart,textEnd + 1));
} catch (Exception e) {System.out.println("malformed string: " +
streamData.substring(textStart, textEnd + 1));
}
}//if(streamText.equals("inserted text"))
System.out.println(streamText);if (streamText.length() > 0)
stringList.add(streamText);
}return stringList;
}public static String getFilterString(PjStream stream) throws PjException
{String filterString = new String();
PjObject filter;
//System.out.println("getting filter from dictionary");if ((filter = stream.getStreamDictionary().getFilter()) == null) {
//System.out.println("Got null filter");
return "";
}
//System.out.println("got it");
// filter should either be a name or an array of names
if (filter instanceof PjName) {//System.out.println("getting filter string from simple name");
filterString = ((PjName) filter).getString();
} else {//System.out.println("getting filter string from array of names");
Iterator nameIter;
Vector nameVector;if ((nameVector = ((PjArray) filter).getVector()) == null) {//System.out.println("got null vector for list of names");
return "";
}nameIter = nameVector.iterator();while (nameIter.hasNext()) {filterString += ((PjName) nameIter.next()).getString();if (nameIter.hasNext())
filterString += " ";
}
}//System.out.println("got filter string");return filterString;
}/**
* Performs a post-order traversal of the pages tree
* from the root node and gets all of the contents streams
* @returns a list of all the contents of all the pages
*/public LinkedList getAllContentsStreams() throws
InvalidPdfObjectException {return getContentsStreams(getAllPages());
}/**
* Get contents streams from the list of PjPage objects
* @returns a list of all the contents of the pages
*/
public LinkedList getContentsStreams(LinkedList pages) throws
InvalidPdfObjectException {LinkedList streams = new LinkedList();
Iterator pageIter = pages.iterator();
PjObject contents;while(pageIter.hasNext()) {
contents = pdf.resolve(((PjPage)pageIter.next()).getContents());// should only be a stream or an array of streams (or refs to
streams)if (contents instanceof PjStream)
streams.add(contents);
else{
Iterator streamsIter = ((PjArray)contents).getVector().iterator();while(streamsIter.hasNext())
streams.add(pdf.resolve((PjObject)streamsIter.next()));
}
}
return streams ;
}/**
* Performs a post-order traversal of the pages tree
* from the root node.
* @returns a list of all the PjPage objects
*/public LinkedList getAllPages() throws InvalidPdfObjectException {LinkedList pages = new LinkedList();
getPages(rootPage, pages);
return pages;
}/**
* Performs a post-order traversal of the pages tree
* from the node passed to it.
* @returns a list of all the PjPage objects under node
*/public void getPages(PjObject node, LinkedList pages) throws
InvalidPdfObjectException {PjPagesNode pageNode = null;// let's hope pdf's don't have pointers to pointersif (node instanceof PjReference)
pageNode = (PjPagesNode) pdf.resolve(node);
else
pageNode = (PjPagesNode) node;if (pageNode instanceof PjPage) {
pages.add(pageNode);
return;
}// kids better be an array and not a reference to oneIterator kidIterator = ((PjArray) ((PjPages)
pageNode).getKids()).getVector().iterator();while(kidIterator.hasNext()) {
getPages((PjObject) kidIterator.next(), pages);
}
}public Pdf getPdf() {
return pdf;
}
}