谢谢!下面是它的代码。有main()函数的,那又该怎么办呢?
* http://www.pdfbox.org
*
*/
package org.pdfbox;import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;import org.pdfbox.exceptions.InvalidPasswordException;import org.pdfbox.pdmodel.PDDocument;import org.pdfbox.util.PDFText2HTML;
import org.pdfbox.util.PDFTextStripper;import org.apache.log4j.Logger;/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* @author Ben Litchfield ([email protected])
* @version $Revision: 1.9 $
*/
public class ExtractText
{
private static final Logger LOG = Logger.getLogger( ExtractText.class ); /**
* This is the default encoding of the text to be output.
*/
public static final String DEFAULT_ENCODING =
null;
//"ISO-8859-1";
//"ISO-8859-6"; //arabic
//"US-ASCII";
//"UTF-8";
//"UTF-16";
//"UTF-16BE";
//"UTF-16LE";
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String HTML = "-html"; // jjb - added simple HTML output /**
* private constructor.
*/
private ExtractText()
{
//static class
} /**
* Infamous main method.
*
* @param args Command line arguments, should be one and a reference to a file.
*
* @throws Exception If there is an error parsing the document.
*/
public static void main( String[] args ) throws Exception
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
String password = "";
String encoding = DEFAULT_ENCODING;
String pdfFile = null;
String textFile = null;
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( HTML ) )
{
toHTML = true;
}
else if( args[i].equals( SORT ) )
{
sort = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
else
{
textFile = args[i];
}
}
} if( pdfFile == null )
{
usage();
} if( textFile == null && pdfFile.length() >4 )
{
textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
} Writer output = null;
PDDocument document = null;
try
{
document = PDDocument.load( pdfFile ); //document.print();
if( document.isEncrypted() )
{
try
{
document.decrypt( password );
}
catch( InvalidPasswordException e )
{
if( args.length == 4 )//they supplied the wrong password
{
System.err.println( "Error: The supplied password is incorrect." );
System.exit( 2 );
}
else
{
//they didn't suppply a password and the default of "" was wrong.
System.err.println( "Error: The document is encrypted." );
usage();
}
}
}
if( toConsole )
{
output = new OutputStreamWriter( System.out );
}
else
{
if( encoding != null )
{
output = new OutputStreamWriter(
new FileOutputStream( textFile ), encoding );
}
else
{
//use default encoding
output = new OutputStreamWriter(
new FileOutputStream( textFile ) );
}
} PDFTextStripper stripper = null;
if(toHTML)
{
stripper = new PDFText2HTML();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition( sort );
stripper.setStartPage( startPage );
stripper.setEndPage( endPage );
stripper.writeText( document, output );
}
finally
{
if( output != null )
{
output.close();
}
if( document != null )
{
document.close();
}
}
} /**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
" -password <password> Password to decrypt document\n" +
" -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
" -console Send text to console instead of file\n" +
" -html Output in HTML format instead of raw text\n" +
" -sort Sort the text before writing\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
" <PDF file> The PDF document to use\n" +
" [Text File] The file to write the text to\n"
);
System.exit( 1 );
}
}
* http://www.pdfbox.org
*
*/
package org.pdfbox;import java.io.FileOutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;import org.pdfbox.exceptions.InvalidPasswordException;import org.pdfbox.pdmodel.PDDocument;import org.pdfbox.util.PDFText2HTML;
import org.pdfbox.util.PDFTextStripper;import org.apache.log4j.Logger;/**
* This is the main program that simply parses the pdf document and transforms it
* into text.
*
* @author Ben Litchfield ([email protected])
* @version $Revision: 1.9 $
*/
public class ExtractText
{
private static final Logger LOG = Logger.getLogger( ExtractText.class ); /**
* This is the default encoding of the text to be output.
*/
public static final String DEFAULT_ENCODING =
null;
//"ISO-8859-1";
//"ISO-8859-6"; //arabic
//"US-ASCII";
//"UTF-8";
//"UTF-16";
//"UTF-16BE";
//"UTF-16LE";
private static final String PASSWORD = "-password";
private static final String ENCODING = "-encoding";
private static final String CONSOLE = "-console";
private static final String START_PAGE = "-startPage";
private static final String END_PAGE = "-endPage";
private static final String SORT = "-sort";
private static final String HTML = "-html"; // jjb - added simple HTML output /**
* private constructor.
*/
private ExtractText()
{
//static class
} /**
* Infamous main method.
*
* @param args Command line arguments, should be one and a reference to a file.
*
* @throws Exception If there is an error parsing the document.
*/
public static void main( String[] args ) throws Exception
{
boolean toConsole = false;
boolean toHTML = false;
boolean sort = false;
String password = "";
String encoding = DEFAULT_ENCODING;
String pdfFile = null;
String textFile = null;
int startPage = 1;
int endPage = Integer.MAX_VALUE;
for( int i=0; i<args.length; i++ )
{
if( args[i].equals( PASSWORD ) )
{
i++;
if( i >= args.length )
{
usage();
}
password = args[i];
}
else if( args[i].equals( ENCODING ) )
{
i++;
if( i >= args.length )
{
usage();
}
encoding = args[i];
}
else if( args[i].equals( START_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
startPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( HTML ) )
{
toHTML = true;
}
else if( args[i].equals( SORT ) )
{
sort = true;
}
else if( args[i].equals( END_PAGE ) )
{
i++;
if( i >= args.length )
{
usage();
}
endPage = Integer.parseInt( args[i] );
}
else if( args[i].equals( CONSOLE ) )
{
toConsole = true;
}
else
{
if( pdfFile == null )
{
pdfFile = args[i];
}
else
{
textFile = args[i];
}
}
} if( pdfFile == null )
{
usage();
} if( textFile == null && pdfFile.length() >4 )
{
textFile = pdfFile.substring( 0, pdfFile.length() -4 ) + ".txt";
} Writer output = null;
PDDocument document = null;
try
{
document = PDDocument.load( pdfFile ); //document.print();
if( document.isEncrypted() )
{
try
{
document.decrypt( password );
}
catch( InvalidPasswordException e )
{
if( args.length == 4 )//they supplied the wrong password
{
System.err.println( "Error: The supplied password is incorrect." );
System.exit( 2 );
}
else
{
//they didn't suppply a password and the default of "" was wrong.
System.err.println( "Error: The document is encrypted." );
usage();
}
}
}
if( toConsole )
{
output = new OutputStreamWriter( System.out );
}
else
{
if( encoding != null )
{
output = new OutputStreamWriter(
new FileOutputStream( textFile ), encoding );
}
else
{
//use default encoding
output = new OutputStreamWriter(
new FileOutputStream( textFile ) );
}
} PDFTextStripper stripper = null;
if(toHTML)
{
stripper = new PDFText2HTML();
}
else
{
stripper = new PDFTextStripper();
}
stripper.setSortByPosition( sort );
stripper.setStartPage( startPage );
stripper.setEndPage( endPage );
stripper.writeText( document, output );
}
finally
{
if( output != null )
{
output.close();
}
if( document != null )
{
document.close();
}
}
} /**
* This will print the usage requirements and exit.
*/
private static void usage()
{
System.err.println( "Usage: java org.pdfbox.ExtractText [OPTIONS] <PDF file> [Text File]\n" +
" -password <password> Password to decrypt document\n" +
" -encoding <output encoding> (ISO-8859-1,UTF-16BE,UTF-16LE,...)\n" +
" -console Send text to console instead of file\n" +
" -html Output in HTML format instead of raw text\n" +
" -sort Sort the text before writing\n" +
" -startPage <number> The first page to start extraction(1 based)\n" +
" -endPage <number> The last page to extract(inclusive)\n" +
" <PDF file> The PDF document to use\n" +
" [Text File] The file to write the text to\n"
);
System.exit( 1 );
}
}
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货