C# 如何解析PDF文件？PDFBox？还是...

private void ParseXRef(int startxref)
{
int objNumber = 0;
int generationNumber;
int offset;
PdfCrossReferenceEntry entry;
int endOfXRef = pdf.IndexOf("trailer", startxref + 4);
string xRef = pdf.Substring(startxref + 4, endOfXRef - (startxref + 4) + 1);
MatchCollection refMatches = refRegex.Matches(xRef); foreach (Match refMatch in refMatches)
{
if (!refMatch.Groups[2].Value.Equals(""))
{
objNumber = Int32.Parse(refMatch.Groups[2].Value, CultureInfo.InvariantCulture);
} offset = Int32.Parse(refMatch.Groups[4].Value, CultureInfo.InvariantCulture);
generationNumber = Int32.Parse(refMatch.Groups[5].Value, CultureInfo.InvariantCulture); if (refMatch.Groups[6].Value.Equals("n"))
{
entry = new PdfCrossReferenceEntry(objNumber, generationNumber, offset,
refMatch.Groups[6].Value.Equals("n")); if (!objects.Contains(objNumber))
{
offsets.Add(offset, entry);
objects.Add(objNumber, entry);
}
}
else if (objNumber == 0)
{
// special case:
// in order to build a new xref table we need the first free object number
nullOffset = offset;
} objNumber++;
}
} /// <summary>
/// Gets the end offset of the specified object.
/// The offset is determined by the beginning offset of the object with next higher start offset.
/// If the object is the last object, -1 is returned.
/// If the document was updated, the offset may be after the xref table and trailer that
/// follow the specified object.
/// </summary>
/// <param name="objNumber">The object number to find the offset for.</param>
/// <returns>The end offset of the specified object or -1 if it is the last object</returns>
private int GetEndOfObject(int objNumber)
{
PdfCrossReferenceEntry theObject = (PdfCrossReferenceEntry)objects[objNumber];
int objectIndex = offsets.IndexOfKey(theObject.Offset);

if ((objectIndex + 1) < offsets.Count)
{
PdfCrossReferenceEntry nextObject = (PdfCrossReferenceEntry)offsets.GetByIndex(objectIndex + 1);
return nextObject.Offset;
}
else
{
return -1;
}
} /// <summary>
/// Parses the Interactive Form Dictionary referenced by the AcroForm entry in
/// the Document Catalog.
/// </summary>
private void ParseAcroForm()
{
PdfCrossReferenceEntry documentCatalog = (PdfCrossReferenceEntry)objects[rootObjectNumber];
int documentCatalogStart = documentCatalog.Offset;
int documentCatalogEnd = GetEndOfObject(documentCatalog.ObjectNumber); PdfDictionary documentCatalogObject = ParseObject(documentCatalogStart, documentCatalogEnd) as PdfDictionary;
PdfObject acroFormObject = documentCatalogObject["/AcroForm"]; if (acroFormObject is PdfReference)
{
PdfReference acroFormRef = (PdfReference)acroFormObject; // extract the AcroForm object
int acroNumber = acroFormRef.ObjectNumber;
int acroGeneration = acroFormRef.GenerationNumber;
int acroStart = ((PdfCrossReferenceEntry)objects[acroNumber]).Offset;
int acroEnd = GetEndOfObject(acroNumber); PdfDictionary formDictionary = (PdfDictionary)ParseObject(acroStart, acroEnd);
form = new PdfForm(acroNumber, acroGeneration, formDictionary);
}
} private PdfObject ParseObject(int start, int end)
{
Match match;
if (end < 0)
{
match = objRegex.Match(pdf, start);
}
else
{
match = objRegex.Match(pdf, start, end - start);
} if (match.Success)
{
int objNumber = Int32.Parse(match.Groups[1].Value, CultureInfo.InvariantCulture);
int generationNumber = Int32.Parse(match.Groups[2].Value, CultureInfo.InvariantCulture);
int endOfMatch = match.Index + match.Length; string afterObj;
if (end < 0)
{
afterObj = pdf.Substring(endOfMatch);
}
else
{
afterObj = pdf.Substring(endOfMatch, end - endOfMatch + 1);
}
return PdfObject.GetPdfObject(ref afterObj);
}
else
{
return null;
}
} /// <summary>
/// Writes the PDF out to the specified stream including all updates made after this object was created.
/// </summary>
/// <param name="stream">The <see cref="System.IO.Stream"/> the PDF will be written to.</param>
public void WritePdf(Stream stream)
{
string lines = pdf + GetUpdate(); byte[] buffer = new byte[lines.Length]; for (int i = 0; i < lines.Length; i++)
{
buffer[i] = (byte)lines[i];
} stream.Write(buffer, 0, buffer.Length);
} private string GetUpdate()
{
if (form != null)
{
int offset = pdf.Length;
string update = "";
string xref = "xref\n";
xref += "0 1\n";
xref += nullOffset.ToString("0000000000", CultureInfo.InvariantCulture) + " 65535 f \n"; // write the AcroForm object
string formString = form.ToString();
update += formString;
xref += form.ObjectNumber.ToString(CultureInfo.InvariantCulture) + " 1\n";
xref += offset.ToString("0000000000", CultureInfo.InvariantCulture) + " " + form.GenerationNumber.ToString("00000", CultureInfo.InvariantCulture) + " n \n";
offset += formString.Length; foreach (PdfField field in Fields)
{
if (field.HasChanged())
{
string fieldString = field.ToString();
update += fieldString;
xref += field.ObjectNumber.ToString(CultureInfo.InvariantCulture) + " 1\n";
xref += offset.ToString("0000000000", CultureInfo.InvariantCulture) + " " + field.GenerationNumber.ToString("00000", CultureInfo.InvariantCulture) + " n \n";
offset += fieldString.Length;
}
} string trailer = GetTrailer(offset); return update + xref + trailer;
}
else
{
return "";
}
} private string GetTrailer(int xrefOffset)
{
Hashtable trailerHash = new Hashtable();
PdfName prevName = new PdfName("/Prev");
PdfName rootName = new PdfName("/Root");
PdfName sizeName = new PdfName("/Size"); trailerHash[prevName] = new PdfNumber(previous);
trailerHash[rootName] = previousTrailer["/Root"];
trailerHash[sizeName] = previousTrailer["/Size"]; PdfDictionary newTrailer = new PdfDictionary(trailerHash); string s = "";
s += "trailer\n";
s += newTrailer + "\n";
s += "startxref\n";
s += xrefOffset.ToString(CultureInfo.InvariantCulture) + "\n";
s += "%%EOF\n"; return s;
} /// <summary>
/// Gets a collection of all form fields.
/// </summary>
public PdfField[] Fields
{
get
{
return fields;
}
} /// <summary>
/// Gets a Hashtable of all form fields keyed by their name.
/// </summary>
public Hashtable FieldsByName
{
get
{
return fieldsByName;
}
} /// <summary>
/// Reads the first file on the command line, parses it and writes it to the second file on the command line.
/// </summary>
/// <param name="args">Two filenames, the first must be a PDF file, the second will be written to.</param>
public static void Main(string[] args)
{
if (args.Length == 2)
{
Console.Error.WriteLine("Usage: " + Environment.GetCommandLineArgs()[0] + " file1 file2");
Console.Error.WriteLine("Reads file1, parses it as a PDF file, and writes to file2");
}
else
{
PdfReader reader = PdfReader.GetPdfReader("E:\\Life.pdf");

foreach (PdfField field in reader.Fields)
{
Debug.WriteLine(field.FieldName);
}

FileStream fileStream = new FileStream(@"D:\Downloads\PDFReader_src\PDFReader\samples\myPDF.pdf", System.IO.FileMode.Create);
reader.WritePdf(fileStream);
fileStream.Close();
}
}

[CLSCompliant(true)]
public class PdfReader
{
private string pdf;
private PdfField[] fields = new PdfField[0];
private Hashtable fieldsByName = new Hashtable();
private PdfForm form;
private static readonly Regex xrefRegex = new Regex(@"startxref\s*(\d+)\s*%%EOF", RegexOptions.Singleline);
private static readonly Regex trailerRegex = new Regex(@"trailer\s*<<(.*?)>>", RegexOptions.Singleline);
private static readonly Regex rootRegex = new Regex(@"/Root\s*(\d+)\s+(\d+)\s*R", RegexOptions.Singleline);
private static readonly Regex sizeRegex = new Regex(@"/Size\s*(\d+)", RegexOptions.Singleline);
private static readonly Regex nullRegex = new Regex(@"\sxref\s+0\s+\d+\s+(\d+)", RegexOptions.Singleline);
private static readonly Regex refRegex = new Regex(@"\s*((\d+)\s+(\d+))?\s*(\d{10})\s+(\d{5})\s+(n|f)", RegexOptions.Singleline);
private static readonly Regex objectRegex = new Regex(@"(\d+)\s+(\d+)\s+obj(.*?)endobj", RegexOptions.Singleline);
private static readonly Regex fieldRegex = new Regex(@"^\s*<<(.*?/FT\s+/(Btn|Tx|Ch).*>>)", RegexOptions.Singleline);
private static readonly Regex formRegex = new Regex(@"^\s*<<(.*?/Fields\s+\[.*>>)", RegexOptions.Singleline);
private static readonly Regex acroFormRegex = new Regex(@"/AcroForm\s+(\d+)\s+(\d+)", RegexOptions.Singleline);
private static readonly Regex trailRegex = new Regex(@"trailer\s+<<(.*>>)(\s+startxref\s+(\d+))?", RegexOptions.Singleline);
private static readonly Regex objRegex = new Regex(@"(\d+)\s+(\d+)\s+obj", RegexOptions.Singleline);
private static readonly Regex linearizedRegex = new Regex(@"/Linearized\s+1", RegexOptions.Singleline); private int previous = -1; // location of the previous cross-reference table
private PdfDictionary previousTrailer;
private int rootObjectNumber;
private int nullOffset;
private bool linearized = false; private Hashtable objects = new Hashtable();
private SortedList offsets = new SortedList(); /// <summary>
/// Initializes a new instance of PdfReader with the specified file.
/// </summary>
/// <param name="name">The file containing the PDF data.</param>
public static PdfReader GetPdfReader(string name)
{
PdfReader reader; using (FileStream stream = new FileStream(name, FileMode.Open))
{
reader = new PdfReader(stream);
} return reader;
} /// <summary>
/// Initializes a new instance of PdfReader with the specified Stream.
/// </summary>
/// <param name="stream">The Stream containing the PDF data.</param>
public PdfReader(Stream stream)
{
DOMConfigurator.ConfigureAndWatch(new FileInfo("PdfReader.exe.log4net")); byte[] buffer = new byte[stream.Length];
stream.Read(buffer, 0, (int)stream.Length); char[] chars = new char[buffer.Length];

for (int i = 0; i < buffer.Length; i++)
{
chars[i] = (char)buffer[i];
} pdf = new String(chars); Parse();
} /// <summary>
/// Returns the PDF object referenced by the specified PDF reference.
/// </summary>
/// <param name="reference">The reference to the object.</param>
/// <returns>The PDF object referenced.</returns>
public PdfObject GetObjectForReference(PdfReference reference)
{
PdfObject PdfObject = null; // is the object active?
if (objects.Contains(reference.ObjectNumber))
{
PdfCrossReferenceEntry entry = (PdfCrossReferenceEntry)objects[reference.ObjectNumber]; if (entry.Active)
{
int start = entry.Offset;
int end = GetEndOfObject(reference.ObjectNumber); PdfObject = ParseObject(start, end);
}
} return PdfObject;
}
public void SetSelectedIndexes(params int[] indexes)
{
if (indexes == null || indexes.Length <= 0)
{
FieldDictionary.SetElement(IName, new PdfNull());
}
else if (indexes.Length == 1)
{
PdfNumber val = new PdfNumber((double)indexes[0]); FieldDictionary.SetElement(IName, val);
}
else
{
PdfNumber[] items = new PdfNumber[indexes.Length];
int i = 0;
foreach (int index in indexes)
{
items[i] = new PdfNumber((double)index);
i++;
}

PdfArray val = new PdfArray(items);
FieldDictionary.SetElement(IName, val);
}
} /// <summary>
/// Gets the indexes of the items that should appear selected.
/// </summary>
/// <returns>An array of index that are selected.</returns>
public int[] GetSelectedIndexes()
{
PdfObject iObject = FieldDictionary.GetElement(IName); if (iObject == null || iObject.GetType() == typeof (PdfNull))
{
return new int[] {};
}
else if (iObject.GetType() == typeof (PdfNumber))
{
return new int[] { (int)((PdfNumber)iObject).Number };
}
else
{
PdfArray array = (PdfArray)iObject;
int[] indexes = new int[array.Elements.Count]; for (int i = 0; i < array.Elements.Count; i++)
{
indexes[i] = (int)((PdfNumber)array[i]).Number;
} return indexes;
}
}
} /// <summary>
/// Represents an AcroForm object in a PDF document. See the PDF reference 8.6.1 Interactive Form Dictionary.
/// </summary>
[CLSCompliant(true)]
public class PdfForm: PdfField
{
private static PdfName NAName = new PdfName("/NeedAppearances"); /// <summary>
/// Initializes a new instance of PdfForm with the specified object number, generation number,
/// and field dictionary.
/// </summary>
/// <param name="objNumber">The object number.</param>
/// <param name="generationNumber">The generation number.</param>
/// <param name="fieldDictionary">The field dictionary.</param>
public PdfForm(int objNumber, int generationNumber, PdfDictionary fieldDictionary): base(objNumber,
generationNumber, fieldDictionary)
{
// set NeedAppearances key so the viewer application regenerates the appearance streams
// for the form fields.
FieldDictionary.SetElement(NAName, new PdfBool(true));
}
} /// <summary>
/// Represents an entry in the PDF Cross Reference table.
/// See the PDF Reference 3.4.3 Cross-Reference Table.
/// </summary>
[CLSCompliant(true)]
public class PdfCrossReferenceEntry
{
private int objectNumber;
private int generationNumber;
private int offset;
private bool active; /// <summary>
/// The object number.
/// </summary>
public int ObjectNumber
{
get
{
return objectNumber;
} set
{
objectNumber = value;
}
} /// <summary>
/// The generation number.
/// </summary>
public int GenerationNumber
{
get
{
return generationNumber;
} set
{
generationNumber = value;
}
} /// <summary>
/// The byte offset of the object within the document.
/// </summary>
public int Offset
{
get
{
return offset;
} set
{
offset = value;
}
} /// <summary>
/// true if the object is not free, false otherwise.
/// </summary>
public bool Active
{
get
{
return active;
} set
{
active = value;
}
} /// <summary>
/// Initializes a new PdfCrossReferenceEntry object.
/// </summary>
/// <param name="objNumber">The object number</param>
/// <param name="generationNumber">The generation number</param>
/// <param name="offset">The byte offset within the PDF file</param>
/// <param name="active">true if the object is not free, false otherwise</param>
public PdfCrossReferenceEntry(int objNumber, int generationNumber,
int offset, bool active)
{
ObjectNumber = objNumber;
GenerationNumber = generationNumber;
Offset = offset;
Active = active;
}
}

请问gefangliang

(心灵彩虹<逍遥派掌门>) 上面代码引用的是什么DLL？

http://download.csdn.net/source/3209788

using System;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Collections;
using System.Diagnostics;
using System.Globalization;
using System.Security.Permissions;
using System.Runtime.InteropServices;
using System.Reflection;
using System.Runtime.CompilerServices;
using log4net;
using log4net.Config;[assembly: CLSCompliant(true)]
[assembly: FileIOPermission(SecurityAction.RequestMinimum)]
[assembly: ComVisible(false)]
[assembly: AssemblyVersion("1.3.*")]
什么dll也没有，你要留下邮箱我给你发过去。

那些对象PdfField，PdfObject，PdfReference，PdfDictionary，PdfForm等是什么？
.NET中有这两个引用？？
using log4net;
using log4net.Config;

不能解析， MS没有支持过PDF，除非你用ADOB的插件。
而且PDF 就是类似是图片之类的文件想把上面的文字用计算机完全读出来，十分困难，尤其那些不是特别清晰PDF文件。比如有一些书本有点旧了，或上面有标记，那些字人虽然可以看懂，但是电脑识别起来是十分困难的。现在好像没有一个软件能完全将PDF全部正确的转换为文本文件，总是存在错误机率.图像识别本来就是难题，尤其是中文，字太多，又复杂...

确实有难度
http://topic.csdn.net/u/20081104/15/e4908012-d771-414c-acce-8d5a4a05f47f.htmlgefangliang你不用给我发源码了，我自已找到了，那需要引用JAVA的两个DLL，不过没用，解析不正确

正在研究，就是不知道如何调用Acrobat Core API

int x = Convert.ToInt32(textBox2.Text);//起始坐标 int y = Convert.ToInt32(textBox3.Text); int w = Convert.ToInt32(textBox4.Text);//宽度和高度 int h = Convert.ToInt32(textBox5.Text); PdfReader reader = new PdfReader("input.pdf"); //抽取文件 PdfReaderContentParser parser = new PdfReaderContentParser(reader); RectangleJ rect = new RectangleJ(x,y, w, h); RenderFilter filter = new RegionTextRenderFilter(rect); RenderFilter[] a = {filter}; FilteredTextRenderListener strategy = new FilteredTextRenderListener(new LocationTextExtractionStrategy(), a); textBox1.AppendText ( PdfTextExtractor.GetTextFromPage(reader, 1,strategy)+"\r\n");//输出

这样可以解析指定区域里面的内容，在C#中添加对itextSharp.dll组件的引用。

调试易

C# 如何解析PDF文件？PDFBox？还是...

解决方案 »