///==============================================================/// Office File Reader
///public static void Main()
///{
/// OfficeFileReader.OfficeFileReader objOFR = new OfficeFileReader.OfficeFileReader()
/// string output="";
/// objOFR.GetText("C:\\MyWordFile.Doc", ref output);
/// Console.WriteLine(output);
///}
///==============================================================using System;using System.Text;using System.Runtime.InteropServices;namespace OfficeFileReader
{
#region Stuff you Dont even need to look at
[Flags] public enum IFILTER_INIT
{ NONE = 0, CANON_PARAGRAPHS = 1, HARD_LINE_BREAKS = 2, CANON_HYPHENS = 4, CANON_SPACES = 8, APPLY_INDEX_ATTRIBUTES = 16, APPLY_CRAWL_ATTRIBUTES = 256, APPLY_OTHER_ATTRIBUTES = 32, INDEXING_ONLY = 64, SEARCH_LINKS = 128, FILTER_OWNED_VALUE_OK = 512 } [Flags] public enum IFILTER_FLAGS
{ OLE_PROPERTIES = 1 } public enum CHUNK_BREAKTYPE
{ CHUNK_NO_BREAK = 0, CHUNK_EOW = 1, CHUNK_EOS = 2, CHUNK_EOP = 3, CHUNK_EOC = 4 } [Flags] public enum CHUNKSTATE
{ CHUNK_TEXT = 0x1, CHUNK_VALUE = 0x2, CHUNK_FILTER_OWNED_VALUE = 0x4 } public enum PSKIND
{ LPWSTR = 0, PROPID = 1 } [StructLayout(LayoutKind.Sequential)] public struct PROPSPEC
{ public uint ulKind; public uint propid; public IntPtr lpwstr; } [StructLayout(LayoutKind.Sequential)] public struct FULLPROPSPEC
{ public Guid guidPropSet; public PROPSPEC psProperty; } [StructLayout(LayoutKind.Sequential)] public struct STAT_CHUNK
{ public uint idChunk; [MarshalAs(UnmanagedType.U4)]
public CHUNK_BREAKTYPE breakType; [MarshalAs(UnmanagedType.U4)]
public CHUNKSTATE flags; public uint locale; [MarshalAs(UnmanagedType.Struct)]
public FULLPROPSPEC attribute; public uint idChunkSource; public uint cwcStartSource; public uint cwcLenSource; } [StructLayout(LayoutKind.Sequential)] public struct FILTERREGION
{ public uint idChunk; public uint cwcStart; public uint cwcExtent; }
#endregion [ComImport] [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")] [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IFilter
{ void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, uint cAttributes, [MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] FULLPROPSPEC[] aAttributes, ref uint pdwFlags); void GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat); [PreserveSig]
int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer); void GetValue(ref UIntPtr ppPropValue); void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk); } [ComImport] [Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")] public class CFilter
{ } public class Constants
{ public const uint PID_STG_DIRECTORY = 0x00000002; public const uint PID_STG_CLASSID = 0x00000003; public const uint PID_STG_STORAGETYPE = 0x00000004; public const uint PID_STG_VOLUME_ID = 0x00000005; public const uint PID_STG_PARENT_WORKID = 0x00000006; public const uint PID_STG_SECONDARYSTORE = 0x00000007; public const uint PID_STG_FILEINDEX = 0x00000008; public const uint PID_STG_LASTCHANGEUSN = 0x00000009; public const uint PID_STG_NAME = 0x0000000a; public const uint PID_STG_PATH = 0x0000000b; public const uint PID_STG_SIZE = 0x0000000c; public const uint PID_STG_ATTRIBUTES = 0x0000000d; public const uint PID_STG_WRITETIME = 0x0000000e; public const uint PID_STG_CREATETIME = 0x0000000f; public const uint PID_STG_ACCESSTIME = 0x00000010; public const uint PID_STG_CHANGETIME = 0x00000011; public const uint PID_STG_CONTENTS = 0x00000013; public const uint PID_STG_SHORTNAME = 0x00000014; public const int FILTER_E_END_OF_CHUNKS = (unchecked((int)0x80041700)); public const int FILTER_E_NO_MORE_TEXT = (unchecked((int)0x80041701)); public const int FILTER_E_NO_MORE_VALUES = (unchecked((int)0x80041702)); public const int FILTER_E_NO_TEXT = (unchecked((int)0x80041705)); public const int FILTER_E_NO_VALUES = (unchecked((int)0x80041706)); public const int FILTER_S_LAST_TEXT = (unchecked((int)0x00041709));
}
public class OfficeFileReader
{
public void GetText(String path, ref string text)
// path is the path of the .doc, .xls or .ppt file
// text is the variable in which all the extracted text will be stored
{
String result = "";
int count = 0;
try
{
IFilter ifilt = (IFilter)(new CFilter());
//System.Runtime.InteropServices.UCOMIPersistFile ipf = (System.Runtime.InteropServices.UCOMIPersistFile)(ifilt);
System.Runtime.InteropServices.ComTypes.IPersistFile ipf = (System.Runtime.InteropServices.ComTypes.IPersistFile)(ifilt);
ipf.Load(@path, 0); uint i = 0;
STAT_CHUNK ps = new STAT_CHUNK();
ifilt.Init(IFILTER_INIT.NONE, 0, null, ref i);
int hr = 0;
while (hr == 0)
{
ifilt.GetChunk(out ps);
if (ps.flags == CHUNKSTATE.CHUNK_TEXT)
{
uint pcwcBuffer = 1000;
int hr2 = 0;
while (hr2 == Constants.FILTER_S_LAST_TEXT || hr2 == 0)
{
try
{
pcwcBuffer = 1000;
System.Text.StringBuilder sbBuffer = new StringBuilder((int)pcwcBuffer);
hr2 = ifilt.GetText(ref pcwcBuffer, sbBuffer);
// Console.WriteLine(pcwcBuffer.ToString());
if (hr2 >= 0) result += sbBuffer.ToString(0, (int)pcwcBuffer);
//textBox1.Text +="\n";
// result += "#########################################";
count++;
}
catch (System.Runtime.InteropServices.COMException myE)
{
Console.WriteLine(myE.Data + "\n" + myE.Message + "\n"); }
}
} } }
catch (System.Runtime.InteropServices.COMException myE)
{
Console.WriteLine(myE.Data + "\n" + myE.Message + "\n"); } text = result;
//return count;
return; }
}}这个是要读DOC文件的
而我是数据库的二进制
谁能改一下```
///public static void Main()
///{
/// OfficeFileReader.OfficeFileReader objOFR = new OfficeFileReader.OfficeFileReader()
/// string output="";
/// objOFR.GetText("C:\\MyWordFile.Doc", ref output);
/// Console.WriteLine(output);
///}
///==============================================================using System;using System.Text;using System.Runtime.InteropServices;namespace OfficeFileReader
{
#region Stuff you Dont even need to look at
[Flags] public enum IFILTER_INIT
{ NONE = 0, CANON_PARAGRAPHS = 1, HARD_LINE_BREAKS = 2, CANON_HYPHENS = 4, CANON_SPACES = 8, APPLY_INDEX_ATTRIBUTES = 16, APPLY_CRAWL_ATTRIBUTES = 256, APPLY_OTHER_ATTRIBUTES = 32, INDEXING_ONLY = 64, SEARCH_LINKS = 128, FILTER_OWNED_VALUE_OK = 512 } [Flags] public enum IFILTER_FLAGS
{ OLE_PROPERTIES = 1 } public enum CHUNK_BREAKTYPE
{ CHUNK_NO_BREAK = 0, CHUNK_EOW = 1, CHUNK_EOS = 2, CHUNK_EOP = 3, CHUNK_EOC = 4 } [Flags] public enum CHUNKSTATE
{ CHUNK_TEXT = 0x1, CHUNK_VALUE = 0x2, CHUNK_FILTER_OWNED_VALUE = 0x4 } public enum PSKIND
{ LPWSTR = 0, PROPID = 1 } [StructLayout(LayoutKind.Sequential)] public struct PROPSPEC
{ public uint ulKind; public uint propid; public IntPtr lpwstr; } [StructLayout(LayoutKind.Sequential)] public struct FULLPROPSPEC
{ public Guid guidPropSet; public PROPSPEC psProperty; } [StructLayout(LayoutKind.Sequential)] public struct STAT_CHUNK
{ public uint idChunk; [MarshalAs(UnmanagedType.U4)]
public CHUNK_BREAKTYPE breakType; [MarshalAs(UnmanagedType.U4)]
public CHUNKSTATE flags; public uint locale; [MarshalAs(UnmanagedType.Struct)]
public FULLPROPSPEC attribute; public uint idChunkSource; public uint cwcStartSource; public uint cwcLenSource; } [StructLayout(LayoutKind.Sequential)] public struct FILTERREGION
{ public uint idChunk; public uint cwcStart; public uint cwcExtent; }
#endregion [ComImport] [Guid("89BCB740-6119-101A-BCB7-00DD010655AF")] [InterfaceType(ComInterfaceType.InterfaceIsIUnknown)] public interface IFilter
{ void Init([MarshalAs(UnmanagedType.U4)] IFILTER_INIT grfFlags, uint cAttributes, [MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] FULLPROPSPEC[] aAttributes, ref uint pdwFlags); void GetChunk([MarshalAs(UnmanagedType.Struct)] out STAT_CHUNK pStat); [PreserveSig]
int GetText(ref uint pcwcBuffer, [MarshalAs(UnmanagedType.LPWStr)] StringBuilder buffer); void GetValue(ref UIntPtr ppPropValue); void BindRegion([MarshalAs(UnmanagedType.Struct)]FILTERREGION origPos, ref Guid riid, ref UIntPtr ppunk); } [ComImport] [Guid("f07f3920-7b8c-11cf-9be8-00aa004b9986")] public class CFilter
{ } public class Constants
{ public const uint PID_STG_DIRECTORY = 0x00000002; public const uint PID_STG_CLASSID = 0x00000003; public const uint PID_STG_STORAGETYPE = 0x00000004; public const uint PID_STG_VOLUME_ID = 0x00000005; public const uint PID_STG_PARENT_WORKID = 0x00000006; public const uint PID_STG_SECONDARYSTORE = 0x00000007; public const uint PID_STG_FILEINDEX = 0x00000008; public const uint PID_STG_LASTCHANGEUSN = 0x00000009; public const uint PID_STG_NAME = 0x0000000a; public const uint PID_STG_PATH = 0x0000000b; public const uint PID_STG_SIZE = 0x0000000c; public const uint PID_STG_ATTRIBUTES = 0x0000000d; public const uint PID_STG_WRITETIME = 0x0000000e; public const uint PID_STG_CREATETIME = 0x0000000f; public const uint PID_STG_ACCESSTIME = 0x00000010; public const uint PID_STG_CHANGETIME = 0x00000011; public const uint PID_STG_CONTENTS = 0x00000013; public const uint PID_STG_SHORTNAME = 0x00000014; public const int FILTER_E_END_OF_CHUNKS = (unchecked((int)0x80041700)); public const int FILTER_E_NO_MORE_TEXT = (unchecked((int)0x80041701)); public const int FILTER_E_NO_MORE_VALUES = (unchecked((int)0x80041702)); public const int FILTER_E_NO_TEXT = (unchecked((int)0x80041705)); public const int FILTER_E_NO_VALUES = (unchecked((int)0x80041706)); public const int FILTER_S_LAST_TEXT = (unchecked((int)0x00041709));
}
public class OfficeFileReader
{
public void GetText(String path, ref string text)
// path is the path of the .doc, .xls or .ppt file
// text is the variable in which all the extracted text will be stored
{
String result = "";
int count = 0;
try
{
IFilter ifilt = (IFilter)(new CFilter());
//System.Runtime.InteropServices.UCOMIPersistFile ipf = (System.Runtime.InteropServices.UCOMIPersistFile)(ifilt);
System.Runtime.InteropServices.ComTypes.IPersistFile ipf = (System.Runtime.InteropServices.ComTypes.IPersistFile)(ifilt);
ipf.Load(@path, 0); uint i = 0;
STAT_CHUNK ps = new STAT_CHUNK();
ifilt.Init(IFILTER_INIT.NONE, 0, null, ref i);
int hr = 0;
while (hr == 0)
{
ifilt.GetChunk(out ps);
if (ps.flags == CHUNKSTATE.CHUNK_TEXT)
{
uint pcwcBuffer = 1000;
int hr2 = 0;
while (hr2 == Constants.FILTER_S_LAST_TEXT || hr2 == 0)
{
try
{
pcwcBuffer = 1000;
System.Text.StringBuilder sbBuffer = new StringBuilder((int)pcwcBuffer);
hr2 = ifilt.GetText(ref pcwcBuffer, sbBuffer);
// Console.WriteLine(pcwcBuffer.ToString());
if (hr2 >= 0) result += sbBuffer.ToString(0, (int)pcwcBuffer);
//textBox1.Text +="\n";
// result += "#########################################";
count++;
}
catch (System.Runtime.InteropServices.COMException myE)
{
Console.WriteLine(myE.Data + "\n" + myE.Message + "\n"); }
}
} } }
catch (System.Runtime.InteropServices.COMException myE)
{
Console.WriteLine(myE.Data + "\n" + myE.Message + "\n"); } text = result;
//return count;
return; }
}}这个是要读DOC文件的
而我是数据库的二进制
谁能改一下```
如果我有10W个DOC 那我要重复写10W次?
using System.Xml;
using System.Xml.XPath;
using System.Data;
class ReadXML
{
public static void Main()
{
string sFile = "ReadXml.xml";
//method 1
XmlDocument doc = new XmlDocument();
doc.Load(sFile);
XmlNode node = doc.DocumentElement["News"]["Content"];
Console.WriteLine(node.InnerText);
//method2
node = doc.SelectSingleNode("//Content");
Console.WriteLine(node.InnerText);
//similarly
node = doc.DocumentElement.SelectSingleNode("News/Content");
Console.WriteLine(node.InnerText);
//method 3
DataSet ds = new DataSet();
ds.ReadXml(sFile);
Console.WriteLine(ds.Tables[0].Rows[0]["Content"].ToString());
//method 4
XmlTextReader reader = new XmlTextReader(sFile);
while (reader.Read())
{
if (reader.Name == "Content")
{
Console.WriteLine("***" + reader.ReadString());
break;
}
}
reader.Close();
//method 5
XPathDocument xpdoc = new XPathDocument(sFile);
XPathNavigator xpnv = xpdoc.CreateNavigator();
xpnv.MoveToFirstChild();
xpnv.MoveToFirstChild();
xpnv.MoveToFirstChild();
xpnv.MoveToNext();xpnv.MoveToNext();xpnv.MoveToNext();
Console.WriteLine("pathnavigator:" + xpnv.Value);
}
}