// BuildIndex.using System;
using ChineseAnalyzer = Lucene.Net.Analysis.China.ChineseAnalyzer;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using System.Data;
using System.IO;
using System.Text.RegularExpressions;
using htmlparser;
namespace Lucene.Net.Demo
{ class IndexFiles
{ //internal static readonly System.IO.FileInfo INDEX_DIR = new System.IO.FileInfo("d:\\index"); internal static int i = 0;
internal static bool b_create=false;
internal static int OPP = 800000; [STAThread]
public static void Main(System.String[] argv)
{
bool flag;
int minMergeDocs;
int mergeFactor;
string a = null;
string b = null;
string c = null; System.IO.StreamReader in_Renamed = null; in_Renamed = new System.IO.StreamReader(new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).BaseStream, new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).CurrentEncoding); System.Console.Out.Write("The Dir u wanna create the INDX: ");
string dir = in_Renamed.ReadLine(); System.Console.Out.Write("The INDX saved path: ");
string indxPath = in_Renamed.ReadLine(); System.Console.Out.Write("minMergedocs(当内存中文档达到多少的时候才写入文件): ");
a = in_Renamed.ReadLine();
minMergeDocs = System.Int32.Parse(a); System.Console.Out.Write("mergeFactor(控制segment合并的频率和大小): ");
b = in_Renamed.ReadLine();
mergeFactor = System.Int32.Parse(b); System.Console.Out.Write("输入索引多少个文件优化一次: ");
c = in_Renamed.ReadLine();
OPP = System.Int32.Parse(c);
System.Console.Out.Write("是否重新建立数据库Y/N: ");
if (Console.ReadLine() == "Y")
b_create = true; if (b_create == true && System.IO.Directory.Exists(indxPath))
{
Console.WriteLine("重新建立数据库失败,指定路径存在数据库。");
Console.WriteLine("请先删除它或指定另一数据库存放目录。");
return;
}
System.IO.FileInfo INDEX_DIR = new System.IO.FileInfo(indxPath); System.Console.Out.WriteLine("Start to build the INDX...");
System.Console.Out.WriteLine("====================================================================="); flag = BuildIndex(dir, INDEX_DIR, minMergeDocs, mergeFactor); if (!flag)
System.Console.Out.WriteLine("索引失败!");
else
Console.WriteLine("索引成功!");
Console.WriteLine("完成!请关闭程序!");
Console.ReadLine(); } public static bool BuildIndex(string RootPath, System.IO.FileInfo INDEX_DIR, int minMergeDocs, int mergeFactor)
{ bool tmpBool;
System.IO.FileInfo docDir = new System.IO.FileInfo(RootPath); bool tmpBool2;
if (System.IO.File.Exists(docDir.FullName))
tmpBool2 = true;
else
tmpBool2 = System.IO.Directory.Exists(docDir.FullName); // 判断指定被索引目录是否存在
if (!tmpBool2) // || !docDir.canRead()) // {{Aroush}} what is canRead() in C#?
{
System.Console.Out.WriteLine("Document directory '" + docDir.FullName + "' does not exist or is not readable, please check the path");
return false;
} System.DateTime start = System.DateTime.Now; try
{ [color=#FF0000]/////////////////////////////////////////////////////////////////////////////////////
IndexWriter writer = new IndexWriter("e://index", new ChineseAnalyzer(),b_create); /////////////////////////////////////////////////////////////////////////////////////[/color]
writer.minMergeDocs = minMergeDocs;
writer.mergeFactor = mergeFactor;
IndexDirectory(writer, docDir); System.Console.Out.Write("optimizing...");
writer.Optimize();
System.Console.Out.WriteLine("done!");
writer.Close(); System.DateTime end = System.DateTime.Now;
System.Console.Out.WriteLine(end.Ticks - start.Ticks + " total milliseconds");
} catch (System.IO.IOException e)
{
System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
} return true;
} public static void IndexDirectory(IndexWriter writer, System.IO.FileInfo file)
{ if (Directory.Exists(file.FullName))
{
String[] files = Directory.GetFileSystemEntries(file.FullName);
// an IO error could occur
if (files != null)
{
for (int i = 0; i < files.Length; i++)
{
IndexDirectory(writer, new FileInfo(files[i])); //这里是一个递归
}
}
}
else
{
IndexFile(file, writer);
}
} private static void IndexFile(System.IO.FileInfo file, IndexWriter writer)
{ try
{
StreamReader reader = new StreamReader(file.FullName, System.Text.Encoding.GetEncoding("gb2312"));
string html = reader.ReadToEnd();
reader.Close(); Document doc = new Document();
//Document doc = HTMLDocument.Document(file);
System.Console.Out.WriteLine("正在建立索引" + file.FullName);
i++;
Console.Out.WriteLine(i);
doc.Add(Field.Keyword("url", lib.GetURL(file.FullName)));
doc.Add(Field.Keyword("size", lib.GetSize(file.FullName)));
// doc.Add(Field.Keyword("html", html));
doc.Add(Field.Keyword("title", lib.GetStringTitle(html)));
html = lib.HtmlFilter(html);// doc.Add(Field.Keyword("text", html));
doc.Add(Field.Text("contents", html)); writer.AddDocument(doc); if (i % OPP == 0)
{
Console.WriteLine("Optimize been indexed files...");
writer.Optimize();
} } catch (System.IO.IOException fnfe)
{
;
} }
}
}
using ChineseAnalyzer = Lucene.Net.Analysis.China.ChineseAnalyzer;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using System.Data;
using System.IO;
using System.Text.RegularExpressions;
using htmlparser;
namespace Lucene.Net.Demo
{ class IndexFiles
{ //internal static readonly System.IO.FileInfo INDEX_DIR = new System.IO.FileInfo("d:\\index"); internal static int i = 0;
internal static bool b_create=false;
internal static int OPP = 800000; [STAThread]
public static void Main(System.String[] argv)
{
bool flag;
int minMergeDocs;
int mergeFactor;
string a = null;
string b = null;
string c = null; System.IO.StreamReader in_Renamed = null; in_Renamed = new System.IO.StreamReader(new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).BaseStream, new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).CurrentEncoding); System.Console.Out.Write("The Dir u wanna create the INDX: ");
string dir = in_Renamed.ReadLine(); System.Console.Out.Write("The INDX saved path: ");
string indxPath = in_Renamed.ReadLine(); System.Console.Out.Write("minMergedocs(当内存中文档达到多少的时候才写入文件): ");
a = in_Renamed.ReadLine();
minMergeDocs = System.Int32.Parse(a); System.Console.Out.Write("mergeFactor(控制segment合并的频率和大小): ");
b = in_Renamed.ReadLine();
mergeFactor = System.Int32.Parse(b); System.Console.Out.Write("输入索引多少个文件优化一次: ");
c = in_Renamed.ReadLine();
OPP = System.Int32.Parse(c);
System.Console.Out.Write("是否重新建立数据库Y/N: ");
if (Console.ReadLine() == "Y")
b_create = true; if (b_create == true && System.IO.Directory.Exists(indxPath))
{
Console.WriteLine("重新建立数据库失败,指定路径存在数据库。");
Console.WriteLine("请先删除它或指定另一数据库存放目录。");
return;
}
System.IO.FileInfo INDEX_DIR = new System.IO.FileInfo(indxPath); System.Console.Out.WriteLine("Start to build the INDX...");
System.Console.Out.WriteLine("====================================================================="); flag = BuildIndex(dir, INDEX_DIR, minMergeDocs, mergeFactor); if (!flag)
System.Console.Out.WriteLine("索引失败!");
else
Console.WriteLine("索引成功!");
Console.WriteLine("完成!请关闭程序!");
Console.ReadLine(); } public static bool BuildIndex(string RootPath, System.IO.FileInfo INDEX_DIR, int minMergeDocs, int mergeFactor)
{ bool tmpBool;
System.IO.FileInfo docDir = new System.IO.FileInfo(RootPath); bool tmpBool2;
if (System.IO.File.Exists(docDir.FullName))
tmpBool2 = true;
else
tmpBool2 = System.IO.Directory.Exists(docDir.FullName); // 判断指定被索引目录是否存在
if (!tmpBool2) // || !docDir.canRead()) // {{Aroush}} what is canRead() in C#?
{
System.Console.Out.WriteLine("Document directory '" + docDir.FullName + "' does not exist or is not readable, please check the path");
return false;
} System.DateTime start = System.DateTime.Now; try
{ [color=#FF0000]/////////////////////////////////////////////////////////////////////////////////////
IndexWriter writer = new IndexWriter("e://index", new ChineseAnalyzer(),b_create); /////////////////////////////////////////////////////////////////////////////////////[/color]
writer.minMergeDocs = minMergeDocs;
writer.mergeFactor = mergeFactor;
IndexDirectory(writer, docDir); System.Console.Out.Write("optimizing...");
writer.Optimize();
System.Console.Out.WriteLine("done!");
writer.Close(); System.DateTime end = System.DateTime.Now;
System.Console.Out.WriteLine(end.Ticks - start.Ticks + " total milliseconds");
} catch (System.IO.IOException e)
{
System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
} return true;
} public static void IndexDirectory(IndexWriter writer, System.IO.FileInfo file)
{ if (Directory.Exists(file.FullName))
{
String[] files = Directory.GetFileSystemEntries(file.FullName);
// an IO error could occur
if (files != null)
{
for (int i = 0; i < files.Length; i++)
{
IndexDirectory(writer, new FileInfo(files[i])); //这里是一个递归
}
}
}
else
{
IndexFile(file, writer);
}
} private static void IndexFile(System.IO.FileInfo file, IndexWriter writer)
{ try
{
StreamReader reader = new StreamReader(file.FullName, System.Text.Encoding.GetEncoding("gb2312"));
string html = reader.ReadToEnd();
reader.Close(); Document doc = new Document();
//Document doc = HTMLDocument.Document(file);
System.Console.Out.WriteLine("正在建立索引" + file.FullName);
i++;
Console.Out.WriteLine(i);
doc.Add(Field.Keyword("url", lib.GetURL(file.FullName)));
doc.Add(Field.Keyword("size", lib.GetSize(file.FullName)));
// doc.Add(Field.Keyword("html", html));
doc.Add(Field.Keyword("title", lib.GetStringTitle(html)));
html = lib.HtmlFilter(html);// doc.Add(Field.Keyword("text", html));
doc.Add(Field.Text("contents", html)); writer.AddDocument(doc); if (i % OPP == 0)
{
Console.WriteLine("Optimize been indexed files...");
writer.Optimize();
} } catch (System.IO.IOException fnfe)
{
;
} }
}
}
解决方案 »
免费领取超大流量手机卡,每月29元包185G流量+100分钟通话, 中国电信官方发货