// BuildIndex.using System;
using ChineseAnalyzer = Lucene.Net.Analysis.China.ChineseAnalyzer;
using IndexWriter = Lucene.Net.Index.IndexWriter;
using Lucene.Net.Documents;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.QueryParsers;
using System.Data;
using System.IO;
using System.Text.RegularExpressions;
using htmlparser;
namespace Lucene.Net.Demo
{    class IndexFiles
    {        //internal static readonly System.IO.FileInfo INDEX_DIR = new System.IO.FileInfo("d:\\index");        internal static int i = 0;
        internal static bool b_create=false;
        internal static int OPP = 800000;        [STAThread]
        public static void Main(System.String[] argv)
        {
            bool flag;
            int minMergeDocs;
            int mergeFactor;
            string a = null;
            string b = null;
            string c = null;            System.IO.StreamReader in_Renamed = null;            in_Renamed = new System.IO.StreamReader(new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).BaseStream, new System.IO.StreamReader(System.Console.OpenStandardInput(), System.Text.Encoding.GetEncoding("UTF-8")).CurrentEncoding);            System.Console.Out.Write("The Dir u wanna create the INDX: ");
            string dir = in_Renamed.ReadLine();            System.Console.Out.Write("The INDX saved path: ");
            string indxPath = in_Renamed.ReadLine();            System.Console.Out.Write("minMergedocs(当内存中文档达到多少的时候才写入文件): ");
            a = in_Renamed.ReadLine();
            minMergeDocs = System.Int32.Parse(a);            System.Console.Out.Write("mergeFactor(控制segment合并的频率和大小): ");
            b = in_Renamed.ReadLine();
            mergeFactor = System.Int32.Parse(b);            System.Console.Out.Write("输入索引多少个文件优化一次: ");
            c = in_Renamed.ReadLine();
            OPP = System.Int32.Parse(c);
            System.Console.Out.Write("是否重新建立数据库Y/N: ");
            if (Console.ReadLine() == "Y")
                b_create = true;            if (b_create == true && System.IO.Directory.Exists(indxPath))
            {
                Console.WriteLine("重新建立数据库失败,指定路径存在数据库。");
                Console.WriteLine("请先删除它或指定另一数据库存放目录。");
                return;
            }
            System.IO.FileInfo INDEX_DIR = new System.IO.FileInfo(indxPath);            System.Console.Out.WriteLine("Start to build the INDX...");
            System.Console.Out.WriteLine("=====================================================================");            flag = BuildIndex(dir, INDEX_DIR, minMergeDocs, mergeFactor);            if (!flag)
                System.Console.Out.WriteLine("索引失败!");
            else
                Console.WriteLine("索引成功!");
            Console.WriteLine("完成!请关闭程序!");
            Console.ReadLine();        }        public static bool BuildIndex(string RootPath, System.IO.FileInfo INDEX_DIR, int minMergeDocs, int mergeFactor)
        {            bool tmpBool;
            System.IO.FileInfo docDir = new System.IO.FileInfo(RootPath);            bool tmpBool2;
            if (System.IO.File.Exists(docDir.FullName))
                tmpBool2 = true;
            else
                tmpBool2 = System.IO.Directory.Exists(docDir.FullName);            // 判断指定被索引目录是否存在
            if (!tmpBool2) // || !docDir.canRead()) // {{Aroush}} what is canRead() in C#?
            {
                System.Console.Out.WriteLine("Document directory '" + docDir.FullName + "' does not exist or is not readable, please check the path");
                return false;
            }            System.DateTime start = System.DateTime.Now;            try
            {                [color=#FF0000]/////////////////////////////////////////////////////////////////////////////////////
                IndexWriter writer = new IndexWriter("e://index", new ChineseAnalyzer(),b_create);                   /////////////////////////////////////////////////////////////////////////////////////
[/color]
                writer.minMergeDocs = minMergeDocs;
                writer.mergeFactor = mergeFactor;
                IndexDirectory(writer, docDir);                System.Console.Out.Write("optimizing...");
                writer.Optimize();
                System.Console.Out.WriteLine("done!");
                writer.Close();                System.DateTime end = System.DateTime.Now;
                System.Console.Out.WriteLine(end.Ticks - start.Ticks + " total milliseconds");
            }            catch (System.IO.IOException e)
            {
                System.Console.Out.WriteLine(" caught a " + e.GetType() + "\n with message: " + e.Message);
            }            return true;
        }        public static void IndexDirectory(IndexWriter writer, System.IO.FileInfo file)
        {            if (Directory.Exists(file.FullName))
            {
                String[] files = Directory.GetFileSystemEntries(file.FullName);
                // an IO error could occur 
                if (files != null)
                {
                    for (int i = 0; i < files.Length; i++)
                    {
                        IndexDirectory(writer, new FileInfo(files[i]));  //这里是一个递归 
                    }
                }
            }
            else
            {
                IndexFile(file, writer);
            }
        }        private static void IndexFile(System.IO.FileInfo file, IndexWriter writer)
        {            try
            {
                StreamReader reader = new StreamReader(file.FullName, System.Text.Encoding.GetEncoding("gb2312"));
                string html = reader.ReadToEnd();
                reader.Close();                Document doc = new Document();
                //Document doc = HTMLDocument.Document(file);
                System.Console.Out.WriteLine("正在建立索引" + file.FullName);
                i++;
                Console.Out.WriteLine(i);
                doc.Add(Field.Keyword("url", lib.GetURL(file.FullName)));
                doc.Add(Field.Keyword("size", lib.GetSize(file.FullName)));
//                doc.Add(Field.Keyword("html", html));
                doc.Add(Field.Keyword("title", lib.GetStringTitle(html)));
                html = lib.HtmlFilter(html);//                doc.Add(Field.Keyword("text", html));
                doc.Add(Field.Text("contents", html));                writer.AddDocument(doc);                if (i % OPP == 0)
                {
                    Console.WriteLine("Optimize been indexed files...");
                    writer.Optimize();
                }            }            catch (System.IO.IOException fnfe)
            {
                ;
            }        }
    }
}