我做了一个蜘蛛程序,每次下载前先判断该URL是否已经下载过,我把下载过的URL存储到文件中(每个网站的URL存一个文件)。怎样才能实现快速的判断和存储
一下是我用文本存储:(缺点,速度很慢)
public static string[] GetNewUrl(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
if (!File.Exists(path))
return strUrls;
string[] reslutUrls = strUrls;
string strLine;
StreamReader sw = new StreamReader(path);
strLine = sw.ReadToEnd();
for (int i = 0; i < reslutUrls.Length; i++)
if (strLine.IndexOf(reslutUrls[i]) > -1)
reslutUrls[i] = "";
sw.Close();
return reslutUrls;
}
public static void WriteUrl(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
string str="";
StreamWriter sw = new StreamWriter(path, true);
for (int i = 0; i < strUrls.Length; i++)
str += strUrls[i]+"\r\n";
if(str!="")
sw.Write(str);
sw.Close();
}一下是用xml存储:(缺点,当多个文件同时读时,内存溢出。若改为单个文件读,速度慢) public static string[] GetExit(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
//if (!GetKeyword(path, strKeyword))
// DeleteXML(path);
string[] reslutUrls = strUrls;
//int n=0;
XmlDocument xDoc = new XmlDocument();
XmlNode xNode;
XmlElement xElem = null; try
{
if (!File.Exists(path))
return strUrls;
xDoc.Load(path);
xNode = xDoc.SelectSingleNode("//UrlSettings");
for (int i = 0; i < strUrls.Length; i++)
{
xElem = (XmlElement)xNode.SelectSingleNode("//web[@url=\"" + strUrls[i] + "\"]");
if (xElem != null)
{
reslutUrls[i] = "";
}
} }
catch (Exception ex)
{
WriteLog.writeTxt("exist" + path + ex.Message);
}
return reslutUrls; } public static void InsertUrl(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
XmlDocument xDoc = new XmlDocument();
try
{
if (!File.Exists(path))
CreateXML(path);
xDoc.Load(path);
XmlNode xNode;
XmlElement xElem1;
XmlElement xElem2;
xNode = xDoc.SelectSingleNode("//UrlSettings");
for (int i = 0; i < strUrls.Length; i++)
{
if (strUrls[i] == "" || strUrls[i] == null)
continue;
xElem1 = (XmlElement)xNode.SelectSingleNode("//web[@url=\"" + strUrls[i] + "\"]");
if (xElem1 == null)
{
xElem2 = xDoc.CreateElement("web");
xElem2.SetAttribute("url", strUrls[i]);
//xElem2.SetAttribute("keyword", strKeyword);
xNode.AppendChild(xElem2);
}
}
xDoc.Save(path);
}
catch (Exception e)
{
WriteLog.writeTxt("insert" + path + e.Message);
}
}有没有更好的方法?
一下是我用文本存储:(缺点,速度很慢)
public static string[] GetNewUrl(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
if (!File.Exists(path))
return strUrls;
string[] reslutUrls = strUrls;
string strLine;
StreamReader sw = new StreamReader(path);
strLine = sw.ReadToEnd();
for (int i = 0; i < reslutUrls.Length; i++)
if (strLine.IndexOf(reslutUrls[i]) > -1)
reslutUrls[i] = "";
sw.Close();
return reslutUrls;
}
public static void WriteUrl(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
string str="";
StreamWriter sw = new StreamWriter(path, true);
for (int i = 0; i < strUrls.Length; i++)
str += strUrls[i]+"\r\n";
if(str!="")
sw.Write(str);
sw.Close();
}一下是用xml存储:(缺点,当多个文件同时读时,内存溢出。若改为单个文件读,速度慢) public static string[] GetExit(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
//if (!GetKeyword(path, strKeyword))
// DeleteXML(path);
string[] reslutUrls = strUrls;
//int n=0;
XmlDocument xDoc = new XmlDocument();
XmlNode xNode;
XmlElement xElem = null; try
{
if (!File.Exists(path))
return strUrls;
xDoc.Load(path);
xNode = xDoc.SelectSingleNode("//UrlSettings");
for (int i = 0; i < strUrls.Length; i++)
{
xElem = (XmlElement)xNode.SelectSingleNode("//web[@url=\"" + strUrls[i] + "\"]");
if (xElem != null)
{
reslutUrls[i] = "";
}
} }
catch (Exception ex)
{
WriteLog.writeTxt("exist" + path + ex.Message);
}
return reslutUrls; } public static void InsertUrl(string SiteID, string[] strUrls)
{
string path = "data\\" + SiteID;
XmlDocument xDoc = new XmlDocument();
try
{
if (!File.Exists(path))
CreateXML(path);
xDoc.Load(path);
XmlNode xNode;
XmlElement xElem1;
XmlElement xElem2;
xNode = xDoc.SelectSingleNode("//UrlSettings");
for (int i = 0; i < strUrls.Length; i++)
{
if (strUrls[i] == "" || strUrls[i] == null)
continue;
xElem1 = (XmlElement)xNode.SelectSingleNode("//web[@url=\"" + strUrls[i] + "\"]");
if (xElem1 == null)
{
xElem2 = xDoc.CreateElement("web");
xElem2.SetAttribute("url", strUrls[i]);
//xElem2.SetAttribute("keyword", strKeyword);
xNode.AppendChild(xElem2);
}
}
xDoc.Save(path);
}
catch (Exception e)
{
WriteLog.writeTxt("insert" + path + e.Message);
}
}有没有更好的方法?
然后搜索的时候,逐个搜索