获取了 如下链接http://www.seckeep.com/
http://www.seckeep.com/
http://www.seckeep.com/?post=2
http://www.seckeep.com/?post=5
http://www.seckeep.com/?author=1
http://www.seckeep.com/?tag=%E4%BF%9D%E6%8C%81%E5%AE%89%E5%85%A8
http://www.seckeep.com/?tag=%E5%8F%A4%E5%85%B8%E9%BB%91%E5%AE%A2
http://www.seckeep.com/?post=168#comment
http://www.seckeep.com/?tag=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8
http://www.seckeep.com/test3/?tag=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8
http://www.seckeep.com/test1/?tag=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8
http://www.seckeep.com/test2/?tag=%E7%BD%91%E7%BB%9C%E5%AE%89%E5%85%A8
http://123.seckeep.com/?post=168#comment
http://456.seckeep.com/?post=168#comment如何提出掉相同的链接名称~显然把上面的链接存入数组再text_list1.Distinct().ToArray();是不合适的~我的想法是 只取一次链接后为post的文件,一次tag。当然还要考虑目录 比如 test1 test2 test3都包含tag。在这样的情况下我都必须收录。还要考虑域名如 123.seckeep.com和 456.seckeep.com 中post文件我也许需要收录。请问有什么好的方法吗?

解决方案 »

  1.   

    http://www.a.com/testweb/default.aspx?id=1
    Request.ApplicationPath: /testweb
    Request.CurrentExecutionFilePath: /testweb/default.aspx
    Request.FilePath: /testweb/default.aspx
    Request.Path: /testweb/default.aspx
    Request.RawUrl: /testweb/default.aspx?id=1
    Request.Url.AbsolutePath: /testweb/default.aspx
    Request.Url.LocalPath: /testweb/default.aspx 
    获取数据保存到dictionary,contains判断是否存在
     
      

  2.   

    错误 1 “System.Uri”不包含“ApplicationPath”的定义,并且找不到可接受类型为“System.Uri”的第一个参数的扩展方法“ApplicationPath”(是否缺少 using 指令或程序集引用?) C:\Users\yby\Desktop\WindowsFormsApplication2\Form1.cs 152 34 WindowsFormsApplication2
      

  3.   

                     HttpRequest request =new HttpRequest("",http://topic.csdn.net/u/20110302/09/1a.php,"");
                   
                    string link=request.ApplicationPath;
                    string aa = request.Path;
                    contents.Add(link);
                    contents.Add(aa);但是link为 NULL。aa里面的内容是期望的!这是为什么呢?
      

  4.   

    http://www.xinfengit.com/200907/1566217.html
      

  5.   


    using System;using System.Xml;using System.Text;using System.Net;using System.IO;using System.Collections;using System.Text.RegularExpressions;public class App{public static void Main(){string strCode;ArrayList alLinks;Console.Write("请输入一个网页地址:");string strURL = Console.ReadLine();if(strURL.Substring(0,7) != @"http://"){strURL = @"http://" + strURL;}Console.WriteLine("正在获取页面代码,请稍侯...");strCode = GetPageSource(strURL);Console.WriteLine("正在提取超链接,请稍侯...");alLinks = GetHyperLinks(strCode);Console.WriteLine("正在写入文件,请稍侯...");WriteToXml(strURL,alLinks);}// 获取指定网页的HTML代码static string GetPageSource(string URL){Uri uri =new Uri(URL);HttpWebRequest hwReq = (HttpWebRequest)WebRequest.Create(uri);HttpWebResponse hwRes = (HttpWebResponse)hwReq.GetResponse();hwReq.Method = "Get";hwReq.KeepAlive = false;StreamReader reader = new StreamReader(hwRes.GetResponseStream(),System.Text.Encoding.GetEncoding("GB2312"));return reader.ReadToEnd();}// 提取HTML代码中的网址static ArrayList GetHyperLinks(string htmlCode){ArrayList al = new ArrayList();string strRegex = @"http://([\w-]+\.)+[\w-]+(/[\w- ./?%&=]*)?";Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);MatchCollection m = r.Matches(htmlCode);for(int i=0; i<=m.Count-1; i++){bool rep = false;string strNew = m[i].ToString();// 过滤重复的URLforeach(string str in al){if(strNew==str){rep =true;break;}}if(!rep) al.Add(strNew);}al.Sort();return al;}// 把网址写入xml文件static void WriteToXml(string strURL, ArrayList alHyperLinks){XmlTextWriter writer = new XmlTextWriter("HyperLinks.xml",Encoding.UTF8);writer.Formatting = Formatting.Indented;writer.WriteStartDocument(false);writer.WriteDocType("HyperLinks", null, "urls.dtd", null);writer.WriteComment("提取自" + strURL + "的超链接");writer.WriteStartElement("HyperLinks");writer.WriteStartElement("HyperLinks", null);writer.WriteAttributeString("DateTime",DateTime.Now.ToString());foreach(string str in alHyperLinks){string title = GetDomain(str);string body = str;writer.WriteElementString(title,null,body);}writer.WriteEndElement();writer.WriteEndElement();writer.Flush();writer.Close();}// 获取网址的域名后缀static string GetDomain(string strURL){string retVal;string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";Regex r = new Regex(strRegex,RegexOptions.IgnoreCase);Match m = r.Match(strURL);retVal = m.ToString();strRegex = @"\.|/$";retVal = Regex.Replace(retVal, strRegex, "").ToString();if(retVal == "")retVal = "other";return retVal;}}