那位高手使用C#做过抓取网页中栏目的链接和名称的请指导下有什么方法可以定位到网页中的栏目位置

解决方案 »

  1.   


    //读取  网页
    public string ReadHtml(string uri)
            {            string retVal = null;                        try
                {
                    HttpWebRequest request = WebRequest.Create(uri) as HttpWebRequest;                request.Method = "GET";                request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0)";                request.AllowAutoRedirect = false;                               HttpWebResponse response = request.GetResponse() as HttpWebResponse;                
                                  if (response.StatusCode == HttpStatusCode.OK)
                    {
                        Console.Write("正在读取 {0}  页面         ", uri);                    StreamReader sReader = new StreamReader(response.GetResponseStream(), Encoding.Default);                    retVal = sReader.ReadToEnd();
                        Console.WriteLine(response.StatusDescription);                  
                    }
                                }
                catch (WebException ex)
                {                if (ex.Status == WebExceptionStatus.ProtocolError)
                    {
                        HttpWebResponse res = ex.Response as HttpWebResponse;                    if (res.StatusCode == HttpStatusCode.NotFound)
                            retVal = "";
                        else
                            throw ex;
                    }
                    else
                    {                    throw ex;
                    }
                }            return retVal;
            }
    //然后正规则   如获取 <a href="www.csdn.com">csdn</a>
     string pattrn =@"<a\s+href=\"([^\"]+)\">([^<]+)</a>";  Match match = Regex.Match(text,pattrn);
      
        string href = match.Groups[1].Value;
        string text = match.Groups[2].Value;
      

  2.   

    WebBrowser 不会的到C#-Home群里问