本帖最后由 Erric77 于 2010-04-22 15:24:54 编辑

解决方案 »

  1.   

    html本身就是xml格式。如果你是要获取文本信息,正则替换所有的< /> <> 这种内容。
      

  2.   

    如果不用登陆的话可以直接这样       System.Net.HttpWebRequest hwr = (System.Net.HttpWebRequest)System.Net.WebRequest.Create("请求页面");
                System.IO.StreamReader sr = new System.IO.StreamReader(hwr.GetResponse().GetResponseStream(), Encoding.Default);
    string str =  sr.ReadToEnd();#region 过滤掉 html代码
            public static string StripHTML(string strHtml)
            {
                string[] aryReg ={ 
                            @"<script[^>]*?>.*?</script>",                         @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>", 
                            @"([\r\n])[\s]+", 
                            @"&(quot|#34);", 
                            @"&(amp|#38);", 
                            @"&(lt|#60);", 
                            @"&(gt|#62);", 
                            @"&(nbsp|#160);", 
                            @"&(iexcl|#161);", 
                            @"&(cent|#162);", 
                            @"&(pound|#163);", 
                            @"&(copy|#169);", 
                            @"&#(\d+);", 
                            @"-->", 
                            @"<!--.*\n" 
                            };                                    string[] aryRep = { 
                            "", 
                            "", 
                            "", 
                            "\"", 
                            "&", 
                            "<", 
                            ">", 
                            " ", 
                            "\xa1",//chr(161), 
                            "\xa2",//chr(162), 
                            "\xa3",//chr(163), 
                            "\xa9",//chr(169), 
                            "", 
                            "\r\n", 
                            "" 
                            };            string newReg = aryReg[0];
                string strOutput = strHtml;
                for (int i = 0; i < aryReg.Length; i++)
                {
                    System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(aryReg[i], System.Text.RegularExpressions.RegexOptions.IgnoreCase);
                    strOutput = regex.Replace(strOutput, aryRep[i]);
                }
                strOutput.Replace("<", "");
                strOutput.Replace(">", "");
                strOutput.Replace("\r\n", "");
                return strOutput;
            }
            #endregion这样就成了纯文本信息