求Ｃ# 过滤html的方法

求达人解决如何用Ｃ#　过滤垃圾html代码。　网上出现的几个方法，如以如下代码为代表的类似代码就不要贴出来了public string checkStr(string html)
      {
          System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"<script[\s\S]+</script *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@" href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@" no[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset *>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex6 = new System.Text.RegularExpressions.Regex(@"\<img[^\>]+\>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex7 = new System.Text.RegularExpressions.Regex(@"</p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex8 = new System.Text.RegularExpressions.Regex(@"<p>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          System.Text.RegularExpressions.Regex regex9 = new System.Text.RegularExpressions.Regex(@"<[^>]*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);
          html = regex1.Replace(html, ""); //过滤<script></script>标记
          html = regex2.Replace(html, ""); //过滤href=javascript: (<A>) 属性
          html = regex3.Replace(html, " _disibledevent="); //过滤其它控件的on...事件
          html = regex4.Replace(html, ""); //过滤iframe
          html = regex5.Replace(html, ""); //过滤frameset
          html = regex6.Replace(html, ""); //过滤frameset
          html = regex7.Replace(html, ""); //过滤frameset
          html = regex8.Replace(html, ""); //过滤frameset
          html = regex9.Replace(html, "");
          html = html.Replace(" ", "");
          html = html.Replace("</strong>", "");
          html = html.Replace("<strong>", "");
          return html;
}类似如上的效率实在不敢恭维，而且还过滤不严。　求解了。

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

过滤垃圾html?
哪些是垃圾html，什么标签？
UP

>--------------强烈要求论坛版面改回去!--------------<
>--------------我要赚好多的分，给我的小弟弟买糖吃!--------------<
  /// <summary>
        /// 从HTML中获取文本,保留br,p,img
        /// </summary>
        /// <param name="HTML"></param>
        /// <returns></returns>
        public static string GetTextFromHTML(string HTML)
        {
            Regex regEx = new Regex(@"</?(?!br|/?p|img)[^>]*>", RegexOptions.IgnoreCase);            return regEx.Replace(HTML, "");
        }   /// <summary>
        /// 过滤HTML中的不安全标签
        /// </summary>
        /// <param name="content"></param>
        /// <returns></returns>
        public static string RemoveUnsafeHtml(string content)
        {
            content = Regex.Replace(content, @"(\<|\s+)o([a-z]+\s?=)", "$1$2", RegexOptions.IgnoreCase);
            content = Regex.Replace(content, @"(script|frame|form|meta|behavior|style)([\s|:|>])+", "$1.$2", RegexOptions.IgnoreCase);
            return content;
        }
我觉得应该用DOM，然后提取InnerText
HTML语言的复杂度要高于正则表达式语言，所以不可能用正则表达式解析所有HTML。
鉴于你对正则表达式的效率都不满意，一个完整的HTML解析引擎更加不可能满足你的需求。你的需求不现实，和你的用户重新讨论一下。