我的方法,无法过滤掉javascrip标记,有些有用的文字也被误删了.代码如下:
//过滤掉html代码 去除HTML标记
public static string ClearHtmlCode(string txtHtml)
{
txtHtml = txtHtml.Trim();
if (string.IsNullOrEmpty(txtHtml))
return string.Empty; //去掉多余的空格
txtHtml = txtHtml.Replace(" ", ""); //将两个半角空格替换为空
txtHtml = txtHtml.Replace(" ", ""); //将一个全角空格替换为空 ////删除空白行
//txtHtml = txtHtml.Replace("\r\n", ""); //过滤掉<span></span>标签
txtHtml = Regex.Replace(txtHtml, @"<span[^<]*</span>", "", RegexOptions.IgnoreCase);
//txtHtml = Regex.Replace(txtHtml, @"<span[^>]*?>.*?</span>", "", RegexOptions.IgnoreCase); ////过滤掉<span></span>标签
//Regex span = new Regex(@"<span[^<]*</span>", RegexOptions.IgnoreCase);
//MatchCollection spanColl = span.Matches(txtHtml);
////检索span标签到arraylist
//ArrayList spankey = new ArrayList();
//for (int i = 0; i < spanColl.Count; i++)
//{
// spankey.Add(spanColl[i].Value);
// //System.Windows.Forms.MessageBox.Show(spanColl[i].Value.ToString());
//}
////过滤span
//for (int i = 0; i < spankey.Count; i++)
//{
// txtHtml = txtHtml.Replace(spankey[i].ToString(), "");
// //System.Windows.Forms.MessageBox.Show(spankey[i].ToString());
//} //过滤掉style
Regex style = new Regex(@"<style[^<]*</style>", RegexOptions.IgnoreCase);
MatchCollection styleColl = style.Matches(txtHtml);
//检索STYLE脚本到arraylist
ArrayList stylekey = new ArrayList();
for (int i = 0; i < styleColl.Count; i++)
{
stylekey.Add(styleColl[i].Value);
}
//过滤style
for (int i = 0; i < stylekey.Count; i++)
{
txtHtml = txtHtml.Replace(stylekey[i].ToString(), "");
} //过滤掉javascript脚本
Regex java = new Regex(@"<script[^<]*</script>", RegexOptions.IgnoreCase);
MatchCollection javaColl = java.Matches(txtHtml);
//检索javascript脚本到arraylist
ArrayList javakey = new ArrayList();
for (int i = 0; i < javaColl.Count; i++)
{
javakey.Add(javaColl[i].Value);
}
//过滤java
for (int i = 0; i < javakey.Count; i++)
{
txtHtml = txtHtml.Replace(javakey[i].ToString(), "");
} //过滤掉常规html标签
Regex r = new Regex(@"<[^<]*>", RegexOptions.IgnoreCase); //定义一个Regex对象实例
MatchCollection mc = r.Matches(txtHtml);
//检索常规html符号集合到arraylist列表
ArrayList wordkey = new ArrayList();
for (int i = 0; i < mc.Count; i++) //在输入字符串中找到所有匹配
{
wordkey.Add(mc[i].Value);
}
//过滤常规html
for (int i = 0; i < wordkey.Count; i++)
{
txtHtml = txtHtml.Replace(wordkey[i].ToString(), "");
}
//--------------------------------- //删除脚本
txtHtml = Regex.Replace(txtHtml, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML
//txtHtml = Regex.Replace(txtHtml, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); //此处代码删除空白行,但导致段落混乱
txtHtml = Regex.Replace(txtHtml, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"-->", "", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"<!--.*", "", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&#(\d+);", "", RegexOptions.IgnoreCase);
return txtHtml;
}-----------------------此100分只给完全解决者!
//过滤掉html代码 去除HTML标记
public static string ClearHtmlCode(string txtHtml)
{
txtHtml = txtHtml.Trim();
if (string.IsNullOrEmpty(txtHtml))
return string.Empty; //去掉多余的空格
txtHtml = txtHtml.Replace(" ", ""); //将两个半角空格替换为空
txtHtml = txtHtml.Replace(" ", ""); //将一个全角空格替换为空 ////删除空白行
//txtHtml = txtHtml.Replace("\r\n", ""); //过滤掉<span></span>标签
txtHtml = Regex.Replace(txtHtml, @"<span[^<]*</span>", "", RegexOptions.IgnoreCase);
//txtHtml = Regex.Replace(txtHtml, @"<span[^>]*?>.*?</span>", "", RegexOptions.IgnoreCase); ////过滤掉<span></span>标签
//Regex span = new Regex(@"<span[^<]*</span>", RegexOptions.IgnoreCase);
//MatchCollection spanColl = span.Matches(txtHtml);
////检索span标签到arraylist
//ArrayList spankey = new ArrayList();
//for (int i = 0; i < spanColl.Count; i++)
//{
// spankey.Add(spanColl[i].Value);
// //System.Windows.Forms.MessageBox.Show(spanColl[i].Value.ToString());
//}
////过滤span
//for (int i = 0; i < spankey.Count; i++)
//{
// txtHtml = txtHtml.Replace(spankey[i].ToString(), "");
// //System.Windows.Forms.MessageBox.Show(spankey[i].ToString());
//} //过滤掉style
Regex style = new Regex(@"<style[^<]*</style>", RegexOptions.IgnoreCase);
MatchCollection styleColl = style.Matches(txtHtml);
//检索STYLE脚本到arraylist
ArrayList stylekey = new ArrayList();
for (int i = 0; i < styleColl.Count; i++)
{
stylekey.Add(styleColl[i].Value);
}
//过滤style
for (int i = 0; i < stylekey.Count; i++)
{
txtHtml = txtHtml.Replace(stylekey[i].ToString(), "");
} //过滤掉javascript脚本
Regex java = new Regex(@"<script[^<]*</script>", RegexOptions.IgnoreCase);
MatchCollection javaColl = java.Matches(txtHtml);
//检索javascript脚本到arraylist
ArrayList javakey = new ArrayList();
for (int i = 0; i < javaColl.Count; i++)
{
javakey.Add(javaColl[i].Value);
}
//过滤java
for (int i = 0; i < javakey.Count; i++)
{
txtHtml = txtHtml.Replace(javakey[i].ToString(), "");
} //过滤掉常规html标签
Regex r = new Regex(@"<[^<]*>", RegexOptions.IgnoreCase); //定义一个Regex对象实例
MatchCollection mc = r.Matches(txtHtml);
//检索常规html符号集合到arraylist列表
ArrayList wordkey = new ArrayList();
for (int i = 0; i < mc.Count; i++) //在输入字符串中找到所有匹配
{
wordkey.Add(mc[i].Value);
}
//过滤常规html
for (int i = 0; i < wordkey.Count; i++)
{
txtHtml = txtHtml.Replace(wordkey[i].ToString(), "");
}
//--------------------------------- //删除脚本
txtHtml = Regex.Replace(txtHtml, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase); //删除HTML
//txtHtml = Regex.Replace(txtHtml, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase); //此处代码删除空白行,但导致段落混乱
txtHtml = Regex.Replace(txtHtml, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"-->", "", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"<!--.*", "", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
txtHtml = Regex.Replace(txtHtml, @"&#(\d+);", "", RegexOptions.IgnoreCase);
return txtHtml;
}-----------------------此100分只给完全解决者!
{
System.Windows.Forms.WebBrowser wb = new System.Windows.Forms.WebBrowser();
wb.Navigate("about:blank");
wb.Document.Write(HtmlStr);
return wb.Document.Body.InnerText;
}