之前的帖是介绍怎么删除<>之间的,但我去掉整段javascript代码,应该怎样写?谢谢…………例如:<SCRIPT language=JavaScript>
function checkemail(form)
{
if(!form.LoginName.value){
form.LoginName.focus();
return false;
}
</script>
function checkemail(form)
{
if(!form.LoginName.value){
form.LoginName.focus();
return false;
}
</script>
string regex = @"^<script language=javascript>.*</script>";
string result = Regex.Replace(input, regex, "");
<script[^>]*?>.*?</script>
<script[^>]*?>[.\n]*?</script>
<script[^>]*>([\s\S](?!<script))*?</script>以上四个都不行~~~
{
WebClient astroWebClient = new WebClient();
astroWebClient.Credentials = CredentialCache.DefaultCredentials; string page2 = "http://www.21cn.com/"; Byte[] pageData = astroWebClient.DownloadData(page2);
string pageHtml = Encoding.Default.GetString(pageData); //GB2312
//pageHtml = Regex.Replace(pageHtml, @"<[^>]+>",""); Console.Write(StripHTML1(pageHtml));
}public static string StripHTML1(string strHtml)
{
string strOutput = strHtml;
//Regex regex1 = new Regex(@"<script.*</script>");
//Regex regex1 = new Regex(@"<script[^>]*?>.*?</script>");
//Regex regex1 = new Regex(@"<script[^>]*?>[.\n]*?</script>", RegexOptions.IgnoreCase);
//Regex regex1 = new Regex(@"<script[^>]*>([\s\S](?!<script))*?</script>");
Regex regex1 = new Regex(@"<script>[\s\S]+?</script>", RegexOptions.IgnoreCase); bool a = regex1.IsMatch(strOutput);
MessageBox.Show(a.ToString());
strOutput = regex1.Replace(strOutput,"");
//Regex regex2 = new Regex(@"<[^>]+>|</[^>]+>");
//strOutput = regex2.Replace(strOutput,"");
return strOutput;
}
改为:
strOutput = strOutput.Replace(regex1,"");
strOutput = regex1.Replace(strOutput,"");Regex regex2 = new Regex(@"<[^>]+>|</[^>]+>");
strOutput = regex2.Replace(strOutput,"");
(@"<SCRIPT language=JavaScript>(.|\s)*?</script>",RegexOptions.IgnoreCase)r.replace(...);
document.writemmm");
document.write("<script>\n");
document.write("\n");
</SCRIPT>由于中间出现了document.write("<script>\n");于是正则表达式子会把之前得到
<SCRIPT LANGUAGE="JavaScript">
document.writemmm");
抛弃,然后从
("<script>\n");
document.write("\n");
</SCRIPT>
这里继续截取,于是那段被抛弃的代码便去不到,大家有什么解决方案?想知道百度是怎么解决的
如果如楼上所说,可以在regex最前面加^试试
/// <summary>
/// 去除HTML标记
/// </summary>
/// <param name="Htmlstring">包括HTML的源码 </param>
/// <returns>已经去除后的文字</returns>
public static string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase); Htmlstring.Replace("<", "");
Htmlstring.Replace(">", "");
Htmlstring.Replace("\r\n", "");
Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim(); return Htmlstring;
}
#endregion
2.仍然没有去掉<script></script>标签里面的内容不过字符转换的处理比较详细,再次感谢!
测试数据:
<SCRIPT LANGUAGE="JavaScript">
document.writemmm");
document.write("<script>\n");
document.write("\n");
</SCRIPT>
<SCRIPT LANGUAGE="JavaScript">
document.writemmm");
document.write("<script>\n");
document.write("\n");
</SCRIPT>
正则模式:Singleline
匹配数:2
*******************
<SCRIPT LANGUAGE="JavaScript">
document.writemmm");
document.write("<script>\n");
document.write("\n");
</SCRIPT>
*******************
<SCRIPT LANGUAGE="JavaScript">
document.writemmm");
document.write("<script>\n");
document.write("\n");
</SCRIPT>