就是说 我获取到了一段html网页上面的值,
但是有的值包含了如下代码(html注释),我如何来过滤掉这段注释的代码呢?
求具体方法,或者正则表达式应该怎么写?
求解,谢谢各位···············内容标题
<!--
/* Font Definitions */
@font-face{font-family:宋体;panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face{font-family:新宋体;panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:仿宋_GB2312;panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:"\@宋体";panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face{font-family:"\@仿宋_GB2312";panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:"\@新宋体";panose-1:2 1 6 9 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;font-size:16.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoHeader, li.MsoHeader, div.MsoHeader{margin:0cm;margin-bottom:.0001pt;text-align:center;layout-grid-mode:char;border:none;padding:0cm;font-size:9.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoFooter, li.MsoFooter, div.MsoFooter{margin:0cm;margin-bottom:.0001pt;layout-grid-mode:char;font-size:9.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoBodyTextIndent, li.MsoBodyTextIndent, div.MsoBodyTextIndent{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;text-indent:52.5pt;line-height:25.0pt;font-size:16.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoBodyTextIndent2, li.MsoBodyTextIndent2, div.MsoBodyTextIndent2{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;text-indent:31.35pt;font-size:16.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoAcetate, li.MsoAcetate, div.MsoAcetate{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;font-size:9.0pt;font-family:仿宋_GB2312;color:#003366;}
p.Char, li.Char, div.Char{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;font-size:12.0pt;font-family:"Times New Roman";}
/* Page Definitions */
@page Section1{size:595.3pt 841.9pt;margin:54.55pt 62.35pt 62.3pt 79.4pt;layout-grid:15.6pt;}
div.Section1{page:Section1;}
/* List Definitions */
ol{margin-bottom:0cm;}
ul{margin-bottom:0cm;}
-->具体内容································
就是内容中突然多出了这么一段注释,如何过滤掉呢?
但是有的值包含了如下代码(html注释),我如何来过滤掉这段注释的代码呢?
求具体方法,或者正则表达式应该怎么写?
求解,谢谢各位···············内容标题
<!--
/* Font Definitions */
@font-face{font-family:宋体;panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face{font-family:新宋体;panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:仿宋_GB2312;panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:"\@宋体";panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face{font-family:"\@仿宋_GB2312";panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:"\@新宋体";panose-1:2 1 6 9 3 1 1 1 1 1;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;font-size:16.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoHeader, li.MsoHeader, div.MsoHeader{margin:0cm;margin-bottom:.0001pt;text-align:center;layout-grid-mode:char;border:none;padding:0cm;font-size:9.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoFooter, li.MsoFooter, div.MsoFooter{margin:0cm;margin-bottom:.0001pt;layout-grid-mode:char;font-size:9.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoBodyTextIndent, li.MsoBodyTextIndent, div.MsoBodyTextIndent{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;text-indent:52.5pt;line-height:25.0pt;font-size:16.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoBodyTextIndent2, li.MsoBodyTextIndent2, div.MsoBodyTextIndent2{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;text-indent:31.35pt;font-size:16.0pt;font-family:仿宋_GB2312;color:#003366;}
p.MsoAcetate, li.MsoAcetate, div.MsoAcetate{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;font-size:9.0pt;font-family:仿宋_GB2312;color:#003366;}
p.Char, li.Char, div.Char{margin:0cm;margin-bottom:.0001pt;text-align:justify;text-justify:inter-ideograph;font-size:12.0pt;font-family:"Times New Roman";}
/* Page Definitions */
@page Section1{size:595.3pt 841.9pt;margin:54.55pt 62.35pt 62.3pt 79.4pt;layout-grid:15.6pt;}
div.Section1{page:Section1;}
/* List Definitions */
ol{margin-bottom:0cm;}
ul{margin-bottom:0cm;}
-->具体内容································
就是内容中突然多出了这么一段注释,如何过滤掉呢?
参考一下
我是读取一个html 获取有用的数据,也就是采集,已经过滤完了 数据去掉了,但是取出来的数据多余了这么段html注释的代码,现在不知道怎么再过滤这个············
正则不难的,网上找源代码不如自己学一下,以后还能用
var html = @"<html><head>
<!--
/* Font Definitions */
@font-face{font-family:宋体;panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face{font-family:新宋体;panose-1:2 1 6 9 3 1 1 1 1 1;}
@font-face{font-family:仿宋_GB2312;panose-1:2 1 6 9 3 1 1 1 1 1;}
--></head></html>";
Regex regex = new Regex("<!--[^<]+-->");
html = regex.Replace(html, "");
this.textBox1.Text = html;
public static string setPinglunContent(string strContent)
{
string theString = strContent.Replace("<", "[").Replace(">", "]");
char enter = (char)13;
char enter2 = (char)10;
theString = theString.Replace(enter.ToString(), "<br>").Replace(enter2.ToString(), "");
theString = theString.Replace(" ", " ").Replace(" ", " ");
return theString;
}
类似的这种
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Text.RegularExpressions;namespace test
{
public partial class re : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
string html = @"111111<!--过滤过滤过滤-->222222222";
Regex re = new Regex("<!--.*-->");
html = re.Replace(html, "");
Label1.Text = html; }
}
}