C#如何用正则表达式过滤网页的JavaScript代码?例如:用户用记事本打开一个.htm网页。复制里面的代码到TextBox1里。点击Button1后,过滤掉里面的JavaScript代码。在TextBox2里显示过滤后的结果。这个用C#如何实现?代码在下面:

解决方案 »

  1.   

    <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3c.org/TR/1999/REC-html401-19991224/loose.dtd">
    <!-- saved from url=(0064)http://download.cnet.com/Google-Chrome/3640-2356_4-11015161.html -->
    <HTML xmlns="http://www.w3.org/1999/xhtml" xmlns:fb = 
    "http://www.facebook.com/2008/fbml"><HEAD><TITLE>Google Chrome - Reviews and free Google Chrome downloads at Download.com</TITLE><!-- Vader loves you  -->
    <META http-equiv=Content-Type content="text/html; charset=gb2312">
    <SCRIPT src="1.files/oreo.moo.rb.combined.js" type=text/javascript></SCRIPT><SCRIPT type=text/javascript>            UserVars = new LocalVars.UserVars({
                    loggedIn: ' ',
                    userName: ' ',
                    ursRegId: ' ',
                    rememberMe: ' ',
                    confirmed: ' '
                });
                
                PageVars = new LocalVars.PageVars({
                    pageType: '3640',
                    nodeId: '2356',
                    siteId: '4',
                    assetId: '11015161',
                    pageNumber: '',
                    channelId: '20',
                    editionId: '3',
                    brandId: '6',
                    breadcrumb: '20:2137:2356',
                    userIP: '114.240.162.243',
                    guid: 'slH0fwoPjAUAAGFgNtsAAABo'
                   });
            </SCRIPT>
    <!--[if IE 6]>
    <SCRIPT src="1.files/FixPNG.js" type=text/javascript></SCRIPT>
    <![endif]--><LINK rev=stylesheet media=screen href="1.files/matrix.css" 
    type=text/css rel=stylesheet><LINK rev=stylesheet media=print 
    href="1.files/print.css" type=text/css rel=stylesheet>
    <META 
    content="Come to CNET Download.com for free and safe Google Chrome downloads. Browse the Web, Google-style." 
    name=description>
    <META content="Google Chrome, Web Browsers, downloads, software, trial, free" 
    name=keywords><LINK rev=stylesheet href="1.files/default.css" type=text/css 
    rel=stylesheet>
    <SCRIPT src="1.files/download.global.framework.compressed.js" 
    type=text/javascript></SCRIPT><SCRIPT src="1.files/download.tron.title.detail2.compressed.js" 
    type=text/javascript></SCRIPT><META content="MSHTML 6.00.2900.2912" name=GENERATOR></HEAD>
    <BODY class="siteId4 pageType3640"><IMG 
    style="LEFT: 0px; POSITION: absolute; TOP: 0px" src=""> <!-- MAC T EA.EB.ED.ED --><!-- MAC [r20081117-1345-OptimizeOn:1.13.10] c18-rb-tron-xw6.cnet.com::1524803936 2009.04.18.05.37.44 --><!-- NO AD TEXT: _QUERY_STRING="POS=100&SP=80" _REQ_NUM="0"  --><!-- default ad --><IMG 
    style="LEFT: 0px; POSITION: absolute; TOP: 0px" height=0 alt="" 
    src="1.files/dotclear.gif" width=0><!-- MAC ad -->
    <SCRIPT src="1.files/dw.js" type=text/javascript></SCRIPT><SCRIPT type=text/javascript>
                DW.pageParams = {
                    siteid: '4',
                    edid:   '3',
                    prtnr:  'CNET Networks, Inc.',
                    ptid:   '3640',
                    onid:   '2356',
                    asid:   '11015161',
                    pgnbr:  '',
                    astId:  '1'
                };
                DW.regSilo = '1';
                
            </SCRIPT><SCRIPT type=text/javascript>
        DW.trackClicks();
    </SCRIPT>
     
    <DIV id=rb_bodyWrap>
    <DIV id=rb_shell>
    <DIV id=rb_header name="rb_header" section="hdr">
    <DIV id=headerContents>
    <UL class=section id=brandNavBar section="brandnav">
      <LI class=brandNav>
      <UL>
        <LI class="siteId1 "><A 
        href="http://www.cnet.com/2001-1_1-0.html"><SPAN>home</SPAN></A> </LI>
        <LI class="siteId7 "><A 
        href="http://reviews.cnet.com/2001-1_7-0.html"><SPAN>reviews</SPAN></A> 
    </LI>
        <LI class="siteId3 "><A 
        href="http://news.cnet.com/2001-1_3-0.html"><SPAN>news</SPAN></A> </LI>
        <LI class="siteId4 active"><A 
        href="http://download.cnet.com/windows/"><SPAN>downloads</SPAN></A> </LI>
        <LI class="siteId53 "><A 
        href="http://cnettv.cnet.com/2000-1_53-0.html"><SPAN>cnet tv</SPAN></A> 
      </LI></UL><!-- /brandNav --></LI>
      <LI class=rubics>
      <DIV id=rubicsTextAd><!-- MAC T ED.ED.F1.F1 --><!-- MAC [r20081117-1345-OptimizeOn:1.13.10] c18-rb-tron-xw6.cnet.com::1524803936 2009.04.18.05.37.44 -->
      <DIV class=rubics_netxp1_main id=rbx_netxp1_parent>
      <SCRIPT type=text/javascript>
    function do_rbx_ctrk_netxp1 (aElem, iTrkId, cTrkPrefix) {
    var imgElem = document.getElementById(iTrkId);
    if (!imgElem) {
    return true;
    } if (!aElem.rbxCPrefixed) {
    aElem.rbxCPrefixed = true;
    var destUri = encodeURIComponent(aElem.href);
    var newUri = cTrkPrefix + destUri;
    aElem.href = newUri;
    }
    return true;
    }
    function do_rbx_itrk_netxp1 (parentId, iTrkId, iTrkUri) {
    if (!document.images) {
    return false;
    } var imgId = iTrkId;
    var imgElem = document.getElementById(imgId); if (!imgElem) { var parentElem = document.getElementById(parentId);
    if (!parentElem) {
    return false;
    } var imgElem = document.createElement('img');
    if (!imgElem) {
    return false;
    } imgElem.id = imgId;
    imgElem.src = iTrkUri;
    imgElem.alt = '';
    imgElem.width = '1';
    imgElem.height = '1'; parentElem.appendChild(imgElem);
    }
    return true;
    }
    </SCRIPT>
      On The Insider: <A class=xpromolink 
      onclick="return do_rbx_ctrk_netxp1(this, 'rbx_netxp1_1', 'http://dw.com.com/rubicsclk?ver=2&amp;ts=2009.04.17.22.37.44&amp;edId=3&amp;onId=2356&amp;ptId=3640&amp;sId=4&amp;appId=19&amp;offId=5869&amp;unitId=45&amp;poolId=1&amp;f1=1&amp;f2=%2d0&amp;f3=%2d0&amp;alg=8&amp;&amp;opt=1&amp;linkPos=1&amp;destUrl=');" 
      href="http://www.theinsider.com/news/2019525_Michael_Jackson_Auction_Off_the_Block" 
      target=_top>Michael Jackson Auction Cancelled</A>
      <SCRIPT type=text/javascript>
    //<![CDATA[
    do_rbx_itrk_netxp1('rbx_netxp1_parent', 'rbx_netxp1_1', 'http://dw.com.com/rubicsimp/c.gif?ver=2&ts=2009.04.17.22.37.44&edId=3&onId=2356&ptId=3640&sId=4&appId=19&unitId=45&poolId=1&f1=1&f2=%2d0&f3=%2d0&alg=8&opt=1&off=5869,-1');
    //]]>
    </SCRIPT>
       </DIV><!-- ros [r20070521-1832-ronr-v1-13-5:1.13.5] c18-ad-rubics-ros6.cnet.com::4025142192 2009.04.17.22.37.44 --><!-- ros t 0.0.0.0.1.1.1.1 --><!-- MAC ad --></DIV></LI>
      

  2.   

     public static string NoHTML(string Htmlstring)
            {
                //删除脚本
                Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*?>.*?</script>", "", RegexOptions.IgnoreCase);
                //删除HTML
                Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);            Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9", RegexOptions.IgnoreCase);
                Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "", RegexOptions.IgnoreCase);            Htmlstring.Replace("<", "");
                Htmlstring.Replace(">", "");
                Htmlstring.Replace("\r\n", "");
                Htmlstring = HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();            return Htmlstring;
            }
      

  3.   

    Regex.Replace(Htmlstring, @" <script[^>]*?>.*? </script>", "", RegexOptions.IgnoreCase); 
    光这句还不行,还得替换所有类似 onclick="alert();"的东西,加上
    Htmlstring = Regex.Replace(Htmlstring, "on.*?\"", "", RegexOptions.IgnoreCase); 
      

  4.   

    Regex.Replace(Htmlstring, @" <script[^>]*?>.*? </script>", "", RegexOptions.IgnoreCase); 
    有错误,应该为
    Regex.Replace(Htmlstring, @" <script.*?</script>", "", RegexOptions.IgnoreCase); 
      

  5.   


    private void button1_Click(object sender, EventArgs e)
            {
                textBox2.Text = Regex.Replace(textBox1.Text, @"<script[^>]*?>[\s\S]*<\/script>", "", RegexOptions.IgnoreCase);           
            }
      

  6.   

    我不贴了。你打开Google的首页就行了。不要用IE保存到硬盘里。直接“查看”--“源文件”。
    试下行不?肯定是不行的。
    回复太快!如果您是恶意刷楼,将会受到严厉惩罚!