本帖最后由 csui2008 于 2010-07-02 12:44:11 编辑

解决方案 »

  1.   


            private static void TestRegex19()
            {
                string html = @"<div align='center' id='year_copper_lme_mt'>
    <table>
    <tr>
    <th colspan='6' class='tbltitle'>
    copper lme cash and 3 month official seller / settlement prices 2010 <small>usd/mt</small>
    </th>
    </tr>
    <tr>
    <td>
    avg 7,386.25 <span>7,412.03</span>
    </td>
    <td>
    avg 6,848.18 <span>6,872.45</span>
    </td>
    <td>
    avg 7,462.83 <span>7,494.85</span>
    </td>
    <td>
    avg 7,745.08 <span>7,780.68</span>
    </td>
    <td>
    avg 6,837.68 <span>6,874.08</span>
    </td>
    <td>
    avg 6,499.30 <span>6,530.00</span>
    </td>
    </td></tr>
    <tr>
    <td>
    </td>
    </tr>
    <tr>
    <th class='tblhead' align='center'>
    jul
    </th>
    <th class='tblhead' align='center'>
    aug
    </th>
    <th class='tblhead' align='center'>
    sep
    </th>
    <th class='tblhead' align='center'>
    oct
    </th>
    <th class='tblhead' align='center'>
    nov
    </th>
    <th class='tblhead' align='center'>
    dec
    </th>
    </tr>
    <tr>
    <td>
    <table>
    <tr>
    <td>
    </td>
    <td >
    cash
    </td>
    <td>
    3 mo
    </td>
    </tr>
    <tr>
    <td>
    <span>1</span>
    </td>
    <td>
    6,354.00
    </td>
    <td>
    6,389.00
    </td>
    </tr>
    </table>
    </td>
    <td>
    </td>
    <td>
    </td>
    <td>
    </td>
    <td>
    </td>
    <td>
    </td>
    </tr>
    </table>
    </table>
    </div>";
                Match m = Regex.Match(html, @"(?isx)<\w+[^>]*>(?><\w+[^>]*>(?'b')|</\w+>(?'-b')|(?:(?!</?\w+\b).)*)*(?(b)(?!))</\w+>");
                string result = m.Value;//这里就是你要的结果。html.中去掉 result就是多余的。
            }
      

  2.   

    首先我在想xmldocument对象是否可以加载这种不规范的格式,
    其次如果可以加载的话,我想到了一个思路你可试试。
    遍历每个element元素及相子元素,看他们的上一个element是否和下一个element是拥有InnerText属性,如果没有的话,应该会报错,在catch里删除这个错误的对象(当然前提还是xmldocument对象能识别这种)。
    我觉得吧,你与其研究这个,不如好好研究下:怎么把你的html代码搞改清楚。
      

  3.   

    forget that...
    我的只能去除最外层的。逐层的过滤的话貌似不行。
    状态机?自己实现?xml能否可以就不知道了。
      

  4.   


    你把最的
    </table>
    </div>
    都干掉了,我只想把上面红色的两个不匹配的删除,其它的留下
      

  5.   


            private static void TestRegex02()
            {            string html = @"<div align='center' id='year_copper_lme_mt'>
        <table>
            <tr>
                <th colspan='6' class='tbltitle'>
                    copper lme cash and 3 month official seller / settlement prices 2010 <small>usd/mt</small>
                </th>
            </tr>
            <tr>
                <td>
                    avg 7,386.25 <span>7,412.03</span>
                </td>
                <td>
                    avg 6,848.18 <span>6,872.45</span>
                </td>
                <td>
                    avg 7,462.83 <span>7,494.85</span>
                </td>
                <td>
                    avg 7,745.08 <span>7,780.68</span>
                </td>
                <td>
                    avg 6,837.68 <span>6,874.08</span>
                </td>
                <td>
                    avg 6,499.30 <span>6,530.00</span>
                </td>
            </td></tr>
            <tr>
                <td>
                </td>
            </tr>
            <tr>
                <th class='tblhead' align='center'>
                    jul
                </th>
                <th class='tblhead' align='center'>
                    aug
                </th>
                <th class='tblhead' align='center'>
                    sep
                </th>
                <th class='tblhead' align='center'>
                    oct
                </th>
                <th class='tblhead' align='center'>
                    nov
                </th>
                <th class='tblhead' align='center'>
                    dec
                </th>
            </tr>
            <tr>
                <td>
                    <table>
                        <tr>
                            <td>
                            </td>
                            <td >
                                cash
                            </td>
                            <td>
                                3 mo
                            </td>
                        </tr>
                        <tr>
                            <td>
                                <span>1</span>
                            </td>
                            <td>
                                6,354.00
                            </td>
                            <td>
                                6,389.00
                            </td>
                        </tr>
                    </table>
                </td>
                <td>
                </td>
                <td>
                </td>
                <td>
                </td>
                <td>
                </td>
                <td>
                </td>
            </tr>
        </table>
        </table>
    </div>";
                Regex regExp = new Regex(@"(?<=<)/?(\w+)", RegexOptions.Compiled);
                Match m = regExp.Match(html);
                Stack<string> tags = new Stack<string>();
                List<Match> err = new List<Match>();
                while (m.Success)
                {
                    if (!m.Value.StartsWith("/")) tags.Push(m.Value);
                    else
                    {
                        if (string.Compare(tags.Peek(), m.Groups[1].Value, true) == 0) tags.Pop();//移除
                        else err.Add(m);
                    }
                    m = m.NextMatch();
                }
                foreach (Match e in err)
                {
                    Console.WriteLine(e.Value + "位置:" + m.Index);
                }
            }
    可以找到结果。别说我刷屏哇。~~
      

  6.   

    在修改一下。可以效果好一些。private static void TestRegex02()
    {
        string html = @"略";
        Regex regExp = new Regex(@"<(/?(\w+))[^>]*>", RegexOptions.Compiled);
        Match m = regExp.Match(html);
        Stack<Match> tags = new Stack<Match>();
        List<Match> err = new List<Match>();
        while (m.Success)
        {
            if (!m.Value.StartsWith("</")) tags.Push(m);
            else
            {
                if (string.Compare(tags.Peek().Groups[2].Value, m.Groups[2].Value, true) == 0) tags.Pop();//移除
                else err.Add(m);
            }
            m = m.NextMatch();
        }
        foreach (Match e in err)
        {
            Console.WriteLine(e.Value + "位置:" + e.Index + "长度:" + e.Length);
        }
    }
      

  7.   

    可能还有点遗漏,最后加上一个遍历stack的。如果前面多了的话。也要检查
    private static void TestRegex02()
    {
        string html = @"略";
        Regex regExp = new Regex(@"<(/?(\w+))[^>]*>", RegexOptions.Compiled);
        Match m = regExp.Match(html);
        Stack<Match> tags = new Stack<Match>();
        List<Match> err = new List<Match>();
        while (m.Success)
        {
            if (!m.Value.StartsWith("</")) tags.Push(m);
            else
            {
                if (string.Compare(tags.Peek().Groups[2].Value, m.Groups[2].Value, true) == 0) tags.Pop();//移除
                else err.Add(m);
            }
            m = m.NextMatch();
        }
        foreach (Match e in tags)//前面<...多余的 
        {
            Console.WriteLine(e.Value + "位置:" + e.Index + "长度:" + e.Length);
        }
        foreach (Match e in err)//后面</...多余的
        {
            Console.WriteLine(e.Value + "位置:" + e.Index + "长度:" + e.Length);
        }
    }
      

  8.   

    <table width="605" height="190" border="0">
    <tr>
    <td  width="70" height="21" class="f1">好评商家:</td>
    <td width="60" class="f2">4层</td>
    <td width="40"><a href="" title="">S042</a></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1">中评商家:</td>
    <td width="60" class="f2">4层</td>
    <td width="40">S042<a href="" title="">S042</a></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1">差评商家:</td>
    <td width="60">1层</td>
    <td width="40"><a href="" title="强迫用户购买巨额3G卡">S039</a></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"><a href="http://tieba.baidu.com/p/1156822179" title=""></a></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"><a href="http://zhidao.baidu.com/question/358711937.html?fr=qrl&cid=220&index=3&fr2=query" title="iphone3g充当iphone3gs卖">s005</a></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"><a href="http://tieba.baidu.com/p/1308343803" title="">S035</a></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="http://tieba.baidu.com/p/1112257492" title="">S039</a></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1"></td>
    <td width="60" class="f2">
    <td width="40">
    <a href="
    http://sjbbs.zol.com.cn/3/11_27220.html
    " title="">
    S019
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://sjbbs.zol.com.cn/3/11_27219.html
    " title="">
    S039
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/1112257493
    " title="
    售卖翻新机
    ">
    S039
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1"></td>
    <td width="60">
    3

    </td>
    <td width="40">
    <a href="" title="
    维修偷配件
    ">
    3H01
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1"></td>
    <td width="60">
    11

    </td>
    <td width="40">
    <a href="
    http://diybbs.zol.com.cn/4/20_31194.html
    " title="">
    1118A
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://bbs.pcpop.com/thread-6019507-1-1.html
    " title="
    虚高价格
    ">
    1128
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1"></td>
    <td width="60">
    15

    </td>
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/1031125303
    " title="
    被劝买了翻新HTC G24
    ">
    1521
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/1031125302
    " title="">
    1521
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/1031125301
    " title="
    iphone3g充当iphone3gs卖
    ">
    1521
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/627361509
    " title="
    售假
    ">
    1516B
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/627361508
    " title="
    售假
    ">
    1516B
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1"></td>
    <td width="60" class="f2">
    <td width="40">
    <a href="
    http://tieba.baidu.com/p/1031125300
    " title="">
    1523
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
    <tr>
    <td  width="70" height="21" class="f1"></td>
    <td width="60">
    19

    </td>
    <td width="40">
    <a href="
    http://zhidao.baidu.com/question/322068822.html?fr=qrl&cid=220&index=1&fr2=query
    " title="">
    1911
    </a>
    </td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    <td width="40"></td>
    <td width="20"></td>
    <td width="20"></td>
    </tr>
     <tr>
        <td  class="f1"></td>
        <td class="f2">&nbsp;</td>
        <td>&nbsp;</td>
        <td>&nbsp;</td>
        <td>&nbsp;</td>
        <td>&nbsp;</td>
        <td>&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
        <td >&nbsp;</td>
      </tr>
    </table>