比如有个网页的DOM结构:
<TABLE>
  <TBODY>
  <TR>
    <TD>
        <A ***>***</A>
        文本1  
        <A ***>***</A>
        文本2<br>
文本3<br>
<A ***>***</A>
我使用的是MSHTML组件,想分别获取文本1、文本2、文本3,有没有比较方便快捷的方法?文本2、文本3一次获取也可以。我可以使用get_innerText获取<TD>节点的所有文本,也可以获取每个<A>节点的文本,就是没找到获取文本节点文本的方法。

解决方案 »

  1.   

    麻烦小胖贴个例子的关键代码给我看看。
    不要告诉我用IHTMLElement::innerText啊,文本1、文本2、文本3的父节点都是<TD>,这三个文本节点没有对应的IHTMLElement,而我需要分别提取这几个文本,所以也不能用他们的父节点提取。
      

  2.   

    遍历<TD>节点的集合 IHTMLDOMChildrenCollection,这个集合包括HTML Elements and TextNode objects 元素
      

  3.   


    IHTMLDocument2* pDoc;
    BSTR tagName, tdtag, itext;
    tdtag = SysAllocString( L"TD");
    SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
    if (CoInitializeEx( 0L, COINIT_MULTITHREADED) == S_OK)
    __try{
    if (CoCreateInstance( CLSID_HTMLDocument, 0L, CLSCTX_INPROC, IID_IHTMLDocument2, (void**)&pDoc) == S_OK)
    __try{
    VARIANT *param;
    SafeArrayAccessData(psa, (LPVOID*)&param);
    param->vt = VT_BSTR;
    param->bstrVal = SysAllocString(
    L"<!doctype html><html><head><title>None</title></head>\
    <body>\
    <table>\
    <tr>\
    <th>Header</th>\
    <td>Text1</td>\
    <td>Text2</td>\
    </tr>\
    </table>\
    </body>\
    </html>");
    if ((pDoc->write( psa) != S_OK)&&(pDoc->close() != S_OK))
    return 1; IDispatch *all, *disp;
    IHTMLElement *body, *item, *td;
    IHTMLElementCollection *alls, *tds;
    IHTMLElement2 *tbl2;
    long alen, tdlen;
    pDoc->get_body( &body);
    body->get_all( &all);
    body->Release();
    body = 0L;
    all->QueryInterface( IID_IHTMLElementCollection, (void**)&alls);
    all->Release();
    all = 0L;
    alls->get_length( &alen);
    VARIANT dummy;
    dummy.vt = VT_I4;
    for( int ai = 0; ai < alen; ai++)
    {
    dummy.intVal = ai;
    alls->item( dummy, dummy, (IDispatch**)&disp);
    if (disp)
    {
    disp->QueryInterface( IID_IHTMLElement, (void**)&item);
    if (item){
    disp->Release();
    disp = 0L;
    item->get_tagName( &tagName);
    if (!lstrcmpW( tagName, L"TABLE")){//if the element is a table at the root
    item->QueryInterface( IID_IHTMLElement2, (void**)&tbl2);
    item->Release();
    item = 0L;
    if(tbl2){
    tbl2->getElementsByTagName( tdtag, &tds);//get the tr
    if (tds){
    tds->get_length( &tdlen);
    for (int tri = 0; tri < tdlen; tri++){
    dummy.intVal = tri;
    tds->item( dummy, dummy, &disp);
    if( disp){
    disp->QueryInterface( IID_IHTMLElement, (void**)&td);
    if(td){
    td->get_innerText( &itext);

    wprintf( L"%s\r\n", (LPWSTR)itext);

    SysFreeString( itext);
    td->Release();
    td = 0L;
    }
    disp->Release();
    disp = 0L;
    }else{
    disp->Release();
    disp = 0L;
    }
    }
    tds->Release();
    tds = 0L;
    }
    tbl2->Release();
    tbl2 = 0L;
    }else{
    item->Release();
    item = 0L;
    }
    }else{
    item->Release();
    item = 0L;
    }
    }else{
    disp->Release();
    disp = 0L;
    }
    }else{
    disp->Release();
    disp = 0L;
    }
    }
    if (alls){
    alls->Release();
    alls = 0L;
    }
    }__finally
    {
    pDoc->Release();
    pDoc = 0L;
    }
    }__finally
    {
    SafeArrayDestroy( psa);
    SysFreeString( tdtag);
    CoUninitialize();
    }
    return 0;
      

  4.   

    谢谢laiyiling的回复。你的方法不适合我的情况,我要解析的网页格式是这样的:
    <TABLE>
      <TBODY>
        <TR>
          <TD>
            <A href="http://www.baidu.com">百度</A>
            文本1<BR>   
            <A href="http://www.sina.com.cn">新浪</A>
            文本2<BR>
            文本3<BR>
            <A href="http://www.neteays.com">网易</A>
          </TD>
        </TR>
      </TBODY>
    <TABLE>
    百度、文本1、新浪、文本2、文本3、网易,需要分别解析出来。你的方法会把他们一起提取出来。
    文本1、文本2、文本3是<TD>的文字,并不是一个节点,所以,不可能有对应的IHTMLElement。
      

  5.   

    IE打开网页后按F12,分析的DOM树
      

  6.   

    http://hi.csdn.net/attachment/201004/16/1534521_127143371534AU.jpg
      

  7.   

    建议楼主把文本1单独用一个SPAN标签包含起来,最后得到这个SPAN标签的Element,其他两个也一样
      

  8.   


    你这个页面确实比较特殊,试试这样:1) 得到 td 的IHTMLElement
    2) IHTMLElement QI 得到 IHTMLDOMNode 
    3)  IHTMLDOMNode::get_childNodes 得到 IDispatch 
    4) IDispatch QI 得到 IHTMLDOMChildrenCollection 
    5) 遍历,通过 IHTMLDOMChildrenCollection::item 得到IDispatch
    6) 该IDispatch上QI IHTMLDOMTextNode 即可
      

  9.   

      做网站就像做爱,首先你要懂得“How To Make Love”,简称HTML ;  如果觉得你对HTML已经精通了,你应该学学3P(ASP,PHP,JSP) 
      

  10.   


    这种方法确实可行,但现在又有一问题:
    IHTMLDOMTextNode有两个函数可以获取文字,get_data、toString。当字符串中有"&nbsp;"时,获取的都不是空格,而是'?',怎么解决这个问题?
      

  11.   

    IHTMLDOMNode::get_nodeValue(VARIANT *p);
      

  12.   


    这个估计是页面编码的原因,如果编码转换正确,应该显示的就是"&nbsp;"
    你可以用IHTMLDocument2::get_charset得到页面编码另外,你可以试试 IHTMLDOMTextNode 上 QI IHTMLElement看能否成功 IHTMLElement::innerText不会是乱码的ps,新问题请开新帖哈~ ^_^
      

  13.   

    IHTMLDocument2* pDoc;
        BSTR tagName, tdtag, itext;
        tdtag = SysAllocString( L"TD");
        SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
        if (CoInitializeEx( 0L, COINIT_MULTITHREADED) == S_OK)
        __try{
            if (CoCreateInstance( CLSID_HTMLDocument, 0L, CLSCTX_INPROC, IID_IHTMLDocument2, (void**)&pDoc) == S_OK)
            __try{
                VARIANT *param;
                SafeArrayAccessData(psa, (LPVOID*)&param);
                param->vt = VT_BSTR;
                param->bstrVal = SysAllocString(
                    L"<!doctype html><html><head><title>None</title></head>\
                        <body>\
                            <table>\
                                <tr>\
                                    <th>Header</th>\
                                    <td>Text1</td>\
                                    <td>Text2</td>\
                                </tr>\
                            </table>\
                        </body>\
                    </html>");
                if ((pDoc->write( psa) != S_OK)&&(pDoc->close() != S_OK))
                    return 1;            IDispatch *all, *disp;
                IHTMLElement *body, *item, *td;
                IHTMLElementCollection *alls, *tds;
                IHTMLElement2 *tbl2;
                long alen, tdlen;
                pDoc->get_body( &body);
                body->get_all( &all);
                body->Release();
                body = 0L;
                all->QueryInterface( IID_IHTMLElementCollection, (void**)&alls);
                all->Release();
                all = 0L;
                alls->get_length( &alen);
                VARIANT dummy;
                dummy.vt = VT_I4;
                for( int ai = 0; ai < alen; ai++)
                {
                    dummy.intVal = ai;
                    alls->item( dummy, dummy, (IDispatch**)&disp);
                    if (disp)
                    {
                        disp->QueryInterface( IID_IHTMLElement, (void**)&item);
                        if (item){
                            disp->Release();
                            disp = 0L;
                            item->get_tagName( &tagName);
                            if (!lstrcmpW( tagName, L"TABLE")){//if the element is a table at the root
                                item->QueryInterface( IID_IHTMLElement2, (void**)&tbl2);
                                item->Release();
                                item = 0L;
                                if(tbl2){
                                    tbl2->getElementsByTagName( tdtag, &tds);//get the tr
                                    if (tds){
                                        tds->get_length( &tdlen);
                                        for (int tri = 0; tri < tdlen; tri++){
                                            dummy.intVal = tri;
                                            tds->item( dummy, dummy, &disp);
                                            if( disp){
                                                disp->QueryInterface( IID_IHTMLElement, (void**)&td);
                                                if(td){
                                                    td->get_innerText( &itext);
                                                    
                                                    wprintf( L"%s\r\n", (LPWSTR)itext);
                                                    
                                                    SysFreeString( itext);
                                                    td->Release();
                                                    td = 0L;
                                                }
                                                disp->Release();
                                                disp = 0L;
                                            }else{
                                                disp->Release();
                                                disp = 0L;
                                            }
                                        }
                                        tds->Release();
                                        tds = 0L;
                                    }
                                    tbl2->Release();
                                    tbl2 = 0L;
                                }else{
                                    item->Release();
                                    item = 0L;
                                }
                            }else{
                                item->Release();
                                item = 0L;
                            }
                        }else{
                            disp->Release();
                            disp = 0L;
                        }
                    }else{
                        disp->Release();
                        disp = 0L;
                    }
                }
                if (alls){
                    alls->Release();
                    alls = 0L;
                }
            }__finally
            {
                pDoc->Release();
                pDoc = 0L;
            }
        }__finally
        {
            SafeArrayDestroy( psa);
            SysFreeString( tdtag);
            CoUninitialize();
        }
        return 0;