小弟有个程序要解析html to txt  用了mshtml.dll
能解析,但是遇到一些网页就会弹出来一些提示说,“安全隐患”,“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html,
部分代码
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc);
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
VARIANT *param;
bstr_t bsData = (LPCTSTR)html;
AfxMessageBox(bsData);
hr = SafeArrayAccessData(psa, (LPVOID*)&param);
param->vt = VT_BSTR;
param->bstrVal = (BSTR)bsData;
hr = pDoc->write(psa);
hr = pDoc->close();
SafeArrayDestroy(psa);
BSTR bstr;
pDoc->body->get_outerText(&bstr);
pElement->get_outerText(&bstr);如果可以程序中忽略提示也可以,或者用别的方式也可以。(不要用正则),那位大虾帮帮我啊!

解决方案 »

  1.   

    就是想提取html中的文字。我把html代码放到了CString中,不知道还有没有别的方法鞥你提取出来
      

  2.   

    参考
    void CIllegalCheck::ParseHtml(byte* lpOrigBuffers,  size_t dwBufferCount,byte* pOutbuf,size_t& nOutBufSize,CConverAndOrigPosHelp& caoPosHelp)
    {
    caoPosHelp.m_pOrigHead = lpOrigBuffers; byte* pCurrentOutIndex = pOutbuf;
    nOutBufSize = 0; byte* lpBuffers = new byte[dwBufferCount + 1 + m_strPreNeedCheckStream.length()];
    if( lpBuffers == NULL)
    {
    return ;
    }
    lpBuffers[dwBufferCount+m_strPreNeedCheckStream.length()]='\0'; memcpy(lpBuffers +  m_strPreNeedCheckStream.length(),lpOrigBuffers,dwBufferCount); byte* pPoshelpOrigHead = lpBuffers + m_strPreNeedCheckStream.length(); //if( NULL == strlwr((char*)(lpBuffers +  m_strPreNeedCheckStream.length())))
    //{
    // if( lpBuffers)
    // {
    // delete[] lpBuffers;
    // }
    // return;
    //}
    CUtility::FastStrLwr( ( char*)(lpBuffers +  m_strPreNeedCheckStream.length()) );
    memcpy(lpBuffers,m_strPreNeedCheckStream.c_str(),m_strPreNeedCheckStream.length()); dwBufferCount += (DWORD)m_strPreNeedCheckStream.length();
    m_strPreNeedCheckStream=""; byte* pStart = lpBuffers ; 
    byte* pFindPos = NULL;
    byte* pEndFindPos=NULL;
    byte* pWholeBufEnd = lpBuffers+dwBufferCount;
    while (pStart < pWholeBufEnd)
    {
    pFindPos = (byte*)memchr(pStart,'<',dwBufferCount - (pStart - lpBuffers) );
    if( NULL == pFindPos )
    {
    while( aHtmlRemoveChar[*pStart])
    {
    ++pStart;
    } size_t nNewContentSize = dwBufferCount - (pStart - lpBuffers);
    if( nNewContentSize > 0 )
    {
    memcpy(pCurrentOutIndex,pStart,nNewContentSize); if( m_bIsUtf8 == CHART_SET_UTF8)
    {
    pCurrentOutIndex[nNewContentSize] = '\0';
    nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize);
    if( CConfig::GetInstance().m_bCheckTraditional )
    {
    size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex);
    }
    } caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf);
    caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) ); pCurrentOutIndex +=nNewContentSize; }
    break;
    } if( pStart != pFindPos)
    {
    while( aHtmlRemoveChar[*pStart] )
    {
    ++pStart;
    } size_t nNewContentSize = pFindPos-pStart;
    if( nNewContentSize > 0 )
    {
    memcpy(pCurrentOutIndex,pStart,nNewContentSize); if( m_bIsUtf8 == CHART_SET_UTF8)
    {
    pCurrentOutIndex[nNewContentSize] = '\0';
    nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize) - 1;
    if( CConfig::GetInstance().m_bCheckTraditional )
    {
    size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex);
    }
    }

    caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf);
    caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) ); pCurrentOutIndex +=nNewContentSize;
    } }  switch( dwHtmlLabelCheck[*(pFindPos + 1)]) 
    {
    case 0: // "< ",这不是标签flag了,只是一个“< "文本
    {
    *((WORD*)pCurrentOutIndex) = *((WORD*)pFindPos);
    pCurrentOutIndex +=2;
    pStart = pFindPos + 2;
    }
    break;
    case 1: //<!-
    {
    if(*(pFindPos + 2) == '-' )
    {
    pEndFindPos = (byte*)strstr((LPCTSTR)pFindPos+3,"->");
    if( NULL  == pEndFindPos)
    {
    m_strPreNeedCheckStream += (char*)pFindPos;
    goto parseend;
    } pStart = pEndFindPos + 2;
    }
    else
    {
    goto noMatchCase;
    }
    }
    break;
    case 2: //<style or <script
    {
    if( strncmp((LPCTSTR)pFindPos,"<style",6) == 0)
    {
    pEndFindPos =(byte*) strstr((LPCTSTR)pFindPos+6,"</style>");
    if( NULL  == pEndFindPos)
    {
    m_strPreNeedCheckStream += (char*)pFindPos;
    goto parseend;
    } pStart = pEndFindPos + 8;
    }
    else if( strncmp((LPCTSTR)pFindPos,"<script",7 ) == 0 )
    {
    pEndFindPos = (byte*)strstr((LPCTSTR)pFindPos+7,"/script>");
    if( NULL  == pEndFindPos)
    {
    m_strPreNeedCheckStream += (char*)pFindPos;
    goto parseend;
    } pStart = pEndFindPos + 8;
    }
    else
    {
    goto noMatchCase;
    }
    }
    break;
    case 3:
    {
    if( strncmp((LPCTSTR)pFindPos,"<meta",5) == 0)
    {
    pEndFindPos = (byte*)strchr((LPCTSTR)pFindPos+5,'>');
    if( NULL  == pEndFindPos)
    {
    m_strPreNeedCheckStream += (char*)pFindPos;
    goto parseend;
    }
    if( m_bIsUtf8 == CHART_SET_UNKOWN)
    {
    CString strMeta;
    strMeta.Append((char*)pFindPos,(int)(pEndFindPos-pFindPos) ); int nCharsetPos = strMeta.Find("charset");
    if( -1 != nCharsetPos )
    {
    m_bIsUtf8 =  -1 != strMeta.Find("utf-8",nCharsetPos+7) ? CHART_SET_UTF8:CHART_SET_GB2312;
    }
    } pStart = pEndFindPos + 1;
    }
    else
    {
    goto noMatchCase;
    }
    }
    break;
    default:
    noMatchCase:
    {
    pEndFindPos = (byte*)memchr(pFindPos+1,'>',dwBufferCount - (pFindPos + 1 - lpBuffers) );
    if( NULL  == pEndFindPos)
    {
    m_strPreNeedCheckStream += (char*)pFindPos;
    goto parseend;
    } pStart = pEndFindPos + 1; } } }parseend:
    if( lpBuffers)
    {
    delete[] lpBuffers;
    } nOutBufSize = pCurrentOutIndex - pOutbuf;
    }