小弟有个程序要解析html to txt 用了mshtml.dll
能解析,但是遇到一些网页就会弹出来一些提示说,“安全隐患”,“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html,
部分代码
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc);
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
VARIANT *param;
bstr_t bsData = (LPCTSTR)html;
AfxMessageBox(bsData);
hr = SafeArrayAccessData(psa, (LPVOID*)¶m);
param->vt = VT_BSTR;
param->bstrVal = (BSTR)bsData;
hr = pDoc->write(psa);
hr = pDoc->close();
SafeArrayDestroy(psa);
BSTR bstr;
pDoc->body->get_outerText(&bstr);
pElement->get_outerText(&bstr);如果可以程序中忽略提示也可以,或者用别的方式也可以。(不要用正则),那位大虾帮帮我啊!
能解析,但是遇到一些网页就会弹出来一些提示说,“安全隐患”,“为找到xxx.js”等。
不知道有没有什么别的好点的办法来解析html,
部分代码
HRESULT hr = CoCreateInstance(CLSID_HTMLDocument, NULL, CLSCTX_INPROC_SERVER,IID_IHTMLDocument2, (void**)&pDoc);
SAFEARRAY* psa = SafeArrayCreateVector(VT_VARIANT, 0, 1);
VARIANT *param;
bstr_t bsData = (LPCTSTR)html;
AfxMessageBox(bsData);
hr = SafeArrayAccessData(psa, (LPVOID*)¶m);
param->vt = VT_BSTR;
param->bstrVal = (BSTR)bsData;
hr = pDoc->write(psa);
hr = pDoc->close();
SafeArrayDestroy(psa);
BSTR bstr;
pDoc->body->get_outerText(&bstr);
pElement->get_outerText(&bstr);如果可以程序中忽略提示也可以,或者用别的方式也可以。(不要用正则),那位大虾帮帮我啊!
void CIllegalCheck::ParseHtml(byte* lpOrigBuffers, size_t dwBufferCount,byte* pOutbuf,size_t& nOutBufSize,CConverAndOrigPosHelp& caoPosHelp)
{
caoPosHelp.m_pOrigHead = lpOrigBuffers; byte* pCurrentOutIndex = pOutbuf;
nOutBufSize = 0; byte* lpBuffers = new byte[dwBufferCount + 1 + m_strPreNeedCheckStream.length()];
if( lpBuffers == NULL)
{
return ;
}
lpBuffers[dwBufferCount+m_strPreNeedCheckStream.length()]='\0'; memcpy(lpBuffers + m_strPreNeedCheckStream.length(),lpOrigBuffers,dwBufferCount); byte* pPoshelpOrigHead = lpBuffers + m_strPreNeedCheckStream.length(); //if( NULL == strlwr((char*)(lpBuffers + m_strPreNeedCheckStream.length())))
//{
// if( lpBuffers)
// {
// delete[] lpBuffers;
// }
// return;
//}
CUtility::FastStrLwr( ( char*)(lpBuffers + m_strPreNeedCheckStream.length()) );
memcpy(lpBuffers,m_strPreNeedCheckStream.c_str(),m_strPreNeedCheckStream.length()); dwBufferCount += (DWORD)m_strPreNeedCheckStream.length();
m_strPreNeedCheckStream=""; byte* pStart = lpBuffers ;
byte* pFindPos = NULL;
byte* pEndFindPos=NULL;
byte* pWholeBufEnd = lpBuffers+dwBufferCount;
while (pStart < pWholeBufEnd)
{
pFindPos = (byte*)memchr(pStart,'<',dwBufferCount - (pStart - lpBuffers) );
if( NULL == pFindPos )
{
while( aHtmlRemoveChar[*pStart])
{
++pStart;
} size_t nNewContentSize = dwBufferCount - (pStart - lpBuffers);
if( nNewContentSize > 0 )
{
memcpy(pCurrentOutIndex,pStart,nNewContentSize); if( m_bIsUtf8 == CHART_SET_UTF8)
{
pCurrentOutIndex[nNewContentSize] = '\0';
nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize);
if( CConfig::GetInstance().m_bCheckTraditional )
{
size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex);
}
} caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf);
caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) ); pCurrentOutIndex +=nNewContentSize; }
break;
} if( pStart != pFindPos)
{
while( aHtmlRemoveChar[*pStart] )
{
++pStart;
} size_t nNewContentSize = pFindPos-pStart;
if( nNewContentSize > 0 )
{
memcpy(pCurrentOutIndex,pStart,nNewContentSize); if( m_bIsUtf8 == CHART_SET_UTF8)
{
pCurrentOutIndex[nNewContentSize] = '\0';
nNewContentSize = CUtility::ConvertUtf8ToGBK((char*)pCurrentOutIndex,nNewContentSize) - 1;
if( CConfig::GetInstance().m_bCheckTraditional )
{
size_t nCountSize = CUtility::ConvertGBKToGB2312((char*)pCurrentOutIndex);
}
}
caoPosHelp.m_vecRangeConver.push_back(pCurrentOutIndex-pOutbuf);
caoPosHelp.m_vecRangeOrigal.push_back(lpOrigBuffers + (pStart -pPoshelpOrigHead) ); pCurrentOutIndex +=nNewContentSize;
} } switch( dwHtmlLabelCheck[*(pFindPos + 1)])
{
case 0: // "< ",这不是标签flag了,只是一个“< "文本
{
*((WORD*)pCurrentOutIndex) = *((WORD*)pFindPos);
pCurrentOutIndex +=2;
pStart = pFindPos + 2;
}
break;
case 1: //<!-
{
if(*(pFindPos + 2) == '-' )
{
pEndFindPos = (byte*)strstr((LPCTSTR)pFindPos+3,"->");
if( NULL == pEndFindPos)
{
m_strPreNeedCheckStream += (char*)pFindPos;
goto parseend;
} pStart = pEndFindPos + 2;
}
else
{
goto noMatchCase;
}
}
break;
case 2: //<style or <script
{
if( strncmp((LPCTSTR)pFindPos,"<style",6) == 0)
{
pEndFindPos =(byte*) strstr((LPCTSTR)pFindPos+6,"</style>");
if( NULL == pEndFindPos)
{
m_strPreNeedCheckStream += (char*)pFindPos;
goto parseend;
} pStart = pEndFindPos + 8;
}
else if( strncmp((LPCTSTR)pFindPos,"<script",7 ) == 0 )
{
pEndFindPos = (byte*)strstr((LPCTSTR)pFindPos+7,"/script>");
if( NULL == pEndFindPos)
{
m_strPreNeedCheckStream += (char*)pFindPos;
goto parseend;
} pStart = pEndFindPos + 8;
}
else
{
goto noMatchCase;
}
}
break;
case 3:
{
if( strncmp((LPCTSTR)pFindPos,"<meta",5) == 0)
{
pEndFindPos = (byte*)strchr((LPCTSTR)pFindPos+5,'>');
if( NULL == pEndFindPos)
{
m_strPreNeedCheckStream += (char*)pFindPos;
goto parseend;
}
if( m_bIsUtf8 == CHART_SET_UNKOWN)
{
CString strMeta;
strMeta.Append((char*)pFindPos,(int)(pEndFindPos-pFindPos) ); int nCharsetPos = strMeta.Find("charset");
if( -1 != nCharsetPos )
{
m_bIsUtf8 = -1 != strMeta.Find("utf-8",nCharsetPos+7) ? CHART_SET_UTF8:CHART_SET_GB2312;
}
} pStart = pEndFindPos + 1;
}
else
{
goto noMatchCase;
}
}
break;
default:
noMatchCase:
{
pEndFindPos = (byte*)memchr(pFindPos+1,'>',dwBufferCount - (pFindPos + 1 - lpBuffers) );
if( NULL == pEndFindPos)
{
m_strPreNeedCheckStream += (char*)pFindPos;
goto parseend;
} pStart = pEndFindPos + 1; } } }parseend:
if( lpBuffers)
{
delete[] lpBuffers;
} nOutBufSize = pCurrentOutIndex - pOutbuf;
}