UTF8格式的网页如何解析?

http://young.xh.blog.163.com/blog/static/95625320071154108561/
从网上找的，用着挺好的:)，实际用的话还要整理一下char *Convert(char *str, int sourceCodepage, int targetCodepage)
{
    int unicodeLen, targetLen;
    wchar_t *pUnicode;
    BYTE *pTargetData;
    char rt[256];    unicodeLen=MultiByteToWideChar(sourceCodepage,0,str,-1,NULL,0);
    pUnicode = (wchar_t *)malloc((unicodeLen + 1) * sizeof(wchar_t));
    memset(pUnicode,0,(unicodeLen+1)*sizeof(wchar_t));
    MultiByteToWideChar(sourceCodepage, 0, str, -1, (LPWSTR)pUnicode, unicodeLen);     targetLen = WideCharToMultiByte(targetCodepage, 0, (LPWSTR)pUnicode, -1,
        (char *)pTargetData, 0, NULL, NULL);
    pTargetData = (BYTE *)malloc((targetLen + 1) * sizeof(BYTE));
    memset(pTargetData, 0, targetLen + 1);
    WideCharToMultiByte(targetCodepage, 0, (LPWSTR)pUnicode, -1,
        (char *)pTargetData, targetLen, NULL, NULL);    memcpy(rt, (char *)pTargetData, targetLen);    free(pUnicode);
    free(pTargetData);    return rt;
}
使用：编码 Convert(NewName, 936, CP_UTF8);解码 Convert(NewName, CP_UTF8, 936);

谢谢楼上的,这个我也有
不过用在UNICODE字符的工程里行不能哇:)

unicode工程下直接转化成unicode就可以了
MultiByteToWideChar(CP_UTF8, ... );
转化一次就可以了.

UTF8转Unicode: CA2T(szUtf8, CP_UTF8)
Unicode转UTF8: AtlUnicodeToUtf8(...)

UTF8toUnicode(char *s)
{
int len = 0;
WCHAR* r = new WCHAR[strlen(s) * 2];
while(s[0])
{
int bytes = 1;
if(s[0] & 0x80)
while(s[0] & (0x80 >> bytes))
{
bytes++;
}
if(bytes == 1)
{
r[len] = s[0];
}
else
{
r[len] = 0;
for(char*p = s + (bytes - 1); p > s; p--)
r[len] |= ((*p) & 0x3F) << ((bytes - (p - s) - 1) * 6);
r[len] |= (s[0] & ((1 << (7 - bytes)) - 1)) << ((bytes - 1) * 6);
}
len++;
s += bytes;
}
r[len] = 0;
char*buffer = new char[len * 2 + 1];
ZeroMemory(buffer, len * 2 + 1);
::WideCharToMultiByte(CP_ACP, NULL, r, len, buffer, 1+ 2 * len, NULL, NULL);
CString str = buffer;
delete[] r;
delete[] buffer;
return str;
}

headerSz = 2;
        bUnicode = TRUE;        char* pUTF8 = (char*)malloc((l=l*2)+1);
        ::MultiByteToWideChar(
            CP_UTF8,
            NULL,
            buf,
            l/2+1,
            reinterpret_cast<LPWSTR>(pUTF8),
            l+1);
        free(buf);
        buf = pUTF8;if (bUnicode)
    {
        char* pGB2312 = (char*)malloc(l+1);
        memset(pGB2312, NULL, l+1);
        ::WideCharToMultiByte(
            CP_ACP,
            NULL,
            reinterpret_cast<LPWSTR>(buf+headerSz),
            (l+1-headerSz)/2,
            pGB2312,
            l+1,
            NULL,
            NULL);
        free(buf);
        buf = pGB2312;
        headerSz = 0;
    }即：UTF8->Unicode->GB2312;

XMLNode XMLNode::parseFile(const char *filename, XMLCSTR tag, XMLResults *pResults)
{
    if (pResults) { pResults->nLine=0; pResults->nColumn=0; }
    FILE *f=fopen(filename,"rb");
    if (f==NULL) { if (pResults) pResults->error=eXMLErrorFileNotFound; return emptyXMLNode; }
    fseek(f,0,SEEK_END);
    int l=ftell(f),headerSz=0;
    if (!l) { if (pResults) pResults->error=eXMLErrorEmpty; return emptyXMLNode; }
    fseek(f,0,SEEK_SET);
    char *buf=(char*)malloc(l+1);
    fread(buf,l,1,f);
    fclose(f);
    buf[l]=0;    BOOL bUnicode = FALSE;
    BYTE btUTF8[3] = {0xEF, 0xBB, 0xBF};
    BYTE btUnicode[2] = {0xFF, 0xFE};
    BYTE btUnicodeBE[2] = {0xFE, 0xFF};    // UTF8 code
    if (memcmp(buf, btUTF8, 3) == 0)
    {
        headerSz = 2;
        bUnicode = TRUE;        char* pUTF8 = (char*)malloc((l=l*2)+1);
        ::MultiByteToWideChar(
            CP_UTF8,
            NULL,
            buf,
            l/2+1,
            reinterpret_cast<LPWSTR>(pUTF8),
            l+1);
        free(buf);
        buf = pUTF8;
    }
    // Unicode big endian code
    else if (memcmp(buf, btUnicodeBE, 2) == 0)
    {
        headerSz = 2;
        bUnicode = TRUE;        for (int i=0; i<l; i+=headerSz)
        {
            BYTE bt = buf[i];
            buf[i] = buf[i+1];
            buf[i+1] = bt;
        }
    }
    // Unicode code
    else if (memcmp(buf, btUnicode, 2) == 0)
    {
        headerSz = 2;
        bUnicode = TRUE;
    }
    // GB2312 code
    else
    {
        headerSz = 0;
        bUnicode = FALSE;
    }    if (bUnicode)
    {
        char* pGB2312 = (char*)malloc(l+1);
        memset(pGB2312, NULL, l+1);
        ::WideCharToMultiByte(
            CP_ACP,
            NULL,
            reinterpret_cast<LPWSTR>(buf+headerSz),
            (l+1-headerSz)/2,
            pGB2312,
            l+1,
            NULL,
            NULL);
        free(buf);
        buf = pGB2312;
        headerSz = 0;
    }    if (!buf) { if (pResults) pResults->error=eXMLErrorCharConversionError; return emptyXMLNode; }
    XMLNode x=parseString((XMLSTR)(buf+headerSz),tag,pResults);
    free(buf);
    return x;
}

本来挺简单的问题,就获取UTF8编码的网页,再转换解析
搞的我头都大了,测试了网上的楼上的代码,乱码越解越乱了
汗~~~~~~接着试,先谢谢楼上各位

调试易

UTF8格式的网页如何解析?

解决方案 »