我到网上找了几个流行的utf-8编码算法,但是每个编码出来都不完全一样。而且都可以完全解码出来!我都不知道怎么了!比如一句话:“你是我的好朋友”有些算法编码出来是:“浣犳槸鎴戠殑濂芥湅鍙”
有些算法编码出来是:“浣犳槸鎴戠殑濂芥湅鍙嬪”这些不同编码都可以还原回去。请问怎么会事??大家怎么编码的?能给一个算法么??vc6这方面真烦?
有些算法编码出来是:“浣犳槸鎴戠殑濂芥湅鍙嬪”这些不同编码都可以还原回去。请问怎么会事??大家怎么编码的?能给一个算法么??vc6这方面真烦?
{
if(!utf8 || length == 0) return true;
const byte* pc = (const byte*)utf8;
const byte* last = pc + length;
uint b;
uint num_errors = 0;
int i = 0;
while (pc < last)
{
b = *pc++; if( !b ) break; // 0 - is eos in all utf encodings if ((b & 0x80) == 0)
{
// 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
;
}
else if ((b & 0xe0) == 0xc0)
{
// 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
if(pc == last) { outbuf[i++]='?'; ++num_errors; break; }
b = (b & 0x1f) << 6;
b |= (*pc++ & 0x3f);
}
else if ((b & 0xf0) == 0xe0)
{
// 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
if(pc >= last - 1) { outbuf[i++]='?'; ++num_errors; break; }
b = (b & 0x0f) << 12;
b |= (*pc++ & 0x3f) << 6;
b |= (*pc++ & 0x3f);
if(b == 0xFEFF &&
i == 0) // bom at start
continue; // skip it
}
else if ((b & 0xf8) == 0xf0)
{
// 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
if(pc >= last - 2) { outbuf[i++]='?'; break; } b = (b & 0x07) << 18;
b |= (*pc++ & 0x3f) << 12;
b |= (*pc++ & 0x3f) << 6;
b |= (*pc++ & 0x3f);
// b shall contain now full 21-bit unicode code point.
assert((b & 0x1fffff) == b);
if((b & 0x1fffff) != b)
{
outbuf[i++]='?';
++num_errors;
continue;
}
if( sizeof(wchar_t) == 16 ) // Seems like Windows, wchar_t is utf16 code units sequence there.
{
outbuf[i++] = wchar_t(0xd7c0 + (b >> 10));
outbuf[i++] = wchar_t(0xdc00 | (b & 0x3ff));
}
else if( sizeof(wchar_t) >= 21 ) // wchar_t is full ucs-4
{
outbuf[i++] = wchar_t(b);
}
else
{
assert(0); // what? wchar_t is single byte here?
}
}
else
{
assert(0); //bad start for UTF-8 multi-byte sequence"
++num_errors;
b = '?';
}
outbuf[i++] = wchar_t(b);
}
return num_errors == 0;
} bool utf8fromwcs(const wchar_t* wcs, size_t length, byte* outbuf)
{
const wchar_t *pc = wcs;
const wchar_t *end = pc + length;
uint num_errors = 0;
int i = 0;
for(unsigned int c = *pc; pc < end ; c = *(++pc))
{
if (c < (1 << 7))
{
outbuf[i++] = (byte(c));
}
else if (c < (1 << 11))
{
outbuf[i++] = (byte((c >> 6) | 0xc0));
outbuf[i++] = (byte((c & 0x3f) | 0x80));
}
else if (c < (1 << 16))
{
outbuf[i++] = (byte((c >> 12) | 0xe0));
outbuf[i++] = (byte(((c >> 6) & 0x3f) | 0x80));
outbuf[i++] = (byte((c & 0x3f) | 0x80));
}
else if (c < (1 << 21))
{
outbuf[i++] = (byte((c >> 18) | 0xf0));
outbuf[i++] = (byte(((c >> 12) & 0x3f) | 0x80));
outbuf[i++] = (byte(((c >> 6) & 0x3f) | 0x80));
outbuf[i++] = (byte((c & 0x3f) | 0x80));
}
else
++num_errors;
}
return num_errors == 0;
}
以下是我的代码:void CHttpSend::ConvertGBKToUtf8(CString& strGBK)
{
int len=MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, NULL,0);
unsigned short * wszUtf8 = new unsigned short[len+1];
memset(wszUtf8, 0, len * 2 + 2);
MultiByteToWideChar(CP_ACP, 0, (LPCTSTR)strGBK, -1, wszUtf8, len);
len = WideCharToMultiByte(CP_UTF8, 0, wszUtf8, -1, NULL, 0, NULL, NULL);
char *szUtf8=new char[len + 1+2];
memset(szUtf8, 0, len + 1+2);
WideCharToMultiByte (CP_UTF8, 0, wszUtf8, -1, szUtf8, len+1, NULL,NULL); strGBK = szUtf8;
delete[] szUtf8;
delete[] wszUtf8;
}帮忙看看有什么错误!