前些天提了一个关于一些网页的UTF8字符不能通过DELPHI自带的UTF8DECODE函数正确处理的问题。
有朋友提供了一个函数来解决这种UTF8字符串含有乱码仍能正确解码的情况,如下function DecodeUtf8Str(const S: UTF8String): WideString;
var lenSrc, lenDst : Integer;
begin
lenSrc := Length(S);
if(lenSrc=0)then Exit;
lenDst := MultiByteToWideChar(CP_UTF8, 0, Pointer(S), lenSrc, nil, 0);
SetLength(Result, lenDst);
MultiByteToWideChar(CP_UTF8, 0, Pointer(S), lenSrc, Pointer(Result), lenDst);
end;但又出现了新问题,因为原来我用DELPHI自带的UTF8DECODE函数来判断一个字符串是否UTF8编码,如果函数返回空值,表示非UTF8编码,如果返回值不为空则返回值为解码后的字符串。现在的这个函数,即使非UTF8字符串,它仍会进行解码,导致GB2312字符串等被处理成乱码。这里向大家寻求一个好用的识别字符串是否UTF8的方法。
有朋友提供了一个函数来解决这种UTF8字符串含有乱码仍能正确解码的情况,如下function DecodeUtf8Str(const S: UTF8String): WideString;
var lenSrc, lenDst : Integer;
begin
lenSrc := Length(S);
if(lenSrc=0)then Exit;
lenDst := MultiByteToWideChar(CP_UTF8, 0, Pointer(S), lenSrc, nil, 0);
SetLength(Result, lenDst);
MultiByteToWideChar(CP_UTF8, 0, Pointer(S), lenSrc, Pointer(Result), lenDst);
end;但又出现了新问题,因为原来我用DELPHI自带的UTF8DECODE函数来判断一个字符串是否UTF8编码,如果函数返回空值,表示非UTF8编码,如果返回值不为空则返回值为解码后的字符串。现在的这个函数,即使非UTF8字符串,它仍会进行解码,导致GB2312字符串等被处理成乱码。这里向大家寻求一个好用的识别字符串是否UTF8的方法。
s是utf8
protected
type
TParseResult = (prMatch, prNotMatch, prOutOfRange, prIgnore);
TFuncParser = reference to function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult;
class var
FMapped : Boolean;
FParsers : array[AnsiChar]of TFuncParser;
class function SubMatch(var Run: PAnsiChar; Last: PAnsiChar; Count: Byte): TParseResult; static; inline;
public
class function Test(const Buffer: TBuffer; MaxParseSize: Integer): Byte; override;
end;class function TLiteEncodingParser.TUtf8Parser.Test(const Buffer: TBuffer;
MaxParseSize: Integer): Byte;
procedure InitParsersMap;
var
reffuncs : array[0..6]of TFuncParser;
procedure InitReferenceFunctions;
begin
reffuncs[0] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := prNotMatch;
Inc(Run);
end;
reffuncs[1] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := prIgnore;
Inc(Run);
end;
reffuncs[2] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := SubMatch(Run, Last, 1);
end;
reffuncs[3] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := SubMatch(Run, Last, 2);
end;
reffuncs[4] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := SubMatch(Run, Last, 3);
end;
reffuncs[5] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := SubMatch(Run, Last, 4);
end;
reffuncs[6] := function(var Run: PAnsiChar; Last: PAnsiChar): TParseResult
begin
Result := SubMatch(Run, Last, 5);
end;
end;
const
Flags : array[1..6]of record
flgAnd, flgMatch : Byte;
end = (
( flgAnd: $80; flgMatch: $00 ), //0xxx xxxx
( flgAnd: $E0; flgMatch: $C0 ), //110x xxxx
( flgAnd: $F0; flgMatch: $E0 ), //1110 xxxx
( flgAnd: $F8; flgMatch: $F0 ), //1111 0xxx
( flgAnd: $FC; flgMatch: $F8 ), //1111 10xx
( flgAnd: $FE; flgMatch: $FC ) //1111 110x
);
var
a : AnsiChar;
i : Integer;
begin
InitReferenceFunctions;
for a := Low(a) to High(a) do
begin
FParsers[a] := reffuncs[0];
for i := Low(Flags) to High(Flags) do
with Flags[i] do
if((Byte(a) and flgAnd)=flgMatch)then
begin
FParsers[a] := reffuncs[i];
Break;
end;
end;
FMapped := True;
end;
var
pRun, pEnd : PAnsiChar;
nIgnore, nFailed : Integer;
begin
if(MaxParseSize=0)then MaxParseSize := Buffer.Size
else MaxParseSize := Min(Buffer.Size, MaxParseSize);
if(MaxParseSize=0)then Exit(0);
if(not FMapped)then
InitParsersMap; pRun := Buffer.Buffer;
pEnd := pRun + MaxParseSize;
nIgnore := 0;
nFailed := 0;
while (pRun<=pEnd) do
case FParsers[pRun^](pRun, pEnd)of
{prMatch :
Inc(nMatch);}
prIgnore :
Inc(nIgnore);
prOutOfRange :
Break;
prNotMatch :
Inc(nFailed);
end;
Result := GetDivRate(nFailed, (MaxParseSize - nIgnore));
// 255 - (nFailed shl 8) div (pEnd-pRun - nIgnore);
end;class function TLiteEncodingParser.TUtf8Parser.SubMatch(var Run: PAnsiChar;
Last: PAnsiChar; Count: Byte): TParseResult;
begin
Inc(Run);
if ((Run+Count)>=Last) then
Exit(prOutOfRange); //(flgAnd: $C0; flgMatch: $80), 10xx xxxx
while (Count>0) do
begin
if (Byte(Run^) and $C0)<>$80 then
Exit(prNotMatch);
Inc(Run);
Dec(Count);
end;
Result := prMatch;
end;
这个办法和1楼一样,遇到UTF8编码字符串中有非法字符就会判断错误
英文的就完了.utf8和ansi的英文编码是一样的.