费了一下午时间,终于把解析UTF8做好了

import java.io.*;public class StringCoder
{
public static final int BYTES_OF_CHAR = 4;
public static final int BYTES_OF_INTEGER = 4;
public static final int BYTES_OF_LONG = 8;
public static final int BYTES_OF_FLOAT = 4;
public static final int BYTES_OF_DOUBLE = 8; private static char [] hexCharArray = {'0','1','2','3',
   '4','5','6','7',
   '8','9','A','B',
   'C','D','E','F'}; public static String decodeUTF8(String src)
{
String [] unitArray = src.split("%");
byte[] buffer = new byte[unitArray.length];
for (int i = 1; i < unitArray.length; i++)
{
buffer[i] = hexStringToByte(unitArray[i]);
}
return decodeUTF8(buffer,1,unitArray.length);
} public static String decodeUTF8(byte[] buffer,int begin,int end)
{
byte [] tmpBuf = new byte[5];
char [] chars = new char[end];
int leng = 0;
for (int i = begin; i < end;)
{
int data = 0;
tmpBuf[0] = buffer[i];
int byteNum = getLeftCountOf1InByte(tmpBuf[0]); for (int j = 1; j < byteNum; j++)
{
tmpBuf[j] = (byte)(buffer[i + j] & 0x3F);
}
tmpBuf[0] = maskFirstByte(tmpBuf[0]);
int byteCount = 0;
for (int j = byteNum - 1; j >= 0; j--)
{
data |=  tmpBuf[j] << (6 * byteCount);
byteCount++;
}
chars[leng++] = (char)data;
i += byteNum;
}
return new String(chars,0,leng);
} public static byte hexStringToByte(String hexStr)
{
hexStr = hexStr.toUpperCase();
   char high = hexStr.charAt(0);
   char low  = hexStr.charAt(1);    int highVal = 0;
   if (high >= '0' && high <= '9')
   {
   highVal = high - '0';
   }
   else
   {
   highVal = 10 + (high - 'A');
   }
   int lowVal = 0;
   if (low >= '0' && low <= '9')
   {
   lowVal = low - '0';
   }
   else
   {
   lowVal = 10 + (low - 'A');
   }
   return (byte)((highVal << 4) | lowVal);
} public static int getLeftCountOf1InByte(byte b)
{
int count = 0;
int mask = 1 << 7;
for (int i = 0; i < 8; i++)
{
if ((b & mask) == 0)
{
break;
}
else
{
count++;
mask >>= 1;
}
}
return count;
} private static byte maskFirstByte(byte b)
{
int mask = 1 << 7;
for (int i = 0; i < 8; i++)
{
if ((b & mask) == 0)
{
break;
}
else
{
b &= ~mask;
mask >>= 1;
}
}
return (byte)b;
}
public  static void main(String [] args) throws Exception
{
String src = "%e6%a1%8c%e9%9d%a2";
if (args.length > 0)
{
src = args[0];
}
String rs = new StringCoder().decodeUTF8(src);
System.out.println(rs);
}
}

解决方案 »

免费领取超大流量手机卡，每月29元包185G流量+100分钟通话, 中国电信官方发货

修正一个BUG 可以读取英文public static int getLeftCountOf1InByte(byte b)
    {
        int count = 0;
        int mask = 1 << 7;
        for (int i = 0; i < 8; i++)
        {
            if ((b & mask) == 0)
            {
                break;
            }
            else
            {
                count++;
                mask >>= 1;
            }
        }
        return count == 0 ? 1 : count;
    }
方法 hexStringToByte 用来把16进制的串变成一个字节如 "A1" -> 161
方法 getLeftCountOf1InByte 用来把首字节里的字节数信息读出来. 如E6 -> 3
方法 maskFirstByte 用来把首字节的提示信息部分去掉只留下数字部分如E6 -> 00000110
有了这三个方法就大功告成啦
(b & mack)==0)//如果相或的结果是0 说明什么？  这个不是很理解
是否是说明这个byte是下列规则中的第几类
U+00000000 - U+0000007F:  0 xxxxxxx  0x - 7x
U+00000080 - U+000007FF:  110 xxxxx 10 xxxxxx  Cx 8x - Dx Bx
U+00000800 - U+0000FFFF:  1110 xxxx 10 xxxxxx 10 xxxxxx  Ex 8x 8x - Ex Bx Bx
U+00010000 - U+001FFFFF:  11110 xxx 10 xxxxxx 10 xxxxxx 10 xxxxxx  F0 8x 8x 8x - F7 Bx Bx Bx
U+00200000 - U+03FFFFFF:  111110 xx 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx  F8 8x 8x 8x 8x - FB Bx Bx Bx Bx
U+04000000 - U+7FFFFFFF:  1111110 x 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx 10 xxxxxx  FC 8x 8x 8x 8x 8x - FD Bx Bx Bx Bx Bx
这个在判断首字节有几个1. 如果这个条件成立,表示当前的BIT上是0. 可以结束查找,返回结果了.
比如11100000
       |
就指向这个位置