额,-2 -1(FE FF)是unicode big endian标志
fe ff:big endian
ff fe: no big endian
按照上面的结果看好像一个char是3 byte,但java中一个char是2 byte,
其实java中无论什么字符集string都会以unicode编码来存储,所以每个char都是一个
unicode编码占两个byte。
import java.io.UnsupportedEncodingException;
public class TestUtf8File {
/**
* @param args
*
*/
public static void main(String[] args) throws UnsupportedEncodingException {
String s = "中国人";
byte[] b = s.getBytes("utf-8");
String s_utf8 = new String(b,"utf-8");
System.out.println(s_utf8.getBytes("utf-8").length);
System.out.println("utf-8 bytes:");
printByteArray(s_utf8.getBytes("utf-8"));
System.out.println("chars:");
printCharArray(s_utf8.toCharArray());
byte[] unicodeb= s.getBytes("unicode");
String s_unidode = new String(unicodeb,"unicode");
System.out.println("unicode bytes:");
printByteArray(s_unidode.getBytes("unicode"));
}
private static void printByteArray(byte[] b){
for(int i = 0;i < b.length; i++){
System.out.println((Integer.toString(b[i],16)));
}
}
private static void printCharArray(char[] c){
for(int i = 0;i < c.length; i++){
System.out.println(Integer.toString((byte)(c[i]>>8),16));
System.out.println(Integer.toString((byte)(c[i]&0xff),16));
}
}
}
结果是output:
9
utf-8 bytes:
-1c
-48
-53
-1b
-65
-43
-1c
-46
-46
chars:
4e
2d
56
-3
4e
-46
unicode bytes:
-2
-1
4e
2d
56
-3
4e
-46 |