another UTF-8 related problem. Chinese characters in Java encoded with 'UTF-8' some times become 3 bytes long when encoded. I don't know why, I thought all Chinese characters their code points are all 2 bytes wide. but when I manually try to detect that it seems doesn't turn out that way either. is there a way to detect the byte width (non zero bytes ) of the UTF-8 character ?
import java.io.UnsupportedEncodingException;
public class a {
public static void main(String[] args) throws UnsupportedEncodingException {
String s = "我是一1"; //expected 7 actually 6
String s1 = "一1";
String s2 = "1";
//String r1 = "\\p{InCJK_Compatibility}";
//String r1 = "\\p{InCJK_Compatibility_Ideographs}";
//String r1 = "\\p{Han}"; //unfortunately not supported in java6
int cnt = 0;
final int length = s.length();
for (int offset = 0; offset < length; ) {
final int codepoint = s.codePointAt(offset);
if( (codepoint & 0xFF) > 0 ) cnt++;
if( (codepoint & 0xFF00) > 0 ) cnt++;
if( (codepoint & 0xFF0000) > 0 ) cnt++;
if( (codepoint & 0xFF000000) > 0 ) cnt++;
offset += Character.charCount(codepoint);
}
System.out.println( cnt );
}
}