ASCII characters exist in Unicode, they are Unicode codepoints U+0000 - U+007F, inclusive.
Java strings are represented in UTF-16, which is a 16-bit byte encoding of Unicode. Each Java char
is a UTF-16 code unit. Unicode codepoints U+0000 - U+FFFF use 1 UTF-16 code unit and thus fit in a single char
, whereas Unicode codepoints U+10000 and higher require a UTF-16 surrogate pair and thus need two char
s.
If the string has UTF-16 code units represented as actual char
values, then you can use Java's string
methods that work with codepoints, eg:
private String[] getCharArray(String unicodeStr) {
ArrayList<String> list = new ArrayList<>();
int i = 0, j;
while (i < unicodeStr.length()) {
j = unicodeStr.offsetByCodePoints(i, 1);
list.add(unicodeStr.substring(i, j));
i = j;
}
return list.toArray(new String[list.size()]);
}
On the other hand, if the string has UTF-16 code units represented in an encoded "\uXXXX"
format (ie, as 6 distinct characters - '\'
, 'u'
, ...), then things get a little more complicated as you have to parse the encoded sequences manually.
If you want to preserve the "\uXXXX" strings in your array, you could do something like this:
private boolean isUnicodeEncoded(string s, int index)
{
return (
(s.charAt(index) == '\\') &&
((index+5) < s.length()) &&
(s.charAt(index+1) == 'u')
);
}
private String[] getCharArray(String unicodeStr) {
ArrayList<String> list = new ArrayList<>();
int i = 0, j, start;
char ch;
while (i < unicodeStr.length()) {
start = i;
if (isUnicodeEncoded(unicodeStr, i)) {
ch = (char) Integer.parseInt(unicodeStr.substring(i+2, i+6), 16);
j = 6;
}
else {
ch = unicodeStr.charAt(i);
j = 1;
}
i += j;
if (Character.isHighSurrogate(ch) && (i < unicodeStr.length())) {
if (isUnicodeEncoded(unicodeStr, i)) {
ch = (char) Integer.parseInt(unicodeStr.substring(i+2, i+6), 16);
j = 6;
}
else {
ch = unicodeStr.charAt(i);
j = 1;
}
if (Character.isLowSurrogate(ch)) {
i += j;
}
}
list.add(unicodeStr.substring(start, i));
}
return list.toArray(new String[list.size()]);
}
If you want to decode the "\uXXXX" strings into actual chars in your array, you could do something like this instead:
private boolean isUnicodeEncoded(string s, int index)
{
return (
(s.charAt(index) == '\\') &&
((index+5) < s.length()) &&
(s.charAt(index+1) == 'u')
);
}
private String[] getCharArray(String unicodeStr) {
ArrayList<String> list = new ArrayList<>();
int i = 0, j;
char ch1, ch2;
while (i < unicodeStr.length()) {
if (isUnicodeEncoded(unicodeStr, i)) {
ch1 = (char) Integer.parseInt(unicodeStr.substring(i+2, i+6), 16);
j = 6;
}
else {
ch1 = unicodeStr.charAt(i);
j = 1;
}
i += j;
if (Character.isHighSurrogate(ch1) && (i < unicodeStr.length())) {
if (isUnicodeEncoded(unicodeStr, i)) {
ch2 = (char) Integer.parseInt(unicodeStr.substring(i+2, i+6), 16);
j = 6;
}
else {
ch2 = unicodeStr.charAt(i);
j = 1;
}
if (Character.isLowSurrogate(ch2)) {
list.add(String.valueOf(new char[]{ch1, ch2}));
i += j;
continue;
}
}
list.add(String.valueOf(ch1));
}
return list.toArray(new String[list.size()]);
}
Or, something like this (per https://stackoverflow.com/a/24046962/65863):
private String[] getCharArray(String unicodeStr) {
Properties p = new Properties();
p.load(new StringReader("key="+unicodeStr));
unicodeStr = p.getProperty("key");
ArrayList<String> list = new ArrayList<>();
int i = 0;
while (i < unicodeStr.length()) {
if (Character.isHighSurrogate(unicodeStr.charAt(i)) &&
((i+1) < unicodeStr.length()) &&
Character.isLowSurrogate(unicodeStr.charAt(i+1)))
{
list.add(unicodeStr.substring(i, i+2));
i += 2;
}
else {
list.add(unicodeStr.substring(i, i+1));
++i;
}
}
return list.toArray(new String[list.size()]);
}