// Latin to ASCII - mostly
private static final String TAB_00C0 = "" +
"AAAAÄAACEEEEIIII" +
"DNOOOOÖ×OUUUÜYTß" +
"aaaaäaaceeeeiiii" +
"dnooooö÷ouuuüyty" +
"AaAaAaCcCcCcCcDd" +
"DdEeEeEeEeEeGgGg" +
"GgGgHhHhIiIiIiIi" +
"IiJjJjKkkLlLlLlL" +
"lLlNnNnNnnNnOoOo" +
"OoOoRrRrRrSsSsSs" +
"SsTtTtTtUuUuUuUu" +
"UuUuWwYyYZzZzZzs";
private static HashMap<Character, String> LIGATURES = new HashMap<>(){{
put('æ', "ae");
put('œ', "oe");
put('þ', "th");
put("ij", "ij");
put('ð', "dh");
put("Æ", "AE");
put("Œ", "OE");
put("Þ", "TH");
put("Ð", "DH");
put("IJ", "IJ");
//TODO
}};
public static String removeAllButUmlauts(String value) {
value = Normalizer.normalize(value, Normalizer.Form.NFC);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < source.length(); i++) {
char c = source.charAt(i);
String l = LIGATURES.get(c);
if (l != null){
sb.append(l);
} else if (c < 0xc0) {
sb.append(c); // ASCII and C1 control codes
} else if (c >= 0xc0 && c <= 0x17f) {
c = TAB_00C0.charAt(c - 0xc0); // common single latin letters
sb.append(c);
} else {
// anything else, including Vietnamese and rare diacritics
l = Normalizer.normalize(Character.toString(c), Normalizer.Form.NFKD)
.replaceAll("[\\p{InCombiningDiacriticalMarks}]+", "");
sb.append(l);
}
}
return sb.toString();
}
and then
String value = "üöäâÇæôøñÁ";
String after = removeAllButUmlauts(value);
System.out.println(after)
gives:
üöäaCaeoonA