Updated code: Careful that @Lakedaemon's Kotlin code doesn't contemplate the tone placement rules.
- A and e trump all other vowels and always take the tone mark. There are no Mandarin syllables in Hanyu Pinyin that contain both a and e.
- In the combination ou, o takes the mark.
- In all other cases, the final vowel takes the mark.
I originally ported @Lakedaemon's Kotlin code to Java, now I modified it and urge people who used this or @Lakedaemon's Kotlin code to update it.
I added an extra auxiliary function to get the correct tone mark postion.
private static int getTonePosition(String r) {
String lowerCase = r.toLowerCase();
// exception to the rule
if (lowerCase.equals("ou")) return 0;
// higher precedence, both never go together
int preferencePosition = lowerCase.indexOf('a');
if (preferencePosition >= 0) return preferencePosition;
preferencePosition = lowerCase.indexOf('e');
if (preferencePosition >= 0) return preferencePosition;
// otherwise the last one takes the tone mark
return lowerCase.length() - 1;
}
static public String getCharacter(String string, int position) {
char[] characters = string.toCharArray();
return String.valueOf(characters[position]);
}
static public String toPinyin(String asciiPinyin) {
Map<String, String> pinyinToneMarks = new HashMap<>();
pinyinToneMarks.put("a", "āáǎà"); pinyinToneMarks.put("e", "ēéěè");
pinyinToneMarks.put("i", "īíǐì"); pinyinToneMarks.put("o", "ōóǒò");
pinyinToneMarks.put("u", "ūúǔù"); pinyinToneMarks.put("ü", "ǖǘǚǜ");
pinyinToneMarks.put("A", "ĀÁǍÀ"); pinyinToneMarks.put("E", "ĒÉĚÈ");
pinyinToneMarks.put("I", "ĪÍǏÌ"); pinyinToneMarks.put("O", "ŌÓǑÒ");
pinyinToneMarks.put("U", "ŪÚǓÙ"); pinyinToneMarks.put("Ü", "ǕǗǙǛ");
Pattern pattern = Pattern.compile("([aeiouüvÜ]{1,3})(n?g?r?)([012345])");
Matcher matcher = pattern.matcher(asciiPinyin);
StringBuilder s = new StringBuilder();
int start = 0;
while (matcher.find(start)) {
s.append(asciiPinyin, start, matcher.start(1));
int tone = Integer.parseInt(matcher.group(3)) % 5;
String r = matcher.group(1).replace("v", "ü").replace("V", "Ü");
if (tone != 0) {
int pos = getTonePosition(r);
s.append(r, 0, pos).append(getCharacter(pinyinToneMarks.get(getCharacter(r, pos)),tone - 1)).append(r, pos + 1, r.length());
} else {
s.append(r);
}
s.append(matcher.group(2));
start = matcher.end(3);
}
if (start != asciiPinyin.length()) {
s.append(asciiPinyin, start, asciiPinyin.length());
}
return s.toString();
}