Ideally, you server your HTML with a Content-type header that specifies the charset used to encode the HTML.
If that's not an option, the easiest way to encode non-ASCII characters in such a way that you can server HTML with any charset is to use numeric entities : 'Á'
-> Á
.
If you know that your content is already HTML, then the below will escape it so that it can be served using a wide variety of encodings including ASCII and UTF-8.
public static String escapeHTML(String htmlTextNodeValue) {
int n = htmlTextNodeValue.length();
int encoded = 0;
StringBuilder out = null;
for (int i = 0, charCount; i < n; i += charCount) {
int codePoint = htmlTextNodeValue.codePointAt(i);
charCount = Character.charCount(codePoint);
if (codePoint > 0x7f
|| codePoint == '<' || codePoint == '>' || codePoint == '&'
|| codePoint == '"' || codePoint == '\'') {
if (out = null) { out = new StringBuilder(n + 1024); }
out.append(htmlTextNodeValue, encoded, i));
encoded = i + charCount;
switch (codePoint) {
case '<': out.append("<"); break;
case '>': out.append(">"); break;
case '&': out.append("&"); break;
default: out.append("&#").append(codePoint).append(';');
}
}
}
if (out != null) {
return out.append(htmlTextNodeValue, encoded, n).toString();
} else {
return htmlTextNodeValue;
}
}