I am working with a large string that represents an html page and is processed then. What I do is following:
String data = <HTML PAGE CONTENT>;
// remove first/last appostrove
data = data.substring(1, data.length() - 1);
data = StringUtils.replace(data, "\\u003C", "<");
data = StringUtils.replace(data, "\\u003E", ">");
data = StringUtils.replace(data, "\\\"", "\"");
// the head html element is not needed, so I remove it beforehand
data = removeTag(data, "head", true);
// format the data if necessary in utf8
// => necessary, otherwise I see unwanted characters in my data
data = cleanString(data);
// continue... here I only parse out a list of all relevant tags I'm interested in
// from here on I use a html parser, which is memory efficient...
Problem
For some people I get OOM exceptions, mostly somewhere in between my string process function, so I'm looking to improve them. I appreciate any suggestion that improves my code in memory efficiency (speed is not important!).
Functions
private static String removeTag(String html, String tag, boolean replaceWithEmpty) {
String regex = "<" + tag + ">.*?</" + tag + ">";
return StringUtils.replaceAll(html, regex, replaceWithEmpty ? "<" + tag + "></" + tag + ">" : "");
}
private static String cleanString(String s) {
try {
// Convert from Unicode to UTF-8
byte[] utf8 = s.getBytes("UTF-8");
// Convert from UTF-8 to Unicode
s = new String(utf8, "UTF-8");
} catch (UnsupportedEncodingException e) {
L.e(e);
}
return s;
}
StringUtils
public class StringUtils {
// compile each pattern once only!
private static HashMap<String, Pattern> COMPILED_PATTERNS = new HashMap<>();
private static Pattern getPattern(String regex) {
if (COMPILED_PATTERNS.containsKey(regex)) {
return COMPILED_PATTERNS.get(regex);
}
Pattern p = Pattern.compile(regex);
COMPILED_PATTERNS.put(regex, p);
return p;
}
public static Matcher match(String regex, String data) {
Pattern p = getPattern(regex);
return p.matcher(data);
}
public static String replace(final String str, final CharSequence searchChars, CharSequence replaceChars) {
return str.replace(searchChars, replaceChars);
}
public static String replaceAll(final String str, final String regex, String replacement) {
Pattern p = getPattern(regex);
return p.matcher(str).replaceAll(replacement);
}
public static String findContentBetween(String content, String prefix, String postfix) {
return findContentBetween(content, prefix, postfix, false);
}
public static String findContentBetween(String content, String prefix, String postfix, boolean searchEndFirst) {
if (content == null || content.length() == 0) {
return null;
}
if (searchEndFirst) {
int index = content.indexOf(postfix);
if (index >= 0) {
int end = -1;
int start = -1;
String s;
while (index >= 0) {
s = content.substring(index, index + 1);
if (s.equals("?")) {
end = index;
} else if (s.equals("/")) {
start = index + 1;
}
if (end != -1 && start != -1) {
break;
}
index--;
}
if (end > start && end >= 0) {
return content.substring(start, end);
}
}
} else {
int end;
int start = content.indexOf(prefix);
if (start > 0) {
start += prefix.length();
end = content.indexOf(postfix, start + 1);
if (end > start) {
return content.substring(start, end);
}
}
}
return null;
}
}