Here's a solution that takes care of the raw char as well as the escaped char in the stream works with stax or sax. It needs extending for the other invalid chars but you get the idea
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import org.apache.commons.io.IOUtils;
import org.apache.xerces.util.XMLChar;
public class IgnoreIllegalCharactersXmlReader extends Reader {
private final BufferedReader underlyingReader;
private StringBuilder buffer = new StringBuilder(4096);
private boolean eos = false;
public IgnoreIllegalCharactersXmlReader(final InputStream is) throws UnsupportedEncodingException {
underlyingReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
}
private void fillBuffer() throws IOException {
final String line = underlyingReader.readLine();
if (line == null) {
eos = true;
return;
}
buffer.append(line);
buffer.append('\n');
}
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
if(buffer.length() == 0 && eos) {
return -1;
}
int satisfied = 0;
int currentOffset = off;
while (false == eos && buffer.length() < len) {
fillBuffer();
}
while (satisfied < len && buffer.length() > 0) {
char ch = buffer.charAt(0);
final char nextCh = buffer.length() > 1 ? buffer.charAt(1) : '\0';
if (ch == '&' && nextCh == '#') {
final StringBuilder entity = new StringBuilder();
// Since we're reading lines it's safe to assume entity is all
// on one line so next char will/could be the hex char
int index = 0;
char entityCh = '\0';
// Read whole entity
while (entityCh != ';') {
entityCh = buffer.charAt(index++);
entity.append(entityCh);
}
// if it's bad get rid of it and clean it from the buffer and point to next valid char
if (entity.toString().equals("")) {
buffer.delete(0, entity.length());
continue;
}
}
if (XMLChar.isValid(ch)) {
satisfied++;
cbuf[currentOffset++] = ch;
}
buffer.deleteCharAt(0);
}
return satisfied;
}
@Override
public void close() throws IOException {
underlyingReader.close();
}
public static void main(final String[] args) {
final File file = new File(
<XML>);
final File outFile = new File(file.getParentFile(), file.getName()
.replace(".xml", ".cleaned.xml"));
Reader r = null;
Writer w = null;
try {
r = new IgnoreIllegalCharactersXmlReader(new FileInputStream(file));
w = new OutputStreamWriter(new FileOutputStream(outFile),"UTF-8");
IOUtils.copyLarge(r, w);
w.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
IOUtils.closeQuietly(r);
IOUtils.closeQuietly(w);
}
}
}