I have a 35 GB XML file (yes, some organizations do that and I have no control over it) that I would like to SAX parse. I found an example here:
http://www.java2s.com/Code/Java/XML/SAXDemo.htm
of how to run a SAX parser and avoid loading everything. However, I get an out of memory error immediatly. Why does this happens and how I can make this code perfectly scalable for any XML file size?
Here my code:
import org.apache.log4j.Logger;
import org.xml.sax.AttributeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
public class XMLSAXTools extends org.xml.sax.helpers.DefaultHandler {
/**
* Logging facility
*/
static Logger logger = Logger.getLogger(XMLSAXTools.class);
private String fileName = "C:/Data/hugefile.xml";
private int counter = 0;
/** The main method sets things up for parsing */
public void test() throws IOException, SAXException,
ParserConfigurationException {
// Create a JAXP "parser factory" for creating SAX parsers
javax.xml.parsers.SAXParserFactory spf = SAXParserFactory.newInstance();
// Configure the parser factory for the type of parsers we require
spf.setValidating(false); // No validation required
// Now use the parser factory to create a SAXParser object
// Note that SAXParser is a JAXP class, not a SAX class
javax.xml.parsers.SAXParser sp = spf.newSAXParser();
// Create a SAX input source for the file argument
org.xml.sax.InputSource input = new InputSource(new FileReader(fileName));
// Give the InputSource an absolute URL for the file, so that
// it can resolve relative URLs in a <!DOCTYPE> declaration, e.g.
input.setSystemId("file://" + new File(fileName).getAbsolutePath());
// Create an instance of this class; it defines all the handler methods
XMLSAXTools handler = new XMLSAXTools();
// Finally, tell the parser to parse the input and notify the handler
sp.parse(input, handler);
// Instead of using the SAXParser.parse() method, which is part of the
// JAXP API, we could also use the SAX1 API directly. Note the
// difference between the JAXP class javax.xml.parsers.SAXParser and
// the SAX1 class org.xml.sax.Parser
//
// org.xml.sax.Parser parser = sp.getParser(); // Get the SAX parser
// parser.setDocumentHandler(handler); // Set main handler
// parser.setErrorHandler(handler); // Set error handler
// parser.parse(input); // Parse!
}
StringBuffer accumulator = new StringBuffer(); // Accumulate parsed text
String servletName; // The name of the servlet
String servletClass; // The class name of the servlet
String servletId; // Value of id attribute of <servlet> tag
// When the parser encounters plain text (not XML elements), it calls
// this method, which accumulates them in a string buffer
public void characters(char[] buffer, int start, int length) {
accumulator.append(buffer, start, length);
}
// Every time the parser encounters the beginning of a new element, it
// calls this method, which resets the string buffer
public void startElement(String name, AttributeList attributes) {
accumulator.setLength(0); // Ready to accumulate new text
if (name.equals("item")) {
logger.info("item tag opened");
counter++;
}
}
// When the parser encounters the end of an element, it calls this method
public void endElement(String name) {
if (name.equals("item")) {
logger.info("item tag closed. Counter: " + counter);
}
}
/** This method is called when warnings occur */
public void warning(SAXParseException exception) {
System.err.println("WARNING: line " + exception.getLineNumber() + ": "
+ exception.getMessage());
}
/** This method is called when errors occur */
public void error(SAXParseException exception) {
System.err.println("ERROR: line " + exception.getLineNumber() + ": "
+ exception.getMessage());
}
/** This method is called when non-recoverable errors occur. */
public void fatalError(SAXParseException exception) throws SAXException {
System.err.println("FATAL: line " + exception.getLineNumber() + ": "
+ exception.getMessage());
throw (exception);
}
public static void main(String[] args){
XMLSAXTools t = new XMLSAXTools();
try {
t.test();
} catch (Exception e){
logger.error("Exception in XMLSAXTools: " + e.getMessage());
e.printStackTrace();
}
}
}