15

I have this below XML file

<book>
<person>
  <first>Kiran</first>
  <last>Pai</last>
  <age>22</age>
</person>
<person>
  <first>Bill</first>
  <last>Gates</last>
  <age>46</age>
</person>
<person>
  <first>Steve</first>
  <last>Jobs</last>
  <age>40</age>
</person>
</book>

now the Java program to read data from an XML file is shown below..

import java.io.File;
import org.w3c.dom.Document;
import org.w3c.dom.*;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.DocumentBuilder;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException; 

public class ReadAndPrintXMLFile{

    public static void main (String argv []){
    try {

            DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
            Document doc = docBuilder.parse (new File("book.xml"));

            // normalize text representation
            doc.getDocumentElement ().normalize ();
            System.out.println ("Root element of the doc is " + 
                 doc.getDocumentElement().getNodeName());


            NodeList listOfPersons = doc.getElementsByTagName("person");
            int totalPersons = listOfPersons.getLength();
            System.out.println("Total no of people : " + totalPersons);

            for(int s=0; s<listOfPersons.getLength() ; s++){


                Node firstPersonNode = listOfPersons.item(s);
                if(firstPersonNode.getNodeType() == Node.ELEMENT_NODE){


                    Element firstPersonElement = (Element)firstPersonNode;

                    //-------
                    NodeList firstNameList = firstPersonElement.getElementsByTagName("first");
                    Element firstNameElement = (Element)firstNameList.item(0);

                    NodeList textFNList = firstNameElement.getChildNodes();
                    System.out.println("First Name : " + 
                           ((Node)textFNList.item(0)).getNodeValue().trim());

                    //-------
                    NodeList lastNameList = firstPersonElement.getElementsByTagName("last");
                    Element lastNameElement = (Element)lastNameList.item(0);

                    NodeList textLNList = lastNameElement.getChildNodes();
                    System.out.println("Last Name : " + 
                           ((Node)textLNList.item(0)).getNodeValue().trim());

                    //----
                    NodeList ageList = firstPersonElement.getElementsByTagName("age");
                    Element ageElement = (Element)ageList.item(0);

                    NodeList textAgeList = ageElement.getChildNodes();
                    System.out.println("Age : " + 
                           ((Node)textAgeList.item(0)).getNodeValue().trim());

                    //------


                }//end of if clause


            }//end of for loop with s var


        }catch (SAXParseException err) {
        System.out.println ("** Parsing error" + ", line " 
             + err.getLineNumber () + ", uri " + err.getSystemId ());
        System.out.println(" " + err.getMessage ());

        }catch (SAXException e) {
        Exception x = e.getException ();
        ((x == null) ? e : x).printStackTrace ();

        }catch (Throwable t) {
        t.printStackTrace ();
        }
        //System.exit (0);

    }//end of main


}

and the outcome was ..

Root element of the doc is book
Total no of people : 3
First Name : Kiran
Last Name : Pai
Age : 22
First Name : Bill
Last Name : Gates
Age : 46
First Name : Steve
Last Name : Jobs
Age : 40

Now my query is please advise is there any other way which is fastest to read this xml , I was looking the fastes , Please advise..!!

user1633823
  • 347
  • 2
  • 5
  • 14
  • 1
    fast and xml generally don't go together. text processing is inherently slow. – Marc B Sep 04 '12 at 18:42
  • 1
    First, you should identify WHERE it is slow instead of asking which is the fastest way to read the XML...any time you want to improve performance, you should start by identifying the places where the program runs slow and go from there...so, at what point in this particular program is it running too slow? – Zack Macomber Sep 04 '12 at 19:01
  • You should prefer [SAX](http://fr.wikipedia.org/wiki/Simple_API_for_XML) to [DOM](http://en.wikipedia.org/wiki/Document_Object_Model). – gontard Sep 04 '12 at 18:45
  • 1
    StAX (JSR-123) would be faster than SAX. – bdoughan Sep 04 '12 at 18:49
  • @BlaiseDoughan Thanks a lot dude , could you please convert my above program into your implementation of Stax so that I can understand in detail too..!! – user1633823 Sep 04 '12 at 18:56

3 Answers3

26

Using ReadAndPrintXMLFileWithStAX below, when I compare with ReadAndPrintXMLFileWithSAX from the answer given by gontard the StAX approach is faster. My test involved running both sample code 500000 times on JDK 1.7.0_07 for the Mac.

ReadAndPrintXMLFileWithStAX:  103 seconds
ReadAndPrintXMLFileWithSAX:   125 seconds

ReadAndPrintXMLFileWithStAX (using Java SE 7)

Below is a more optimized StAX (JSR-173) example using XMLStreamReader instead of XMLEventReader.

import java.io.FileInputStream;
import java.io.InputStream;
import javax.xml.stream.*;

public class ReadAndPrintXMLFileWithStAX {

    public static void main(String argv[]) throws Exception {
        XMLInputFactory inputFactory = XMLInputFactory.newInstance();
        InputStream in = new FileInputStream("book.xml");
        XMLStreamReader streamReader = inputFactory.createXMLStreamReader(in);
        streamReader.nextTag(); // Advance to "book" element
        streamReader.nextTag(); // Advance to "person" element

        int persons = 0;
        while (streamReader.hasNext()) {
            if (streamReader.isStartElement()) {
                switch (streamReader.getLocalName()) {
                case "first": {
                    System.out.print("First Name : ");
                    System.out.println(streamReader.getElementText());
                    break;
                }
                case "last": {
                    System.out.print("Last Name : ");
                    System.out.println(streamReader.getElementText());
                    break;
                }
                case "age": {
                    System.out.print("Age : ");
                    System.out.println(streamReader.getElementText());
                    break;
                }
                case "person" : {
                    persons ++;
                }
                }
            }
            streamReader.next();
        }
        System.out.print(persons);
        System.out.println(" persons");
    }

}

Output

First Name : Kiran
Last Name : Pai
Age : 22
First Name : Bill
Last Name : Gates
Age : 46
First Name : Steve
Last Name : Jobs
Age : 40
3 persons
Community
  • 1
  • 1
bdoughan
  • 147,609
  • 23
  • 300
  • 400
  • 8
    I m' sorry but i disagree with how you compare the program performance. You should launch the programs not `n` timeswith very little file but on a very large file otherwise too much time is spent on the jvm loading. On top of that, removing all the printing to the console is important since they are very expensive. – gontard Sep 05 '12 at 05:29
  • 3
    I tried your code on my "benchmark" and effectively `StAX` is faster than `SAX`. I was surprised so i try to explain that in my answer. Please take a look at it and give me feedback. – gontard Sep 05 '12 at 06:50
9

If the performances are important in your case, you should prefer SAX or StAX(http://en.wikipedia.org/wiki/StAX) to DOM.

With DOM, in a first time the XML file is parsed into an object model then you have can ask it. So for you algorithm there are two pass.

With SAX, during the parse, some callbacks are invoked (startDocument, endElement...), SAX is event-based or a push model.

With StAX, you control the parsing. You move a cursor from an element to another one. This is a pull model.

With a file containing 32910000 persons , i compare my version with SAX to the over answer (of Blaise Doughan) with StAX. I remove all the System.out.println instrusctions. My program took 106 seconds to read all the file and the other took 94 seconds. I suppose that SAX is slower because all the callback are invoked even if they does nothing (the push model) whereas with StAX the cursor is moved only on the "interresting" elements (the pull model).

For example with java 7 :

import java.io.File;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class ReadAndPrintXMLFileWithSax {

    public static void main(String[] args) throws Exception {
        SAXParserFactory fabrique = SAXParserFactory.newInstance();
        SAXParser parser = fabrique.newSAXParser();

        File file = new File("book.xml");
        BookHandler handler = new BookHandler();
        parser.parse(file, handler);
    }

    public static class BookHandler extends DefaultHandler {
        private int count = 0;
        private StringBuilder buffer;

        @Override
        public void startElement(String uri, String localName, String qName,
                Attributes attributes) throws SAXException {
            switch (qName) {
            case "person":
                count++;
                break;
            case "first":
                buffer = new StringBuilder("First Name : ");
                break;
            case "last":
                buffer = new StringBuilder("Last Name : ");
                break;
            case "age":
                buffer = new StringBuilder("Age : ");
                break;
            }
        }

        @Override
        public void characters(char[] ch, int start, int length)
                throws SAXException {
            String content = new String(ch, start, length);
            if (buffer != null)
                buffer.append(content);
        }

        @Override
        public void endElement(String uri, String localName, String qName)
                throws SAXException {
            switch (qName) {
            case "first":
            case "last":
            case "age":
                System.out.println(buffer.toString());
                break;
            }
        }

        @Override
        public void endDocument() throws SAXException {
            System.out.println(count + " persons");
        }
    }
}
gontard
  • 28,720
  • 11
  • 94
  • 117
  • I would expect the StAX approach to be faster. Below is a more optimized approach using `XMLStreamReader` that in my testing is faster than your SAX sample: http://stackoverflow.com/a/12273296/383861 – bdoughan Sep 05 '12 at 01:15
2

A Stax Example

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;

public class ReadAndPrintXMLFile {


    public static void main(String argv []) {

        String inputFile = "c:/source/book.xml";

        try {
            // First create a new XMLInputFactory
            XMLInputFactory inputFactory = XMLInputFactory.newInstance();
            // Setup a new eventReader
            InputStream in = new FileInputStream(inputFile);
            XMLEventReader eventReader = inputFactory.createXMLEventReader(in);
            // Read the XML document

            while (eventReader.hasNext()) {
                XMLEvent event = eventReader.nextEvent();

                if (event.isStartElement()) {
                    StartElement startElement = event.asStartElement();

                    if (startElement.getName().getLocalPart().equals("first")) {

                        event = eventReader.nextEvent();

                        StringBuilder fName = new StringBuilder();

                        while (!event.isEndElement()) {
                            fName.append(event.asCharacters().getData());
                            event = eventReader.nextEvent();
                        }
                        System.out.println("First Name : " + fName);

                        event = eventReader.nextEvent();

                        continue;
                    }

                    if (startElement.getName().getLocalPart().equals("last")) {

                        event = eventReader.nextEvent();

                        StringBuilder lName = new StringBuilder();

                        while (!event.isEndElement()) {
                            lName.append(event.asCharacters().getData());
                            event = eventReader.nextEvent();
                        }
                        System.out.println("Last Name : " + lName);

                        event = eventReader.nextEvent();

                        continue;
                    }

                    if (startElement.getName().getLocalPart().equals("age")) {

                        event = eventReader.nextEvent();

                        StringBuilder age = new StringBuilder();

                        while (!event.isEndElement()) {
                            age.append(event.asCharacters().getData());
                            event = eventReader.nextEvent();
                        }
                        System.out.println("Age : " + age);

                        event = eventReader.nextEvent();

                        continue;
                    }

                }
            }
        } catch (FileNotFoundException e) {
            System.out.println("File not Found: " + inputFile);
        } catch (XMLStreamException e) {
            e.printStackTrace();
        }
    }

}


Output:

First Name : Kiran
Last Name : Pai
Age : 22
First Name : Bill
Last Name : Gates
Age : 46
First Name : Steve
Last Name : Jobs
Age : 40
Mike
  • 3,186
  • 3
  • 26
  • 32