1

I have big xml files (~1GB) with this structure:

<?xml version="1.0" encoding="UTF-8"?>
<GenoExchange xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.ncbi.nlm.nih.gov/SNP/geno" xsi:schemaLocation="http://www.ncbi.nlm.nih.gov/SNP/geno ftp://ftp.ncbi.nlm.nih.gov/snp/specs/genoex_1_5.xsd" dbSNPBuildNo="146" reportId="MT" reportType="chromosome">
    <Population popId="638" handle="TSC-CSHL" locPopId="TSC_42_AA">
        <popClass self="NORTH AMERICA"/>
    </Population>
 <SnpInfo rsId="1041870" observed="C/T">
        <SnpLoc genomicAssembly="107:GRCh38.p2" geneId="4512" geneSymbol="COX1" chrom="MT" start="6150" locType="2" rsOrientToChrom="fwd" contigAllele="T" contig="NC_012920:1"/>
        <SsInfo ssId="1508548" locSnpId="TSC0349089" ssOrientToRs="fwd">
            <ByPop popId="1303" sampleSize="184">
                <AlleleFreq allele="T" freq="1"/>
                <AlleleFreq allele="C" freq="0"/>
            </ByPop>
        </SsInfo>
    </SnpInfo>
<SnpInfo rsId="1029293" observed="C/T">
        <SnpLoc genomicAssembly="107:GRCh38.p2" geneId="4512" geneSymbol="COX1" chrom="MT" start="6307" locType="2" rsOrientToChrom="fwd" contigAllele="C" contig="NC_012920:1"/>
        <SsInfo ssId="1494519" locSnpId="TSC0254145" ssOrientToRs="fwd">
            <ByPop popId="639" sampleSize="82">
                <AlleleFreq allele="T" freq="0"/>
                <AlleleFreq allele="C" freq="1"/>
            </ByPop>
            <ByPop popId="1303" sampleSize="184">
                <AlleleFreq allele="T" freq="0"/>
                <AlleleFreq allele="C" freq="1"/>
            </ByPop>
        </SsInfo>
    </SnpInfo>

I want to find a specific rsID, for example rsID="1029293" and extract all the information inside that node. I don't want to run all the file. I only want to find that ID, extract that information and end the iteration. From what I read it's better if I use SAX or Stax parsers. I'm using SAX, this is my code:

class UserHandler extends DefaultHandler {

   String rsID = null;
   String i = "1029293";       

   @Override
   public void startElement(String uri, 
      String localName, String qName, Attributes attributes) throws SAXException {

      if (qName.equalsIgnoreCase("SnpInfo")) { 
         rsID = attributes.getValue("rsId"); 
          //System.out.println("value: " + rsID);
      }
      if((i).equals(rsID) &&
         qName.equalsIgnoreCase("SnpInfo")){
         System.out.println("Start Element: " + qName + " " + rsID);
      }      

      if ((i).equals(rsID) && qName.equalsIgnoreCase("SsInfo")) {
          String a = attributes.getValue("ssId");
          System.out.println("SSID: " + a);
      }

      if ((i).equals(rsID) && qName.equalsIgnoreCase("ByPop")) {
          String p = attributes.getValue("popId");
          System.out.println("POPID: " + p);
      } 
      if ((i).equals(rsID) && qName.equalsIgnoreCase("AlleleFreq")) {
          String p = attributes.getValue("allele");
          String f = attributes.getValue("freq"); 
          System.out.println("ALLELE: " + p + " FREQ: " + f);
      }  
      if ((i).equals(rsID) && qName.equalsIgnoreCase("GTypeFreq")) {
          String p = attributes.getValue("gtype");
          String f = attributes.getValue("freq"); 
          System.out.println("GTYPE: " + p + " FREQ: " + f);
      }  
   }

   @Override
   public void endElement(String uri, 
      String localName, String qName) throws SAXException {
      if (qName.equalsIgnoreCase("SnpInfo")) {
         if((i).equals(rsID) 
            && qName.equalsIgnoreCase("SnpInfo"))
            System.out.println("End Element: " + qName); 
         }
      }
}
public class XMLParser {

    public static void main(String argv[]) {
        try {   
            InputStream fileStream = new FileInputStream("/home/xml/gt_chr10.xml.gz");
            InputStream gzipStream = new GZIPInputStream(fileStream);
            SAXParserFactory factory = SAXParserFactory.newInstance();
            SAXParser saxParser = factory.newSAXParser();
            UserHandler userhandler = new UserHandler();
            saxParser.parse(gzipStream, userhandler);
        } catch (Exception e) {
            e.printStackTrace();
        }
    } 

My problem is that my code searches the whole file for the ID and that takes more than 2 minutes each time. I can't have a code that takes so long. Is there a better approach for this?

diborbi
  • 83
  • 2
  • 10
  • Throw all the code away and use XPath fed by SAX or STAX. NB you shouldn't keep testing for `i.equals(rsID)`, or use redundant parentheses either. – user207421 Mar 16 '16 at 10:32
  • Is it possible to use XPath with SAX or STAX? I never worked with xml files before so the only thing I know is what I found in foruns. But a lot of people say that XPath needs an approach like DOM to work. – diborbi Mar 16 '16 at 11:05

5 Answers5

1

You can throw an exception in your end element handler, to indicate to the parser that it aborts parsing (http://www.ibm.com/developerworks/library/x-tipsaxstop/):

   @Override
   public void endElement(String uri, 
      String localName, String qName) throws SAXException {
      if (qName.equalsIgnoreCase("SnpInfo")) {
         if((i).equals(rsID) 
            && qName.equalsIgnoreCase("SnpInfo"))
            System.out.println("End Element: " + qName); 
            throw SAXException("Element found.");
         }
      }
Martin Honnen
  • 160,499
  • 6
  • 90
  • 110
1

Using STAX gives you more control when parsing XML, since you actively pull elements from the stream. This way you can pull the next event, handle it and once you found your data, simply terminate the loop (using a flag or even a return statement if you must)

InputStream in = ...
XMLInputFactory factory = XMLInputFactory.newInstance();
XMLEventReader eventReader = factory.createXMLEventReader(in);

boolean found = false;
while (!found && eventReader.hasNext()) {
    XMLEvent event = eventReader.nextEvent();
    switch (event.getEventType()) {
    case XMLStreamConstants.START_ELEMENT:
        // your logic here 
        // once you found your element, you can terminate the loop 
        found = true;
        break;
    case XMLStreamConstants.END_ELEMENT:
        // your logic here
        break;
    }
}

(omitted exception and resource handling for brevity)

On a side note, you will gain some performance by combining your if ((i).equals(rsID) && ... into a single one, with detail checks in nested ifs

if ((i).equals(rsID)) {
    if(qName.equalsIgnoreCase("GTypeFreq")) {
       ...
    }
}
nyname00
  • 2,496
  • 2
  • 22
  • 25
1

The only way to avoid parsing the whole file every time you run this is to put the data in an XML database. Parsing a 1Gb file is going to take about a minute, plus or minus depending on the speed of your machine and what processing you do on each node.

A streamed XSLT 3.0 solution is simply:

<xsl:transform version="3.0"
     xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
     xpath-default-namespace="http://www.ncbi.nlm.nih.gov/SNP/geno">
  <xsl:template name="xsl:initial-template">
    <xsl:stream href="input.xml">
       <xsl:copy-of select="/GenoExchange/SnpInfo[@rsId='1041870'][1]"/>
    </xsl:stream>
  </xsl:template>
</xsl:transform>

No need to write all that pesky SAX or StAX code.

I put the "[1]" predicate in to allow the processor to abandon the search when it has found the first hit.

Michael Kay
  • 156,231
  • 11
  • 92
  • 164
  • When I use this I always get the message "No element found". – diborbi Mar 16 '16 at 17:18
  • It works for me. Perhaps you forgot to specify xsl:initial-template as the entry point, or perhaps your source file wasn't called input.xml, or perhaps XSLT 3.0 wasn't enabled. – Michael Kay Mar 17 '16 at 11:38
1

The best approach is to use vtd-xml and xpath... 1GB xml file takes about 1.5GB heap space and < 10 sec in a 3~4 year old intel processor.see code example below.. One more thing, if you want to eliminate parsing entirely, you can create a vtd+XML file format so any subsequent query can directly access the vtd index portion, which could easily triple or quadruple your app performance...

import com.ximpleware.*;

    public class simpleXpathSearch{
        public  static  void main(String s[]) throws VTDException,java.io.UnsupportedEncodingException,java.io.IOException{
            VTDGen vg = new VTDGen();
            vg.setLCLevel(5);
            if (!vg.parseFile("input.xml", false))
                return;
            VTDNav vn = vg.getNav();
            AutoPilot ap = new AutoPilot(vn);
            ap.selectXPath("/*/*[@rsID='1029293']");
            int i=0;
            while((i=ap.evalXPath())!=-1){
               // your code logic here
            }
Community
  • 1
  • 1
vtd-xml-author
  • 3,319
  • 4
  • 22
  • 30
  • I tried this example and it works fine for files less than 1GB, but for bigger files I get: java.lang.OutOfMemoryError: Java heap space – diborbi Apr 19 '16 at 11:02
  • I don't understand how do I create a vtd-xml without first parsing the file. As I have only 4GB of RAM I can't parse my files. – diborbi Apr 19 '16 at 11:53
  • But I have files with more than 10GB that's where my problem is. – diborbi Apr 20 '16 at 09:43
  • if 4gb is all you got, then you have no choice but to use StAX to process that 10GB file... but again, why can't you generate files that are somewhat smaller than that? 1GB per file is pretty reasonable isn't it? – vtd-xml-author Apr 20 '16 at 17:46
  • The files are not mine. Thank you for the help. – diborbi Apr 21 '16 at 16:44
0

//Main class

public static void main(String[] args) {
    SAXReader.read();
}

//SAXReader

public static void read(){
    try {
        XMLReader processor = XMLReaderFactory.createXMLReader();
        processor.setContentHandler(new SAXController());
        processor.parse(new InputSource("MyXML.xml"));
    } catch (SAXException | IOException e) {
        System.err.println(e.getMessage());
    }
}

//SAXController

// The SAXController extends DefaultHandler

private int tab = 0;

private void tabulation() {
    for (int i=0; i<tab; i++)
        System.out.print("  ");
}

@Override
public void startDocument() {
    tabulation();
    System.out.println("Starting XML Document");
    tab++;
}

@Override
public void endDocument() {
    tab--;
    tabulation();
    System.out.println("Ending XML Document");
}

@Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
        throws SAXException {
    tabulation();
    System.out.print(localName);
    if (attributes.getLength()>0) {
        for (int i=0; i<attributes.getLength(); i++) {
            System.out.print(attributes.getLocalName(i)+": "+attributes.getValue(i));
        }
    }
    System.out.println();
    tab++;
}

@Override
public void endElement(String uri, String localName, String qName)
        throws SAXException {
    tab--;
    tabulation();
    System.out.println(localName);
}

@Override
public void characters(char[] ch, int start, int length)
        throws SAXException {
    String content= new String(ch, start, length);
    content= content.replaceAll("[\t\n]", "").trim();
    if (!content.equals("")) {
        tabulation();
        System.out.println(content);
    }
}
Jordi72
  • 3
  • 5