4

I'm using scala.xml.pull to parse through a vary large xml file. This works great for event processing, but what I want to do is have my parser cough up a mini-document for particular nodes and I don't see an easy way to do this, or at least not a "scala" way.

I'm thinking I build a seek function like this, that can use the iterator to find an EvElemStart event that matches my tag:

def seek(tag: String) = {
  while (it.hasNext) {
    it.next match {
      case EvElemStart(_, `tag`, _, _) => 

After that I'm less clear. Is there a simple way to grab all of the children of this tag into a document rather than having to iterate through every event the XMLEventReader pops out?

What I'm ultimately looking for is a process that scans the file and emits an xml element (an Elem?) for each instance of a particular tag or set of tags that I can process using the normal scala xml processing.

Jim B.
  • 4,512
  • 3
  • 25
  • 53

2 Answers2

2

Here's what I ended up doing. slurp(tag) seeks to the next instance of the tag and returns a complete node tree for that tag.

def slurp(tag: String): Option[Node] = {
  while (it.hasNext) {
    it.next match {
      case EvElemStart(pre, `tag`, attrs, _) => return Some(subTree(tag, attrs))
      case _ => 
    }
  }
  return None
}

def subTree(tag: String, attrs: MetaData): Node = {
  var children = List[Node]()

  while (it.hasNext) {
    it.next match {
      case EvElemStart(_, t, a, _) => {
        children = children :+ subTree(t, a)
      }
      case EvText(t) => {
        children = children :+ Text(t)
      }
      case EvElemEnd(_, t) => {
        return new Elem(null, tag, attrs, xml.TopScope, children: _*)
      }
      case _ =>
    }
  }
  return null   // this shouldn't happen with good XML
}
Jim B.
  • 4,512
  • 3
  • 25
  • 53
2

Based on Jim Baldwin's answer I created an iterator, which gets nodes at a specific level (instead of specific tag):

import scala.io.Source
import scala.xml.parsing.FatalError
import scala.xml.{Elem, MetaData, Node, Text, TopScope}
import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader}


/**
  * Streaming XML parser which yields Scala XML Nodes.
  *
  * Usage:
  *
  * val it = new XMLNodeIterator(pathToXML, 1)
  *
  * Will give you all book-nodes of
  *
  * <?xml version="1.0" encoding="UTF-8"?>
  * <books>
  *     <book>
  *         <title>A book title</title>
  *     </book>
  *     <book>
  *         <title>Another book title</title>
  *     </book>
  * </books>
  *
  */
class StreamingXMLParser(filename: String, wantedNodeLevel: Int) extends Iterator[Node] {
    val file = Source.fromFile(filename)
    val it = new XMLEventReader(file)
    var currentLevel = 0
    var nextEvent = it.next // peek into next event

    def getNext() = {
        val currentEvent = nextEvent
        nextEvent = it.next
        currentEvent
    }

    def hasNext = {
        while (it.hasNext && !nextEvent.isInstanceOf[EvElemStart]) {
            getNext() match {
                case EvElemEnd(_, _) => {
                    currentLevel -= 1
                }
                case _ => // noop
            }
        }
        it.hasNext
    }

    def next: Node = {
        if (!hasNext) throw new NoSuchElementException

        getNext() match {
            case EvElemStart(pre, tag, attrs, _) => {
                if (currentLevel == wantedNodeLevel) {
                    currentLevel += 1
                    getElemWithChildren(tag, attrs)
                }
                else {
                    currentLevel += 1
                    next
                }
            }
            case EvElemEnd(_, _) => {
                currentLevel -= 1
                next
            }
            case _ => next
        }
    }

    def getElemWithChildren(tag: String, attrs: MetaData): Node = {
        var children = List[Node]()

        while (it.hasNext) {
            getNext() match {
                case EvElemStart(_, t, a, _) => {
                    currentLevel += 1
                    children = children :+ getElemWithChildren(t, a)
                }
                case EvText(t) => {
                    children = children :+ Text(t)
                }
                case EvElemEnd(_, _) => {
                    currentLevel -= 1
                    return new Elem(null, tag, attrs, TopScope, true, children: _*)
                }
                case _ =>
            }
        }
        throw new FatalError("Failed to parse XML.")
    }
}
arve0
  • 3,424
  • 26
  • 33