I have to process a large XML document for which I have several data cleaning and manipulating task to do.
The basic code below is using the xml.etree.ElementTree
.
As the file is very large (about 2Gb) i would like to be able to print the value of my tagCounts
accumulator variable on a regular basis.
What is the cleanest way to implement a timer using ElementTree
printing every 3 minutes the content of self.tagCounts
?
Thanks
import xml.etree.ElementTree as ET
import pprint
class TagCounter:
def __init__(self):
self.tagCounts = {}
def start(self, tag, attrib):
if tag in self.tagCounts:
self.tagCounts[tag] += 1
else:
self.tagCounts[tag] = 1
def end(self, tag):
pass
def data(self, data):
pass
def close(self):
return self.tagCounts
def count_tags(filename):
parser = ET.XMLParser(target = TagCounter())
with open(filename, mode='r') as f:
for line in f:
parser.feed(line)
t = parser.close()
return t
if __name__ == "__main__":
tags = count_tags("file.osm")
pprint.pprint(tags)