I'm trying to iterate through a lot of xml files that have ~1000 individual nodes that I want to iterate through to extract specific attributes (each node has 15 or so attributes, I only want one). In the end, there should be about 4 million rows. My code is below, but I have a feeling that it's not time efficient. What can I optimize about this?
import os, pandas as pd, xml.etree.ElementTree as xml
#init master df as accumulator of temp dfs
master_df = pd.DataFrame(
columns = [
'col1',
'col2',
'col3',
'col4'
])
dir = 'C:\\somedir'
#iterate through files
for file in os.listdir(dir):
#init xml handle and parse
file = open(str(dir+"{}").format('\\'+file)
parse = xml.parse(file)
root = parse.getroot()
#var assignments with desired data
parent_node1 = str(root[0][0].get('pn1'))
parent_node2 = str(root[0][1].get('pn2'))
#resetting iteration dependent variables
count = 0
a_dict = {}
#iterating through list of child nodes
for i in list(root[1].iter())[1:]:
child_node1 = str(i.get('cn1'))
child_node2 = str(i.get('cn2'))
a_dict.update({
count: {
"col1" : parent_node1,
'col2': child_node1,
"col3": parent_node2,
"col4" : child_node2
}})
count = count+1
temp_df = pd.DataFrame(a_dict).T
master_df = pd.merge(
left = master_df,
right = temp_df,
how = 'outer'
)