how do you convert xml file to data frame or csv output in python

Question

I have this xml file that would like to convert the contents to data frame for csv file in python:

<?xml version="1.0" encoding="utf-8"?>
<dashboardreport name="jvm_report" version="7.0.21.1017" reportdate="2018-08-08T10:37:01.510-04:00" description="">
  <source name="CORP_GTM">
    <filters summary="from Jul-30 23:40 to Jul-31 02:40">
      <filter>tf:CustomTimeframe?1533008450802:1533019250802</filter>
    </filters>
  </source>
  <reportheader>
    <reportdetails>
      <user>test1</user>
    </reportdetails>
  </reportheader>
  <data>
    <chartdashlet name="jvm_mem_percent" description="" showabsolutevalues="false">
      <measures structuretype="tree">
        <measure measure="Memory Utilization - Memory Utilization (split by Agent)" color="#800080" aggregation="Maximum" unit="%" thresholds="false" drawingorder="1">
          <measure measure="Memory Utilization - test@server1" color="#7aebd0" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1533008460000" avg="11.116939544677734" min="11.007165908813477" max="11.143875122070312" sum="66.7016372680664" count="6"></measurement>
            <measurement timestamp="1533008520000" avg="11.204706827799479" min="11.144883155822754" max="11.268420219421387" sum="67.22824096679688" count="6"></measurement>
          </measure>
          <measure measure="Memory Utilization - test@server2" color="#a6f2e0" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1533008460000" avg="11.900418599446615" min="10.386141777038574" max="13.744248390197754" sum="71.40251159667969" count="6"></measurement>
            <measurement timestamp="1533008520000" avg="11.139397939046225" min="10.617960929870605" max="11.427289009094238" sum="66.83638763427734" count="6"></measurement>
          </measure>
          <measure measure="Memory Utilization - test@server3" color="#dd2271" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1533008460000" avg="8.395787556966146" min="8.340044021606445" max="8.429450035095215" sum="50.374725341796875" count="6"></measurement>
            <measurement timestamp="1533008520000" avg="8.490419387817383" min="8.456218719482422" max="8.5205659866333" sum="50.9425163269043" count="6"></measurement>
           </measure>
            </measure>
      </measures>
    </chartdashlet>
    <chartdashlet name="jvm_trans_errors" description="" showabsolutevalues="false">
      <measures structuretype="tree"></measures>
    </chartdashlet>
    <chartdashlet name="jvm_trans" description="" showabsolutevalues="false">
      <measures structuretype="tree">
        <measure measure="Count Backend - Count Backend (split by Agent)" color="#8080c0" aggregation="Sum" unit="num" thresholds="false" drawingorder="1">
          <measure measure="Count Backend - test@server1" color="#e44e8d" aggregation="Sum" unit="num" thresholds="false">
            <measurement timestamp="1533010380000" avg="1.0" min="1.0" max="1.0" sum="1.0" count="1"></measurement>
            <measurement timestamp="1533011340000" avg="1.0" min="1.0" max="1.0" sum="10.0" count="10"></measurement>
            <measurement timestamp="1533013080000" avg="1.0" min="1.0" max="1.0" sum="1.0" count="1"></measurement>
            <measurement timestamp="1533013200000" avg="1.0" min="1.0" max="1.0" sum="1.0" count="1"></measurement>
            <measurement timestamp="1533014940000" avg="1.0" min="1.0" max="1.0" sum="2.0" count="2"></measurement>
            <measurement timestamp="1533015780000" avg="1.0" min="1.0" max="1.0" sum="1.0" count="1"></measurement>
            <measurement timestamp="1533018480000" avg="1.0" min="1.0" max="1.0" sum="1.0" count="1"></measurement>
            <measurement timestamp="1533018540000" avg="1.0" min="1.0" max="1.0" sum="2.0" count="2"></measurement>
          </measure>
          <measure measure="Count Backend - test@server2" color="#e5cf4d" aggregation="Sum" unit="num" thresholds="false">
            <measurement timestamp="1533009060000" avg="1.0" min="1.0" max="1.0" sum="10.0" count="10"></measurement>
            <measurement timestamp="1533009120000" avg="1.0" min="1.0" max="1.0" sum="1.0" count="1"></measurement>
            <measurement timestamp="1533009420000" avg="1.0" min="1.0" max="1.0" sum="3.0" count="3"></measurement>
            <measurement timestamp="1533009480000" avg="1.0" min="1.0" max="1.0" sum="5.0" count="5"></measurement>
            <measurement timestamp="1533010020000" avg="1.0" min="1.0" max="1.0" sum="4.0" count="4"></measurement>
            <measurement timestamp="1533010320000" avg="1.0" min="1.0" max="1.0" sum="1200.0" count="1200"></measurement>
          </measure>
          <measure measure="Count Backend - test@server3" color="#dec321" aggregation="Sum" unit="num" thresholds="false">
            <measurement timestamp="1533008460000" avg="1.0" min="1.0" max="1.0" sum="4.0" count="4"></measurement>
            <measurement timestamp="1533008520000" avg="1.0" min="1.0" max="1.0" sum="5.0" count="5"></measurement>
            <measurement timestamp="1533008580000" avg="1.0" min="1.0" max="1.0" sum="9.0" count="9"></measurement>
            <measurement timestamp="1533008640000" avg="1.0" min="1.0" max="1.0" sum="5.0" count="5"></measurement>
          </measure>       
          </measure>
        </measures>
    </chartdashlet>
  </data>
</dashboardreport>

the output needs to look like this:

timestamp    max           count    node
1.53301E+12 11.14387512 6   Memory Utilization - test@server1
1.53301E+12 11.26842022 6   Memory Utilization - test@server1
1.53301E+12 13.74424839 6   Memory Utilization - test@server2
1.53301E+12 11.42728901 6   Memory Utilization - test@server2
1.53301E+12 8.429450035 6   Memory Utilization - test@server3
1.53301E+12 8.520565987 6   Memory Utilization - test@server3
1.53301E+12 1   1   Count Backend - test@server1
1.53301E+12 1   10  Count Backend - test@server1
1.53301E+12 1   1   Count Backend - test@server1
1.53301E+12 1   1   Count Backend - test@server1

I can do this in R like this:

doc <- read_xml("C:/test1/test.xml")
  dat<-xml_find_all(doc, ".//measure/measure") %>%
    map_df(function(x) {
      xml_find_all(x, ".//measurement") %>%
        map_df(~as.list(xml_attrs(.))) %>%
        select(-min, -avg, -sum) %>%
        mutate(node=xml_attr(x, "measure"))
    })

I need to do this in python, any ideas?

Some useful solutions here: https://stackoverflow.com/questions/28259301/how-to-convert-an-xml-file-to-nice-pandas-dataframe — Michael B, Aug 08 '18 at 15:53
I believe there is an error in your xml file, there are some "measure" tags inside some other "measure" tags — Nicolò Gasparini, Aug 10 '18 at 17:52
Does this answer your question? [How to convert an XML file to nice pandas dataframe?](https://stackoverflow.com/questions/28259301/how-to-convert-an-xml-file-to-nice-pandas-dataframe) — iacob, Apr 21 '21 at 07:53

Rakesh · Accepted Answer · 2018-08-14T05:57:30.630

One approach is to pre-process your XML file and then feed it to pandas. I am using ElementTree in this example.

Ex:

import pandas as pd
import xml.etree.ElementTree as ET

def getMetrics(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    result = []
    for measure in root.iter('measure'):                         #Get all 'measure' tag
        node = measure.attrib["measure"].split("-")[0].strip()    #Get Node
        for measurement in measure:                              #Get Metrics Information
            if "timestamp" in measurement.attrib:
                result.append(dict(node=node, timestamp=measurement.attrib.get("timestamp"), max=measurement.attrib["max"], count=measurement.attrib["count"]))
    return result

df = pd.DataFrame(getMetrics(filename), columns=["timestamp", "max", "count", "node"])          #Form Dataframe
print(df)

df.to_csv("Your_Output.csv")     #Write to CSV.

Output:

        timestamp                 max count                node
0   1533008460000  11.143875122070312     6  Memory Utilization
1   1533008520000  11.268420219421387     6  Memory Utilization
2   1533008460000  13.744248390197754     6  Memory Utilization
3   1533008520000  11.427289009094238     6  Memory Utilization
4   1533008460000   8.429450035095215     6  Memory Utilization
5   1533008520000     8.5205659866333     6  Memory Utilization
6   1533010380000                 1.0     1       Count Backend
7   1533011340000                 1.0    10       Count Backend
8   1533013080000                 1.0     1       Count Backend
9   1533013200000                 1.0     1       Count Backend
10  1533014940000                 1.0     2       Count Backend
11  1533015780000                 1.0     1       Count Backend
12  1533018480000                 1.0     1       Count Backend
13  1533018540000                 1.0     2       Count Backend
14  1533009060000                 1.0    10       Count Backend
15  1533009120000                 1.0     1       Count Backend
16  1533009420000                 1.0     3       Count Backend
17  1533009480000                 1.0     5       Count Backend
18  1533010020000                 1.0     4       Count Backend
19  1533010320000                 1.0  1200       Count Backend
20  1533008460000                 1.0     4       Count Backend
21  1533008520000                 1.0     5       Count Backend
22  1533008580000                 1.0     9       Count Backend
23  1533008640000                 1.0     5       Count Backend

Edit as per comment. If you want to pass the xml from requests use ET.fromstring and pass r.content or r.text.

Ex:

import pandas as pd
import xml.etree.ElementTree as ET

def getMetrics(file_name):
    root = ET.fromstring(file_name)
    result = []
    for measure in root.iter('measure'):                         #Get all 'measure' tag
        node = measure.attrib["measure"].split("-")[0].strip()    #Get Node
        for measurement in measure:                              #Get Metrics Information
            if "timestamp" in measurement.attrib:
                result.append(dict(node=node, timestamp=measurement.attrib.get("timestamp"), max=measurement.attrib["max"], count=measurement.attrib["count"]))
    return result

df = pd.DataFrame(getMetrics(r.content), columns=["timestamp", "max", "count", "node"])          #Form Dataframe
print(df)

my data xml data stored in an object called "r", when I call your function "df = pd.DataFrame(getMetrics(r), columns=["timestamp", "max", "count", "node"]) ", not able to extract the data. — user1471980, Aug 13 '18 at 19:18
is `r` the path to your xml file?...You need to provide the full path to your xml file Ex: `getMetrics(r"C:\Users\User\Desktop\testFiles\A.xml")` — Rakesh, Aug 13 '18 at 19:20
import requests from requests.auth import HTTPDigestAuth url = 'http://url' r = requests.get(url, auth=('user', 'pass')) — user1471980, Aug 13 '18 at 19:23
can I feed an oject to getMetrics function. I rather feed the object than write and read files to this function. — user1471980, Aug 13 '18 at 19:34
I'm almost there. I also need the server column, I need to extract "Count Backend - test@server1" from this line "". The xml file has data per each server, there fore I also need the server info as column. — user1471980, Aug 14 '18 at 12:59
replace `node = measure.attrib["measure"].split("-")[0].strip()` with `node = measure.attrib["measure"].strip() ` — Rakesh, Aug 14 '18 at 13:04

Nicolò Gasparini · Answer 2 · 2018-08-11T08:53:58.653

You should use the builtin library xml in Python.

Now, your tags and attributes aren't standard so I had to create a function that might be hard-coded for your problem, but others can use it as a guideline.

Considering this kind of tag as the only source of data you have and getting its node attribute from the parent tag:

<measurement timestamp="1533008520000" avg="8.490419387817383" min="8.456218719482422" max="8.5205659866333" sum="50.9425163269043" count="6"></measurement>

The following function should work, using Pandas to create a dataframe and exporting it to a .csv file:

from xml.dom import minidom
import pandas as pd

def convert():
    filename = 'teststack.xml'
    document = minidom.parse(filename)
    items = document.getElementsByTagName('measurement')

    df = pd.DataFrame(columns=["timestamp", "max", "count", "node"])
    for i, item in enumerate(items):
        # Creating new line for every item
        df.loc[i] = [
            item.getAttribute('timestamp'),
            item.getAttribute('max'),
            item.getAttribute('count'),
            item.parentNode.getAttribute('measure')
        ]

    # Exporting file
    df.to_csv("export.csv")
    return df

Just change the filename with your .xml file and it should work. Once you have the dataframe you can work however you like to modify precision, approximation and other features of your data.

tchatow · Answer 3 · 2018-08-13T20:57:51.723

Here is a solution using included libraries only and Python 3.6 - no need for pandas

CSV:

import csv
import xml.etree.ElementTree

e = xml.etree.ElementTree.parse('data.xml').getroot()

with open('out.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    for data in e.iter('measures'):
        measures = data.findall('measure/measure')
        for measure in measures:
            for row in measure:
                csv_writer.writerow([row.get('timestamp'), row.get('max'), row.get('count'), measure.get('measure')])

Columns:

import xml.etree.ElementTree

e = xml.etree.ElementTree.parse('data.xml').getroot()

row_data = [['timestamp', 'max', 'count', 'node']]
widths = [len(i) for i in row_data[0]]

for data in e.iter('measures'):
    measures = data.findall('measure/measure')
    for measure in measures:
        for row in measure:
            row_list = [row.get('timestamp'), row.get('max'), row.get('count'), measure.get('measure')]
            row_data.append(row_list)

            for i, val in enumerate(row_list):
                if len(val) > widths[i]:
                    widths[i] = len(val)

with open('out.txt', 'w') as txt_writer:
    for row in row_data:
        txt_writer.write(' '.join([f"{row[i]: <{widths[i]}}" for i in range(4)]) + '\n')

score 0 · Answer 4 · answered Aug 17 '18 at 12:40

import pandas as pd
import xml.etree.ElementTree as ET

def getMetrics(file_name):
    tree = ET.parse(file_name)
    root = tree.getroot()
    result = []
    for measure in root.iter('measure'):                         #Get all 'measure' tag
        node = measure.attrib["measure"].split("-")[0].strip()    #Get Node
        for measurement in measure:                              #Get Metrics Information
            if "timestamp" in measurement.attrib:
                result.append(dict(node=node, timestamp=measurement.attrib.get("timestamp"), max=measurement.attrib["max"], count=measurement.attrib["count"]))
    return result

df = pd.DataFrame(getMetrics(filename), columns=["timestamp", "max", "count", "node"])          #Form Dataframe
print(df)

df.to_csv("Your_Output.csv")     #Write to CSV.

how do you convert xml file to data frame or csv output in python

4 Answers4