Consider parsing data with DOM tools like etree
(or feature-rich, third-party lxml
) and then build a list of dictionaries at the repeating <FullSchedule>
element to be passed into DataFrame
constructor:
import urllib.request as rq
import xml.etree.ElementTree as et
import pandas as pd
url = "https://wbes.nrldc.in/xml/FullSchedule-(184)-30-01-2021.xml"
doc = rq.urlopen(url)
tree = et.fromstring(doc.read()) # NOTE: TAKES SEVERAL MINUTES DUE TO SIZE
data = [{t.tag:t.text.strip() if t.text is not None else None
for t in fs.findall("*")
} for fs in tree.findall(".//FullSchedule")]
df = pd.DataFrame(data)
df.shape
# (1152, 21)
df.columns
# Index(['Buyer', 'Seller', 'ScheduleName', 'ScheduleSubTypeName', 'ScheduleDate',
# 'ScheduleAmount', 'BuyerAmount', 'SellerAmount', 'PocInjectionLoss',
# 'PocDrawalLoss', 'StateInjectionLoss', 'StateDrawalLoss',
# 'DiscomInjectionLoss', 'DiscomDrawalLoss', 'Trader', 'LinkName',
# 'OffBarTotal', 'OffBarAllocatedFromPool', 'Open', 'Combined',
# 'ApprovalNo'], dtype='object')
Because <Buyer>
and <Seller>
contain nested elements, they are blank above. Hence consider additional parsing and compilation. Only difference above is the findall
XPath.
data = [{t.tag:t.text.strip() if t.text is not None else None
for t in fs.findall("*")
} for fs in tree.findall(".//FullSchedule/Buyer")]
df = pd.DataFrame(data)
print(df)
# Acronym ParentState WBESParentStateAcronym
# 0 HARYANA HARYANA HARYANA_STATE
# 1 JK&LADAKH JAMMU AND KASHMIR JK&LADAKH_UT
# 2 UPPCL UTTAR PRADESH UTTARPRADESH_STATE
# 3 JK&LADAKH JAMMU AND KASHMIR JK&LADAKH_UT
# 4 UPPCL UTTAR PRADESH UTTARPRADESH_STATE
# ... ... ...
# 1147 CHANDIGARH CHANDIGARH CHANDIGARH_UT
# 1148 PUNJAB PUNJAB PUNJAB_STATE
# 1149 DELHI DELHI DELHI_UT
# 1150 HARYANA HARYANA HARYANA_STATE
# 1151 CHANDIGARH CHANDIGARH CHANDIGARH_UT
data = [{t.tag:t.text.strip() if t.text is not None else None
for t in fs.findall("*")
} for fs in tree.findall(".//FullSchedule/Seller")]
df = pd.DataFrame(data)
print(df)
# Acronym ParentState WBESParentStateAcronym
# 0 KAMENG None None
# 1 KAPS None None
# 2 VSTPS V None None
# 3 SOLAPUR None None
# 4 LARA-I None None
# ... ... ...
# 1147 NAPP None None
# 1148 BHAKRA None None
# 1149 CHAMERA3 None None
# 1150 RAPPC None None
# 1151 BHAKRA None None
By the way, pandas.read_xml()
is in the works by me and uses above algorithm where above may soon be handled with below. See Git issues post.
url = "https://wbes.nrldc.in/xml/FullSchedule-(184)-30-01-2021.xml"
fs_df = pd.read_xml(url, xpath=".//FullScheule", parser="lxml")
fs_df = pd.read_xml(url, xpath=".//FullScheule", parser="etree")
buyer_df = pd.read_xml(url, xpath=".//FullScheule/Buyer")
seller_df = pd.read_xml(url, xpath=".//FullScheule/Seller")