I am trying to read a large XML file. The file size is around 84 GB, Stack overflow data dump from "Post.xml"
I noticed there is Pandas API pandas.read_xml() link. But if I try to use this then it gives me "memory error"
MemoryError Traceback (most recent call last)
<ipython-input-9-86c9119d39ba> in <module>
----> 1 df = pd.read_xml(posts_file)
~\anaconda3\envs\analytics\lib\site-packages\pandas\io\xml.py in read_xml(path_or_buffer, xpath, namespaces, elems_only, attrs_only, names, encoding, parser, stylesheet, compression, storage_options)
925 """
926
--> 927 return _parse(
928 path_or_buffer=path_or_buffer,
929 xpath=xpath,
~\anaconda3\envs\analytics\lib\site-packages\pandas\io\xml.py in _parse(path_or_buffer, xpath, namespaces, elems_only, attrs_only, names, encoding, parser, stylesheet, compression, storage_options, **kwargs)
726 raise ValueError("Values for parser can only be lxml or etree.")
727
--> 728 data_dicts = p.parse_data()
729
730 return _data_to_frame(data=data_dicts, **kwargs)
~\anaconda3\envs\analytics\lib\site-packages\pandas\io\xml.py in parse_data(self)
389 from lxml.etree import XML
390
--> 391 self.xml_doc = XML(self._parse_doc(self.path_or_buffer))
392
393 if self.stylesheet is not None:
~\anaconda3\envs\analytics\lib\site-packages\pandas\io\xml.py in _parse_doc(self, raw_doc)
540 )
541
--> 542 handle_data = get_data_from_filepath(
543 filepath_or_buffer=raw_doc,
544 encoding=self.encoding,
~\anaconda3\envs\analytics\lib\site-packages\pandas\io\xml.py in get_data_from_filepath(filepath_or_buffer, encoding, compression, storage_options)
611 ) as handle_obj:
612 filepath_or_buffer = (
--> 613 handle_obj.handle.read()
614 if hasattr(handle_obj.handle, "read")
615 else handle_obj.handle
MemoryError:
I could not find any options to read the xml file chuck chuck. Can anyone suggest how can I overcome this limitations?
*** Updated ***
As it turns out panda's read_xml() can not parse a XML file larger than 1 GB even if the system has 100GB RAM.
So, I ended up implementing parsing the XML file in a iterative way. The code is shared below. I hope this will help someone.
def save_df(df, file_name, append=False):
if append:
df.to_csv(file_name, index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC, mode="a", header=False)
else:
df.to_csv(file_name, index=False, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
def append_to_file(rows, output_csv_file, append):
df = pd.DataFrame(rows)
df.drop_duplicates('Id', inplace=True)
save_df(df, output_csv_file, append=append)
print("%d rows has been appended to the CSV file %s" % (len(df), output_csv_file))
return len(df)
def convert_xml_to_csv_iteratively(XML_file, columns, output_csv_file, threshold):
if output_csv_file is None:
output_csv_file = XML_file[:-4] + ".csv"
print("Going to Convert XML files to CSV file")
context = ET.iterparse(XML_file, events=("end",))
# Variables to process the output
total_questions = 0
unique_rows = 0
rows = []
cur_count = 0
append = False
for event, elem in context:
if elem.tag == "row":
dic = {}
for col in COLS:
dic[col] = elem.attrib.get(col, '')
rows.append(dic)
# progress
if total_questions % 100000 == 0:
print('Total Questions: %d' % total_questions)
elem.clear()
total_questions += 1
cur_count += 1
if cur_count > threshold:
unique_rows += append_to_file(rows, output_csv_file, append)
append = True
cur_count = 0
rows = []
if(len(rows) > 0):
unique_rows += append_to_file(rows, output_csv_file, append)
print("Total number of rows: %d vs unique_rows: %d" % (total_questions, unique_rows))
return total_questions