I am working on a project that requires me to parse massive XML files to JSON. I have written code, however it is too slow. I have looked at using lxml and BeautifulSoup but am unsure how to proceed.
I have included my code. It works exactly how it is supposed to, except it is too slow. It took around 24 hours to go through a sub-100Mb file to parse 100,000 records.
product_data = open('productdata_29.xml', 'r')
read_product_data = product_data.read()
def record_string_to_dict(record_string):
'''This function takes a single record in string form and iterates through
it, and sorts it as a dictionary. Only the nodes present in the parent_rss dict
are appended to the new dict (single_record_dict). After each record,
single_record_dict is flushed to final_list and is then emptied.'''
#Iterating through the string to find keys and values to put in to
#single_record_dict.
while record_string != record_string[::-1]:
try:
k = record_string.index('<')
l = record_string.index('>')
temp_key = record_string[k + 1:l]
record_string = record_string[l+1:]
m = record_string.index('<')
temp_value = record_string[:m]
#Cleaning thhe keys and values of unnecessary characters and symbols.
if '\n' in temp_value:
temp_value = temp_value[3:]
if temp_key[-1] == '/':
temp_key = temp_key[:-1]
n = record_string.index('\n')
record_string = record_string[n+2:]
#Checking parent_rss dict to see if the key from the record is present. If it is,
#the key is replaced with keys and added to single_record_dictionary.
if temp_key in mapped_nodes.keys():
temp_key = mapped_nodes[temp_key]
single_record_dict[temp_key] = temp_value
except Exception:
break
while len(read_product_data) > 10:
#Goes through read_product_data to create blocks, each of which is a single
#record.
i = read_product_data.index('<record>')
j = read_product_data.index('</record>') + 8
single_record_string = read_product_data[i:j]
single_record_string = single_record_string[9:-10]
#Runs previous function with the input being the single string found previously.
record_string_to_dict(single_record_string)
#Flushes single_record_dict to final_list, and empties the dict for the next
#record.
final_list.append(single_record_dict)
single_record_dict = {}
#Removes the record that was previously processed.
read_product_data = read_product_data[j:]
#For keeping track/ease of use.
print('Record ' + str(break_counter) + ' has been appended.')
#Keeps track of the number of records. Once the set value is reached
#in the if loop, it is flushed to a new file.
break_counter += 1
flush_counter += 1
if break_counter == 100 or flush_counter == break_counter:
record_list = open('record_list_'+str(file_counter)+'.txt', 'w')
record_list.write(str(final_list))
#file_counter keeps track of how many files have been created, so the next
#file has a different int at the end.
file_counter += 1
record_list.close()
#resets break counter
break_counter = 0
final_list = []
#For testing purposes. Causes execution to stop once the number of files written
#matches the integer.
if file_counter == 2:
break
print('All records have been appended.')