I have a file of almost 2 GB. The task is to translate the xml file into a database. To process it, I use a custom generator function. While the file is being processed by the accumulation effect, my RAM is loaded and the process ends without completing it (killed). Please tell me what is wrong in my code. As far as I know, the generator function method is used for very large files and should not fill the RAM completely.
from lxml import etree
from bs4 import BeautifulSoup
from itertools import chain
import os, glob
def generator_get_element(xmlfile):
#generator to get elements in a xml file
doc = etree.iterparse(xmlfile, events=('start', 'end'))
_, root = next(doc)
start_tag = None
for event, element in doc:
if event == 'start' and start_tag is None:
start_tag = element.tag
if event == 'end' and element.tag == start_tag:
yield element
start_tag = None
root.clear()
def get_info_from_element():
# parse all information from the received element of the xml file
# download_file()
intermediate_list = list()
os.chdir("./xml_files")
for file in glob.glob("*.xml"):
my_fun = generator_get_element(file)
for i in my_fun:
try:
parts = ([i.text] +
list(chain(*([etree.tostring(c)] for c in i.getchildren()))) +
[i.tail])
filter_parts = filter(None, parts)
for i in filter_parts:
bf = BeautifulSoup(i, 'lxml')
intermediate_list.append(bf.text)
intermediate_list = [line.replace('\n', ' ') for line in intermediate_list]
trade_name = ' '.join(list(set(intermediate_list[4].split(' '))))
insert_program(
db_con,
id=re.sub(">", "", intermediate_list[1].strip()),
type=re.sub(">", "", intermediate_list[2].strip()),
tradeName=trade_name.strip(),
ico=re.sub("[^0-9]", "", intermediate_list[5].strip()),
fullAddress=re.sub(">", "", intermediate_list[6].strip())
)
intermediate_list.clear()
except:
intermediate_list.clear()
pass