I have a python 3 script below that is supposed to download an xml file and split it into smaller files with only 500 items each. I am having two problems:
- the last item in the original xml is not present in the split files
- if the original xml was 1000 items long it will create a 3rd empty xml file.
Can anyone tell me where there could be such an error in my code to cause these symptoms?
import urllib.request as urllib2
from lxml import etree
def _yield_str_from_net(url, car_tag):
xml_file = urllib2.urlopen(url)
for _, element in etree.iterparse(xml_file, tag=car_tag):
yield etree.tostring(element, pretty_print=True).decode('utf-8')
element.clear()
def split_xml(url, car_tag, save_as):
output_file_num = 1
net_file_iter = _yield_str_from_net(url, car_tag)
while True:
file_name = "%s%s.xml" % (save_as, output_file_num)
print("Making %s" % file_name)
with open(file_name, mode='w', encoding='utf-8') as the_file:
for elem_count in range(500): # want only 500 items
try:
elem = next(net_file_iter)
except StopIteration:
return
the_file.write(elem)
print("processing element #%s" % elem_count)
output_file_num += 1
if __name__ == '__main__':
split_xml("http://www.my_xml_url.com/",
'my_tag',
'my_file')