I have a folder full of metadata xml files I am parsing them and write the output as single lines to a csv. I am trying to use multiprocessing to speed this up and I have about 13000 files in the folder however I see that as I increase my pool size the program does not process all files. This is my code till now
from multiprocessing.dummy import Pool as ThreadPool
from multiprocessing import Pool, Queue
import os,subprocess,csv
mfile=r"C:\demo\metadata_parsed.csv"
errorlog=r"C:\demo\errorlog.csv"
with open(mfile,'wb') as csvfile:
writer=csv.DictWriter(csvfile,fieldnames=["id_no", "system:time_start", "platform"], delimiter=',')
writer.writeheader()
with open(errorlog,'wb') as csvfile:
writer=csv.DictWriter(csvfile,fieldnames=["id_no"], delimiter=',')
writer.writeheader()
def get_image_paths(folder):
return(os.path.join(folder,f)
for f in os.listdir(folder))
def download_item(images):
infilename=images
fsp=images.split("_x")[0]
from xml.dom import minidom #This gets the main xml parse tree
xmldoc=minidom.parse(infilename)
ps=xmldoc.getElementsByTagName("ps:EarthObservationMetaData")[0]
observation=xmldoc.getElementsByTagName("ps:EarthObservationResult") [0]
eopfilename=xmldoc.getElementsByTagName("eop:fileName")[0].firstChild.data
meta=xmldoc.getElementsByTagName("ps:EarthObservationMetaData")[0]
acquisition= meta.getElementsByTagName("eop:acquisitionDate")[0].firstChild.data
date_time = acquisition.split("T")[0]
pattern = '%Y-%m-%d'
epoch = int(time.mktime(time.strptime(date_time, pattern)))*1000
print("epoch time", epoch)
with open("C:\demo\metadata_parsed.csv",'a') as csvfile:
writer=csv.writer(csvfile,delimiter=',',lineterminator='\n')
writer.writerow([os.path.basename(fsp),epoch,platform])
csvfile.close()
if __name__ == '__main__':
t1=time.time()
folder=r"F:\demo\metadata"
images=get_image_paths(folder)
pool = Pool(20)
pool.map(download_item,images)
pool.close() # signal that we won't submit any more tasks to pool
pool.join() # wait until all processes are done
print("Pool took:",time.time()-t1)
And and all help would be appreciated