I am attempting to write a script that joins 2 compressed files based on the first column match. I would like to do this in chunks as the original code i work with is CSV file and when used with these files produces a memory error.
The code I use with a memory error (but works with smaller files):
f1 = open('file1.csv', 'r')
f2 = open('file2.csv', 'r')
f3 = open('output.csv', 'w')
c1 = csv.reader(f1)
c2 = csv.reader(f2)
c3 = csv.writer(f3)
file2 = list(c2)
for file1_row in c1:
row = 1
found = False
results_row = file1_row #Moved out from nested loop
for file2_row in file2:
x = file2_row[1:]
if file1_row[0] == file2_row[0]:
results_row.append(x)
found = True
break
row += 1
if not found:
results_row.append('Not found')
c3.writerow(results_row)
f1.close()
f2.close()
f3.close()
I have tried to make this work where I work with a chunk but think it is in the wrong format.
f1 = open('final1.gz', 'r')
f2 = open('final2.gz', 'r')
f3 = open('results.gz.DONE', 'w')
c1 = csv.reader(f1)
c2 = csv.reader(f2)
c3 = csv.writer(f3)
file2 = list(c2)
fileList = ['final_balance.gz', 'final_service.gz']
for fileName in fileList:
with open(fileName, 'rb') as sourceFile:
chunk = True
while chunk:
chunk = sourceFile.read(bufferSize)
#file2 = list(c2) # MemoryError occurs on this line.
for file1_row in c1:
row = 1
found = False
results_row = file1_row #Moved out from nested loop
for file2_row in file2:
x = file2_row[1:]
if file1_row[0] == file2_row[0]:
results_row.append(x)
found = True
break
row += 1
if not found:
results_row.append('Not found')
c3.writerow(results_row)
At this point I am getting the error at:
File "function.py", line 20, file2 = list(c2) MemoryError.
I can't use panda as I don't have access.