I have a folder of 100gb of csv files that I want to merge into a single csv. The file names are in order of row position. I've written a single threaded script to tackle this, but it is understandably slow.
def JoinRows(rows_to_join, init=True):
#rows_to_join is a list of csv paths.
for i, row in enumerate(rows_to_join):
with open('join_rows.csv', 'a') as f1:
#join_rows.csv is just the output file with all the rows
with open(row, 'r') as f2:
for line in f2:
f1.write('\n'+line)
I also wrote a recursive function that doesn't work and isn't parallel (yet). My thought was to join each csv with another, delete the second of the two, and keep repeating until only one file was left. This way the task could be split up among different available threads. Any suggestions?
def JoinRows(rows_to_join, init=False):
if init==True: rows_to_join.sort()
LEN = len(rows_to_join)
print(LEN)
if len(rows_to_join) == 2:
with open(rows_to_join[0], 'a') as f1:
with open(rows_to_join[1], 'rb') as f2:
for line in f2:
f1.write('\n'+line)
subprocess.check_call(['rm '+rows_to_join[1]], shell=True)
return(rows_to_join[1])
else:
rows_to_join.remove(JoinRows(rows_to_join[:LEN//2]))
rows_to_join.remove(JoinRows(rows_to_join[LEN//2:]))