I have a code that reads a certain csv file and splits it into parts with 500000 rows each. After spliting, it deletes de original file. The problem is that sometimes, this file is too big and I run out of disk space before the splits finishes, so I needed to delete the file after reading and before spliting it. Here's the full code:
import pandas as pd
import csv
import os
def csv_splitter():
chunk_size = 500000
batch_no = 1
file_count = 0
file_location = r'C:\Users\Documents'
valid_files = []
file_name = 'file_output'
for file in os.listdir(file_location):
if file_name in file:
if file.partition(file_name)[0] == "":
valid_files.append(file)
if len(valid_files) == 0:
print('File not found')
return()
archive = str(file_location) + '\\' + str(valid_files[0])
for chunk in pd.read_csv(archive, chunksize = chunk_size, encoding ='latin1', delimiter = '|', dtype = 'str'):
chunk.to_csv(file_name + '_split_' + str(batch_no) + '.csv', quoting = csv.QUOTE_ALL, sep = '|', index = False)
batch_no += 1
if os.path.exists(archive):
os.remove(archive)
for path in os.listdir(file_location):
if os.path.isfile(os.path.join(file_location, path)):
file_count += 1
print('The file was split in ' + str(file_count) + ' parts')
return()
I tried to put the read_csv in a variable and after that delete the archive, but it returns an error saying the file is in use by another program. It ended like this:
test = pd.read_csv(archive, chunksize = chunk_size, encoding ='latin1', delimiter = '|', dtype = 'str')
if os.path.exists(archive):
os.remove(archive)
for chunk in test:
chunk.to_csv(archive[0:-4] + '_split_' + str(batch_no) + '.csv', quoting = csv.QUOTE_ALL, sep = '|', index = False)
batch_no += 1
Can someone please help me?