0

(my code takes 17mins to merge multiple files ). How do i reduce the processing time? is there any solution?

import glob
import pandas as pd

all_csv_files = [file for file in glob.glob(os.path.join(dir_name, '*.csv'))]
combined_csv = pd.concat(map(pd.read_csv, all_csv_files), ignore_index=True)

Here is the Full Code: unzipping time is also long around 3 mins (what changes should i make?)

import os
import concurrent.futures
import glob
import time
import zipfile
import pandas as pd

start = time.perf_counter()
source = "C:....."
destination = "C:....."
# get the zipped files
zip_files = []
for root, dirs, files in os.walk(source):
    for name in files:
        if '.zip' in name:
            zip_files.append(os.path.join(root, name))

def main(item):
    dir_name = item.split('.')[0]
    all_csv_files = [file for file in glob.glob(
        os.path.join(dir_name, '*.csv'))]
    combined_csv = pd.concat(
        map(pd.read_csv, all_csv_files), ignore_index=True
    )            
    remove_source_path = item.split(source)[1]        
    remove_zip_extension = remove_source_path.split('.zip')[0]        
    save_filename = '_'.join(remove_zip_extension.split('\\'))[1:]
    save_filepath = os.path.join(destination, f"{save_filename}.csv")
    combined_csv.to_csv(save_filepath, encoding='cp932')


with concurrent.futures.ThreadPoolExecutor() as exe:
    for file in zip_files:
        exe.submit(main, file)
        print(file)
end = time.perf_counter()

print(end - start)
SuperStormer
  • 4,997
  • 5
  • 25
  • 35
rin
  • 1
  • 3
  • Timing alone is not sufficient. There are many variables: size of data in rows and cols, number of files, data types, etc. Try list comprehension over `map`, appending even zipping csvs by command line, read all as string types, etc. – Parfait Nov 25 '21 at 22:15

1 Answers1

0

I would consider doing this and just skip Panda:

filenames = ['file1.txt', 'file2.txt', ...]
with open('path/to/output/file', 'w') as outfile:
    for fname in filenames:
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

Code comes from this answer: https://stackoverflow.com/a/13613375/6699433

klutt
  • 30,332
  • 17
  • 55
  • 95