(my code takes 17mins to merge multiple files ). How do i reduce the processing time? is there any solution?
import glob
import pandas as pd
all_csv_files = [file for file in glob.glob(os.path.join(dir_name, '*.csv'))]
combined_csv = pd.concat(map(pd.read_csv, all_csv_files), ignore_index=True)
Here is the Full Code: unzipping time is also long around 3 mins (what changes should i make?)
import os
import concurrent.futures
import glob
import time
import zipfile
import pandas as pd
start = time.perf_counter()
source = "C:....."
destination = "C:....."
# get the zipped files
zip_files = []
for root, dirs, files in os.walk(source):
for name in files:
if '.zip' in name:
zip_files.append(os.path.join(root, name))
def main(item):
dir_name = item.split('.')[0]
all_csv_files = [file for file in glob.glob(
os.path.join(dir_name, '*.csv'))]
combined_csv = pd.concat(
map(pd.read_csv, all_csv_files), ignore_index=True
)
remove_source_path = item.split(source)[1]
remove_zip_extension = remove_source_path.split('.zip')[0]
save_filename = '_'.join(remove_zip_extension.split('\\'))[1:]
save_filepath = os.path.join(destination, f"{save_filename}.csv")
combined_csv.to_csv(save_filepath, encoding='cp932')
with concurrent.futures.ThreadPoolExecutor() as exe:
for file in zip_files:
exe.submit(main, file)
print(file)
end = time.perf_counter()
print(end - start)