I have written a code that would iterate over each csv in a folder, read it using data-frame and append it to a master df which would be later user.
import glob
import os
import pandas as pd
import time
import multiprocessing as mp
from multiprocessing.dummy import Pool
constituent_df= pd.DataFrame()
def process(file):
'''
This Function reads csv and appends it to a global data-frame
Parameters:
file-> csv file
'''
fields= ('REGION', 'CURR')
print("Pandas Reading:", file)
csv_df= pd.read_csv(file, skipinitialspace=True, usecols=fields)
constituent_df= constituent_df.append(csv_df, ignore_index=True)
def main():
'''
This module reads files present in the directory
And
'''
pool = mp.Pool(processes=4)
start= time.time()
constituent_df= pd.DataFrame()
for file in glob.glob(os.path.join(os.getcwd(),'2653AM\\*.csv')):
pool.apply_async(process,[file])
pool.close()
pool.join()
end= time.time()
print("It took:", end-start)
print(constituent_df)
constituent_df.to_excel(excel_writer="Constituent_Data_MP.xlsx", index=False)
if __name__=='__main__':
main()
#print(constituent_df)
I am not able to save constituent_df. Can anyone guide me on how to store constituent_df? Is there any other way?