Further to a post I made a couple of weeks ago, I'm reading rows from a spreadsheet (nearly 215,000) and attempting to match them with text files contained in in a sub-directory. On average the number of text files files contained in the sub-directory is 14000. Although my code is working, it is taking an inordinate amount of time to copy the matched files to a second sub-directory. At this rate it's going to be end of August before the job is complete (average processing time is six hours)
Is there a way to improve the efficiency of this algorithm, or indeed is there a better way? My code is below
regards
import glob
import os,sys
import csv
import shutil
import pandas as pd
import fnmatch
import string
import xlrd
from os import listdir
from os.path import isfile
MDA_Path = 'D:/1994_QTR3' # contains Loughram and MacDonald 10-K files for QTR3
MDA_Path_2 = 'D:/1994_QTR4' # Contains L&M 10-K files for QTR4
MDA_Path_3 = 'D:/1995_QTR1'
MDA_Path_4 = 'D:/1995_QTR2'
MDA_Path_5 = 'D:/1995_QTR3'
MDA_Path_6 = 'D:/1995_QTR4'
MDA_Path_7 = 'D:/1996_QTR1'
MDA_Path_8 = 'D:/1996_QTR2'
MDA_Path_9 = 'D:/1996_QTR3'
MDA_Path_10 = 'D:/1996_QTR4'
MDA_Path_11 = 'D:/1997_QTR1'
MDA_Path_12 = 'D:/1997_QTR2'
MDA_Path_13 = 'D:/1997_QTR3'
MDA_Path_14 = 'D:/1997_QTR4'
MDA_Path_15 = 'D:/1998/QTR1'
MDA_Path_16 = 'D:/1998/QTR2'
MDA_Path_17 = 'D:/1998/QTR3'
MDA_Path_18 = 'D:/1998/QTR4'
MDA_Path_19 = 'D:/1999/QTR1'
MDA_Path_20 = 'D:/1999/QTR2'
MDA_Path_21 = 'D:/1999/QTR3'
MDA_Path_22 = 'D:/1999/QTR4'
MDA_Path_23 = 'D:/2000/QTR1'
MDA_Path_24 = 'D:/2000/QTR2'
MDA_Path_25 = 'D:/2000/QTR3'
MDA_Path_26 = 'D:/2000/QTR4'
MDA_Path_27 = 'D:/2001/QTR1'
MDA_Path_28 = 'D:/2001/QTR2'
MDA_Path_29 = 'D:/2001/QTR3'
MDA_Path_30 = 'D:/2001/QTR4'
MDA_Path_31 = 'D:/2002/QTR1'
MDA_Path_32 = 'D:/2002/QTR2'
MDA_Path_33 = 'D:/2002/QTR3'
MDA_Path_34 = 'D:/2002/QTR4'
MDA_Target_List = r'D:/PhD_Data/Wenrui_Filing_list' # stores wenruis data
MDA_For_Parsing_1994_QTR3 = 'D:/Required_MDA_1994_QTR3' # will hold all 10-Ks from wenrui's spreadsheet once detected
MDA_For_Parsing_1994_QTR4 = 'D:/Required_MDA_1994_QTR4'
MDA_For_Parsing_1995_QTR1 = 'D:/Required_MDA_1995_QTR1'
MDA_For_Parsing_1995_QTR2 = 'D:/Required_MDA_1995_QTR2'
MDA_For_Parsing_1995_QTR3 = 'D:/Required_MDA_1995_QTR3'
MDA_For_Parsing_1995_QTR4 = 'D:/Required_MDA_1995_QTR4'
MDA_For_Parsing_1996_QTR1 = 'D:/Required_MDA_1996_QTR1'
MDA_For_Parsing_1996_QTR2 = 'D:/Required_MDA_1996_QTR2'
MDA_For_Parsing_1996_QTR3 = 'D:/Required_MDA_1996_QTR3'
MDA_For_Parsing_1996_QTR4 = 'D:/Required_MDA_1996_QTR4'
MDA_For_Parsing_1997_QTR1 = 'D:/Required_MDA_1997_QTR1'
MDA_For_Parsing_1997_QTR2 = 'D:/Required_MDA_1997_QTR2'
MDA_For_Parsing_1997_QTR3 = 'D:/Required_MDA_1997_QTR3'
MDA_For_Parsing_1997_QTR4 = 'D:/Required_MDA_1997_QTR4'
MDA_For_Parsing_1998_QTR1 = 'D:/Required_MDA_1998_QTR1'
MDA_For_Parsing_1998_QTR2 = 'D:/Required_MDA_1998_QTR2'
MDA_For_Parsing_1998_QTR3 = 'D:/Required_MDA_1998_QTR3'
MDA_For_Parsing_1998_QTR4 = 'D:/Required_MDA_1998_QTR4'
MDA_For_Parsing_1999_QTR1 = 'D:/Required_MDA_1999_QTR1'
MDA_For_Parsing_1999_QTR2 = 'D:/Required_MDA_1999_QTR2'
MDA_For_Parsing_1999_QTR3 = 'D:/Required_MDA_1999_QTR3'
MDA_For_Parsing_1999_QTR4 = 'D:/Required_MDA_1999_QTR4'
MDA_For_Parsing_2000_QTR1 = 'D:/Required_MDA_2000_QTR1'
MDA_For_Parsing_2000_QTR2 = 'D:/Required_MDA_2000_QTR2'
MDA_For_Parsing_2000_QTR3 = 'D:/Required_MDA_2000_QTR3'
MDA_For_Parsing_2000_QTR4 = 'D:/Required_MDA_2000_QTR4'
MDA_For_Parsing_2001_QTR1 = 'D:/Required_MDA_2001_QTR1'
MDA_For_Parsing_2001_QTR2 = 'D:/Required_MDA_2001_QTR2'
MDA_For_Parsing_2001_QTR3 = 'D:/Required_MDA_2001_QTR3'
MDA_For_Parsing_2001_QTR4 = 'D:/Required_MDA_2001_QTR4'
MDA_FOR_Parsing_2002_QTR1 = 'D:/Required_MDA_2002_QTR1'
MDA_FOR_Parsing_2002_QTR2 = 'D:/Required_MDA_2002_QTR2'
MDA_FOR_Parsing_2002_QTR3 = 'D:/Required_MDA_2002_QTR3'
MDA_FOR_Parsing_2002_QTR4 = 'D:/Required_MDA_2002_QTR4'
# open the csv file and extract the column containing the location of the text file(s)
datas = pd.read_excel(r'D:/PhD_Data/Wenrui_Filing_list/1994-2017filingslist_Wenrui_13Jul2020.xlsx')
df = pd.DataFrame(datas, columns = ['FILE_NAME']) # extract the data contained in FILE_NAME column
df['FILE_NAME'] = df['FILE_NAME'].str[26:] # remove the first 26 characters which contain the edgar drive info
df['FILE_NAME'] = df['FILE_NAME'].str.strip() # remove all leading and trailing
file_length = len(df) # count number of files in Wenrui's list (will need this later to loop through all occurrences)
dirs = os.listdir(MDA_Path_32)
# dirs1 = os.listdir(MDA_Path_3)
for x in range(file_length):
for file in dirs:
# if file == df['FILE_NAME'][x]:
if df['FILE_NAME'][x] in file:
print(file)
shutil.copy(MDA_Path_32 + '/' + file, MDA_FOR_Parsing_2002_QTR2) # Move it to QTR directory```