import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)