There are several folders (called DT_20180102
, DT_20180103
, ...) in ComputedTEsCsv
folder. In each DT_...
folder, there are 498 CSV files. I want to store these into a dictionary and store it in a pickle.
I write the code below, but it raises an error:
'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
How can I correct this?
# Directory containing Joined Datasets of all companies.
_dir = "/Users/admin/Desktop/TransferEntropyEarningsAnnouncements/SP500Data/ComputedTEsCsv/"
# Create Directory names
#dates = []#['DT_20180201','DT_20180202']
dates = [i for i in os.listdir(_dir) if 'DT' in i]#['DT_20180201','DT_20180202']
# Create/Populate dictionary to contain all network data
network_dfs = {}
for _date in dates:
network_dfs[_date] = {}
load_pickle = False # Process to read in data is costly. Set to True to read in from pickle file
p_path = "SP500Data/NetworkJoinAll.pickle" # Save all files here ...
#if load_pickle is not True:
for date in tqdm(dates, total=len(dates), desc='JoiningAllNetworkDates'):
try:
base_path = "{0}{1}/".format(_dir, date)
company_files = os.listdir(base_path)
if '.ipynb_checkpoints' in company_files:
company_files.remove('.ipynb_checkpoints')
if '.rda' in company_files:
company_files.remove('.rda')
for i, company_file in enumerate(company_files):#tqdm(enumerate(company_files), total=len(company_files)):
# Only read in 1st 34 columns with 2hr 10 min periods
tmp_df = pd.read_csv(base_path+company_file)
if i == 0:
network_dfs[date] = tmp_df
else:
network_dfs[date] = pd.concat([network_dfs[date], tmp_df], ignore_index=True)
# Clean Data set any negative TE values to nan.
for col in network_dfs[date].columns[3:]:
network_dfs[date][network_dfs[date][col] < 0][col] = np.nan
except FileNotFoundError:
pass
print('Writing Network Data from {0}'.format(p_path))
with open(p_path, 'wb') as f:
pickle.dump(network_dfs, f, pickle.HIGHEST_PROTOCOL)
print('Done.')