0

There are several folders (called DT_20180102, DT_20180103, ...) in ComputedTEsCsv folder. In each DT_... folder, there are 498 CSV files. I want to store these into a dictionary and store it in a pickle.

I write the code below, but it raises an error:

'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte

How can I correct this?

# Directory containing Joined Datasets of all companies.
_dir = "/Users/admin/Desktop/TransferEntropyEarningsAnnouncements/SP500Data/ComputedTEsCsv/"

# Create Directory names
#dates = []#['DT_20180201','DT_20180202']
dates = [i for i in os.listdir(_dir) if 'DT' in i]#['DT_20180201','DT_20180202']


# Create/Populate dictionary to contain all network data
network_dfs = {}

for _date in dates:
    network_dfs[_date] = {}

load_pickle = False  # Process to read in data is costly. Set to True to read in from pickle file
p_path = "SP500Data/NetworkJoinAll.pickle"   # Save all files here ...

#if load_pickle is not True:
    
for date in tqdm(dates, total=len(dates), desc='JoiningAllNetworkDates'):
    try:
        base_path = "{0}{1}/".format(_dir, date)
        company_files = os.listdir(base_path)

        if '.ipynb_checkpoints' in company_files:
            company_files.remove('.ipynb_checkpoints')
        if '.rda' in company_files:
            company_files.remove('.rda')

        for i, company_file in enumerate(company_files):#tqdm(enumerate(company_files), total=len(company_files)):

            # Only read in 1st 34 columns with 2hr 10 min periods
            tmp_df = pd.read_csv(base_path+company_file)
            if i == 0:
                network_dfs[date] = tmp_df
            else:
                network_dfs[date] = pd.concat([network_dfs[date], tmp_df], ignore_index=True)

        # Clean Data set any negative TE values to nan.
        for col in network_dfs[date].columns[3:]:
            network_dfs[date][network_dfs[date][col] < 0][col] = np.nan
    except FileNotFoundError:
        pass


print('Writing Network Data from {0}'.format(p_path))
with open(p_path, 'wb')  as f:
    pickle.dump(network_dfs, f, pickle.HIGHEST_PROTOCOL)
print('Done.')
  • Can you provide the whole output of the error and why do you store it as a pickle? – Dimitar May 12 '22 at 18:15
  • @Dimitar UnicodeDecodeError Traceback (most recent call last) /var/folders/d9/dplb0tqs0kjdq5lhn_8f65nc0000gn/T/ipykernel_14272/3189939502.py in 31 32 # Only read in 1st 34 columns with 2hr 10 min periods ---> 33 tmp_df = pd.read_csv(base_path+company_file) UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte. Sorry for the mass format as it seems cannot use line break. – Xinyao Qian May 12 '22 at 18:28
  • The 33 line: **tmp_df = pd.read_csv(base_path+company_file)** has the error – Xinyao Qian May 12 '22 at 18:30
  • 1
    possible duplicate of [https://stackoverflow.com/questions/42339876/error-unicodedecodeerror-utf-8-codec-cant-decode-byte-0xff-in-position-0-in](https://stackoverflow.com/questions/42339876/error-unicodedecodeerror-utf-8-codec-cant-decode-byte-0xff-in-position-0-in) – Rdimo May 12 '22 at 18:31
  • 1
    @Dimitar I want to store it as pickle because: Aggregrating seperate csv files w/ TE Estimates for Q1 2018 is time consuming. Instead of doing that everytime it is needed. It is done once and stored in a dictionary. The dictionary is then saved into a pickle file which can loaded much faster later. – Xinyao Qian May 12 '22 at 18:33
  • 1
    The csv file evidently isn't encoded as UTF-8. You need to pass the correct encoding to `read_csv`. – snakecharmerb May 12 '22 at 18:33

0 Answers0