It's slow to loop through a dataframe.
Solution 1: Working in memory
Code
def restructure_data(df):
# Depth column is string dictionary.
# - Convert to dictionary using ast.literal
# - Concatenate list of bids and asks
df['bids-asks'] = df["depth"].astype('str').apply(lambda x: p['bids'] + p['asks'] if (p:=ast.literal_eval(x)) else x)
# Names of new columns
columns = [f"{name}{j}" for j in range(1, (len(df['bids-asks'][0])//4)+1) for name in ["bid_price", "bid_volume", "ask_price", "ask_volume"]]
# Split bids-asks column into columns
split_df = pd.DataFrame(df['bids-asks'].tolist(), columns=columns)
# concat df and split_df
df = pd.concat([df, split_df], axis=1)
# Drop bids-asks column
df.drop('bids-asks', axis=1, inplace=True)
return df
Usage
# Read CSV file into Dataframe
df = pd.read_csv('brr.csv', index_col=[0])
df = restructure_data(df)
# Display header
print(df.head())
Output
Close depth bid_price1 bid_volume1 ask_price1 ask_volume1 bid_price2 bid_volume2 ask_price2 ask_volume2 ... ask_price498 ask_volume498 bid_price499 bid_volume499 ask_price499 ask_volume499 bid_price500 bid_volume500 ask_price500 ask_volume500
0 7.245 {'lastUpdateId': 1787277998027, 'E': 165985484... [7.2450, 652] [7.2440, 545] [7.2430, 2643] [7.2420, 1308] [7.2410, 2201] [7.2400, 4207] [7.2390, 4878] [7.2380, 5604] ... [8.2930, 12] [8.2940, 619] [8.2950, 16] [8.2960, 33] [8.2970, 10] [8.2980, 13] [8.2990, 2] [8.3000, 1884] [8.3010, 3] [8.3020, 1]
1 7.246 {'lastUpdateId': 1787278048764, 'E': 165985485... [7.2450, 842] [7.2440, 1121] [7.2430, 1825] [7.2420, 2728] [7.2410, 3047] [7.2400, 5431] [7.2390, 4790] [7.2380, 5526] ... [8.2930, 12] [8.2940, 619] [8.2950, 16] [8.2960, 33] [8.2970, 10] [8.2980, 13] [8.2990, 2] [8.3000, 1884] [8.3010, 3] [8.3020, 1]
2 7.247 {'lastUpdateId': 1787278510622, 'E': 165985487... [7.2460, 861] [7.2450, 1298] [7.2440, 1636] [7.2430, 3669] [7.2420, 4127] [7.2410, 3330] [7.2400, 3865] [7.2390, 4951] ... [8.2940, 619] [8.2950, 16] [8.2960, 33] [8.2970, 10] [8.2980, 13] [8.2990, 2] [8.3000, 1884] [8.3010, 3] [8.3020, 1] [8.3030, 9]
3 7.246 {'lastUpdateId': 1787278719415, 'E': 165985488... [7.2460, 809] [7.2450, 1872] [7.2440, 1230] [7.2430, 3774] [7.2420, 3498] [7.2410, 3744] [7.2400, 3906] [7.2390, 5100] ... [8.2940, 619] [8.2950, 16] [8.2960, 33] [8.2970, 10] [8.2980, 13] [8.2990, 2] [8.3000, 1884] [8.3010, 3] [8.3020, 1] [8.3030, 9]
4 7.247 {'lastUpdateId': 1787278800142, 'E': 165985488... [7.2470, 840] [7.2460, 2145] [7.2450, 1797] [7.2440, 2720] [7.2430, 6232] [7.2420, 3448] [7.2410, 2882] [7.2400, 3198] ... [8.2950, 16] [8.2960, 33] [8.2970, 10] [8.2980, 13] [8.2990, 2] [8.3000, 1884] [8.3010, 3] [8.3020, 1] [8.3030, 9] [8.3040, 7]
5 rows × 2002 columns
Solution 2: Working on Chunks of Data
The issue with the solution 1 is that it quicly exhaust RAM memory for large data files (e.g. 30K records). The alternative is to process data in chunks by:
- Use Pandas read_csv chunksize parameter so we can process in chunks
- Use tqdm task status bar to monitor progress
- Append chunks processed to an output CSV file
Code
import os
import pandas as pd
import ast
import csv
from pathlib import Path
import time
from tqdm import tqdm
# Main function
def restructure_data(df, columns):
'''
Split list in depth column
'''
# Split depth column into columns
split_df = pd.DataFrame(df['depth'].tolist(),
columns=columns, # using specified column names
index = df.index) # index from df
# concat df and split_df
df = pd.concat([df, split_df], axis=1)
return df
# Function utilities
def append_result(filename, modifier = "-result"):
'''
Creates an output file path whose name has modified appended into the base name
'''
p = Path(filename)
return f"{Path.joinpath(p.parent, p.stem)}{modifier}{p.suffix}"
def repeat_csv(filenm, n_repeats = 3):
'''
Duplicates the CSV file data
'''
new_file = append_result(filenm, "-repeats")
with open(new_file, 'w', newline='') as fout, open(filenm, 'r') as fin:
reader = csv.reader(fin)
writer = csv.writer(fout)
writer.writerow(next(reader)) # write header
index = 0 # keep track of index
for l in reader:
for _ in range(n_repeats): # Repeat rows
l[0] = index # update index value
index += 1
writer.writerow(l)
return new_file
def get_bids_asks(x):
'''
Pulls bids and lists from dictionary of depth column dictionary data
'''
p = ast.literal_eval(x)
return p['bids'] + p['asks']
#################################################
# 1. Generate Data by repeating infile n_repeats. We also update index which is in column 0
repeated_file = repeat_csv('temp.csv', n_repeats = 2)
#################################################
# 2, Main processing
# Results file name
results_file = append_result(repeated_file, "-result")
# Delete results file if already exissts
if os.path.exists(results_file):
os.remove(results_file)
# Process input file in chunks
for i, df in enumerate(tqdm(pd.read_csv(repeated_file, index_col=[0],
converters = {'depth':get_bids_asks},
chunksize = 10))): # Processing 10 rows of input at a time
if i == 0:
# Names of new columns
columns = [f"{name}{j}" for j in range(1, (len(df['depth'][0])//4)+1) for name in ["bid_price", "bid_volume", "ask_price", "ask_volume"]]
df_result = restructure_data(df, columns)
# Append to results file
df_result.to_csv(results_file, mode='a',
index = False,
header = (i == 0)) # Only write header for first output iteration
Performance
Tested using repeat = 1000 on file linked by OP in question.
- Repeated data contained 10,000 records
- Processed file in ~11 minutes (Wall clock time) on older Windows PC (10+ years old).