Possible solution is to store data in another format. Consider the following code. The very same data stored with pickle. The only difference is the format - dictionary of lists VS pandas dataframe. As you can see and try by yourself panas dataframe is loaded about 50 times faster. I guess the reason is parsing data into objects. So consider switching to more efficient data format.
import functools
import time
import pickle
import numpy as np
import pandas as pd
def measure_running_time(echo=True):
def decorator(func):
@functools.wraps(func)
def wrapped(*args, **kwargs):
t_1 = time.time()
ans = func(*args, **kwargs)
t_2 = time.time()
if echo:
print(f'{func.__name__}() running time is {t_2 - t_1:.2f} s')
return ans
return wrapped
return decorator
def f_0():
data = np.random.random((14_000, 1_000)) # 0.1 GB numpy array
# print(data.nbytes / 1024 / 1024 / 1024)
# format 1 - dictionary of lists
data_b = {idx: list(val) for idx, val in enumerate(data)}
# format 2 - pandas dataframe
data_p = pd.DataFrame(data)
# save data
with open('1.pickle', 'wb') as file:
pickle.dump(data_b, file)
data_p.to_pickle('2.pickle')
@measure_running_time()
def f_1():
with open('1.pickle', 'rb') as file:
pickle.load(file)
@measure_running_time()
def f_2():
pd.read_pickle('2.pickle')
if __name__ == '__main__':
# f_0()
f_1()
f_2()
Output:
f_1() running time is 5.52 s
f_2() running time is 0.11 s