I created two alternative approaches to this problem, and timed the results. Also included the other answer as well as the original function.
from copy import deepcopy
import pandas as pd
from collections import defaultdict
import numpy as np
x = {'a': {'x': [0, 1, 2], 'y': [1 ,2 ,3]},
'b': {'x': [4, 6, 8], 'y': [9, 8, 7]}}
test = deepcopy(x)
for i in range(1000):
test.update({f'a_{i}':test['a']})
test2 = {k:{key: val*300 for key, val in v.items()} for k, v in x.items()}
print(len(test2['a']['x']))
for i in range(1000):
test2.update({f'a_{i}':test2['a']})
def dict_of_dicts_of_collections_to_multiindex_df(dict_of_dicts_of_collections):
x = dict_of_dicts_of_collections
result = {}
for outer_key, intermediate_dict in x.items():
result[outer_key] = {}
for intermediate_key, collection in intermediate_dict.items():
try:
for i, e in enumerate(collection):
result[outer_key][(intermediate_key, i)] = e
except TypeError:
pass
return pd.DataFrame(result).T
def out_from_other_answer(data):
df = pd.DataFrame(data).T
cols = df.columns
df = pd.concat([df[col].apply(pd.Series) for col in cols], axis=1)
#tweaked to avoid hardcoding [0, 1, 2]
df.columns = pd.MultiIndex.from_product([cols, range(len(df.columns)//len(cols))])
return df
def out2(dict_of_dicts):
df = pd.DataFrame(list(dict_of_dicts.values()))
out_df = pd.concat([pd.DataFrame(df[col].values.tolist())
for col in df.columns
],
axis=1,
keys=df.columns,
)
out_df.index = dict_of_dicts.keys()
return out_df
def out3(data):
temp = defaultdict(list)
for d in list(data.values()):
for k, v in d.items():
temp[k].append(v)
out = pd.concat([pd.DataFrame(v) for v in temp.values()], axis=1, keys=temp.keys())
out.index = data.keys()
return out
Let us see the results.
With small data x.
%timeit dict_of_dicts_of_collections_to_multiindex_df(x)
%timeit out_from_other_answer(x)
%timeit out2(x)
%timeit out3(x)
1.63 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
4.49 ms ± 492 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
2.44 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
1.49 ms ± 98.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
With more rows but same columns test.
%timeit dict_of_dicts_of_collections_to_multiindex_df(test)
%timeit out_from_other_answer(test)
%timeit out2(test)
%timeit out3(test)
70.3 ms ± 10.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
494 ms ± 40.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
4.81 ms ± 185 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
3.37 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
With more rows and more columns, test2
%timeit dict_of_dicts_of_collections_to_multiindex_df(test2)
%timeit out_from_other_answer(test2)
%timeit out2(test2)
%timeit out3(test2)
1.24 s ± 19.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.1 s ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
590 ms ± 39.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
598 ms ± 44 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
It seems each solution is affected differently in terms of what dimension of data increases. Overall it seems that out3 is the best bet. Essentially, it's best to change the layout of input data before working on it.