This one works with every nested dict, list, tuple, df, nparray, etc.
class Tuppsub(tuple):
pass
class ProtectedTuple(tuple):
pass
class ProtectedList(list):
pass
class ProtectedDict(dict):
pass
class ProtectedSet(set):
pass
def aa_flatten_dict_tu(
v,
listitem,
forbidden=(list, tuple, set, frozenset),
allowed=(
str,
int,
float,
complex,
bool,
bytes,
type(None),
ProtectedTuple,
ProtectedList,
ProtectedDict,
ProtectedSet,
Tuppsub,
),
):
if isinstance(v, dict) or (
hasattr(v, "items") and hasattr(v, "keys")
): # we check right away if it is a dict or something similar (with keys/items). If we miss something, we will
# only get the keys back.
for k, v2 in v.items():
newtu = listitem + (k,) # we accumulate all keys in a tuple
# and check if there are more dicts (nested) in this dict
yield from aa_flatten_dict_tu(
v2, listitem=newtu, forbidden=forbidden, allowed=allowed
)
elif isinstance(
v, forbidden
): # if we have an iterable without keys (list, tuple, set, frozenset) we have to enumerate them to be able to
# access the original dict values later: di['blabla'][0] instead of di['blabla']
for indi, v2 in enumerate(v):
if isinstance(v2, allowed):
yield v2, listitem
# if the value is not in our allowed data types, we have to check if it is an iterable
else:
yield from aa_flatten_dict_tu(
v2,
listitem=(listitem + (indi,)),
forbidden=forbidden,
allowed=allowed,
)
elif isinstance(v, allowed):
# if the datatype is allowed, we yield it
yield Tuppsub((v, listitem))
# Brute force to check if we have an iterable. We have to get all iterables!
else:
try:
for indi2, v2 in enumerate(v):
try:
if isinstance(v2, allowed):
yield v2, listitem
else:
yield aa_flatten_dict_tu(
v2,
listitem=(listitem + (indi2,)),
forbidden=forbidden,
allowed=allowed,
)
except Exception:
# if there is an exception, it is probably not an iterable, so we yield it
yield v2, listitem
except Exception:
# if there is an exception, it is probably not an iterable, so we yield it
yield v, listitem
def fla_tu(
item,
walkthrough=(), # accumulate nested keys
forbidden=(list, tuple, set, frozenset), # forbidden to yield, need to be flattened
allowed=( # Data types we don't want to touch!
str,
int,
float,
complex,
bool,
bytes,
type(None),
ProtectedTuple, #
ProtectedList,
ProtectedDict,
ProtectedSet,
Tuppsub # This is the secret - Inherit from tuple and exclude it from being flattened -
# ProtectedTuple does the same thing
),
dict_variation=(
# we don't check with isinstance(), rather with type(), that way we don't have to import collections.
"collections.defaultdict",
"collections.UserDict",
"collections.OrderedDict",
),
):
if isinstance(item, allowed): # allowed items, so let's yield them
yield item, walkthrough
elif isinstance(item, forbidden):
for ini, xaa in enumerate(item):
try:
yield from fla_tu(
xaa,
walkthrough=(walkthrough + (ini,)),
forbidden=forbidden,
allowed=allowed,
dict_variation=dict_variation,
) # if we have an iterable, we check recursively for other iterables
except Exception:
yield xaa, Tuppsub(
(walkthrough + Tuppsub((ini,)))
) # we just yield the value (value, (key1,key2,...)) because it is probably not an iterable
elif isinstance(
item, dict
): # we need to pass dicts to aa_flatten_dict_tu(), they need a special treatment, if not, we only get the keys from the dict back
yield from aa_flatten_dict_tu(
item, listitem=walkthrough, forbidden=forbidden, allowed=allowed
)
# let's try to catch all different dict variations by using ( hasattr(item, "items") and hasattr(item, "keys").
# If we dont pass it to aa_flatten_dict_tu(), we only get the keys back.
#
# -> (hasattr(item, "items") and hasattr(item, "keys") -> Maybe better here: elif isinstance( item, dict ):
elif (str(type(item)) in dict_variation) or (
hasattr(item, "items") and hasattr(item, "keys")
):
yield from aa_flatten_dict_tu(
dict(item), listitem=walkthrough, forbidden=forbidden, allowed=allowed
)
# isinstance(item, pd.DataFrame) maybe better?
elif "DataFrame" in str(type(item)):
yield from aa_flatten_dict_tu(
item.copy().to_dict(),
# pandas needs to be converted to dict first, if not, we only get the columns back. Copying might not be necessary
listitem=walkthrough,
forbidden=forbidden,
allowed=allowed,
)
# # many iterables are hard to identify using isinstance() / type(), so we have to use brute force to check if it is
# an iterable. If one iterable escapes, we are screwed!
else:
try:
for ini2, xaa in enumerate(item):
try:
if isinstance(xaa, allowed): # yield only for allowed data types
yield xaa, Tuppsub(
(walkthrough + (ini2,))
) # yields (value, (key1,key2,...)) -> always same format -> first value, then all keys in another tuple
else: # if it is not in the allowed data types, we check recursively for other iterables
yield from fla_tu(
xaa,
walkthrough=Tuppsub(
(walkthrough + Tuppsub(ini2, ))
), # yields (value, (key1,key2,...))
forbidden=forbidden,
allowed=allowed,
dict_variation=dict_variation,
)
except Exception:
yield xaa, Tuppsub(
(walkthrough + (ini2,))
) # in case of an exception, we yield (value, (key1,key2,...))
except Exception:
yield item, Tuppsub(
(walkthrough + Tuppsub(item, ))
) # in case of an exception, we yield (value, (key1,key2,...))
data = {'results': [{'key': 'survey_x',
'value': [[{'key': 'q1',
'value': 2,
'endTime': '2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'},
{'key': 'q2',
'value': 0,
'endTime': '2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'},
{'key': 'q3',
'value': 2,
'endTime': '2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'},
{'key': 'q4',
'value': 0,
'endTime': '2021-01-21',
'skipped': False,
'startTime': '2021-01-21',
'resultType': 'multipleChoice'}]],
'skipped': False,
'end_time': '2021-01-21',
'start_time': '2021-01-21',
'result_type': 'grouped'}]}
nested_whatever=fla_tu(data)
flattenddict = list((fla_tu(nested_whatever)))
resultfinal=[list(x)[0] if "generator" in str(type(x)) else x for x in flattenddict]
allvalues = [x[0][0] for x in resultfinal]
allkeys = [x[0][1] for x in resultfinal]
Here are the results
#result (allvalues)
['survey_x',
'q1',
2,
'2021-01-21',
False,
'2021-01-21',
'multipleChoice',
'q2',
0,
'2021-01-21',
False,
'2021-01-21',
'multipleChoice',
'q3',
2,
'2021-01-21',
False,
'2021-01-21',
'multipleChoice',
'q4',
0,
'2021-01-21',
False,
'2021-01-21',
'multipleChoice',
False,
'2021-01-21',
'2021-01-21',
'grouped' ....]
#result(allkeys)
[('results', 0, 'key'),
('results', 0, 'value', 0, 0, 'key'),
('results', 0, 'value', 0, 0, 'value'),
('results', 0, 'value', 0, 0, 'endTime'),
('results', 0, 'value', 0, 0, 'skipped'),
('results', 0, 'result_type')....]
If you want to have a DataFrame, you can use this:
import pandas as pd
class Tuppsub(tuple):
pass
class ProtectedTuple(tuple):
pass
class ProtectedList(list):
pass
class ProtectedDict(dict):
pass
class ProtectedSet(set):
pass
def aa_flatten_dict_tu(
v,
listitem,
forbidden=(list, tuple, set, frozenset),
allowed=(
str,
int,
float,
complex,
bool,
bytes,
type(None),
ProtectedTuple,
ProtectedList,
ProtectedDict,
ProtectedSet,
Tuppsub,
),
):
if isinstance(v, dict) or (
hasattr(v, "items") and hasattr(v, "keys")
):
for k, v2 in v.items():
newtu = listitem + (k,)
yield from aa_flatten_dict_tu(
v2, listitem=newtu, forbidden=forbidden, allowed=allowed
)
elif isinstance(
v, forbidden
):
for indi, v2 in enumerate(v):
if isinstance(v2, allowed):
yield v2, listitem
else:
yield from aa_flatten_dict_tu(
v2,
listitem=(listitem + (indi,)),
forbidden=forbidden,
allowed=allowed,
)
elif isinstance(v, allowed):
yield Tuppsub((v, listitem))
else:
try:
for indi2, v2 in enumerate(v):
try:
if isinstance(v2, allowed):
yield v2, listitem
else:
yield aa_flatten_dict_tu(
v2,
listitem=(listitem + (indi2,)),
forbidden=forbidden,
allowed=allowed,
)
except Exception:
yield v2, listitem
except Exception:
yield v, listitem
def fla_tu(
item,
walkthrough=(),
forbidden=(list, tuple, set, frozenset),
allowed=(
str,
int,
float,
complex,
bool,
bytes,
type(None),
ProtectedTuple, #
ProtectedList,
ProtectedDict,
ProtectedSet,
Tuppsub
),
dict_variation=(
"collections.defaultdict",
"collections.UserDict",
"collections.OrderedDict",
),
):
if isinstance(item, allowed):
yield item, walkthrough
elif isinstance(item, forbidden):
for ini, xaa in enumerate(item):
try:
yield from fla_tu(
xaa,
walkthrough=(walkthrough + (ini,)),
forbidden=forbidden,
allowed=allowed,
dict_variation=dict_variation,
)
except Exception:
yield xaa, Tuppsub(
(walkthrough + Tuppsub((ini,)))
)
elif isinstance(
item, dict
):
yield from aa_flatten_dict_tu(
item, listitem=walkthrough, forbidden=forbidden, allowed=allowed
)
elif (str(type(item)) in dict_variation) or (
hasattr(item, "items") and hasattr(item, "keys")
):
yield from aa_flatten_dict_tu(
dict(item), listitem=walkthrough, forbidden=forbidden, allowed=allowed
)
elif "DataFrame" in str(type(item)):
yield from aa_flatten_dict_tu(
item.copy().to_dict(),
listitem=walkthrough,
forbidden=forbidden,
allowed=allowed,
)
else:
try:
for ini2, xaa in enumerate(item):
try:
if isinstance(xaa, allowed):
yield xaa, Tuppsub(
(walkthrough + (ini2,))
)
else:
yield from fla_tu(
xaa,
walkthrough=Tuppsub(
(walkthrough + Tuppsub(ini2, ))
),
forbidden=forbidden,
allowed=allowed,
dict_variation=dict_variation,
)
except Exception:
yield xaa, Tuppsub(
(walkthrough + (ini2,))
)
except Exception:
yield item, Tuppsub(
(walkthrough + Tuppsub(item, ))
)
def qq_d_sort_columns_alphabetically(df, reverse=False):
# crucial -> to make sure our keys are in the right order
if reverse is False:
return df.filter(sorted(df.columns)).copy()
return df.filter(reversed(sorted(df.columns))).copy()
def qq_s_lists_to_df(df):
df2 = df.copy()
# the biggest problem of a nested iterable is, that we have different key depths of each value. To solve the problem,
# we get the max len and adjust all other keys
maxlen = df2.dropna().map(lambda x: len(x)).max()
# # Any key (tuple of keys) will get adjusted
return df2.apply(
lambda x: _exs_normalize_lists_in_series(x, maxlen, seriesback=True)
).copy()
def qq_ds_merge_multiple_dfs_and_series_on_index(
df,
list_with_ds,
how="inner",
on=None,
sort=False,
suffixes=("_x", "_y"),
indicator=False,
validate=None,
):
# pd.merge in a for-loop (is there a way to do this with pandas directly?)
df2 = df.copy()
for ini, x in enumerate(list_with_ds):
if isinstance(x, pd.Series):
x = x.to_frame().copy()
df2 = (
pd.merge(
df2.copy(),
x.copy(),
how=how,
on=on,
sort=sort,
indicator=indicator,
validate=validate,
left_index=True,
right_index=True,
suffixes=(
f"{suffixes[0]}_{str(ini).zfill(3)}", # important to have a format that can be filtered easily
f"{suffixes[1]}_{str(ini).zfill(3)}",
),
)
).copy() # Copy! Copy! Copy! Maybe not necessary, but changing the original data is a no-go.
return df2
def _exs_normalize_lists_in_series(list_, maxlen, seriesback=True):
# the nan check is very important, but pd.isna and np.isnan is not enough!!
if qq_s_isnan(list_):
if seriesback:
return pd.Series(
[pd.NA] * maxlen
) # pd.NA is the "most tolerant" of all nan versions (less exceptions), so let's use it
else:
return [pd.NA] * maxlen
list_ = _if_not_list_to_list(list_) # make sure to have a list
add_lists = (maxlen - len(list_)) * [
pd.NA
] # Example -> if maxlen (max depth) of our keys is 8, and the one that has been passed to this function has 2 , we add 6x pd.NA values.
if seriesback:
return pd.Series(
list_ + add_lists
) # in our case, we always use seriesback=True -> best way to get a DataFrame in the right format
return list_ + add_lists # now our lists have all the same length,
def _if_not_list_to_list(list_):
if not isinstance(list_, list):
try:
list_ = list_.tolist() # we try it first like that (only for numpy arrays)
except Exception:
list_ = list(
list_
) # if it doesn't work, we do it like this. Otherwise, it won't work, resulting in many exceptions later on
return list_
def qq_s_isnan(wert, nan_back=False, debug=False):
# qq_s_isnan looks terrible, but is very important due to all different nans that exist
# Here is a little example:
# -------------------------------------------------------
# np.isnan(np.array(['ddd',3])) -> np.isnan throws an exception
# TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
# -------------------------------------------------------
# pd.isna(np.array(['ddd',3])) -> pandas is ok with it
# Out[14]: array([False, False])
# -------------------------------------------------------
# from math import isnan -> math throws another exception
# isnan(np.array(['ddd',3]))
# TypeError: only size-1 arrays can be converted to Python scalars
# We don't need np.isnan here, it is covered by pd/math
allenanvalues = [
"<NA>",
"<NAN>",
"<nan>",
"np.nan",
"NoneType",
"None",
"-1.#IND",
"1.#QNAN",
"1.#IND",
"-1.#QNAN",
"#N/A N/A",
"#N/A",
"N/A",
"n/a",
"NA",
"",
"#NA",
"NULL",
"null",
"NaN",
"-NaN",
"nan",
"-nan",
]
try:
if pd.isna(wert) is True:
if nan_back is True:
return np.nan
return True
except Exception as Fehler:
if debug is True:
print(Fehler)
try:
if pd.isnull(wert) is True:
if nan_back is True:
return np.nan
return True
except Exception as Fehler:
if debug is True:
print(Fehler)
try:
if math.isnan(wert) is True:
if nan_back is True:
return np.nan
return True
except Exception as Fehler:
if debug is True:
print(Fehler)
try:
if wert is None:
return True
except Exception as Fehler:
if debug is True:
print(Fehler)
# # usually we don't get here, but who knows. If we get here (string import???),
# we get rid of those values and substitute them with real NaN
for allaaa in allenanvalues:
try:
nanda = re.findall(str(fr"^\s*{wert}\s*$"), str(allaaa))
if any(nanda):
return True
except Exception as Fehler:
if debug is True:
print(Fehler)
return False
return False
def nested_something_to_df(nested_whatever, unstack: bool = True,) -> pd.DataFrame:
flattenddict = list(fla_tu(nested_whatever)) # flatten every iterable
# depending on the iterable, we have to convert it to a list and get the first (and only) tuple (which always has the same length -> 2 - value and keys[s])
flattenddict = [
list(x)[0] if "generator" in str(type(x)) else x for x in flattenddict
]
# now we have a dataframe, but all keys from our iterable are in one column
df = pd.DataFrame(flattenddict)
df.columns = [
"aa_value",
"aa_all_keys",
]
indexdf = qq_s_lists_to_df(
df["aa_all_keys"]
) # We need to explode the column aa_all_keys
# enumerate columns, to distinguish better.
indexdf.columns = [f"aa_key_{x}" for x in indexdf.columns]
# merge the exploded columns with the 2=colum dataframe! we need the data twice! 1x for the index right now, and
# the second one 'aa_all_keys' later to transform the df stacked->unstacked / unstacked->stacked
# and to update the original iter
df = qq_ds_merge_multiple_dfs_and_series_on_index(df, [indexdf])
# merge the exploded columns with the 2=columns DataFrame! We need the data twice! 1x for the index right now, and
# the second one 'aa_all_keys' later to transform the DataFrame stacked->unstacked / unstacked->stacked
# and to update the original iterable -> update_original_iter
df.index = [df[f"aa_key_{x}"].__array__() for x in range(len(df.columns) - 2)]
# Very important, we need to make sure to put the keys of the nested iter in the right order
df = qq_d_sort_columns_alphabetically(df)
# We can now drop the columns because the key-data is now present in the index
df = df.drop(columns=[x for x in df.columns if x.startswith("aa_key_")])
# At this point there are only 2 columns -> (["aa_value", "aa_all_keys"] )
# It might not be necessary to copy the DataFrame here, but no way I want to change the original data
if unstack:
# df = adjust_dataframe_and_dtypes(df, nested_whatever)
# One column for each key (key1, key2...), one for the value[s] ("aa_value") and one for all keys as tuple ("aa_all_keys")
return df.reset_index().copy()
# The 2 columns version
return df.copy()
nested_something_to_df(nested_whatever=data, unstack = False,)
And here is the result:
aa_all_keys (can be droped) aa_value
results 0 key NaN NaN NaN (results, 0, key) survey_x
value 0 0 key (results, 0, value, 0, 0, key) q1
value (results, 0, value, 0, 0, value) 2
endTime (results, 0, value, 0, 0, endTime) 2021-01-21
skipped (results, 0, value, 0, 0, skipped) False
startTime (results, 0, value, 0, 0, startTime) 2021-01-21
resultType (results, 0, value, 0, 0, resultType) multipleChoice
1 key (results, 0, value, 0, 1, key) q2
value (results, 0, value, 0,
.......
Here is the result, when you merge the 2 dicts:
nested_something_to_df(nested_whatever=[test_dict_1,test_dict_2], unstack = True,)
level_0 level_1 ... aa_all_keys aa_value
0 0 results ... (0, results, 0, key) q1
1 0 results ... (0, results, 0, value) 1
2 0 results ... (0, results, 0, end_time) 2021-01-21
3 0 results ... (0, results, 0, start_time) 2021-01-21
4 0 results ... (0, results, 0, result_type) multipleChoice
5 0 results ... (0, results, 1, key) q2
6 0 results ... (0, results, 1, value) False
7 0 results ... (0, results, 1, end_time) 2021-01-21
8 0 results ... (0, results, 1, start_time) 2021-01-21
9 0 results ... (0, results, 1, result_type) multipleChoice
10 0 results ... (0, results, 2, key) q3
11 0 results ... (0, results, 2, value) 3
12 0 results ... (0, results, 2, end_time) 2021-01-21
13 0 results ... (0, results, 2, start_time) 2021-01-21
14 0 results ... (0, results, 2, result_type) multipleChoice
15 0 results ... (0, results, 3, key) q4
16 0 results ... (0, results, 3, value) 3
17 0 results ... (0, results, 3, end_time) 2021-01-21
18 0 results ... (0, results, 3, start_time) 2021-01-21
19 0 results ... (0, results, 3, result_type) multipleChoice
20 1 results ... (1, results, 0, key) survey_x
21 1 results ... (1, results, 0, value, 0, 0, key) q1
22 1 results ... (1, results, 0, value, 0, 0, value) 2
23 1 results ... (1, results, 0, value, 0, 0, endTime)
.................
[49 rows x 9 columns]
I updated the code (small bug fixes and speed improvements). It's on GitHub
https://github.com/hansalemaos/flatten_any_dict_iterable_or_whatsoever
for pandas: https://github.com/hansalemaos/a_pandas_ex_plode_tool