How to get values from list of nested dictionaries?

Question

I would like to get all the key values from a nested dictionary that is stored in lists. See example below:

#creating dataframe with nested dictionaries
test_dict_1={'results': [{'key': 'q1',
   'value': ['1'],
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'multipleChoice'},
  {'key': 'q2',
   'value': ['False'],
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'multipleChoice'},
  {'key': 'q3',
   'value': ['3'],
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'multipleChoice'},
  {'key': 'q4',
   'value': ['3'],
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'multipleChoice'}]}
  


test_dict_2={'results': [{'key': 'survey_x',
   'value': [[{'key': 'q1',
      'value': 2,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q2',
      'value': 0,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q3',
      'value': 2,
      'endTime':'2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q4',
      'value': 0,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime':'2021-01-21',
      'resultType': 'multipleChoice'}]],
   'skipped': False,
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'grouped'}]}


df = pd.DataFrame()
df = pd.concat([df, pd.Series([test_dict_1])],ignore_index=True)
df = pd.concat([df, pd.Series([test_dict_2])],ignore_index=True)

df.head()

Below I created a for-loop that extracts all key values from all rows. Using this approach, I get the ideal key-value extraction from the first row, but not the second row. Can you help me figure out how to only extract the key-values from second row which contains a dictionary in a list?

for i in range(len(df)):
    for key_, value_ in recursive_items(df.loc[i, 0]):
        for element in value_:
            keys_ = dict((k, element[k]) for k in ['key'] if k in element)
            texts_ = dict((k, element[k]) for k in ['text'] if k in element)
            values_ = dict((k, element[k]) for k in ['value'] if k in element)
            
            #print('keys',keys_)
            #print('text',texts_)
            #print('values',values_)

            new_data_dictionary=dict.fromkeys(keys_.values(),values_)
            if bool(texts_):
                new_data_dictionary.append(texts_)
            print(new_data_dictionary)
            df.loc[i,'key_values']=new_data_dictionary

The for-loop returns the following:

#this is what I would like
{'q1': {'value': ['1']}}
{'q2': {'value': ['False']}}
{'q3': {'value': ['3']}}
{'q4': {'value': ['3']}}

#this is not what I want. The issue is caused by the dictionary being in the list
{'survey_x': {'value': [[{'key': 'q1', 'value': 2, 'endTime': '2021-01-21', 'skipped': False, 'startTime': '2021-01-21', 'resultType': 'multipleChoice'}, {'key': 'q2', 'value': 0, 'endTime': '2021-01-21', 'skipped': False, 'startTime': '2021-01-21', 'resultType': 'multipleChoice'}, {'key': 'q3', 'value': 2, 'endTime': '2021-01-21', 'skipped': False, 'startTime': '2021-01-21', 'resultType': 'multipleChoice'}, {'key': 'q4', 'value': 0, 'endTime': '2021-01-21', 'skipped': False, 'startTime': '2021-01-21', 'resultType': 'multipleChoice'}]]}}

Jordan · Answer 1 · 2022-09-14T16:08:34.727

This script doesn’t format the result values the way that you want, but it will help you figure out how to do it on your own:

test_dict_2={'results': [{'key': 'survey_x',
   'value': [[{'key': 'q1',
      'value': 2,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q2',
      'value': 0,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q3',
      'value': 2,
      'endTime':'2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q4',
      'value': 0,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime':'2021-01-21',
      'resultType': 'multipleChoice'}]],
   'skipped': False,
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'grouped'}]}

result_values = test_dict_2["results"][0]["value"][0]
for result_val in result_values:
    # do something with each dictionary

It basically accesses the nested list of dictionaries. try replacing the comment line with print(result_val.items()) to see the output, and then I think you’ve got it from there.

Update

One way you could get the desired output with the strategy above would be to do something like this:


test_dict_2={'results': [{'key': 'survey_x',
   'value': [[{'key': 'q1',
      'value': 2,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q2',
      'value': 0,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q3',
      'value': 2,
      'endTime':'2021-01-21',
      'skipped': False,
      'startTime': '2021-01-21',
      'resultType': 'multipleChoice'},
     {'key': 'q4',
      'value': 0,
      'endTime': '2021-01-21',
      'skipped': False,
      'startTime':'2021-01-21',
      'resultType': 'multipleChoice'}]],
   'skipped': False,
   'end_time': '2021-01-21',
   'start_time': '2021-01-21',
   'result_type': 'grouped'}]}

original_values = test_dict_2["results"][0]["value"][0]

res = {}

for result in original_values:
  key = result["key"]
  if key not in res:
    result.pop("key")
    res[key] = result

Which would give you something like this:

{
    "q1": {
        "value": 2,
        "endTime": "2021-01-21",
        "skipped": false,
        "startTime": "2021-01-21",
        "resultType": "multipleChoice"
    },
    "q2": {
        "value": 0,
        "endTime": "2021-01-21",
        "skipped": false,
        "startTime": "2021-01-21",
        "resultType": "multipleChoice"
    },
    "q3": {
        "value": 2,
        "endTime": "2021-01-21",
        "skipped": false,
        "startTime": "2021-01-21",
        "resultType": "multipleChoice"
    },
    "q4": {
        "value": 0,
        "endTime": "2021-01-21",
        "skipped": false,
        "startTime": "2021-01-21",
        "resultType": "multipleChoice"
    }
}

If you want to take a different approach with a for-loop, as you mentioned, using isinstance() could help. Along with try/except statements.

If you know the field names (dict keys) that you're looking for in particular, maybe having them in a list and checking if they're in one of the nested values, could be helpful.

Thanks this works, but doesn't completely answer my question. I was provided with the df, not with the individual test_dict_1 and test_dict_2. How do I use this approach for all rows in the dataframe? Is there a for loop that I could apply that would detect if a row is like test_dict_1 or 2, and then apply the dataframe correction? Next, at the end of my for loop, I create a keyvalue dictionary and add it back to the original dataframe (in column 'key_values'). Do I have to create a dictionary after creating the dataframes? Or is there a simpler approach? — sos.cott, Sep 14 '22 at 13:33

Dacromir · Answer 2 · 2022-09-13T20:55:18.693

Two things to point out. First, your complicated for-loop setup is mostly unnecessary. If you tweak code used to create your dataframe, you can turn the list of dictionaries into rows of a data frame and access the values directly:

df1 = pd.DataFrame(test_dict_1['results'])
print(df1[['key', 'value']])

That code outputs the following:

  key    value
0  q1      [1]
1  q2  [False]
2  q3      [3]
3  q4      [3]

Second, the issue with the test_dict_2 is that it's structured differently. To create a dataframe from it, you'll need to make sure you correctly access the list of dictionaries:

df2 = pd.DataFrame(test_dict_2['results'][0]['value'][0])
print(df2[['key', 'value']])

Output:

  key  value
0  q1      2
1  q2      0
2  q3      2
3  q4      0

Note that, as-is, you'll have a hard time concatenating df1 and df2 - they have different column names and don't have the same number or order of columns. However, if you really need to concatenate the two frames together you can do so by renaming columns to match up nicely and then following the documentation here. For example, in this case you could use the following code to concatenate the data frames:

# Create dataframes
df1 = pd.DataFrame(test_dict_1['results'])
df2 = pd.DataFrame(test_dict_2['results'][0]['value'][0])

# Prepare for concatenation
df2.columns = ['end_time', 'key', 'result_type', 'skipped', 'start_time', 'value']
df2 = df2.drop(columns='skipped')
df1['value'] = [v[0] for v in df1['value']]

# Concatenate data frames
df = pd.concat([df1, df2])
print(df)

# Print all key/value pairs
print(df[['key', 'value']])

Thanks this works, but doesn't completely answer my question. I was provided with the df, not with the individual test_dict_1 and test_dict_2. How do I use this approach for all rows in the dataframe? Is there a for loop that I could apply that would detect if a row is like test_dict_1 or 2, and then apply the dataframe correction? Next, at the end of my for loop, I create a keyvalue dictionary and add it back to the original dataframe (in column 'key_values'). Do I have to create a dictionary after creating the dataframes? Or is there a simpler approach? — sos.cott, Sep 14 '22 at 13:32

Gonçalo Peres · Answer 3 · 2022-09-14T14:01:24.987

OP's issue is in the creation of the dataframe using test_dict_2. OP wants to create dataframe from test_dict_2, but, instead of using the key "survey_x", OP wants to go deeper into the nested dictionary.

For that, one can use pandas.DataFrame as follows

df_2 = pd.DataFrame(test_dict_2['results'][0]['value'][0])

As OP only wants to consider the columns key and value, there are various ways to do that (Read this: Delete a column from a Pandas DataFrame), one can select as follows

df_2 = df_2[['key', 'value']]

[Out]:
  key  value
0  q1      2
1  q2      0
2  q3      2
3  q4      0

For the first one it is simpler, as the following will do the work

df_1 = pd.DataFrame(test_dict_1['results'])

df_1 = df_1[['key', 'value']] # One way of selecting the columns one wants

[Out]:
  key    value
0  q1      [1]
1  q2  [False]
2  q3      [3]
3  q4      [3]

This should be enough for OP to run the loop. In order to test it, also created a simple function that, given a specific dataframe, extracts the key values

def extract_keys_values(df):
    # Create a list of dictionaries
    list_of_dicts = []
    for index, row in df.iterrows():
        # Create a dictionary for each row
        dict_ = {}
        for key, value in row.items():
            # If the value is a list, extract the first element
            if isinstance(value, list):
                value = value[0]
            # If the value is a dictionary, extract the value
            if isinstance(value, dict):
                value = value['value']
            # Add key and value to dictionary
            dict_[key] = value
        # Add dictionary to list
        list_of_dicts.append(dict_)
    # Create dataframe from list of dictionaries
    df = pd.DataFrame(list_of_dicts)
    return df

And it works just fine. One can run it with the dataframes created above, to test, as follows

df_1 = extract_keys_values(df_1)

[Out]:
  key  value
0  q1      1
1  q2  False
2  q3      3
3  q4      3

and

df_2 = extract_keys_values(df_2)

[Out]:
  key  value
0  q1      2
1  q2      0
2  q3      2
3  q4      0

Thanks this works, but doesn't completely answer my question. I was provided with the df, not with the test_dict_1 and test_dict_2. How do I use extract_keys_values for all rows in the dataframe? Is there a for loop that I could apply that would detect if a row is like test_dict_1 or 2, and then apply the extract_keys_values? Next, at the end of my for loop, I create a keyvalue dictionary and add it back to the original dataframe (in column 'key_values'). Do I have to create a dictionary after creating the dataframe from extract_key_values? Or is there a simpler approach? — sos.cott, Sep 14 '22 at 13:29
@sos.cott that is a different question. My suggestion is for you to create a new question, with exactly what you have and what you want to achieve. Then, if not me, another developer will, most likely, jump in, and help you with your query. — Gonçalo Peres, Sep 14 '22 at 14:04
Okay, Thank @ Gonçalo Peres. Does this apply for both questions, or just the first question. The second question seems to be relevant as it's shown in the example in the original question. — sos.cott, Sep 14 '22 at 14:57
@sos.cott I would start the new question with what you have (a dataframe with X, Y, Z - specify that). Then would indicate that there are various types of dictionaries, and that you will need to create a function to, for each type, give you the keys and values, and package everything in a specific way (for you to define as well based on your requirements). These are just some thoughts from the words you've written - you might have to adjust. — Gonçalo Peres, Sep 14 '22 at 15:00
Thanks @ Gonçalo Peres! Here is the updated question (I hope this clarifies the question): https://stackoverflow.com/questions/73719143/how-to-detect-if-theres-a-nested-dictionary-in-a-pandas-dataframe-column — sos.cott, Sep 14 '22 at 15:07
Thanks @ Gonçalo Peres for your help! I made a (very ugly) function that provides the solution that I want in the end. See below for the answer. — sos.cott, Sep 14 '22 at 20:04

Hans · Accepted Answer · 2022-10-03T14:51:34.983

This one works with every nested dict, list, tuple, df, nparray, etc.

class Tuppsub(tuple):
    pass

class ProtectedTuple(tuple):
    pass


class ProtectedList(list):
    pass


class ProtectedDict(dict):
    pass


class ProtectedSet(set):
    pass
def aa_flatten_dict_tu(
        v,
        listitem,
        forbidden=(list, tuple, set, frozenset),
        allowed=(
                str,
                int,
                float,
                complex,
                bool,
                bytes,
                type(None),
                ProtectedTuple,
                ProtectedList,
                ProtectedDict,
                ProtectedSet,
                Tuppsub,
        ),
):
    if isinstance(v, dict) or (
            hasattr(v, "items") and hasattr(v, "keys")
    ):  # we check right away if it is a dict or something similar (with keys/items). If we miss something, we will
        # only get the keys back.
        for k, v2 in v.items():
            newtu = listitem + (k,)  # we accumulate all keys in a tuple

            # and check if there are more dicts (nested) in this dict
            yield from aa_flatten_dict_tu(
                v2, listitem=newtu, forbidden=forbidden, allowed=allowed
            )
    elif isinstance(
            v, forbidden
    ):  # if we have an iterable without keys (list, tuple, set, frozenset) we have to enumerate them to be able to
        # access the original dict values later: di['blabla'][0] instead of di['blabla']

        for indi, v2 in enumerate(v):

            if isinstance(v2, allowed):
                yield v2, listitem
            #  if the value is not in our allowed data types, we have to check if it is an iterable
            else:
                yield from aa_flatten_dict_tu(
                    v2,
                    listitem=(listitem + (indi,)),
                    forbidden=forbidden,
                    allowed=allowed,
                )
    elif isinstance(v, allowed):
        #  if the datatype is allowed, we yield it
        yield Tuppsub((v, listitem))

    # Brute force to check if we have an iterable. We have to get all iterables!
    else:
        try:
            for indi2, v2 in enumerate(v):

                try:
                    if isinstance(v2, allowed):
                        yield v2, listitem

                    else:
                        yield aa_flatten_dict_tu(
                            v2,
                            listitem=(listitem + (indi2,)),
                            forbidden=forbidden,
                            allowed=allowed,
                        )
                except Exception:
                    # if there is an exception, it is probably not an iterable, so we yield it
                    yield v2, listitem
        except Exception:
            # if there is an exception, it is probably not an iterable, so we yield it
            yield v, listitem


def fla_tu(
        item,
        walkthrough=(),  # accumulate nested keys
        forbidden=(list, tuple, set, frozenset),  # forbidden to yield, need to be flattened
        allowed=(  # Data types we don't want to touch!
                str,
                int,
                float,
                complex,
                bool,
                bytes,
                type(None),
                ProtectedTuple,  #
                ProtectedList,
                ProtectedDict,
                ProtectedSet,
                Tuppsub  # This is the secret - Inherit from tuple and exclude it from being flattened -
                # ProtectedTuple does the same thing
        ),
        dict_variation=(
        # we don't check with isinstance(), rather with type(), that way we don't have to import collections.
                "collections.defaultdict",
                "collections.UserDict",
                "collections.OrderedDict",
        ),
):
    if isinstance(item, allowed):  # allowed items, so let's yield them
        yield item, walkthrough
    elif isinstance(item, forbidden):
        for ini, xaa in enumerate(item):
            try:
                yield from fla_tu(
                    xaa,
                    walkthrough=(walkthrough + (ini,)),
                    forbidden=forbidden,
                    allowed=allowed,
                    dict_variation=dict_variation,
                )  # if we have an iterable, we check recursively for other iterables

            except Exception:

                yield xaa, Tuppsub(
                    (walkthrough + Tuppsub((ini,)))
                )  # we just yield the value (value, (key1,key2,...))  because it is probably not an iterable
    elif isinstance(
            item, dict
    ):  # we need to pass dicts to aa_flatten_dict_tu(), they need a special treatment, if not, we only get the keys from the dict back

        yield from aa_flatten_dict_tu(
            item, listitem=walkthrough, forbidden=forbidden, allowed=allowed
        )
    # let's try to catch all different dict variations by using ( hasattr(item, "items") and hasattr(item, "keys").
    # If we dont pass it to aa_flatten_dict_tu(), we only get the keys back.
    #
    # -> (hasattr(item, "items") and hasattr(item, "keys") -> Maybe better here:     elif isinstance( item, dict ):
    elif (str(type(item)) in dict_variation) or (
            hasattr(item, "items") and hasattr(item, "keys")
    ):
        yield from aa_flatten_dict_tu(
            dict(item), listitem=walkthrough, forbidden=forbidden, allowed=allowed
        )

    # isinstance(item, pd.DataFrame) maybe better?
    elif "DataFrame" in str(type(item)):

        yield from aa_flatten_dict_tu(
            item.copy().to_dict(),
            # pandas needs to be converted to dict first, if not, we only get the columns back. Copying might not be necessary
            listitem=walkthrough,
            forbidden=forbidden,
            allowed=allowed,
        )

    # # many iterables are hard to identify using isinstance() / type(), so we have to use brute force to check if it is
    # an iterable. If one iterable escapes, we are screwed!
    else:
        try:
            for ini2, xaa in enumerate(item):
                try:
                    if isinstance(xaa, allowed):  # yield only for allowed data types

                        yield xaa, Tuppsub(
                            (walkthrough + (ini2,))
                        )  # yields (value, (key1,key2,...)) -> always same format -> first value, then all keys in another tuple
                    else:  # if it is not in the allowed data types, we check recursively for other iterables
                        yield from fla_tu(
                            xaa,
                            walkthrough=Tuppsub(
                                (walkthrough + Tuppsub(ini2, ))
                            ),  # yields (value, (key1,key2,...))
                            forbidden=forbidden,
                            allowed=allowed,
                            dict_variation=dict_variation,
                        )
                except Exception:

                    yield xaa, Tuppsub(
                        (walkthrough + (ini2,))
                    )  # in case of an exception, we yield  (value, (key1,key2,...))
        except Exception:

            yield item, Tuppsub(
                (walkthrough + Tuppsub(item, ))
            )  # in case of an exception, we yield  (value, (key1,key2,...))


data = {'results': [{'key': 'survey_x',
                     'value': [[{'key': 'q1',
                                 'value': 2,
                                 'endTime': '2021-01-21',
                                 'skipped': False,
                                 'startTime': '2021-01-21',
                                 'resultType': 'multipleChoice'},
                                {'key': 'q2',
                                 'value': 0,
                                 'endTime': '2021-01-21',
                                 'skipped': False,
                                 'startTime': '2021-01-21',
                                 'resultType': 'multipleChoice'},
                                {'key': 'q3',
                                 'value': 2,
                                 'endTime': '2021-01-21',
                                 'skipped': False,
                                 'startTime': '2021-01-21',
                                 'resultType': 'multipleChoice'},
                                {'key': 'q4',
                                 'value': 0,
                                 'endTime': '2021-01-21',
                                 'skipped': False,
                                 'startTime': '2021-01-21',
                                 'resultType': 'multipleChoice'}]],
                     'skipped': False,
                     'end_time': '2021-01-21',
                     'start_time': '2021-01-21',
                     'result_type': 'grouped'}]}
nested_whatever=fla_tu(data)
flattenddict = list((fla_tu(nested_whatever)))
resultfinal=[list(x)[0] if "generator" in str(type(x)) else x for x in flattenddict]
allvalues = [x[0][0] for x in resultfinal]
allkeys = [x[0][1] for x in resultfinal]

Here are the results

#result (allvalues)
['survey_x',
 'q1',
 2,
 '2021-01-21',
 False,
 '2021-01-21',
 'multipleChoice',
 'q2',
 0,
 '2021-01-21',
 False,
 '2021-01-21',
 'multipleChoice',
 'q3',
 2,
 '2021-01-21',
 False,
 '2021-01-21',
 'multipleChoice',
 'q4',
 0,
 '2021-01-21',
 False,
 '2021-01-21',
 'multipleChoice',
 False,
 '2021-01-21',
 '2021-01-21',
 'grouped' ....]


#result(allkeys)

[('results', 0, 'key'),
 ('results', 0, 'value', 0, 0, 'key'),
 ('results', 0, 'value', 0, 0, 'value'),
 ('results', 0, 'value', 0, 0, 'endTime'),
 ('results', 0, 'value', 0, 0, 'skipped'),

 ('results', 0, 'result_type')....]

If you want to have a DataFrame, you can use this:

import pandas as pd
class Tuppsub(tuple):
    pass

class ProtectedTuple(tuple):
    pass


class ProtectedList(list):
    pass


class ProtectedDict(dict):
    pass


class ProtectedSet(set):
    pass
def aa_flatten_dict_tu(
        v,
        listitem,
        forbidden=(list, tuple, set, frozenset),
        allowed=(
                str,
                int,
                float,
                complex,
                bool,
                bytes,
                type(None),
                ProtectedTuple,
                ProtectedList,
                ProtectedDict,
                ProtectedSet,
                Tuppsub,
        ),
):
    if isinstance(v, dict) or (
            hasattr(v, "items") and hasattr(v, "keys")
    ):  
        
        for k, v2 in v.items():
            newtu = listitem + (k,)  

            
            yield from aa_flatten_dict_tu(
                v2, listitem=newtu, forbidden=forbidden, allowed=allowed
            )
    elif isinstance(
            v, forbidden
    ):  
        

        for indi, v2 in enumerate(v):

            if isinstance(v2, allowed):
                yield v2, listitem
            
            else:
                yield from aa_flatten_dict_tu(
                    v2,
                    listitem=(listitem + (indi,)),
                    forbidden=forbidden,
                    allowed=allowed,
                )
    elif isinstance(v, allowed):
        
        yield Tuppsub((v, listitem))

    
    else:
        try:
            for indi2, v2 in enumerate(v):

                try:
                    if isinstance(v2, allowed):
                        yield v2, listitem

                    else:
                        yield aa_flatten_dict_tu(
                            v2,
                            listitem=(listitem + (indi2,)),
                            forbidden=forbidden,
                            allowed=allowed,
                        )
                except Exception:
                    yield v2, listitem
        except Exception:
            yield v, listitem


def fla_tu(
        item,
        walkthrough=(), 
        forbidden=(list, tuple, set, frozenset),  
        allowed=(  
                str,
                int,
                float,
                complex,
                bool,
                bytes,
                type(None),
                ProtectedTuple,  #
                ProtectedList,
                ProtectedDict,
                ProtectedSet,
                Tuppsub  
        ),
        dict_variation=(
        
                "collections.defaultdict",
                "collections.UserDict",
                "collections.OrderedDict",
        ),
):
    if isinstance(item, allowed): 
        yield item, walkthrough
    elif isinstance(item, forbidden):
        for ini, xaa in enumerate(item):
            try:
                yield from fla_tu(
                    xaa,
                    walkthrough=(walkthrough + (ini,)),
                    forbidden=forbidden,
                    allowed=allowed,
                    dict_variation=dict_variation,
                )  

            except Exception:

                yield xaa, Tuppsub(
                    (walkthrough + Tuppsub((ini,)))
                ) 
    elif isinstance(
            item, dict
    ): 

        yield from aa_flatten_dict_tu(
            item, listitem=walkthrough, forbidden=forbidden, allowed=allowed
        )

    elif (str(type(item)) in dict_variation) or (
            hasattr(item, "items") and hasattr(item, "keys")
    ):
        yield from aa_flatten_dict_tu(
            dict(item), listitem=walkthrough, forbidden=forbidden, allowed=allowed
        )

    elif "DataFrame" in str(type(item)):

        yield from aa_flatten_dict_tu(
            item.copy().to_dict(),
            listitem=walkthrough,
            forbidden=forbidden,
            allowed=allowed,
        )


    else:
        try:
            for ini2, xaa in enumerate(item):
                try:
                    if isinstance(xaa, allowed): 

                        yield xaa, Tuppsub(
                            (walkthrough + (ini2,))
                        ) 
                    else: 
                        yield from fla_tu(
                            xaa,
                            walkthrough=Tuppsub(
                                (walkthrough + Tuppsub(ini2, ))
                            ), 
                            forbidden=forbidden,
                            allowed=allowed,
                            dict_variation=dict_variation,
                        )
                except Exception:

                    yield xaa, Tuppsub(
                        (walkthrough + (ini2,))
                    )  
        except Exception:

            yield item, Tuppsub(
                (walkthrough + Tuppsub(item, ))
            )  



def qq_d_sort_columns_alphabetically(df, reverse=False):
    # crucial -> to make sure our keys are in the right order

    if reverse is False:
        return df.filter(sorted(df.columns)).copy()
    return df.filter(reversed(sorted(df.columns))).copy()

def qq_s_lists_to_df(df):
    df2 = df.copy()
    # the biggest problem of a nested iterable is, that we have different key depths of each value. To solve the problem,
    # we get the max len and adjust all other keys
    maxlen = df2.dropna().map(lambda x: len(x)).max()
    # # Any key (tuple of keys) will get adjusted
    return df2.apply(
        lambda x: _exs_normalize_lists_in_series(x, maxlen, seriesback=True)
    ).copy()

def qq_ds_merge_multiple_dfs_and_series_on_index(
    df,
    list_with_ds,
    how="inner",
    on=None,
    sort=False,
    suffixes=("_x", "_y"),
    indicator=False,
    validate=None,
):
    # pd.merge in a for-loop (is there a way to do this with pandas directly?)
    df2 = df.copy()
    for ini, x in enumerate(list_with_ds):
        if isinstance(x, pd.Series):
            x = x.to_frame().copy()
        df2 = (
            pd.merge(
                df2.copy(),
                x.copy(),
                how=how,
                on=on,
                sort=sort,
                indicator=indicator,
                validate=validate,
                left_index=True,
                right_index=True,
                suffixes=(
                    f"{suffixes[0]}_{str(ini).zfill(3)}",  # important to have a format that can be filtered easily
                    f"{suffixes[1]}_{str(ini).zfill(3)}",
                ),
            )
        ).copy()  # Copy! Copy! Copy! Maybe not necessary, but changing the original data is a no-go.
        return df2
def _exs_normalize_lists_in_series(list_, maxlen, seriesback=True):

    # the nan check is very important, but pd.isna and np.isnan is not enough!!
    if qq_s_isnan(list_):
        if seriesback:
            return pd.Series(
                [pd.NA] * maxlen
            )  # pd.NA is the "most tolerant" of all nan versions (less exceptions), so let's use it
        else:
            return [pd.NA] * maxlen

    list_ = _if_not_list_to_list(list_)  # make sure to have a list

    add_lists = (maxlen - len(list_)) * [
        pd.NA
    ]  # Example -> if maxlen (max depth) of our keys is 8, and the one that has been passed to this function has 2 , we add 6x pd.NA values.
    if seriesback:
        return pd.Series(
            list_ + add_lists
        )  # in our case, we always use seriesback=True -> best way to get a DataFrame in the right format
    return list_ + add_lists  # now our lists have all the same length,
def _if_not_list_to_list(list_):

    if not isinstance(list_, list):
        try:
            list_ = list_.tolist()  # we try it first like that (only for numpy arrays)
        except Exception:
            list_ = list(
                list_
            )  # if it doesn't work, we do it like this. Otherwise, it won't work, resulting in many exceptions later on
    return list_
def qq_s_isnan(wert, nan_back=False, debug=False):

    # qq_s_isnan looks terrible, but is very important due to all different nans that exist
    # Here is a little example:
    # -------------------------------------------------------
    # np.isnan(np.array(['ddd',3])) -> np.isnan throws an exception
    # TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''
    # -------------------------------------------------------
    # pd.isna(np.array(['ddd',3])) -> pandas is ok with it
    # Out[14]: array([False, False])
    # -------------------------------------------------------
    # from math import isnan -> math throws another exception
    # isnan(np.array(['ddd',3]))
    # TypeError: only size-1 arrays can be converted to Python scalars
    # We don't need np.isnan here, it is covered by pd/math

    allenanvalues = [
        "<NA>",
        "<NAN>",
        "<nan>",
        "np.nan",
        "NoneType",
        "None",
        "-1.#IND",
        "1.#QNAN",
        "1.#IND",
        "-1.#QNAN",
        "#N/A N/A",
        "#N/A",
        "N/A",
        "n/a",
        "NA",
        "",
        "#NA",
        "NULL",
        "null",
        "NaN",
        "-NaN",
        "nan",
        "-nan",
    ]
    try:
        if pd.isna(wert) is True:
            if nan_back is True:
                return np.nan
            return True
    except Exception as Fehler:
        if debug is True:
            print(Fehler)

    try:
        if pd.isnull(wert) is True:
            if nan_back is True:
                return np.nan
            return True
    except Exception as Fehler:
        if debug is True:
            print(Fehler)

    try:
        if math.isnan(wert) is True:
            if nan_back is True:
                return np.nan
            return True
    except Exception as Fehler:
        if debug is True:
            print(Fehler)

    try:
        if wert is None:
            return True
    except Exception as Fehler:
        if debug is True:
            print(Fehler)

    # # usually we don't get here, but who knows. If we get here (string import???),
    # we get rid of those values and substitute them with real NaN
    for allaaa in allenanvalues:
        try:
            nanda = re.findall(str(fr"^\s*{wert}\s*$"), str(allaaa))
            if any(nanda):
                return True
        except Exception as Fehler:
            if debug is True:
                print(Fehler)
            return False
    return False

def nested_something_to_df(nested_whatever, unstack: bool = True,) -> pd.DataFrame:

    flattenddict = list(fla_tu(nested_whatever))  # flatten every iterable
    # depending on the iterable, we have to convert it to a list and get the first (and only) tuple (which always has the same length -> 2 - value and keys[s])

    flattenddict = [
        list(x)[0] if "generator" in str(type(x)) else x for x in flattenddict
    ]
    # now we have a dataframe, but all keys from our iterable are in one column

    df = pd.DataFrame(flattenddict)
    df.columns = [
        "aa_value",
        "aa_all_keys",
    ]
    indexdf = qq_s_lists_to_df(
        df["aa_all_keys"]
    )  # We need to explode the column aa_all_keys
    # enumerate columns, to distinguish better.
    indexdf.columns = [f"aa_key_{x}" for x in indexdf.columns]
    # merge the exploded columns with the 2=colum dataframe! we need the data twice! 1x for the index right now, and
    # the second one 'aa_all_keys' later to transform the df stacked->unstacked / unstacked->stacked
    # and to update the original iter
    df = qq_ds_merge_multiple_dfs_and_series_on_index(df, [indexdf])

    # merge the exploded columns with the 2=columns DataFrame! We need the data twice! 1x for the index right now, and
    # the second one 'aa_all_keys' later to transform the DataFrame stacked->unstacked / unstacked->stacked
    # and to update the original iterable -> update_original_iter
    df.index = [df[f"aa_key_{x}"].__array__() for x in range(len(df.columns) - 2)]

    # Very important, we need to make sure to put the keys of the nested iter in the right order
    df = qq_d_sort_columns_alphabetically(df)

    # We can now drop the columns because the key-data is now present in the index
    df = df.drop(columns=[x for x in df.columns if x.startswith("aa_key_")])

    # At this point there are only 2 columns -> (["aa_value", "aa_all_keys"] )
    # It might not be necessary to copy the DataFrame here, but no way I want to change the original data
    if unstack:
        # df = adjust_dataframe_and_dtypes(df, nested_whatever)
        # One column for each key (key1, key2...), one for the value[s] ("aa_value") and one for all keys as tuple ("aa_all_keys")
        return df.reset_index().copy()
    # The 2 columns version
    return df.copy()

nested_something_to_df(nested_whatever=data, unstack = False,)

And here is the result:

                                                   aa_all_keys (can be droped)       aa_value
results 0 key         NaN NaN NaN                             (results, 0, key)        survey_x
          value       0   0   key                (results, 0, value, 0, 0, key)              q1
                              value            (results, 0, value, 0, 0, value)               2
                              endTime        (results, 0, value, 0, 0, endTime)      2021-01-21
                              skipped        (results, 0, value, 0, 0, skipped)           False
                              startTime    (results, 0, value, 0, 0, startTime)      2021-01-21
                              resultType  (results, 0, value, 0, 0, resultType)  multipleChoice
                          1   key                (results, 0, value, 0, 1, key)              q2
                              value            (results, 0, value, 0, 
  .......

Here is the result, when you merge the 2 dicts:

nested_something_to_df(nested_whatever=[test_dict_1,test_dict_2], unstack = True,)



    level_0  level_1  ...                               aa_all_keys        aa_value
0         0  results  ...                      (0, results, 0, key)              q1
1         0  results  ...                    (0, results, 0, value)               1
2         0  results  ...                 (0, results, 0, end_time)      2021-01-21
3         0  results  ...               (0, results, 0, start_time)      2021-01-21
4         0  results  ...              (0, results, 0, result_type)  multipleChoice
5         0  results  ...                      (0, results, 1, key)              q2
6         0  results  ...                    (0, results, 1, value)           False
7         0  results  ...                 (0, results, 1, end_time)      2021-01-21
8         0  results  ...               (0, results, 1, start_time)      2021-01-21
9         0  results  ...              (0, results, 1, result_type)  multipleChoice
10        0  results  ...                      (0, results, 2, key)              q3
11        0  results  ...                    (0, results, 2, value)               3
12        0  results  ...                 (0, results, 2, end_time)      2021-01-21
13        0  results  ...               (0, results, 2, start_time)      2021-01-21
14        0  results  ...              (0, results, 2, result_type)  multipleChoice
15        0  results  ...                      (0, results, 3, key)              q4
16        0  results  ...                    (0, results, 3, value)               3
17        0  results  ...                 (0, results, 3, end_time)      2021-01-21
18        0  results  ...               (0, results, 3, start_time)      2021-01-21
19        0  results  ...              (0, results, 3, result_type)  multipleChoice
20        1  results  ...                      (1, results, 0, key)        survey_x
21        1  results  ...         (1, results, 0, value, 0, 0, key)              q1
22        1  results  ...       (1, results, 0, value, 0, 0, value)               2
23        1  results  ...     (1, results, 0, value, 0, 0, endTime)          

.................
[49 rows x 9 columns]

I updated the code (small bug fixes and speed improvements). It's on GitHub https://github.com/hansalemaos/flatten_any_dict_iterable_or_whatsoever for pandas: https://github.com/hansalemaos/a_pandas_ex_plode_tool

score 0 · Answer 5 · answered Sep 14 '22 at 20:03

By far not the prettiest solution, but this works for my messy dataframe:

def recursive_items(dictionary):
for key, value in dictionary.items():
    if type(value) is dict:
        yield from recursive_items(value)
    else:
        yield (key, value)

def extract_keys_values(df):
    for i in range(len(df)):
    #    print(i)
        global_dict={}
        for key_, value_ in recursive_items(df.loc[i, 0]):
            for element in value_:
                keys_ = dict((k, element[k]) for k in ['key'] if k in element)
                texts_ = dict((k, element[k]) for k in ['text'] if k in element)
                values_ = dict((k, element[k]) for k in ['value'] if k in element)

                if 'key' in str(values_):
                    for key, value in values_.items():
                        try:
                            nested_keys_list=list(object['key'] for object in value[0])
                            nested_values_list=list(object['value'] for object in value[0])
                        except:
                            nested_keys_list=list(object['key'] for object in [value][0])
                            nested_values_list=list(object['value'] for object in [value][0])
                        for list_index, word in enumerate(nested_keys_list):
                            if isinstance(nested_values_list[list_index], list):
                                try:
                                    nested_dictionaries={word,nested_values_list[list_index][0]}
                                except:
                                    nested_dictionaries={word,''}
                            else:
                                nested_dictionaries={word,nested_values_list[list_index]}
                                new_data_dictionary=dict.fromkeys(keys_.values(),[nested_dictionaries])

                else:
                    new_data_dictionary=dict.fromkeys(keys_.values(),values_)
                
                if bool(texts_):
                    new_data_dictionary.update(texts_)
                global_dict.update(new_data_dictionary)

        #    print(global_dict)
        df.loc[i,'data_key_value']=[global_dict]

    return df['data_key_value']

df['data_key_value']=extract_keys_values(df)

How to get values from list of nested dictionaries?

5 Answers5

Update