0

@MichelePiccolini contributed an excellent generic function to flatten a pandas dataframe with json columns here https://stackoverflow.com/a/61269285/11620388. When I run the function on the test code then do df or df.info() the original dataframe is returned, not the flattened dataframe:

def flatten_nested_json_df(df):

df = df.reset_index()

print(f"original shape: {df.shape}")
print(f"original columns: {df.columns}")


# search for columns to explode/flatten
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()

s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()

print(f"lists: {list_columns}, dicts: {dict_columns}")
while len(list_columns) > 0 or len(dict_columns) > 0:
    new_columns = []

    for col in dict_columns:
        print(f"flattening: {col}")
        # explode dictionaries horizontally, adding new columns
        horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
        horiz_exploded.index = df.index
        df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
        new_columns.extend(horiz_exploded.columns) # inplace

    for col in list_columns:
        print(f"exploding: {col}")
        # explode lists vertically, adding new columns
        df = df.drop(columns=[col]).join(df[col].explode().to_frame())
        new_columns.append(col)

    # check if there are still dict o list fields to flatten
    s = (df[new_columns].applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df[new_columns].applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")

print(f"final shape: {df.shape}")
print(f"final columns: {df.columns}")
return df

test:

df = pd.DataFrame(
columns=['id','name','columnA','columnB'],
data=[
    [1,'John',{"dist": "600", "time": "0:12.10"},[{"pos": "1st", "value": "500"},{"pos": "2nd", "value": "300"},{"pos": "3rd", "value": "200"}, {"pos": "total", "value": "1000"}]],
    [2,'Mike',{"dist": "600"},[{"pos": "1st", "value": "500"},{"pos": "2nd", "value": "300"},{"pos": "total", "value": "800"}]]
])

flatten_nested_json_df(df)

df.info()

returns original dataframe, not the flattened dataframe.

DavidH
  • 57
  • 6
  • Python is sensitive to indents, be careful... you should leave 2 spaces for all the code which is below the function `def flatten_nested_json_df(df):` till the Return statement – NoobVB May 28 '22 at 05:15

1 Answers1

0

as per my comment, your code should look like the below (with the correct Indentation):

def flatten_nested_json_df(df):

    df = df.reset_index()

    print(f"original shape: {df.shape}")
    print(f"original columns: {df.columns}")


    # search for columns to explode/flatten
    s = (df.applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df.applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")
    while len(list_columns) > 0 or len(dict_columns) > 0:
        new_columns = []

    for col in dict_columns:
        print(f"flattening: {col}")
        # explode dictionaries horizontally, adding new columns
        horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
        horiz_exploded.index = df.index
        df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
        new_columns.extend(horiz_exploded.columns) # inplace

    for col in list_columns:
        print(f"exploding: {col}")
        # explode lists vertically, adding new columns
        df = df.drop(columns=[col]).join(df[col].explode().to_frame())
        new_columns.append(col)

    # check if there are still dict o list fields to flatten
    s = (df[new_columns].applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df[new_columns].applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")

    print(f"final shape: {df.shape}")
    print(f"final columns: {df.columns}")
    return df

# test your function below
flatten_nested_json_df(test_df)
NoobVB
  • 989
  • 6
  • 10
  • Thanks NoobVB. I cut & pasted your code into my notebook, and it hangs when I run it and fails to complete. Hangs on this line: new_columns = [] – DavidH May 30 '22 at 20:17
  • @MichelePiccolini solution prints the solution I need, I just can't figure out how to access the flattened dataframe her function returns. – DavidH May 30 '22 at 20:24
  • Problem solved, I wasn't running the function correctly. Should be something like:
    df_2 = flatten_nested_json_df(test_df)
    – DavidH Jun 12 '22 at 03:13