Faster method to extract information from complex nested json in python dataframe

Question

I have some complex nested json data saved in a python dataframe column and I am trying to flatten the data into columns. The functions and method I have tested seem very slow and would take days to process all my data, so not really a good option.

I am reproducing a simplified example of the data below, and repeating the records to demonstrate how the number of records affect the processing time. Note: not all records have information for all the existing columns.

Please let me know of any other ways to improve the speed of that process. Thanks

import pandas as pd
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}


small_df = pd.DataFrame(str_json)

Example of small_df:

Creating a relatively bigger dataframe to illustrate how processing speed is affected with more records:

bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)

First function, step 1 (very slow). Extracting columns day_id, location_id and details_items:

def extract_details(row):
   data = row.to_string(header=False, index=False)
   json_data = json.loads(data)
   result = pd.json_normalize(json_data)
   day_id = result.iloc[0].loc['DayId']    
   details = result.iloc[0].loc['Details']

   details_df = pd.json_normalize(details)   
   location_id = details_df.iloc[0].loc['LocationId']
   details_items = details_df.iloc[0].loc['Items']

   return day_id, location_id, details_items


%%time
step1_df= bigger_df.copy()
step1_df[['day_id', 'location_id', 'details_items']]= step1_df.apply(lambda row: extract_details(row), axis=1, result_type ='expand')
step1_df.drop('json_col', axis=1, inplace=True)

Example of step 1 running time and output:

Second function, step 2 (relatively fast):

def flatten_nested_json_df(df):

df = df.reset_index()

print(f"original shape: {df.shape}")
print(f"original columns: {df.columns}")


# search for columns to explode/flatten
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()

s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()

print(f"lists: {list_columns}, dicts: {dict_columns}")
while len(list_columns) > 0 or len(dict_columns) > 0:
    new_columns = []

    for col in dict_columns:
        print(f"flattening: {col}")
        # explode dictionaries horizontally, adding new columns
        horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
        horiz_exploded.index = df.index
        df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
        new_columns.extend(horiz_exploded.columns) # inplace

    for col in list_columns:
        print(f"exploding: {col}")
        # explode lists vertically, adding new columns
        df = df.drop(columns=[col]).join(df[col].explode().to_frame())
        new_columns.append(col)

    # check if there are still dict o list fields to flatten
    s = (df[new_columns].applymap(type) == list).all()
    list_columns = s[s].index.tolist()

    s = (df[new_columns].applymap(type) == dict).all()
    dict_columns = s[s].index.tolist()

    print(f"lists: {list_columns}, dicts: {dict_columns}")

print(f"final shape: {df.shape}")
print(f"final columns: {df.columns}")
return df

Example of step 2 running time and output:

consider writing a custom LR(1) json parser / transformer. https://stackoverflow.com/questions/68756628/how-to-merge-values-of-multiple-keys-with-the-same-name-using-python/68758831#68758831 you can optimise how you transform the parse tree — Rob Raymond, Aug 13 '21 at 12:48

Rob Raymond · Answer 1 · 2021-08-14T11:10:37.450

here's a different approach. Put in place a parser / transformer that generates a dict that can be flattened by json_normalize() in a single call
output structure is generic fully flattened, hence column names include index of embedded list
timings below - it's not clear to me what timings you are seeing
opportunities for further optimisation of output from transformer (output less). However this would make it less generic

create specialised JSON parser

from lark import Transformer
from lark import Lark

json_grammar = r"""
    ?value: dict
          | list
          | string
          | SIGNED_NUMBER      -> number
          | "true"             -> true
          | "false"            -> false
          | "null"             -> null

    list : "[" [value ("," value)*] "]"

    dict : "{" [pair ("," pair)*] "}"
    pair : string ":" value

    string : ESCAPED_STRING

    %import common.ESCAPED_STRING
    %import common.SIGNED_NUMBER
    %import common.WS
    %ignore WS

    """
class TreeToJson(Transformer):
    def string(self, s):
        (s,) = s
        return s[1:-1]
    def number(self, n):
        (n,) = n
        return float(n)
    # convert embedded lists to embedded dicts
    def list(self, l):
        return {str(i):v for i,v in enumerate(l)}
    pair = tuple
    dict = dict 

    null = lambda self, _: None
    true = lambda self, _: True
    false = lambda self, _: False

json_parser = Lark(json_grammar, start='value', parser='lalr', transformer=TreeToJson())

app code

import pandas as pd
import numpy as np
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}

small_df = pd.DataFrame(str_json)

def wide(js):
    df = pd.json_normalize(json_parser.parse(js))
    return pd.Series(df.values[0], index=df.columns)

bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)
small_df["json_col"].apply(wide)

output

	DayId	Details.0.LocationId	Details.0.Items.0.Stock.0.0	Details.0.Items.0.Stock.0.1	Details.0.Items.0.Stock.1.0	Details.0.Items.0.Stock.1.1	Details.0.Items.0.Stock.2.0	Details.0.Items.0.Stock.2.1	Details.0.Items.0.Stock.3.0	Details.0.Items.0.Stock.3.1	Details.0.Items.0.Product	Details.0.Items.1.Stock.0.0	Details.0.Items.1.Stock.0.1	Details.0.Items.1.Stock.1.0	Details.0.Items.1.Stock.1.1	Details.0.Items.1.Product	Details.0.Items.2.Stock.0.0	Details.0.Items.2.Stock.0.1	Details.0.Items.2.Product	Details.0.Items.3.Sold.0.0	Details.0.Items.3.Sold.0.1	Details.0.Items.3.Sold.1.0	Details.0.Items.3.Sold.1.1	Details.0.Items.3.Sold.2.0	Details.0.Items.3.Sold.2.1	Details.0.Items.3.Product	Details.0.Items.4.Sold.0.0	Details.0.Items.4.Sold.0.1	Details.0.Items.4.Product	Details.0.Items.1.Stock.2.0	Details.0.Items.1.Stock.2.1	Details.0.Items.1.Stock.3.0	Details.0.Items.1.Stock.3.1	Details.0.Items.2.Sold.0.0	Details.0.Items.2.Sold.0.1	Details.0.Items.4.Sold.1.0	Details.0.Items.4.Sold.1.1	Details.0.Items.4.Sold.2.0	Details.0.Items.4.Sold.2.1
0	1	101	A1	100	A2	105	A3	90	A4	85	A	B1	220	B2	240	B	C1	50	C	A1	5	A2	8	A3	4	A	C1	12	C	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
1	2	101	D1	150	D2	145	D3	130	nan	nan	D	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
2	3	101	A2	97	A5	90	nan	nan	nan	nan	A	E1	25	E2	30	E	nan	nan	B	D1	20	D3	15	nan	nan	D	E2	4	E	E3	22	E4	30	B2	32	E3	1	E4	2

timing

parsing and transforming < 1ms
parsing, transforming and creation of series ~2ms
parsing, transforming, creation of series and results in data frame ~3.5ms
processing 3000 rows, 5.5s

%timeit json_parser.parse(small_df.values[0][0])
%timeit wide(small_df.values[0][0])
%timeit small_df.loc[0, ["json_col"]].apply(wide)
%timeit bigger_df["json_col"].apply(wide)
864 µs ± 8.22 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.99 ms ± 83.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
3.55 ms ± 117 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
5.49 s ± 529 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

I am seeing a KeyError: '0' when copying/pasting and running the suggested method. Not sure what would be the difference and the reason. — Wyse09, Aug 14 '21 at 01:21
is it this line `json_parser.parse(small_df.values[0][0])` line of code that fails? try `json_parser.parse("""{"a":1,"b":{"c":[5,6]}}""")` — Rob Raymond, Aug 14 '21 at 06:35
json_parser.parse("""{"a":1,"b":{"c":[5,6]}}""") returns {'a': 1.0, 'b': {'c': {0: 5.0, 1: 6.0}}} — Wyse09, Aug 14 '21 at 06:54
good, therefore `pd.DataFrame([{"js":"""{"a":1,"b":{"c":[5,6]}}"""}])["js"].apply(wide)` should work and return a dataframe with 3 columns. Which line gave the key error? — Rob Raymond, Aug 14 '21 at 08:22
~\anaconda3\lib\site-packages\pandas\io\json\_normalize.py in _json_normalize(data, record_path, meta, meta_prefix, record_prefix, errors, sep, max_level) 276 # TODO: handle record value which are lists, at least error 277 # reasonably --> 278 data = nested_to_record(data, sep=sep, max_level=max_level) 279 return DataFrame(data) 280 elif not isinstance(record_path, list): ... KeyError: '0' — Wyse09, Aug 14 '21 at 09:04
Let us [continue this discussion in chat](https://chat.stackoverflow.com/rooms/235995/discussion-between-wyse09-and-rob-raymond). — Wyse09, Aug 14 '21 at 10:15
The updated alternative ran with my pandas v1.3.1. I then did a comparison of the execution speed which unfortunately didn't improve. I'll try and do the pre-processing in postgresql beforehand and see how it compares. Thank you very much for your assistance which was much appreciated! — Wyse09, Aug 15 '21 at 00:05

	DayId	Details.0.LocationId	Details.0.Items.0.Stock.0.0	Details.0.Items.0.Stock.0.1	Details.0.Items.0.Stock.1.0	Details.0.Items.0.Stock.1.1	Details.0.Items.0.Stock.2.0	Details.0.Items.0.Stock.2.1	Details.0.Items.0.Stock.3.0	Details.0.Items.0.Stock.3.1	Details.0.Items.0.Product	Details.0.Items.1.Stock.0.0	Details.0.Items.1.Stock.0.1	Details.0.Items.1.Stock.1.0	Details.0.Items.1.Stock.1.1	Details.0.Items.1.Product	Details.0.Items.2.Stock.0.0	Details.0.Items.2.Stock.0.1	Details.0.Items.2.Product	Details.0.Items.3.Sold.0.0	Details.0.Items.3.Sold.0.1	Details.0.Items.3.Sold.1.0	Details.0.Items.3.Sold.1.1	Details.0.Items.3.Sold.2.0	Details.0.Items.3.Sold.2.1	Details.0.Items.3.Product	Details.0.Items.4.Sold.0.0	Details.0.Items.4.Sold.0.1	Details.0.Items.4.Product	Details.0.Items.1.Stock.2.0	Details.0.Items.1.Stock.2.1	Details.0.Items.1.Stock.3.0	Details.0.Items.1.Stock.3.1	Details.0.Items.2.Sold.0.0	Details.0.Items.2.Sold.0.1	Details.0.Items.4.Sold.1.0	Details.0.Items.4.Sold.1.1	Details.0.Items.4.Sold.2.0	Details.0.Items.4.Sold.2.1
0	1	101	A1	100	A2	105	A3	90	A4	85	A	B1	220	B2	240	B	C1	50	C	A1	5	A2	8	A3	4	A	C1	12	C	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
1	2	101	D1	150	D2	145	D3	130	nan	nan	D	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
2	3	101	A2	97	A5	90	nan	nan	nan	nan	A	E1	25	E2	30	E	nan	nan	B	D1	20	D3	15	nan	nan	D	E2	4	E	E3	22	E4	30	B2	32	E3	1	E4	2

	DayId	Details.0.LocationId	Details.0.Items.0.Stock.0.0	Details.0.Items.0.Stock.0.1	Details.0.Items.0.Stock.1.0	Details.0.Items.0.Stock.1.1	Details.0.Items.0.Stock.2.0	Details.0.Items.0.Stock.2.1	Details.0.Items.0.Stock.3.0	Details.0.Items.0.Stock.3.1	Details.0.Items.0.Product	Details.0.Items.1.Stock.0.0	Details.0.Items.1.Stock.0.1	Details.0.Items.1.Stock.1.0	Details.0.Items.1.Stock.1.1	Details.0.Items.1.Product	Details.0.Items.2.Stock.0.0	Details.0.Items.2.Stock.0.1	Details.0.Items.2.Product	Details.0.Items.3.Sold.0.0	Details.0.Items.3.Sold.0.1	Details.0.Items.3.Sold.1.0	Details.0.Items.3.Sold.1.1	Details.0.Items.3.Sold.2.0	Details.0.Items.3.Sold.2.1	Details.0.Items.3.Product	Details.0.Items.4.Sold.0.0	Details.0.Items.4.Sold.0.1	Details.0.Items.4.Product	Details.0.Items.1.Stock.2.0	Details.0.Items.1.Stock.2.1	Details.0.Items.1.Stock.3.0	Details.0.Items.1.Stock.3.1	Details.0.Items.2.Sold.0.0	Details.0.Items.2.Sold.0.1	Details.0.Items.4.Sold.1.0	Details.0.Items.4.Sold.1.1	Details.0.Items.4.Sold.2.0	Details.0.Items.4.Sold.2.1
0	1	101	A1	100	A2	105	A3	90	A4	85	A	B1	220	B2	240	B	C1	50	C	A1	5	A2	8	A3	4	A	C1	12	C	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
1	2	101	D1	150	D2	145	D3	130	nan	nan	D	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
2	3	101	A2	97	A5	90	nan	nan	nan	nan	A	E1	25	E2	30	E	nan	nan	B	D1	20	D3	15	nan	nan	D	E2	4	E	E3	22	E4	30	B2	32	E3	1	E4	2

Faster method to extract information from complex nested json in python dataframe

1 Answers1

create specialised JSON parser

app code

output

timing

	DayId	Details.0.LocationId	Details.0.Items.0.Stock.0.0	Details.0.Items.0.Stock.0.1	Details.0.Items.0.Stock.1.0	Details.0.Items.0.Stock.1.1	Details.0.Items.0.Stock.2.0	Details.0.Items.0.Stock.2.1	Details.0.Items.0.Stock.3.0	Details.0.Items.0.Stock.3.1	Details.0.Items.0.Product	Details.0.Items.1.Stock.0.0	Details.0.Items.1.Stock.0.1	Details.0.Items.1.Stock.1.0	Details.0.Items.1.Stock.1.1	Details.0.Items.1.Product	Details.0.Items.2.Stock.0.0	Details.0.Items.2.Stock.0.1	Details.0.Items.2.Product	Details.0.Items.3.Sold.0.0	Details.0.Items.3.Sold.0.1	Details.0.Items.3.Sold.1.0	Details.0.Items.3.Sold.1.1	Details.0.Items.3.Sold.2.0	Details.0.Items.3.Sold.2.1	Details.0.Items.3.Product	Details.0.Items.4.Sold.0.0	Details.0.Items.4.Sold.0.1	Details.0.Items.4.Product	Details.0.Items.1.Stock.2.0	Details.0.Items.1.Stock.2.1	Details.0.Items.1.Stock.3.0	Details.0.Items.1.Stock.3.1	Details.0.Items.2.Sold.0.0	Details.0.Items.2.Sold.0.1	Details.0.Items.4.Sold.1.0	Details.0.Items.4.Sold.1.1	Details.0.Items.4.Sold.2.0	Details.0.Items.4.Sold.2.1
0	1	101	A1	100	A2	105	A3	90	A4	85	A	B1	220	B2	240	B	C1	50	C	A1	5	A2	8	A3	4	A	C1	12	C	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
1	2	101	D1	150	D2	145	D3	130	nan	nan	D	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan	nan
2	3	101	A2	97	A5	90	nan	nan	nan	nan	A	E1	25	E2	30	E	nan	nan	B	D1	20	D3	15	nan	nan	D	E2	4	E	E3	22	E4	30	B2	32	E3	1	E4	2