I have some complex nested json data saved in a python dataframe column and I am trying to flatten the data into columns. The functions and method I have tested seem very slow and would take days to process all my data, so not really a good option.
I am reproducing a simplified example of the data below, and repeating the records to demonstrate how the number of records affect the processing time. Note: not all records have information for all the existing columns.
Please let me know of any other ways to improve the speed of that process. Thanks
import pandas as pd
str_json= {"json_col":[ '{"DayId":1,"Details":[{"LocationId":101,"Items":[{"Stock":[["A1",100],["A2",105],["A3",90],["A4",85]],"Product":"A"},{"Stock":[["B1",220],["B2",240]],"Product":"B"},{"Stock":[["C1",50]],"Product":"C"},{"Sold":[["A1",5],["A2",8],["A3",4]],"Product":"A"},{"Sold":[["C1",12]],"Product":"C"}]}]}','{"DayId":2,"Details":[{"LocationId":101,"Items":[{"Stock":[["D1",150],["D2",145],["D3",130]],"Product":"D"}]}]}','{"DayId":3,"Details":[{"LocationId":101,"Items":[{"Stock":[["A2",97],["A5",90]],"Product":"A"},{"Stock":[["E1",25],["E2",30],["E3",22],["E4",30]],"Product":"E"},{"Sold":[["B2",32]],"Product":"B"},{"Sold":[["D1",20],["D3",15]],"Product":"D"},{"Sold":[["E2",4],["E3",1],["E4",2]],"Product":"E"}]}]}']}
small_df = pd.DataFrame(str_json)
Creating a relatively bigger dataframe to illustrate how processing speed is affected with more records:
bigger_df = pd.DataFrame(np.repeat(small_df.values, 1000, axis=0), columns=small_df.columns)
First function, step 1 (very slow). Extracting columns day_id, location_id and details_items:
def extract_details(row):
data = row.to_string(header=False, index=False)
json_data = json.loads(data)
result = pd.json_normalize(json_data)
day_id = result.iloc[0].loc['DayId']
details = result.iloc[0].loc['Details']
details_df = pd.json_normalize(details)
location_id = details_df.iloc[0].loc['LocationId']
details_items = details_df.iloc[0].loc['Items']
return day_id, location_id, details_items
%%time
step1_df= bigger_df.copy()
step1_df[['day_id', 'location_id', 'details_items']]= step1_df.apply(lambda row: extract_details(row), axis=1, result_type ='expand')
step1_df.drop('json_col', axis=1, inplace=True)
Example of step 1 running time and output:
Second function, step 2 (relatively fast):
def flatten_nested_json_df(df):
df = df.reset_index()
print(f"original shape: {df.shape}")
print(f"original columns: {df.columns}")
# search for columns to explode/flatten
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
print(f"lists: {list_columns}, dicts: {dict_columns}")
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
print(f"flattening: {col}")
# explode dictionaries horizontally, adding new columns
horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
horiz_exploded.index = df.index
df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
new_columns.extend(horiz_exploded.columns) # inplace
for col in list_columns:
print(f"exploding: {col}")
# explode lists vertically, adding new columns
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
# check if there are still dict o list fields to flatten
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
print(f"lists: {list_columns}, dicts: {dict_columns}")
print(f"final shape: {df.shape}")
print(f"final columns: {df.columns}")
return df