The following code provides you with all you need to accomplish what you want to achieve:
data = \
{ "key1": {
"subkey1":"1.1",
"subkey2":"1.2"
},
"key2": {
"subkey1":"2.1",
"subkey2":"2.2",
"subkey3": {
"child3": {
"subchild3":"2.3.3.3"
}
}
}
}
print(data)
ID = 0
lstRows = []
def getTableRow(data, key):
global lstRows, ID
for k, v in data.items():
#print('for k,v:', k,v)
if isinstance(v, dict):
#print('dict:',v)
if key=='':
getTableRow(v, k)
else:
getTableRow(v, key +'.'+ k)
else:
#print('lstRows.append()')
ID += 1
lstRows.append({"ID":ID, "key":key +'.'+ k, "value":v})
getTableRow(data, '')
print( lstRows )
dctTable = {"ID":[],"key":[], "value":[]}
for dct in lstRows:
dctTable["ID"].append(dct["ID"])
dctTable["key"].append(dct["key"])
dctTable["value"].append(dct["value"])
print( dctTable )
import pandas as pd
df = pd.DataFrame.from_dict(dctTable)
# df = pd.DataFrame(lstRows) # equivalent to above .from_dict()
# df = pd.DataFrame(dctTable) # equivalent to above .from_dict()
print(df)
prints
{'key1': {'subkey1': '1.1', 'subkey2': '1.2'}, 'key2': {'subkey1': '2.1', 'subkey2': '2.2', 'subkey3': {'child3': {'subchild3': '2.3.3.3'}}}}
[{'ID': 1, 'key': 'key1.subkey1', 'value': '1.1'}, {'ID': 2, 'key': 'key1.subkey2', 'value': '1.2'}, {'ID': 3, 'key': 'key2.subkey1', 'value': '2.1'}, {'ID': 4, 'key': 'key2.subkey2', 'value': '2.2'}, {'ID': 5, 'key': 'key2.subkey3.child3.subchild3', 'value': '2.3.3.3'}]
{'ID': [1, 2, 3, 4, 5], 'key': ['key1.subkey1', 'key1.subkey2', 'key2.subkey1', 'key2.subkey2', 'key2.subkey3.child3.subchild3'], 'value': ['1.1', '1.2', '2.1', '2.2', '2.3.3.3']}
ID key value
0 1 key1.subkey1 1.1
1 2 key1.subkey2 1.2
2 3 key2.subkey1 2.1
3 4 key2.subkey2 2.2
4 5 key2.subkey3.child3.subchild3 2.3.3.3
It uses a recursive call of a function creating the rows of the resulting table.
As I don't use pyspark the shown table was created using Pandas.
See also here ( "Flatten nested dictionaries, compressing keys"
) for a general and flexible way of flattening a nested dictionary handling also values being lists.
See below code for further instructions and explanations requested in the comments:
# ======================================================================
# You can read the json file content directly into the dct_data using
# the Python json.load(fp) function ( fp=open(filename) ).
# This code starts with an in str_data stored json file content:
str_data = """
{ "key1": {
"subkey1":"1.1",
"subkey2":"1.2"
},
"key2": {
"subkey1":"2.1",
"subkey2":"2.2",
"subkey3": {
"child3": {
"subchild3":"2.3.3.3"
}
}
}
}"""
# print(str_data)
# Let's create a Python dictionary from the json data string:
import json
dct_data = json.loads(str_data) # or = json.load(open(filename))
print(dct_data)
# Here the function for flattening the dictionary dct_data returning
# a dictionary with flattened dct_data content:
def flattenNestedDictionary(dct_data, key='', ID=0, lstRows=[]):
#global lstRows, ID
for k, v in dct_data.items():
#print('for k,v:', k,v)
if isinstance(v, dict):
#print('dict:',v)
if key=='':
flattenNestedDictionary(v, k, ID, lstRows)
else:
flattenNestedDictionary(v, key +'.'+ k, ID, lstRows)
else:
#print('lstRows.append()')
ID += 1
lstRows.append({"ID":ID, "key":key +'.'+ k, "value":v})
# now lstRows has all the required content so let's create the
# flattened dictionary:
if ID==0:
print('lstRows:', lstRows )
dct_flattened_json = {"ID":[],"key":[], "value":[]}
for dct in lstRows:
dct_flattened_json["ID"].append(dct["ID"])
dct_flattened_json["key"].append(dct["key"])
dct_flattened_json["value"].append(dct["value"])
print('#', dct_flattened_json )
return dct_flattened_json
dct_flattened_json = flattenNestedDictionary(dct_data)
# Let's create a valid json data string out of the dictionary:
str_flattened_json = json.dumps(dct_flattened_json)
print('>', str_flattened_json)
# you can now write the str_flattened_json string to a file and load
# the new json file with flattened data into spark DataFrame.
# Or you load the string str_flattened_json into a spark DataFrame.