In addition to my previous answer, you might also want to convert a JSON file to tuples that can be inserted into a database. In that case, you can use the following approach.
Before running the following script, you must import the Dict
class defined in this post (just copy and paste it). This class is fundamental to our approach to serializing objects in JSON.
Python Script
Definition of the JSON
class.
from __future__ import annotations
from collections import OrderedDict
import logging
logging.basicConfig(level="DEBUG")
LOGGER = logging.getLogger(__name__)
class JSON:
def __init__(self, dict_list: [dict], key_sep: str = "_", dump_objects: bool = False):
"""
Instantiates a JSON processing object.
Parameters
----------
dict_list: [dict]
List of dictionaries.
key_sep: str
Nested keys separator.
dump_objects: bool
Whether to dump objects.
References
----------
[1] 'JSON' class: https://stackoverflow.com/a/70791993/16109419
[2] 'Dict' class: https://stackoverflow.com/a/70908985/16109419
"""
self.key_sep = key_sep
self.dump_objects = dump_objects
# Serializing dictionaries before processing them:
self.dict_list = [self.serialize(data=d, dump_objects=dump_objects) for d in dict_list]
@staticmethod
def serialize(data: dict, dump_objects: bool = False) -> [dict]:
"""
Serializes the objects contained in the dictionary.
Parameters
----------
data: dict
Dictionary.
dump_objects: bool
Whether to dump objects.
Returns
-------
data: dict
Dictionary.
Notes
-----
This method is required to handle data types not supported by the JSON standard.
For instance, only native data types are supported in Python (e.g., str, int).
Custom objects values are dumped into the dictionaries structure.
"""
serialized_d = Dict(data=data)
for keys, value in serialized_d.items():
parsed, parsed_value = False, None
if hasattr(value, 'isoformat'): # Date/Datetime object.
parsed = True
parsed_value = value.isoformat() if dump_objects else value
elif hasattr(value, '__dict__'): # Custom object.
parsed = True
value_vars = vars(value)
value_vars_str = str(value_vars)
value_str = str(value)
if value_vars_str == value_str: # Dict-based object.
parsed_value = JSON.serialize(data=value_vars, dump_objects=dump_objects)
else: # Not dict-based object.
if dump_objects:
parsed_value = JSON.serialize(data=value_vars, dump_objects=dump_objects)
else:
parsed_value = value_str
if parsed:
serialized_d.set(keys=keys, value=parsed_value)
data = serialized_d.data
return data
def dict_to_list(
self,
sub_tree: dict,
current_list: [str],
items_list: [[object]]
) -> [[object]]:
"""
Convert dictionary to items list.
Parameters
----------
sub_tree
current_list
items_list
Returns
-------
items_list: [[object]]
List of items list.
"""
try: # Tree branches.
for key in sub_tree:
if isinstance(sub_tree[key], (list, tuple)):
for sub_item in sub_tree[key]:
self.dict_to_list(
sub_tree=sub_item,
current_list=current_list + [key],
items_list=items_list
)
elif isinstance(sub_tree[key], dict):
self.dict_to_list(
sub_tree=sub_tree[key],
current_list=current_list + [key],
items_list=items_list
)
else:
items_list.append(current_list + [key] + [sub_tree[key]])
except: # Tree leaf.
items_list.append(current_list + [sub_tree])
return items_list
def extract_entries(self) -> [[(str, object)]]:
"""
Extracts entries from a dictionary.
Returns
-------
entries: [[(str, object)]]
List of key-value items list.
"""
entries = []
for parent in self.dict_list:
key_value_tuples = []
for child in self.dict_to_list(sub_tree=parent, current_list=[], items_list=[]):
key_parts = child[:-1]
key = self.key_sep.join(key_parts)
value = child[-1]
key_value_tuples.append((key, value))
entries.append(key_value_tuples)
return entries
@staticmethod
def get_nth_element(
items: [(str, object)],
element: str,
nth: int = 1
) -> ((str, object), bool):
"""
Get nth element (occurrence) from items list.
Parameters
----------
items: [(str, object)]
Items list.
element: str
Item key.
nth: int
Nth element position.
Returns
-------
nth_element, index_out_of_bounds: ((str, object), bool)
Nth element, and whether it was not found.
"""
assert nth >= 1, f"'nth' ({nth}) must be >= 1."
occurrences = [i for i in items if i[0] == element]
n_occurrences = len(occurrences)
if n_occurrences:
index_out_of_bounds = True if nth > n_occurrences else False
nth_element = occurrences[min(nth, n_occurrences) - 1]
else:
nth_element = None
index_out_of_bounds = True
return nth_element, index_out_of_bounds
def to_tuples(self) -> ([str], [tuple]):
"""
Convert JSON semi-structured data into structured tuples data.
Returns
-------
keys, values: ([str], [tuple])
List of keys and values.
Examples
--------
data = {
"values":
[
{
"A": 0,
"B": 1,
"C": 2
},
{
"C": {
"E": 3,
"F": 4
},
"D": [
{
"G": 5
},
{
"H": 6
}
]
}
]
}
self.dict_list = data['values']
... return (
["A", "B", "C", "C_E", "C_F", "D_G", "D_H"],
[
(0, 1, 2, None, None, None, None),
(None, None, None, 3, 4, 5, 6)
]
)
"""
LOGGER.info(f"Extracting values tuples from JSON file...")
entries = self.extract_entries()
keys = list(
OrderedDict.fromkeys(
[
key_value_tuple[0]
for samples in entries
for key_value_tuple in samples
]
)
)
n_entries = len(entries)
n_keys = len(keys)
values = []
for tuples, index in zip(entries, range(1, n_entries + 1)):
LOGGER.debug(f"Processing values from entry {index}/{n_entries} ({((index / n_entries) * 100):.2f}%)...")
for i in range(1, len(tuples) + 1):
index_out_of_bounds_count = 0
row = []
for c in keys:
key_value_tuple, index_out_of_bounds = self.get_nth_element(items=tuples, element=c, nth=i)
row.append(key_value_tuple[1]) if key_value_tuple else row.append(None)
if index_out_of_bounds:
index_out_of_bounds_count += 1
if index_out_of_bounds_count == n_keys:
break
if row.count(None) != n_keys:
values.append(row)
LOGGER.info(f"Successfully extracted values tuples from JSON file!")
return keys, values
Definition of the main
method to show examples.
from datetime import date, datetime
class B:
def __init__(self) -> B:
self.data = {'a': {'b': [{'c': 0}, {'d': 1}]}}
class A:
def __init__(self) -> A:
self.name = "my_name"
self.attr = "my_attr"
self.b = B()
def main(dump_objects: bool) -> None:
data = {'values': [{'A': A(), 'C': 3, 'D': date.today(), 'E': datetime.now(), 'F': {'G': {'H': 8}}}]}
json_data = JSON(dict_list=data['values'], dump_objects=dump_objects)
print("JSON to tuples test:\n")
for i, d in enumerate(json_data.dict_list, start=1):
print(f"d{i} = {d}\n")
keys, values = json_data.to_tuples()
print(f"\nKeys:")
for i, key in enumerate(keys, start=1):
print(f"\t{i}. {key}")
print(f"\nValues:")
for i, row in enumerate(values, start=1):
print(f"\t{i}:")
for j, value in enumerate(row, start=1):
print(f"\t\t{j}. {value} ({type(value)})")
Example 1: not dumping objects
Calling the main
method.
main(dump_objects=False)
Output:
JSON to tuples test:
d1 = {'A': '<__main__.A object at 0x000002002F436290>', 'C': 3, 'D': datetime.date(2022, 2, 3), 'E': datetime.datetime(2022, 2, 3, 15, 54, 4, 874847), 'F': {'G': {'H': 8}}}
INFO:__main__:Extracting values tuples from JSON file...
DEBUG:__main__:Processing values from entry 1/1 (100.00%)...
INFO:__main__:Successfully extracted values tuples from JSON file!
Keys:
1. A
2. C
3. D
4. E
5. F_G_H
Values:
1:
1. <__main__.A object at 0x000002002F436290> (<class 'str'>)
2. 3 (<class 'int'>)
3. 2022-02-03 (<class 'datetime.date'>)
4. 2022-02-03 15:54:04.874847 (<class 'datetime.datetime'>)
5. 8 (<class 'int'>)
Example 2: dumping objects
Calling the main
method.
main(dump_objects=True)
Output:
JSON to tuples test:
d1 = {'A': {'name': 'my_name', 'attr': 'my_attr', 'b': {'data': {'a': {'b': [{'c': 0}, {'d': 1}]}}}}, 'C': 3, 'D': '2022-02-03', 'E': '2022-02-03T15:55:29.014941', 'F': {'G': {'H': 8}}}
INFO:__main__:Extracting values tuples from JSON file...
DEBUG:__main__:Processing values from entry 1/1 (100.00%)...
INFO:__main__:Successfully extracted values tuples from JSON file!
Keys:
1. A_name
2. A_attr
3. A_b_data_a_b_c
4. A_b_data_a_b_d
5. C
6. D
7. E
8. F_G_H
Values:
1:
1. my_name (<class 'str'>)
2. my_attr (<class 'str'>)
3. 0 (<class 'int'>)
4. 1 (<class 'int'>)
5. 3 (<class 'int'>)
6. 2022-02-03 (<class 'str'>)
7. 2022-02-03T15:55:29.014941 (<class 'str'>)
8. 8 (<class 'int'>)