2

I want to convert nested json into csv format including sub rows for grouped list/dict.

Here my json

data =\
{
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }
            ]
        }
    ]
}
import pandas as pd

df = pd.json_normalize(data)

# display(df)
  description id       name                                                                                                                                                                                                packages item.description item.id
0         HLD  1  HIGHLEVEL  [{'id': '1', 'label': 'Package 1', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}, {'id': '2', 'label': 'Package 3', 'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]      description      11

Output of JSON to CSV Converter

"id","name","description","item__id","item__description","packages__id","packages__label","packages__products__id","packages__products__price"
"1","HIGHLEVEL","HLD","11","description","1","Package 1","1","5"
"","","","","","","","2","3"
"","","","","","2","Package 3","1","5"
"","","","","","","","2","3"

I tried pandas normalization but the results are not the same as wanted. JSON Array are not converted into sub rows in csv. I want to keep empty string in the csv.

I want to do the same but with a Python Script.

Achraf Bentabib
  • 72
  • 1
  • 13
  • What is the correct data? – yoonghm Aug 23 '21 at 09:44
  • The csv I want is the same as in my question – Achraf Bentabib Aug 23 '21 at 09:54
  • The data is the same as in the question, but it might be different. So the script has to be generalized for different structured of json data – Achraf Bentabib Aug 23 '21 at 10:10
  • The only way to write a general converter is to define how you want to handle nested lists and dicts. For example, what do you think the CSV for the following JSONs should look like? `{"a": ["b"]}`, `["a", {"b": ["c", "d"]}]`, `{"a": [{"b": {"c": "d"}}]}` – Kache Aug 23 '21 at 18:36
  • Does this answered post helpful? [Convert nested JSON to CSV file in Python](https://stackoverflow.com/questions/41180960/convert-nested-json-to-csv-file-in-python) – CMYang Aug 23 '21 at 18:50

3 Answers3

3

This should work for you:

from copy import deepcopy
import pandas


def cross_join(left, right):
    new_rows = [] if right else left
    for left_row in left:
        for right_row in right:
            temp_row = deepcopy(left_row)
            for key, value in right_row.items():
                temp_row[key] = value
            new_rows.append(deepcopy(temp_row))
    return new_rows


def flatten_list(data):
    for elem in data:
        if isinstance(elem, list):
            yield from flatten_list(elem)
        else:
            yield elem


def json_to_dataframe(data_in):
    def flatten_json(data, prev_heading=''):
        if isinstance(data, dict):
            rows = [{}]
            for key, value in data.items():
                rows = cross_join(rows, flatten_json(value, prev_heading + '_' + key))
        elif isinstance(data, list):
            rows = []
            if(len(data) != 0):
                for i in range(len(data)):
                    [rows.append(elem) for elem in flatten_list(flatten_json(data[i], prev_heading))]
            else:
                data.append("")
                [rows.append(elem) for elem in flatten_list(flatten_json(data[0], prev_heading))]
        else:
            rows = [{prev_heading[1:]: data}]
        return rows

    return pandas.DataFrame(flatten_json(data_in))

def remove_duplicates(df):
    columns = list(df)[:7]
    for c in columns:
        df[c] = df[c].mask(df[c].duplicated(), "")

    return df


if __name__ == '__main__':
    df = json_to_dataframe(data)
    df = remove_duplicates(df)

    print(df)
    df.to_csv('data.csv', index=False)

Input 01:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }, {
            "id": "2",
            "label": "Package 3",
            "products": [{
                    "id": "1",
                    "price": 5
                }, {
                    "id": "2",
                    "price": 3
                }, {
                    "id": "3",
                    "price": 9
                }
            ]
        }
    ]
}

Output 01:

enter image description here

Input 02:

data = {
    "id": "1",
    "name": "HIGHLEVEL",
    "description": "HLD",
    "item": {
        "id": "11",
        "description": "description"
    },
    "packages": [{
            "id": "1",
            "label": "Package 1",
            "products": []
        }, {
            "id": "2",
            "label": "Package 3",
            "products": []
        }
    ]
}

Output 02: enter image description here

Hope it will resolve your issue. If you need any explanation then please let me know.

Thanks

Sabil
  • 3,750
  • 1
  • 5
  • 16
0

Thanks @Trenton McKinney

import pandas as pd
import json

data =\
{'description': 'HLD',
 'id': '1',
 'item': {'description': 'description', 'id': '11'},
 'name': 'HIGHLEVEL',
 'packages': [{'id': '1',
               'label': 'Package 1',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]},
              {'id': '2',
               'label': 'Package 3',
               'products': [{'id': '1', 'price': 5}, {'id': '2', 'price': 3}]}]}


df = pd.json_normalize(data, record_path=['packages'], meta=['id', 'name', 'description', ['item', 'id'], ['item', 'description']], meta_prefix='top_', sep='_')
df = df.explode('products')

df.rename({'id': 'packages_id', 'label': 'packages_label'}, axis=1, inplace=True)
df = df.join(pd.DataFrame(df.pop('products').values.tolist()))

df.rename({'id': 'packages_products_id', 'price': 'packages_products_price'}, axis=1, inplace=True)

df.columns = df.columns.str.replace('top_', '')

df = df[['id', 'name', 'description', 'item_id', 'item_description', 'packages_id', 'packages_label', 'packages_products_id', 'packages_products_price']]

columns_to_group = ["name", "description", "item_id", "item_description", "packages_id", "packages_label"]

for c in columns_to_group:
  df[c] = df[c].mask(
    df[c].duplicated(), ""
  )

print(df)

df.to_csv('data.csv', index=False)

Now I have to make it more general so it can be used for any structured json.

Achraf Bentabib
  • 72
  • 1
  • 13
0
import json  # For JSON loading
import csv  # For CSV dict writer


def get_leaves(item, key=None, key_prefix=""):
    """
    This function converts nested dictionary structure to flat
    """
    if isinstance(item, dict):
        leaves = {}
        """Iterates the dictionary and go to leaf node after that calls to get_leaves function recursively to go to leaves level"""
        for item_key in item.keys():
            """Some times leaves and parents or some other leaves might have same key that's why adding leave node key to distinguish"""
            temp_key_prefix = (
                item_key if (key_prefix == "") else (key_prefix + "_" + str(item_key))
            )
            leaves.update(get_leaves(item[item_key], item_key, temp_key_prefix))
        return leaves
    elif isinstance(item, list):
        leaves = {}
        elements = []
        """Iterates the list and go to leaf node after that if it is leave then simply add value to current key's list or 
        calls to get_leaves function recursively to go to leaves level"""
        for element in item:
            if isinstance(element, dict) or isinstance(element, list):
                leaves.update(get_leaves(element, key, key_prefix))
            else:
                elements.append(element)
        if len(elements) > 0:
            leaves[key] = elements
        return leaves
    else:
        return {key_prefix: item}


with open("./campaign-summary.json") as f_input, open("./finalised_output.csv", "w", newline="") as f_output:
    json_data = json.load(f_input, strict=False)
    """'First parse all entries to get the unique fieldnames why because already we have file in RAM level and
    if we put each dictionary after parsing in list or some data structure it will crash your system due to memory constraint
    that's why first we will get the keys first then we convert each dictionary and put it to CSV"""
    fieldnames = set()
    for entry in json_data:
        fieldnames.update(get_leaves(entry).keys())
    csv_output = csv.DictWriter(f_output, delimiter=";", fieldnames=sorted(fieldnames))
    csv_output.writeheader()
    csv_output.writerows(get_leaves(entry) for entry in json_data)