I've been having to do a lot of this lately (pretend it's a csv, tabular is just easier read). Converting csv files to json with varying schemas.
I'd never call myself a "python programmer" but I work a lot with data and it's my language of choice for wrangling.
I'd like some help generalizing this, I think it'd be a good learning exercise so "code-for-me" isn't required. Just some nudges in the right direction. If you read all this ... thanks.
What is an appropriate structure for the metadata input? Just any tips/thoughts
Sample input, expected result, current state, and quarter baked goal state.
Toplvl TopLvlAttr1 TopLvlAttr2 SecondLvl SecondLvlAttr1 ThirdLvl ThirdLLvlAttr1
T1 TA1_1 TA2_1 S1 SA1_1 TR1 TRA_1
T1 TA1_1 TA2_1 S1 SA1_1 TR2 TRA_2
T1 TA1_1 TA2_1 S1 SA1_1 TR3 TRA_3
T1 TA1_1 TA2_1 S2 SA1_2 TR4 TRA_1
T1 TA1_1 TA2_1 S2 SA1_2 TR5 TRA_2
T1 TA1_1 TA2_1 S2 SA1_2 TR6 TRA_3
T2 TA2_1 TA2_2 S3 SA1_3 TR7 TRA_1
T2 TA2_1 TA2_2 S3 SA1_3 TR8 TRA_2
T2 TA2_1 TA2_2 S3 SA1_3 TR9 TRA_3
The expected json looks like
{
"T1": {
"TopLevelAttribute1": "TA1_1",
"TopLevelAttribute2": "TA1_2",
"SecondLevels": {
"S1": {
"SecondLevelAttribute1": "SA1_1",
"ThirdLevels": {
"TR1": {
"ThirdLevelAttribute1": "TRA_1"
},
"TR2": {
"ThirdLevelAttribute1": "TRA_2"
},
"TR3": {
"ThirdLevelAttribute1": "TRA_3"
}
}
},
"S2": {
"SecondLevelAttribute1": "SA2_1",
"ThirdLevels": {
"TR4": {
"ThirdLevelAttribute1": "TRA_5"
},
"TR5": {
"ThirdLevelAttribute1": "TRA_5"
},
"TR6": {
"ThirdLevelAttribute1": "TRA_6"
}
}
}
}
},
"T2": {
"TopLevelAttribute1": "TA2_1",
"TopLevelAttribute2": "TA2_2",
"SecondLevels": {
"S3": {
"SecondLevelAttribute1": "SA1_3",
"ThirdLevels": {
"TR7": {
"ThirdLevelAttribute1": "TRA_7"
},
"TR8": {
"ThirdLevelAttribute1": "TRA_8"
},
"TR9": {
"ThirdLevelAttribute1": "TRA_9"
}
}
}
}
}
}
The pattern that I just generally plug&chug with looks like
import json
with open('sample_csv_input.csv','r') as sd:
data_lines = sd.readlines()
sample_output = {}
for line in data_lines:
data_attributes = line.split(',')
toplvl, toplvl_attr1, toplvl_attr2, secondlvl, secondlvl_attr1, thirdlvl, thirdlvl_attr1 = data_attributes
third_level_element = {
thirdlvl: {
"ThirdLevelAttribute1": thirdlvl_attr1
}
}
if toplvl in sample_output.keys():
if secondlvl in sample_output[toplvl]["SecondLevels"].keys():
sample_output[toplvl]["SecondLevels"][secondlvl]["ThirdLevels"].update(third_level_element)
else:
sample_output[toplvl]["SecondLevels"][secondlvl] = {
"SecondLevelAttribute1": secondlvl_attr1,
"ThirdLevels": third_level_element}
else:
sample_output[toplvl] = {
"TopLevelAttribute1": toplvl_attr1,
"TopLevelAttribute2": toplvl_attr2,
"SecondLevels":{
secondlvl:{
"SecondLevelAttribute1": secondlvl_attr1,
"ThirdLevels": third_level_element
}
}
}
with open('sample_json.output.json','w') as so:
json.dump(sample_output, so, indent=4)
I know this is bad.
I'm trying to do something more generalized, like have a piece of metadata that defines the structure and a generic function that takes it from there. But getting stuck.
# Hierarchy Attribute Map
{
"TopLvl":{
"Attributes":[{"TopLevelAttribute1": "TopLvlAttr1","TopLevelAttribute2": "TopLvlAttr2"}],
"Children": [{"SecondLevels": "SecondLvl"}]
},
"SecondLvl":{
"Attributes":[{"SecondLevelAttribute1": "SecondLvlAttr1"}],
"Children": [{"ThirdLevels": "ThirdLvl"}]
},
"ThirdLvl"{
"Attributes":[{"SecondLevelAttribute1": "ThirdLLvlAttr1"}]
}
}
import csv
import json
class csv2json:
def __init__(self, input_csv_file, hierarchy_map):
self.input_csv_file = input_csv_file
self.labeled_rows = self.read_with_labels()
self.converted_json = self.convert2json()
def get_children(self):
pass
def get_attributes(self):
pass
def insert_element(self):
pass
def read_with_labels(self, input_csv_file):
csv_rows = []
with open(input_csv_file, 'r') as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]: row[field[i]] for i in range(len(field))}])
return csv_rows
def convert2json(self):
pass
Any thoughts are appreciated