I have a DataFrame which is constituted of the following fields:
{
"cohort_key": datetime,
"last_name": str,
"file_identifier": str,
"file_status": str,
"responsible_label": str,
"status": str,
"earliest_appointment_date_key": datetime,
"latest_appointment_date_key_as_of_today": datetime,
"months_between_first_last_apppointment": int,
"count_appointments": int,
"count_hours": float,
"procedures": List[{"procedure_key": str, "unit_count": int, "charges_amount_laboratory": float}]
}
with "procedure" being a sparse column where all dicts have the same format (obtained from ARRAY_AGG(STRUCT<...>())
in SQL). Here is some example data:
[
{
"cohort_key": 1475280000000,
"last_name": "Georges",
"file_identifier": "5672",
"file_status": "open",
"responsible_label": "Dr John Doe",
"status": "probably_done",
"earliest_appointment_date_key": 1475452800000,
"latest_appointment_date_key_as_of_today": 1544054400000,
"months_between_first_last_apppointment": 26,
"count_appointments": 16,
"count_hours": 15.5,
"procedures": [
{
"procedure_key": "84000",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "89600",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "80000",
"unit_count": 1,
"charges_amount_laboratory": 0.0
}
],
"avg_charges_gross_amount_per_appointment": 464.06,
"avg_charges_gross_amount_per_hour": 479.03,
"charges_gross_amount_total": 7425.0,
"charges_amount_laboratory": 0.0
},
{
"cohort_key": 1475280000000,
"last_name": "White",
"file_identifier": "6690",
"file_status": "open",
"responsible_label": "Dr John Doe",
"status": "probably_done",
"earliest_appointment_date_key": 1480291200000,
"latest_appointment_date_key_as_of_today": 1579737600000,
"months_between_first_last_apppointment": 37,
"count_appointments": 13,
"count_hours": 10.6666666667,
"procedures": [
{
"procedure_key": "84000",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "80000",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "80000",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "80000",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "80672",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "80672",
"unit_count": 1,
"charges_amount_laboratory": 0.0
},
{
"procedure_key": "83200",
"unit_count": 1,
"charges_amount_laboratory": 83.0
}
],
"avg_charges_gross_amount_per_appointment": 558.92,
"avg_charges_gross_amount_per_hour": 681.19,
"charges_gross_amount_total": 7266.0,
"charges_amount_laboratory": 83.0
},
{
"cohort_key": 1475280000000,
"last_name": "Nguyen",
"file_identifier": "6568",
"file_status": "open",
"responsible_label": "Dr John Doe",
"status": "probably_done",
"earliest_appointment_date_key": 1479081600000,
"latest_appointment_date_key_as_of_today": 1479081600000,
"months_between_first_last_apppointment": 0,
"count_appointments": 1,
"count_hours": 0.5,
"procedures": [],
"avg_charges_gross_amount_per_appointment": 0.0,
"avg_charges_gross_amount_per_hour": 0.0,
"charges_gross_amount_total": 0.0,
"charges_amount_laboratory": 0.0
},
{
"cohort_key": 1475280000000,
"last_name": "Lee",
"file_identifier": "9502",
"file_status": "open",
"responsible_label": "Dr John Doe",
"status": "probably_done",
"earliest_appointment_date_key": 1478736000000,
"latest_appointment_date_key_as_of_today": 1478736000000,
"months_between_first_last_apppointment": 0,
"count_appointments": 1,
"count_hours": 0.5,
"procedures": [],
"avg_charges_gross_amount_per_appointment": 0.0,
"avg_charges_gross_amount_per_hour": 0.0,
"charges_gross_amount_total": 0.0,
"charges_amount_laboratory": 0.0
}
]
I would like to change the format of this table so that procedures
is expanded as a pivot table with:
index=[all other fields]
columns=procedure[procedure_key]
values=[procedure[unit_count], prodedure[charges_amount_laboratory]
So far my efforts have been mostly fruitless - I reached the point where I have the index and the columns but still have a dict as the value of the columns:
[
{
"cohort_key": 1467331200000,
"last_name": "Johnson",
"file_identifier": "1227",
"file_status": "open",
"responsible_label": "Dr John Doe",
"status": "probably_done",
"earliest_appointment_date_key": 1473206400000,
"latest_appointment_date_key_as_of_today": 1524009600000,
"months_between_first_last_apppointment": 19,
"count_appointments": 17,
"count_hours": 17.5,
"avg_charges_gross_amount_per_appointment": 397.06,
"avg_charges_gross_amount_per_hour": 385.71,
"charges_gross_amount_total": 6750.0,
"charges_amount_laboratory": 0.0,
"15109": null,
"15110": null,
"80000": { "unit_count": 1, "charges_amount_laboratory": 0.0 },
"80630": null,
"80650": null,
"80671": null,
"80672": null,
"81130": null,
"81207": null,
"81222": null,
"83113": null,
"83114": null,
"83200": null,
"83210": null,
"84000": { "unit_count": 1, "charges_amount_laboratory": 0.0 },
"84500": null,
"89600": { "unit_count": 18, "charges_amount_laboratory": 0.0 }
}
]
which was obtained through the following code
df_patients = df.copy()
def agg_procedures(procedures):
"""Returns single dict from list with procedure_key as key and sum of unit_count, charges_amount_laboratory as values"""
result = defaultdict(dict)
for procedure in procedures:
proc_dict = result[procedure['procedure_key']]
proc_dict['unit_count'] = procedure['unit_count'] + proc_dict.get('unit_count', 0)
proc_dict['charges_amount_laboratory'] = procedure['charges_amount_laboratory'] + proc_dict.get('charges_amount_laboratory', 0)
return result
df_procedures = df_patients[['cohort_key', 'last_name', 'file_identifier', 'procedures']].copy()
df_normalized_procedures = (
df_procedures["procedures"]
.apply(agg_procedures)
.apply(pd.Series) # see https://stackoverflow.com/questions/38231591/split-explode-a-column-of-dictionaries-into-separate-columns-with-pandas
)
df_normalized_procedures = df_normalized_procedures.reindex(sorted(df_normalized_procedures.columns), axis=1)
df_patients = df_patients.drop("procedures", axis=1)
df_concat = pd.concat([df_patients, df_normalized_procedures], axis=1)
df_concat = df_concat.set_index([col for col in df_patients])
but in the end this method feels like a dead-end - I don't know how to transform the dict into a sublevel. I figured my way of doing it may be wrong.
How would you go about doing this transformation?