If you use nested list comprehension instead of nested for-loops:
# import pandas as pd
# import ast ## I just needed this to parse the RESPONSE column from csv
# df = pd.read_csv('https://raw.githubusercontent.com/ajayvd/dataframe/main/data_sub.csv')
# df['RESPONSE'] = df['RESPONSE'].apply(ast.literal_eval) # maybe only after read_csv
k3List = ('provider', 'storeExternalId')
get_e = lambda resp_v:resp_v['store-boundary-dsp']['estimates']
def get_separate_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e):
def get_k3(k3):
return [e[k3] for resp in data_frame[column] for e in get_l(resp) if k3 in e]
isList = isinstance(k3List, (list,tuple,set))
lists = [get_k3(k) for k in (k3List if isList else [k3List])]
return lists if isList else lists[0]
provider, store_id = get_separate_lists(df)
# provider = get_separate_lists(df, k3List='provider')
# store_id = get_separate_lists(df, 'RESPONSE', 'storeExternalId')
[k3List
can be a single key or a list (or tuple or set) of keys, and get_l
should be a function.]
If you want parallel lists, you can start with a list of tuples and then unpack and zip
to basically "unzip" them into separate lists:
# k3List, get_e = ... ## as before
def get_tuple_lists(data_frame,column='RESPONSE', k3List=k3List, get_l=get_e):
return [ tuple(e.get(k3) for k3 in k3List)
for resp in data_frame[column] for e in get_l(resp) ]
provider_stores = get_tuple_lists(df)
provider, store_id = [list(t) for t in zip(*provider_stores)]
# provider, store_id = list(zip(*provider_stores)) ## 2 tuples instead of 2 lists
With either function, print(f'{store_id=}\n{provider=}')
should print
store_id=['1504', '1504', '9346', '9346', '1035', '4883', '3791', '5464', '5464', '3869', '3869', '7510', '6221', '5708', '5708', '3465']
provider=['Instacart', 'DoorDash', 'DoorDash', 'Uber', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'Skipcart', 'DoorDash', 'DoorDash', 'DoorDash', 'Postmates', 'DoorDash', 'DoorDash']
But the direct output of get_tuple_lists
would look like
provider_stores=[('Instacart', '1504'), ('DoorDash', '1504'), ('DoorDash', '9346'), ('Uber', '9346'), ('DoorDash', '1035'), ('DoorDash', '4883'), ('DoorDash', '3791'), ('Postmates', '5464'), ('DoorDash', '5464'), ('Skipcart', '3869'), ('DoorDash', '3869'), ('DoorDash', '7510'), ('DoorDash', '6221'), ('Postmates', '5708'), ('DoorDash', '5708'), ('DoorDash', '3465')]
If you're not sure that the outer keys used (like store-boundary-dsp
and estimates
above) exist on every row, you can use try...except
in get_l
:
def get_b(resp_v):
try: return resp_v['store-boundary-dsp']['boundaries']
except: return []
boundary_names = set(get_separate_lists(df, k3List='name', get_l=get_b))
# --> # boundary_names={'9346 - Area - 1', '1504 - Primary', '1504-Primary'}
Just some notes about the snippet in your question:
for index, row in data_frame.iterrows():
for i in range(0,len(row[column])):
for k,v in row[column]['store-boundary-dsp']['estimates'][i].items():
# if k==....
- You don't really need to use
.iterrows()
or range
or .items()
here - you can just use
for rc in data_frame[column]:
for est in rc['store-boundary-dsp']['estimates']:
if 'storeExternalId' in est: store_id.append(est['storeExternalId'])
if 'provider' in est: provider.append(est['provider'])
- Even in the
for k,v
loop, defining store_val
or provider_val
(as v
) is redundant [unless you plan to use them outside of their respective if
blocks or plan to modify v
in some way] when you can just .append(v)
- You could also add extract any number of lists [from
store-boundary-dsp.estimates
] by adding them to the all_lists
dictionary below [instead of coding more if
s inside for est...
]
all_lists = {
'storeExternalId': (store_id := []),
'provider': (provider := []),
}
for rc in data_frame[column]:
for est in rc['store-boundary-dsp']['estimates']:
for k in all_lists:
if k in est: all_lists[k].append(est[k])
- You could also use nested list comprehension instead of nested for-loops:
def get_k3(k3, k2='estimates', k1='store-boundary-dsp'):
return [e[k3] for resp in data_frame[column] for e in resp[k1][k2] if k3 in e]
k3List = ['provider', 'storeExternalId'] ## line them up EXACTLY
provider, store_id = [get_k3(k) for k in k3List]
Btw, you can also use .explode
and json_normalize
to completely flatten the DataFrame:
df1 = pd.concat([df[['CREATEDAT']], pd.json_normalize(df['RESPONSE'])], axis=1)
df1.columns = [c.split('.',1)[-1] for c in df1.columns]
lCols = ['value', 'errors', 'tags.1504']
dlCols = ['boundaries', 'distances', 'estimates', 'fulfillments']
for c in (lCols+dlCols): df1 = df1.explode(c)
df1 = pd.concat([df1.drop(dlCols, axis=1).reset_index(drop=True), *[
pd.json_normalize(df1[c]).rename(columns=lambda cn: f'{c}.{cn}')
for c in dlCols
]], axis=1)#.dropna(axis='columns', thresh=140)
## dropna(axis='columns',thresh=N)--> only keep columns with < N empty cells

