I have a dataframe with the following schema:
I try to fetch all the data from this dataframe. I use df.collect()
method to iterate through the entire dataframe and then pulling the values out of the columns one-by-one. But it seems like its not iterating through the entire tree and just pulling through the initial parent row only.
def parseCol(landing_df,data):
for i in landing_df.collect():
parent_id = i["parent_id"]
shared = "null"
if (len(i["children"]))>1:
# print(len(i["children"]))
# if(len(i["children"])>1):
data.append([i["project_id"],i["id"],i["name"],i["order"],i["pid"],i["created_date"],i["last_modified_date"], str(parent_id),i["description"],i["recursive"],i["links"][0][0],str(shared)])
for j in i["children"]:
if(('shared') not in (i)):
shared = 'null'
else:
shared = i['shared']
if(('project_id') not in (j)):
project_id = "null"
else:
project_id = j['project_id']
data.append([project_id,j["id"],j["name"],j["order"],j["pid"],j["created_date"],j["last_modified_date"],str(j["parent_id"]),j["description"],j["recursive"],j["links"][0][0],str(shared)])
# print(-1)
elif(len(i["children"])==0):
data.append([i["project_id"],i["id"],i["name"],i["order"],i["pid"],i["created_date"],i["last_modified_date"],"null",i["description"],i["recursive"],i["links"][0][0],str(shared)])
return data
Can someone suggest some better way to do this.