I want to create a nested json file from data in PySpark from the following data.
I wanted to convert this into Nested json file which should have following structure.
{ "NewData" : [ {"id":"1","number":"smith","name":"uber","age":12}, {"id":"2","number":"jon","name":"lunch","age":13}, {"id":"3","number":"jocelyn","name":"rental","age":15}, {"id":"4","number":"megan","name":"sds","age":15}
] }
How to put the correct output in a json file
Can you help me achieve this?
data = [(1,12,"smith", "uber"),
(2,13,"jon","lunch"),
(3,15,"jocelyn","rental"),
(4,15,"megan","sds")
]
schema = StructType([
StructField('id', IntegerType(), True),
StructField('age', IntegerType(), True),
StructField('number', StringType(), True),
StructField('name', StringType(), True)
])
df = spark.createDataFrame(data,schema)
df.show(truncate=False)
df = df.withColumn("NewData", F.lit("NewData"))
df2 = df.groupBy('NewData').agg(F.collect_list(
F.to_json(F.struct('id','number', 'name', 'age'))
).alias('values')
))
df2.show(truncate=False)