How can i transform the first dataframe into the second dataframe as shown in the image with pyspark? I did some research and i want something like the function unpivot. The column "Ano" is the year.
At the moment i have this code based on this link Unpivot in Spark SQL / PySpark :
from pyspark.sql.types import StructType,StructField,StringType,FloatType
from pyspark.sql.functions import when
data = [("High Paper Grade", 0.007,0.005,0.001,0.026,0.013,0.009,0.001,0.059,"Paper"), ("Mixed Low Grade Paper", 0.087,0.068,0.024,0.314,0.093,0.074,0.024,0.319,"Paper")]
schema = StructType([ \
StructField("Material",StringType(),True), \
StructField("2017_Aggregate",FloatType(),True), \
StructField("2017_Refuse",FloatType(),True), \
StructField("2017_MGP", FloatType(), True), \
StructField("2017_Paper", FloatType(), True), \
StructField("2013_Aggregate", FloatType(), True), \
StructField("2013_Refuse", FloatType(), True), \
StructField("2013_MGP", FloatType(), True), \
StructField("2013_Paper", FloatType(), True), \
StructField("Material_Group", StringType(), True)
])
df = spark.createDataFrame(data=data,schema=schema)
df=df.selectExpr("Material","Material_Group","stack(8,'2017_Aggregate',2017_Aggregate,'2017_Refuse',2017_Refuse,'2017_MGP',2017_MGP,'2017_Paper',2017_Paper,'2013_Aggregate',2013_Aggregate,'2013_Refuse', 2013_Refuse,'2013_MGP',2013_MGP,'2013_Paper',2013_Paper) as (Year,Value)").where("Value is not null").show()
df = df.withColumn("Year", when(df.Year == "2017_Aggregate", "2017")
.when (df.Year == "2017_Aggregate", "2017")
.when (df.Year == "2017_Refuse", "2017")
.when (df.Year == "2017_MGP", "2017")
.when (df.Year == "2017_Paper", "2017")
.when (df.Year == "2013_Aggregate", "2013")
.when (df.Year == "2013_Refuse", "2013")
.when (df.Year == "2013_MGP", "2013")
.when (df.Year == "2013_Paper", "2013")
.otherwise(df.Year)
)
df.toPandas()
Which gives me the following output: Output
This is what i am trying to achieve: