Is there a possibility to make a pivot for different columns at once in PySpark? I have a dataframe like this:
from pyspark.sql import functions as sf
import pandas as pd
sdf = spark.createDataFrame(
pd.DataFrame([[1, 'str1', 'str4'], [1, 'str1', 'str4'], [1, 'str2', 'str4'], [1, 'str2', 'str5'],
[1, 'str3', 'str5'], [2, 'str2', 'str4'], [2, 'str2', 'str4'], [2, 'str3', 'str4'],
[2, 'str3', 'str5']], columns=['id', 'col1', 'col2'])
)
# +----+------+------+
# | id | col1 | col2 |
# +----+------+------+
# | 1 | str1 | str4 |
# | 1 | str1 | str4 |
# | 1 | str2 | str4 |
# | 1 | str2 | str5 |
# | 1 | str3 | str5 |
# | 2 | str2 | str4 |
# | 2 | str2 | str4 |
# | 2 | str3 | str4 |
# | 2 | str3 | str5 |
# +----+------+------+
I want to pivot it on multiple columns ("col1", "col2", ...) to have a dataframe that looks like this:
+----+-----------+-----------+-----------+-----------+-----------+
| id | col1_str1 | col1_str2 | col1_str3 | col2_str4 | col2_str5 |
+----+-----------+-----------+-----------+-----------+-----------+
| 1 | 2 | 2 | 1 | 3 | 3 |
| 2 | 0 | 2 | 2 | 3 | 1 |
+----+-----------+-----------+-----------+-----------+-----------+
I've found a solution that works:
sdf_pivot_col1 = (
sdf
.groupby('id')
.pivot('col1')
.agg(sf.count('id'))
)
sdf_pivot_col2 = (
sdf
.groupby('id')
.pivot('col2')
.agg(sf.count('id'))
)
sdf_result = (
sdf
.select('id').distinct()
.join(sdf_pivot_col1, on = 'id' , how = 'left')
.join(sdf_pivot_col2, on = 'id' , how = 'left')
).show()
# +---+----+----+----+----+----+
# | id|str1|str2|str3|str4|str5|
# +---+----+----+----+----+----+
# | 1| 2| 2| 1| 3| 2|
# | 2|null| 2| 2| 3| 1|
# +---+----+----+----+----+----+
But I'm looking for a more compact solution.