This answer might be a bit 'overkill' but it does not use Pandas or collect anything to the driver. It will also work when you have multiple rows. We can just pass an empty list to the melt function from "How to melt Spark DataFrame?"
A working example would be as follows:
import findspark
findspark.init()
import pyspark as ps
from pyspark.sql import SQLContext, Column
import pandas as pd
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql import DataFrame
from typing import Iterable
try:
sc
except NameError:
sc = ps.SparkContext()
sqlContext = SQLContext(sc)
# From https://stackoverflow.com/questions/41670103/how-to-melt-spark-dataframe
def melt(
df: DataFrame,
id_vars: Iterable[str], value_vars: Iterable[str],
var_name: str="variable", value_name: str="value") -> DataFrame:
"""Convert :class:`DataFrame` from wide to long format."""
# Create array<struct<variable: str, value: ...>>
_vars_and_vals = array(*(
struct(lit(c).alias(var_name), col(c).alias(value_name))
for c in value_vars))
# Add to the DataFrame and explode
_tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
cols = id_vars + [
col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
return _tmp.select(*cols)
# Sample data
df1 = sqlContext.createDataFrame(
[(0,1,2,3,4)],
("col1", "col2",'col3','col4','col5'))
df1.show()
df2 = melt(df1,id_vars=[],value_vars=df1.columns)
df2.show()
Output:
+----+----+----+----+----+
|col1|col2|col3|col4|col5|
+----+----+----+----+----+
| 0| 1| 2| 3| 4|
+----+----+----+----+----+
+--------+-----+
|variable|value|
+--------+-----+
| col1| 0|
| col2| 1|
| col3| 2|
| col4| 3|
| col5| 4|
+--------+-----+
Hope this helps.