from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark import SparkContext, SQLContext
from pyspark.sql.functions import *
import sys
from pyspark.sql.functions import when, max, greatest
sc = SparkContext('local')
sqlContext = SQLContext(sc)
df = sc.parallelize([
[1, 'A', 'B'],
[2, 'B', 'A'],
[3, 'D', 'C'],
[4, 'A', 'N'],
[5, 'N', 'B'],
[6, 'A', 'A'],
[6, 'B', 'B']
]).toDF('id: integer, col1: string, col2: string')
df.show(20, False)
df = df.withColumn("my_col", when(df.col1 == 'A', 1).when(df.col2 == 'B', 2).otherwise(3))
df.show(20, False)
df_answer = df.select(max(col("my_col")))
df_answer.show(20, False)
Outputs:
+---+----+----+
|id |col1|col2|
+---+----+----+
|1 |A |B |
|2 |B |A |
|3 |D |C |
|4 |A |N |
|5 |N |B |
|6 |A |A |
|6 |B |B |
+---+----+----+
+---+----+----+------+
|id |col1|col2|my_col|
+---+----+----+------+
|1 |A |B |1 |
|2 |B |A |3 |
|3 |D |C |3 |
|4 |A |N |1 |
|5 |N |B |2 |
|6 |A |A |1 |
|6 |B |B |2 |
+---+----+----+------+
+-----------+
|max(my_col)|
+-----------+
|3 |
+-----------+