If your schema is the following:
df.printSchema()
#root
# |-- id: long (nullable = true)
# |-- X: array (nullable = true)
# | |-- element: long (containsNull = true)
# |-- Y: array (nullable = true)
# | |-- element: long (containsNull = true)
# |-- Z: array (nullable = true)
# | |-- element: long (containsNull = true)
and your pyspark version 2.4+ you can use array_intersect
and array_except
:
from pyspark.sql.functions import array_except, array_intersect
df=df.withColumn("new_col", array_except(array_intersect("X", "Y"), "Z"))
df.show()
#+---+---------------+---------------------+----+--------+
#|id |X |Y |Z |new_col |
#+---+---------------+---------------------+----+--------+
#|1 |[12, 23, 1, 24]|[13, 412, 12, 23, 24]|[12]|[23, 24]|
#|2 |[1, 2, 3] |[2, 4, 5, 6] |[] |[2] |
#+---+---------------+---------------------+----+--------+