I'm using pyspark 2.4 version.
I have a weird issue with dropping columns after joining.
I get the correct result if I drop one column, but I get an error if I drop two columns.
I want to drop the 'address' and 'role' columns from the workers1 data frame on the joined data frame (joined_workers).
from pyspark.sql import functions as f
workers1 = spark.createDataFrame(
[("barmen", "Paris", "25"),
("waitress", None, "22")],
["role", "address", "age"])
workers1.toPandas()
>>>
role address age
0 barmen Paris 25
1 waitress None 22
workers2 = spark.createDataFrame(
[("barmen", "Paris"),
(None, "Berlin")],
["role", "address"])
workers2.toPandas()
>>>
role address
0 barmen Paris
1 None Berlin
clumns_to_join_on = ["role", "address"]
joined_workers = workers1.alias("workers1").join(
workers2.alias("workers2"),
[
*[getattr(workers1, col).eqNullSafe(
getattr(workers2, col)) for col in clumns_to_join_on]
],
how="right",
)
joined_workers.toPandas()
>>>
role address age role address
0 None None None None Berlin
1 barmen Paris 25 barmen Paris
# expected result
joined_workers.drop(*[f.col("workers1.role")]).toPandas()
>>>
address age role address
0 None None None Berlin
1 Paris 25 barmen Paris
# Work as expected
joined_workers.drop(*[f.col("workers1.address")]).toPandas()
>>>
role age role address
0 None None None Berlin
1 barmen 25 barmen Paris
# Work as expected
joined_workers.drop(*[f.col("workers1.role"), f.col("workers1.address")]).toPandas()
>>>
TypeError: each col in the param list should be a string