Assuming that your DataFrame schema is like the following (here the page_list
column is a string):
df.printSchema()
#root
# |-- page_name: string (nullable = true)
# |-- page_list: string (nullable = true)
You can use from_json
to get the page_list
as an array of strings. Then use array_contains
to check if the page_name
is in this list.
The trick is that you will have to use expr
to pass a column value as a parameter to array_contains
.
from pyspark.sql.types import StructType, StructField, ArrayType, StringType
from pyspark.sql.functions import expr, from_json
df.withColumn(
"flag",
from_json(
"page_list",
schema=StructType([StructField("page_list", ArrayType(StringType()))])
)["page_list"]
).withColumn(
"flag",
expr("array_contains(flag, page_name)")
).show(truncate=False)
#+---------+----------------------------------+-----+
#|page_name|page_list |flag |
#+---------+----------------------------------+-----+
#|home |{"page_list":["home","something"]}|true |
#|about |{"page_list":["something"]} |false|
#+---------+----------------------------------+-----+