You can use filter in conjunction with exist which comes under Higher Order Functions that will check if any of the elements within the array contains the word
The other approach would be a UDF -
Data Preparation
sparkDF = sql.createDataFrame([(['apple', 'banana', 'orange'],),
(['strawberry', 'raspberry'],),
(['apple', 'pineapple', 'grapes'],)
]
,['arr_column']
)
sparkDF.show(truncate=False)
+--------------------------+
|arr_column |
+--------------------------+
|[apple, banana, orange] |
|[strawberry, raspberry] |
|[apple, pineapple, grapes]|
+--------------------------+
Filter & Exists >= Spark 2.4
starts_with_app = lambda s: s.startswith("app")
sparkDF_filtered = sparkDF.filter(F.exists(F.col("arr_column"), starts_with_app))
sparkDF_filtered.show(truncate=False)
+--------------------------+
|arr_column |
+--------------------------+
|[apple, banana, orange] |
|[apple, pineapple, grapes]|
+--------------------------+
UDF - Lower Versions as well
def filter_string(inp):
res = []
for s in inp:
if s.startswith("app"):
res += [s]
if res:
return res
else:
return None
filter_string_udf = F.udf(lambda x: filter_string(x),ArrayType(StringType()))
sparkDF_filtered = sparkDF.withColumn('arr_filtered',filter_string_udf(F.col('arr_column')))\
.filter(F.col('arr_filtered').isNotNull())
sparkDF_filtered.show(truncate=False)
+--------------------------+------------+
|arr_column |arr_filtered|
+--------------------------+------------+
|[apple, banana, orange] |[apple] |
|[apple, pineapple, grapes]|[apple] |
+--------------------------+------------+