UPDATE: Without UDF
df.withColumn("data", split(col("data"), ","))
.withColumn("data", explode_outer("data"))
.withColumn("data", regexp_replace(col("data"), '\"|\[|\]','')).show()
+--------------------+--------------------+--------+
| identifier_field| values| data|
+--------------------+--------------------+--------+
|PCR-0020779-SAMKG...|prohibited_restri...| no|
|PCR-0020779-SAMKG...| physical_form|crystals|
|PCR-0020779-SAMKG...| physical_form| powder|
|PCR-0020779-SAMKG...| region_country| Emea_EU|
+--------------------+--------------------+--------+
You can use UDF to convert string value to array in column data
and then explode.
data = [("PCR-0020779-SAMKGO-FH", "prohibited_restricted_authorised_chemicals", "no"),
("PCR-0020779-SAMKGO-FH", "physical_form", '["crystals", "powder"]'),
("PCR-0020779-SAMKGO-FH", "region_country", ' ["Emea_EU"]')]
columns = ["identifier_field", "values", "data"]
spark.createDataFrame(data, columns)
+---------------------+------------------------------------------+----------------------+
|identifier_field |values |data |
+---------------------+------------------------------------------+----------------------+
|PCR-0020779-SAMKGO-FH|prohibited_restricted_authorised_chemicals|no |
|PCR-0020779-SAMKGO-FH|physical_form |["crystals", "powder"]|
|PCR-0020779-SAMKGO-FH|region_country | ["Emea_EU"] |
+---------------------+------------------------------------------+----------------------+
def string_to_array(c):
out = eval(c) if c.strip().startswith('[') else [c]
return out
string_to_array_udf = udf(string_to_array, ArrayType(StringType()))
out_df = df.withColumn("data", string_to_array_udf(col("data"))).withColumn("data", explode_outer("data"))
+--------------------+--------------------+--------+
| identifier_field| values| data|
+--------------------+--------------------+--------+
|PCR-0020779-SAMKG...|prohibited_restri...| no|
|PCR-0020779-SAMKG...| physical_form|crystals|
|PCR-0020779-SAMKG...| physical_form| powder|
|PCR-0020779-SAMKG...| region_country| Emea_EU|
+--------------------+--------------------+--------+