Try with "go".*?"go":"(.*)"
regex.
df.withColumn("go",regexp_extract(col("fields"),'"go".*?"go":"(.*)"',1)).show(10,False)
df.withColumn("go",regexp_extract(col("fields"),'"go".*?"go":"([A-Za-z0-9]*)"',1)).show(10,False)
#+--------------------------------------------+-----+
#|fields |go |
#+--------------------------------------------+-----+
#|{"go":"NEW123", "hey":"OLD32", "go":"BYE89"}|BYE89|
#+--------------------------------------------+-----+
Another way would be using from_json function
:
Second occurrence of go
will overwrite the first occurrence (same as python dict) so we will have only one value for go
.
df.show(10,False)
#+--------------------------------------------+
#|fields |
#+--------------------------------------------+
#|{"go":"NEW123", "hey":"OLD32", "go":"BYE89"}|
#+--------------------------------------------+
from pyspark.sql.types import *
from pyspark.sql.functions import *
sch=StructType([StructField("go",StringType()),StructField("hey",StringType())])
df.withColumn("go",from_json(col("fields"),sch)).\
withColumn("go",col("go.go")).show(10,False)
#+--------------------------------------------+-----+
#|fields |go |
#+--------------------------------------------+-----+
#|{"go":"NEW123", "hey":"OLD32", "go":"BYE89"}|BYE89|
#+--------------------------------------------+-----+