I need the top n most frequently occurring consecutive sub sequence (i.e, its more like sub string) of the 2nd column. Is it possible to use Structured Streaming with Prefix Scan? Can anyone help me with it? I am new to pyspark and would love it if some one could explain it in a way a newbie like me can understand.
from pyspark.sql import SparkSession
from pyspark.sql.types import LongType
from pyspark.sql.functions import split,conv
from pyspark.ml.fpm import PrefixSpan
spark = SparkSession\
.builder\
.master("local[*]")\
.appName("PrefixSpan")\
.getOrCreate()
sc=spark.sparkContext
sc.setLogLevel("OFF")
deft_col='value'
split_col=split(deft_col, ",")
trace=spark\
.readStream\
.format("socket")\
.option("host","localhost")\
.option("port",9998)\
.load()\
.withColumn("Col1", conv(split_col.getItem(0),16,10).cast(LongType()))\
.withColumn("Col2", conv(split_col.getItem(1),16,10).cast(LongType()))\
.drop(deft_col)
prefixSpan = PrefixSpan()
prefixSpan.setSequenceCol('Col2')
prefixSpan.setMinSupport(0.5)
prefixSpan.setMaxPatternLength(5)
prefixSpan.findFrequentSequentialPatterns(trace).show(truncated=False)
trace.awaitTermination()