I'm trying to use Pyenchant module to spellcheck a column in a pyspark dataframe
I can use the module in python normally after pip install pyenchant
but when I try to use it with a udf
or a rdd.map
function I get ModuleNotFoundError: No module named 'enchant'
example
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.session import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
import pyspark.sql.functions as F
import enchant
english_dict = enchant.Dict("en_US")
spark = SparkSession.builder.getOrCreate()
# initialize dataframe
data = [
(1, "This sentence is correct"),
(2, "This sentence is incorrrect"),
(3, "THis senecne is incorect :)"),
]
schema = StructType([StructField("id", IntegerType()), StructField("sentence", StringType())])
nodes_df = spark.createDataFrame(data, schema)
# manipulate the senctence column
nodes_df = nodes_df.withColumn(
"sentence", F.array_remove(F.split(F.regexp_replace(F.lower(F.col("sentence")), r"[^a-z ]", ""), "\s"), "")
)
stop_word_list = StopWordsRemover().getStopWords()
remover = StopWordsRemover(inputCol="sentence", outputCol="sentence_cleaned", stopWords=stop_word_list)
# remove stop words from sentence column
nodes_df = remover.transform(nodes_df)
# nodes_df.show(truncate=False)
# +---+--------------------------------+----------------------+
# |id |sentence |sentence_cleaned |
# +---+--------------------------------+----------------------+
# |1 |[this, sentence, is, correct] |[sentence, correct] |
# |2 |[this, sentence, is, incorrrect]|[sentence, incorrrect]|
# |3 |[this, senecne, is, incorect] |[senecne, incorect] |
# +---+--------------------------------+----------------------+
# "ModuleNotFoundError: No module named 'enchant'" error thrown here
nodes_df = nodes_df.rdd.map(
lambda x: x.sentence_cleaned if english_dict.check(x.sentence_cleaned) else english_dict.suggest(x.sentence_cleaned)[0]
).toDF()
I'm using a virtualenv
for my project.