0

I'm trying to use Pyenchant module to spellcheck a column in a pyspark dataframe

I can use the module in python normally after pip install pyenchant but when I try to use it with a udf or a rdd.map function I get ModuleNotFoundError: No module named 'enchant'

example

from pyspark.ml.feature import StopWordsRemover
from pyspark.sql.session import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
import pyspark.sql.functions as F

import enchant

english_dict = enchant.Dict("en_US")

spark = SparkSession.builder.getOrCreate()

# initialize dataframe
data = [
    (1, "This sentence is correct"),
    (2, "This sentence is incorrrect"),
    (3, "THis senecne is incorect :)"),
]

schema = StructType([StructField("id", IntegerType()), StructField("sentence", StringType())])
nodes_df = spark.createDataFrame(data, schema)

# manipulate the senctence column
nodes_df = nodes_df.withColumn(
    "sentence", F.array_remove(F.split(F.regexp_replace(F.lower(F.col("sentence")), r"[^a-z ]", ""), "\s"), "")
)

stop_word_list = StopWordsRemover().getStopWords()

remover = StopWordsRemover(inputCol="sentence", outputCol="sentence_cleaned", stopWords=stop_word_list)

# remove stop words from sentence column

nodes_df = remover.transform(nodes_df)

# nodes_df.show(truncate=False)
# +---+--------------------------------+----------------------+                   
# |id |sentence                        |sentence_cleaned      |
# +---+--------------------------------+----------------------+
# |1  |[this, sentence, is, correct]   |[sentence, correct]   |
# |2  |[this, sentence, is, incorrrect]|[sentence, incorrrect]|
# |3  |[this, senecne, is, incorect]   |[senecne, incorect]   |
# +---+--------------------------------+----------------------+

# "ModuleNotFoundError: No module named 'enchant'" error thrown here
nodes_df = nodes_df.rdd.map(
    lambda x: x.sentence_cleaned if english_dict.check(x.sentence_cleaned) else english_dict.suggest(x.sentence_cleaned)[0]
).toDF()

I'm using a virtualenv for my project.

VectorXY
  • 349
  • 1
  • 3
  • 12

0 Answers0