I have a training file and testing file, I want to detect emotion from tweets using machine learning algorithms, in this code, I will employ the preprocessing step in the training dataset in Arabic, and appear this error when removing stop_words! do you need to install an Arabic stopwords file or can I import it from NLTK?
#csv file for train
df=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-train.csv")
#csv file for test
df_test=pd.read_csv("C:/Users/User/Desktop/2018-EI-oc-Ar-fear-test-gold.csv")
def stopWordRmove(text):
ar_stop_list = open("ar_stop_word_list.txt", "r") # this error appear in this line
stop_words = ar_stop_list.read().split('\n')
needed_words = []
words = word_tokenize(text)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def noramlize(Tweet):
Tweet = re.sub(r"[إأٱآا]", "ا", Tweet)
Tweet = re.sub(r"ى", "ي", Tweet)
Tweet = re.sub(r"ؤ", "ء", Tweet)
Tweet = re.sub(r"ئ", "ء", Tweet)
Tweet = re.sub(r'[^ا-ي ]', "", Tweet)
noise = re.compile(""" ّ | # Tashdid
َ | # Fatha
ً | # Tanwin Fath
ُ | # Damma
ٌ | # Tanwin Damm
ِ | # Kasra
ٍ | # Tanwin Kasr
ْ | # Sukun
ـ # Tatwil/Kashida
""", re.VERBOSE)
Tweet = re.sub(noise, '', Tweet)
return Tweet
def stopWordRmove(Tweet):
ar_stop_list = open("ar_stop_word_list.txt", "r")
stop_words = ar_stop_list.read().split('\n')
needed_words = []
words = word_tokenize(Tweet)
for w in words:
if w not in (stop_words):
needed_words.append(w)
filtered_sentence = " ".join(needed_words)
return filtered_sentence
def stemming(Tweet):
st = ISRIStemmer()
stemmed_words = []
words = word_tokenize(Tweet)
for w in words:
stemmed_words.append(st.stem(w))
stemmed_sentence = " ".join(stemmed_words)
return stemmed_sentence
def prepareDataSets(df):
sentences = []
for index, r in df.iterrows():
text = stopWordRmove(r['Tweet'])
text = noramlize(r['Tweet'])
text = stemming(r['Tweet'])
df_sentences = DataFrame(sentences, columns=['Tweet', 'Affect Dimension'])
return df_sentences
preprocessed_df = prepareDataSets(df)
FileNotFoundError: [Errno 2] No such file or directory: 'ar_stop_word_list.txt'
how can I Remove stopwords from Arabic Tweet?