I'm attempting to remove all the stop words from text input. The code below removes all the stop words, except one that begin a sentence.
How do I remove those words?
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stopwords_nltk_en = set(stopwords.words('english'))
from string import punctuation
exclude_punctuation = set(punctuation)
stoplist_combined = set.union(stopwords_nltk_en, exclude_punctuation)
def normalized_text(text):
lemma = WordNetLemmatizer()
stopwords_punctuations_free = ' '.join([i for i in text.lower().split() if i not in stoplist_combined])
normalized = ' '.join(lemma.lemmatize(word) for word in stopwords_punctuations_free.split())
return normalized
sentence = [['The birds are always in their house.'], ['In the hills the birds nest.']]
for item in sentence:
print (normalized_text(str(item)))
OUTPUT:
the bird always house
in hill bird nest