I am new to both, python and NLTK. I have to extract noun phrase from the corpus and then remove the stop words by using NLTK. I already do my coding but still have error. Can anyone help me to fix this problem? Or please also recommend if there is any better solution. Thank you
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
docid='19509'
title='Example noun-phrase and stop words'
print('Document id:'),docid
print('Title:'),title
#list noun phrase
content='This is a sample sentence, showing off the stop words filtration.'
is_noun = lambda pos: pos[:2] == 'NN'
tokenized = nltk.word_tokenize(content)
nouns = [word for (word,pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
print('All Noun Phrase:'),nouns
#remove stop words
stop_words = set(stopwords.words("english"))
example_words = word_tokenize(nouns)
filtered_sentence = []
for w in example_words:
if w not in stop_words:
filtered_sentence.append(w)
print('Without stop words:'),filtered_sentence
And I got the following error
Traceback (most recent call last):
File "C:\Users\User\Desktop\NLP\stop_word.py", line 20, in <module>
example_words = word_tokenize(nouns)
File "C:\Python27\lib\site-packages\nltk\tokenize\__init__.py", line 109,in
word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Python27\lib\site-packages\nltk\tokenize\__init__.py", line 94, in
sent_tokenize
return tokenizer.tokenize(text)
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1237, in
tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1285, in
sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text,realign_boundaries)]
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1276, in
span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1316, in
_realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 310, in
_pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1289, in
_slices_from_text
for match in self._lang_vars.period_context_re().finditer(text):
TypeError: expected string or buffer