I am new to nltk. I was trying some codes. i know so many are asked this type of questions still i don't understand. i also tried nltk.download('all') except panlex.lite package everything is up-to date.
Here is my code,
short_pos = open("sr/positive.txt","r").read()
short_neg = open("sr/negative.txt","r").read()
documents = []
for r in short_pos.split('\n'):
documents.append( (r, "pos") )
for r in short_neg.split('\n'):
documents.append( (r, "neg") )
all_words = []
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)
It gives me this following error:
Traceback (most recent call last):
File "S:/7th sem/minor_project/hotel_recommended/testing/sentiment_mod.py", line 52, in <module>
short_pos_words = word_tokenize(short_pos)
File "C:\Python27\lib\site-packages\nltk\tokenize\__init__.py", line 106, in word_tokenize
return [token for sent in sent_tokenize(text, language)
File "C:\Python27\lib\site-packages\nltk\tokenize\__init__.py", line 91, in sent_tokenize
return tokenizer.tokenize(text)
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1226, in tokenize
return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1274, in sentences_from_text
return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1265, in span_tokenize
return [(sl.start, sl.stop) for sl in slices]
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1304, in _realign_boundaries
for sl1, sl2 in _pair_iter(slices):
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 311, in _pair_iter
for el in it:
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1280, in _slices_from_text
if self.text_contains_sentbreak(context):
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1325, in text_contains_sentbreak
for t in self._annotate_tokens(self._tokenize_words(text)):
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 1460, in _annotate_second_pass
for t1, t2 in _pair_iter(tokens):
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 310, in _pair_iter
prev = next(it)
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 577, in _annotate_first_pass
for aug_tok in tokens:
File "C:\Python27\lib\site-packages\nltk\tokenize\punkt.py", line 542, in _tokenize_words
for line in plaintext.split('\n'):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 6: ordinal not in range(128)
Thanks in advance