I'm new to Python and I'm trying to get the top 10 most frequent word in every text files of a certain path. But I'm having this error:
Traceback (most recent call last):
File "C:/Python27/cluster.py", line 51, in <module>
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
File "C:\Python27\lib\site-packages\textblob\decorators.py", line 24, in
__get__
value = obj.__dict__[self.func.__name__] = self.func(obj)
File "C:\Python27\lib\site-packages\textblob\blob.py", line 643, in words
return WordList(word_tokenize(self.raw, include_punc=False))
File "C:\Python27\lib\site-packages\textblob\blob.py", line 218, in __init__
self._collection = [Word(w) for w in collection]
File "C:\Python27\lib\site-packages\textblob\blob.py", line 74, in __new__
return super(Word, cls).__new__(cls, string)
UnicodeDecodeError: 'ascii' codec can't decode byte 0xef in position 0:
ordinal not in range(128)
I'm using Python 2.7 and I can't really fix it on my own.
Here is my code:
from __future__ import division, unicode_literals
import sys, getopt
import glob,os
import math
from cStringIO import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from textblob import TextBlob as tb
sys.path.append("C:\Python27\Lib\site-packages\pdfminer.six-20160614-
py2.7.egg\pdfminer")
def tf(word, blob):
return blob.words.count(word) / len(blob.words)
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob)
def idf(word, bloblist):
return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
file_names = glob.glob("C:\Python27\PDF2text\Text\\*.txt")
corpus = []
for file_path in file_names:
with open(file_path) as f_input:
corpus.append(f_input.read())
print corpus
bloblist = map(tb,corpus)
for i, blob in enumerate(bloblist):
print("Document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:10]:
print("Word: {}, TF-IDF: {}".format(word, round(score, 5)))
How will I fix this? Thank You so much!