I seem to be having a bit of an issue stripping punctuation from a string in Python. Here, I'm given a text file (specifically a book from Project Gutenberg) and a list of stopwords. I want to return a dictionary of the 10 most commonly used words. Unfortunately, I keep getting one hiccup in my returned dictionary.
import sys
import collections
from string import punctuation
import operator
#should return a string without punctuation
def strip_punc(s):
return ''.join(c for c in s if c not in punctuation)
def word_cloud(infile, stopwordsfile):
wordcount = {}
#Reads the stopwords into a list
stopwords = [x.strip() for x in open(stopwordsfile, 'r').readlines()]
#reads data from the text file into a list
lines = []
with open(infile) as f:
lines = f.readlines()
lines = [line.split() for line in lines]
#does the wordcount
for line in lines:
for word in line:
word = strip_punc(word).lower()
if word not in stopwords:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
#sorts the dictionary, grabs 10 most common words
output = dict(sorted(wordcount.items(),
key=operator.itemgetter(1), reverse=True)[:10])
print(output)
if __name__=='__main__':
try:
word_cloud(sys.argv[1], sys.argv[2])
except Exception as e:
print('An exception has occured:')
print(e)
print('Try running as python3 word_cloud.py <input-text> <stopwords>')
This will print out
{'said': 659, 'mr': 606, 'one': 418, '“i': 416, 'lorry': 322, 'upon': 288, 'will': 276, 'defarge': 268, 'man': 264, 'little': 263}
The "i shouldn't be there. I don't understand why it isn't eliminated in my helper function.
Thanks in advance.