I just finished a program that reads text from books and graphs their word count with the x-axis being the count of one book and the y-axis being the count of the second book. It works, but it's surprisingly slow and I'm hoping to get some tips on how to optimize it. I think my biggest concern is creating a dictionary for similar words between the books and a dictionary for words that are in one book but not the other. This implementation added a lot of runtime to the program and I'd like to find a pythonic way to improve this. Below is the code:
import re # regular expressions
import io
import collections
from matplotlib import pyplot as plt
# xs=[x1,x2,...,xn]
# Number of occurences of the word in book 1
# use
# ys=[y1.y2,...,yn]
# Number of occurences of the word in book 2
# plt.plot(xs,ys)
# save as svg or pdf files
word_pattern = re.compile(r'\w+')
# with version ensures closing even if there are failures
with io.open("swannsway.txt") as f:
text = f.read() # read as a single large string
book1 = word_pattern.findall(text) # pull out words
book1 = [w.lower() for w in book1 if len(w)>=3]
with io.open("moby_dick.txt") as f:
text = f.read() # read as a single large string
book2 = word_pattern.findall(text) # pull out words
book2 = [w.lower() for w in book2 if len(w)>=3]
#Convert these into relative percentages/total book length
wordcount_book1 = {}
for word in book1:
if word in wordcount_book1:
wordcount_book1[word]+=1
else:
wordcount_book1[word]=1
'''
for word in wordcount_book1:
wordcount_book1[word] /= len(wordcount_book1)
for word in wordcount_book2:
wordcount_book2[word] /= len(wordcount_book2)
'''
wordcount_book2 = {}
for word in book2:
if word in wordcount_book2:
wordcount_book2[word]+=1
else:
wordcount_book2[word]=1
common_words = {}
for i in wordcount_book1:
for j in wordcount_book2:
if i == j:
common_words[i] = [wordcount_book1[i], wordcount_book2[j]]
break
book_singles= {}
for i in wordcount_book1:
if i not in common_words:
book_singles[i] = [wordcount_book1[i], 0]
for i in wordcount_book2:
if i not in common_words:
book_singles[i] = [0, wordcount_book2[i]]
wordcount_book1 = collections.Counter(book1)
wordcount_book2 = collections.Counter(book2)
# how many words of different lengths?
word_length_book1 = collections.Counter([len(word) for word in book1])
word_length_book2 = collections.Counter([len(word) for word in book2])
print(wordcount_book1)
#plt.plot(list(word_length_book1.keys()),list(word_length_book1.values()), list(word_length_book2.keys()), list(word_length_book2.values()), 'bo')
for i in range(len(common_words)):
plt.plot(list(common_words.values())[i][0], list(common_words.values())[i][1], 'bo', alpha = 0.2)
for i in range(len(book_singles)):
plt.plot(list(book_singles.values())[i][0], list(book_singles.values())[i][1], 'ro', alpha = 0.2)
plt.ylabel('Swannsway')
plt.xlabel('Moby Dick')
plt.show()
#key:value