I am getting a KeyError: when performing bag of words on a large text file. This had worked a few years back, but I dug it back up and redid it in python 3.7 using pycharm instead of emacs and python 2.7. Looking to get an old NLP example running to compare to newer techniques eventually.
KeyError: 'learning'
line 12, in get_bag_of_words
bag_of_words[word] += course_bag_of_words[word]
How do I track this down? truly not sure what to do been reading posts most of the day?
If its the key or entry do I do something like delete or pop it? How to remove a key from a Python dictionary?
Or some wrong with the bag of word function.
def get_bag_of_words(titles_lines):
bag_of_words = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
for word in course_bag_of_words:
if word not in course_bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
full code on my github simple recommender and search full run.py
from myfuncs import *
# import myfuncs
# get keywords, inverted index and titles
f = open('s2-titles.txt', encoding = "utf8")
titles_lines = f.readlines()
f.close()
bag_of_words = get_bag_of_words(titles_lines)
keywords = get_keywords(titles_lines, bag_of_words)
inverted_index = get_inverted_index(keywords)
titles = get_titles(titles_lines)
# run search query
query = input('Input your search query: ')
while query != '':
query_terms = query.split()
sorted_results = get_search_results(query_terms,
keywords,
inverted_index)
print('==> search results for query:', query)
for result in sorted_results:
print(result, titles[result])
query = input('Input your search query [hit return to finish]: ')
# get unit vectors
f = open('s2-categories.tsv', encoding = "utf8")
categories_lines = f.readlines()
f.close()
unit_vectors = get_dot_product(keywords, categories_lines)
# run recommendation algorithm
seed_courseid = input('Input your seed courseid: ')
while seed_courseid != '':
sorted_results = get_recommendation_results(seed_courseid,
keywords,
inverted_index,
unit_vectors)
print('==> recommendation results:')
for result in sorted_results:
print(result, titles[result])
print(get_dot_product(seed_courseid, result, unit_vectors))
seed_courseid = input('Input seed courseid [hit return to finish]:')
full myfuncs.py
#!/usr/bin/env python
# coding: utf-8
def get_bag_of_words(titles_lines):
bag_of_words = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
for word in course_bag_of_words:
if word not in course_bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
def get_course_bag_of_words(line):
course_bag_of_words = {}
# split by weirdcombo to prevent weird splits
courseid, title, description = line.split('XXXYYYZZZ')
title = title.lower()
description = description.lower()
wordlist = title.split() + description.split()
if len(wordlist) >= 10:
for word in wordlist:
if word not in course_bag_of_words:
course_bag_of_words[word] = 1
else:
course_bag_of_words[word] += 1
return courseid, course_bag_of_words
def get_sorted_results(d):
kv_list = d.items()
vk_list = []
for kv in kv_list:
k, v = kv
vk = v, k
vk_list.append(vk)
vk_list.sort()
vk_list.reverse()
k_list = []
for vk in vk_list[:10]:
v, k = vk
k_list.append(k)
return k_list
def get_keywords(titles_lines, bag_of_words):
n = sum(bag_of_words.values())
keywords = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
term_importance = {}
for word in course_bag_of_words:
tf_course = (float(course_bag_of_words[word]) /
sum(course_bag_of_words.values())
)
tf_overall = float(bag_of_words[word]) / n
term_importance[word] = tf_course / tf_overall
keywords[courseid] = get_sorted_results(term_importance)
return keywords
def get_inverted_index(keywords):
inverted_index = {}
for courseid in keywords:
for keyword in keywords[courseid]:
if keyword not in inverted_index:
inverted_index[keyword] = []
inverted_index[keyword].append(courseid)
return inverted_index
def get_search_results(query_terms, keywords, inverted_index):
search_results = {}
for term in query_terms:
if term in inverted_index:
for courseid in inverted_index[term]:
if courseid not in search_results:
search_results[courseid] = 0.0
search_results[courseid] += (
1 / float(keywords[courseid].index(term) + 1) *
1 / float(query_terms(term) + 1)
)
sorted_results = get_sorted_results(search_results)
return sorted_results
def get_titles(titles_lines):
titles = {}
for line in titles_lines[1:]:
courseid, title, description = line.split('XXXYYYZZZ')
titles[courseid] = title[:60] # take first 60 characters
return titles
def get_unit_vectors(keywords, categories_lines):
norm = 1.884
cat = {}
subcat = {}
for line in categories_lines[1:]:
courseid, category, subcategory = line.split('\t')
cat[courseid] = category.strip()
subcat[courseid] = subcategory.strip()
unit_vectors = {}
for courseid in keywords:
u = {}
if courseid in cat:
u[cat[courseid]] = 1 / norm
u[subcat[courseid]] = 1 / norm
for keyword in keywords[courseid]:
u[keyword] = (
1 /
float(keywords[courseid].index(keyword) + 1) /
norm
)
unit_vectors[courseid] = u
return unit_vectors
def get_dot_product(courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
def get_recommendation_results(seed_courseid,
keywords,
inverted_index,
unit_vectors):
courseids = []
for keyword in keywords[seed_courseid]:
for courseid in inverted_index[keyword]:
if courseid not in courseids and courseid != seed_courseid:
courseids.append(courseid)
dot_products = {}
for courseid in courseids:
dot_products[courseids] = get_dot_product(seed_courseid,
courseid,
unit_vectors)
sorted_results = get_sorted_results(dot_products)
return sorted_results