I am trying some unknown word and it give out 0% by putting "Polytechnic", "Diploma" that dictionary does not even have and i try to find sources that are able to add words into dictionary that i find are not able to find
Here is my function of code that i am calling
def similarityChecker(txt1, txt2):
result = 0.00
file_docs = []
tokens1 = sentence(txt1)
for line in tokens1:
file_docs.append(line)
print("Number of sentence:",len(file_docs))
gen_docs = [[w.lower() for w in removestop(text)]
for text in file_docs]
dictionary = gensim.corpora.Dictionary(gen_docs)
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs]
# This tf_idf cannot add unknown words?
tf_idf = gensim.models.TfidfModel(corpus)
# Gives out an empty array [] for using words not in the english dictionary
for doc in tf_idf[corpus]:
print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])
# building the index
sims = gensim.similarities.Similarity('/', tf_idf[corpus], num_features=len(dictionary))
file2_docs = []
tokens2 = sentence(txt2)
for line in tokens2:
file2_docs.append(line)
print("Number of sentence:", len(file2_docs))
avg_sims = []
for line in file2_docs:
# tokenize words
query_doc = [w.lower() for w in removestop(line)]
# create bag of words
query_doc_bow = dictionary.doc2bow(query_doc)
# find similarity for each document
query_doc_tf_idf = tf_idf[query_doc_bow]
# print (document_number, document_similarity)
print('Comparing Result:', sims[query_doc_tf_idf])
# calculate sum of similarities for each query doc
sum_of_sims = (np.sum(sims[query_doc_tf_idf], dtype=np.float32))
# calculate average of similarity for each query doc
avg = sum_of_sims / len(file_docs)
# print average of similarity for each query doc
print(f'avg: {sum_of_sims / len(file_docs)}')
# add average values into array
avg_sims.append(avg)
total_avg = np.sum(avg_sims, dtype=np.float)
result = round(float(total_avg) * 100)
if result >= 100:
result = 100
return result
Some function i added is to call nltk which is working. And I am new to this gensim coding i really need help.