I have book.csv file which consists some bibliographic lists of books. I also have user table in database which consists user information need. my aim is to do tf-idf, cosine similarity between user information need from database table as query and book.csv row as document and print out most similar rows with user information need when user_Id is inserted. so I've some problem on setting csv raws as document. any help please with this errorIndexError: list index out of range
. the other problem is even when I insert right User_Id it replays error messages until I reached on number of that user. i.e if user is on 3rd in database table I've to try three times like this
insert User_Id
JU/MF3024/04
no such User exist
insert User_Id
JU/MF3024/04
insert User_Id
JU/MF3024/04
no such User exist
Fit Vectorizer to train set [[0 1]
[1 0]]
Transform Vectorizer to test set [[0 0]
[0 0]
here is my implementation code in python 2.7.11. I used some codes from Python: tf-idf-cosine: to find document similarity
from sklearn.feature_extraction. text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import numpy.linalg as LA
import pandas as pd
from nltk.corpus import stopwords
from collections import defaultdict
import csv
import mysql.connector as sql
from mysql.connector import connection
with open("Book.csv", "rb") as books:
reader = csv.reader(books, delimiter =',')
reader. next()
count = 0
docs = {}
for row in reader:
docs = row[1].split()#I want to consider each row as document similar to train set on the above linked post
query = "" # like test_set on the above post
config = {'user': 'root', 'password': '929255@Tenth', 'host': '127.0.0.1','database': 'juls', 'raise_on_warnings': True,}
db = ql.connect(**config)
cursor = db.cursor()
query = "SELECT * FROM user"
cursor.execute(query)
result = cursor.fetchall()
for r in result:
User_Id = r[0]
First_Name = r[1]
Last_Name = r[2]
College = r[3]
Department = r[4]
Info_need = r[5]
email = r[6]
print "insert User_Id"
Id = str(raw_input())
if Id not in User_Id:
print "no such User exist"
pass
elif Id =="":
print "User ID is blank"
pass
else:
query = "SELECT Info_need from user WHERE User_Id = '%s'" % Id
cursor.execute(query)
stopWords = set(stopwords.words('english'))
vectorizer = CountVectorizer(stop_words = stopWords)
transformer = TfidfTransformer()
trainVectorizerArray = vectorizer.fit_transform(docs).toarray()
testVectorizerArray = vectorizer.transform(query).toarray()
print 'Fit Vectorizer to train set', trainVectorizerArray
print 'Transform Vectorizer to test set', testVectorizerArray
cx = lambda a, b : round(np.inner(a, b)/(LA.norm(a)*LA.norm(b)), 3)
for vector in trainVectorizerArray:
for testV in testVectorizerArray:
cosine = cx(vector, testV)
transformer.fit(trainVectorizerArray)
transformer.fit(testVectorizerArray)
tfidf = transformer.transform(testVectorizerArray)
print tfidf.todense()
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(docs)
print "RANKED TF-IDF"
print tfidf[0:1]
cosine_similarities = linear_kernel( tfidf[ 0: 1], tfidf). flatten()
print cosine_similarities
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
print related_docs_indices
print cosine_similarities[related_docs_indices]
print docs[14]