I am attempting to calculate the cosine similarity between a title and search query that are stored within a pandas dataframe, but am struggling to find the optimal method. It runs very slowly and I feel there must be a better way.
My code looks like:
import tensorflow_hub as hub
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow_text
# Load Google universal sentence encoder for semantic similarity
USE = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
# Load title and search queries from a test file
data = pd.read_json('1.json')
def use_filter(data, min_sim=0.80):
"""Utilize google universal sentence encoder to compare similarity between title and query
and then flter dataframe based on cosine similarirt
Args:
data(Dataframe): Dataframe of clicklog data pre-filtered for low similarity
min_sim (float): minim level of similarity to keep output
Returns:
df(Dataframe): Dataframe of clicklog data filtered for low similarity
"""
x = []
compare = dict(zip(data['title'], data['query']))
for key, value in compare.items():
for i in value:
cos_sim = cosine_similarity(embed_fn([key]), embed_fn([i])).flatten()[0]
x.append(cos_sim)
data['cos'] = x
data = data[data['cos'] > min_sim]
return data