I'm trying to use google flan t5-large to create embeddings for a simple semantic search engine. However, the generated embeddings cosine similarity with my query is very off. Is there something I'm doing wrong?
import torch
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-large')
model = AutoModel.from_pretrained('google/flan-t5-large')
# Set the text to encode
def emebddings_generate(text):
all_embeddings = []
for i in text:
input_ids = tokenizer.encode(i, return_tensors='pt')
with torch.no_grad():
embeddings = model(input_ids, decoder_input_ids=input_ids).last_hidden_state.mean(dim=1)
all_embeddings.append((embeddings,i))
return all_embeddings
def run_query(query,corpus):
input_ids = tokenizer.encode(query, return_tensors='pt')
with torch.no_grad():
quer_emebedding=model(input_ids,decoder_input_ids=input_ids).last_hidden_state.mean(dim=1)
similairtiy = []
for embeds in corpus:
sim = euclidean(embeds[0].flatten(),quer_emebedding.flatten())
similairtiy.append((embeds[1],float(sim)))
return similairtiy
text = ['some sad song', ' a very happy song']
corpus = emebddings_generate(text)
query = "I'm feeling so sad rn"
similairtiy = run_query( query,corpus)
for i in similairtiy:
print(i)
print(i[1],i[0])
I've tried different pooling techniques as well as using other distance metrics.