I have text data to perform sentiment analysis. With three classes (-1,0,1) I would like to create embeddings
and get the centroids of the data so new data can be assigned according to the centroids based on cosine similarity.
Any ideas?
I am trying to create the embeddings using MPNET
.
This is the code I tried
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
token_embeddings = model_output[0] # First element of model_output contains all token embeddings
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Function to classify data based on cosine similarity and threshold
def classify_data(embeddings, centroids, threshold):
similarity_scores = torch.cosine_similarity(embeddings.unsqueeze(1), centroids.unsqueeze(0), dim=2)
return similarity_scores.argmax(dim=1)
# Load model from HuggingFace Hub
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model_mpnet = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
# Example DataFrame with 'sentence' and 'label' columns
data = {
'sentence': [
'This is a positive sentence',
'Each sentence is neutral',
'Another example negative sentence',
'More sentences to test',
],
'label': [1, 0, -1, 0], # Assuming sentiment labels: 1 for positive, 0 for neutral, -1 for negative
}
df = pd.DataFrame(data)
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
# Tokenize sentences using MPNet
encoded_input_mpnet = tokenizer_mpnet(train_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings using MPNet
with torch.no_grad():
model_output_mpnet = model_mpnet(**encoded_input_mpnet)
# Perform pooling for embeddings using MPNet
sentence_embeddings_mpnet = mean_pooling(model_output_mpnet, encoded_input_mpnet['attention_mask'])
# Normalize embeddings
sentence_embeddings_mpnet = F.normalize(sentence_embeddings_mpnet, p=2, dim=1)
# Compute the centroids of each class for MPNet
centroids_mpnet = []
for label in [-1, 0, 1]:
centroid = sentence_embeddings_mpnet[train_df['label'] == label].mean(dim=0)
centroids_mpnet.append(centroid)
# Tokenize testing sentences using MPNet
encoded_input_test = tokenizer_mpnet(test_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')
# Compute token embeddings for testing sentences using MPNet
with torch.no_grad():
model_output_test = model_mpnet(**encoded_input_test)
# Perform pooling for testing embeddings using MPNet
sentence_embeddings_test = mean_pooling(model_output_test, encoded_input_test['attention_mask'])
# Normalize testing embeddings
sentence_embeddings_test = F.normalize(sentence_embeddings_test, p=2, dim=1)
# Classify new data based on cosine similarity and threshold for MPNet
threshold_mpnet = 0.33
predicted_labels_mpnet = classify_data(sentence_embeddings_test, torch.stack(centroids_mpnet), threshold_mpnet)
# Calculate precision, recall, and F1-score for each class for MPNet
precision_mpnet, recall_mpnet, f1_score_mpnet, _ = precision_recall_fscore_support(test_df['label'], predicted_labels_mpnet, average=None)
print("MPNet Precision:")
print(precision_mpnet)
print("MPNet Recall:")
print(recall_mpnet)
print("MPNet F1-score:")
print(f1_score_mpnet)
the error i was get :
getting keyerror at line : label in [-1, 0, 1]:
centroid = sentence_embeddings_mpnet[train_df['label'] == label].mean(dim=0)
Should I use labels or not?