2

I have text data to perform sentiment analysis. With three classes (-1,0,1) I would like to create embeddings and get the centroids of the data so new data can be assigned according to the centroids based on cosine similarity.

Any ideas?

I am trying to create the embeddings using MPNET.

This is the code I tried

import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to classify data based on cosine similarity and threshold
def classify_data(embeddings, centroids, threshold):
    similarity_scores = torch.cosine_similarity(embeddings.unsqueeze(1), centroids.unsqueeze(0), dim=2)
    return similarity_scores.argmax(dim=1)

# Load model from HuggingFace Hub
tokenizer_mpnet = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model_mpnet = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Example DataFrame with 'sentence' and 'label' columns
data = {
    'sentence': [
        'This is a positive sentence',
        'Each sentence is neutral',
        'Another example negative sentence',
        'More sentences to test',
    ],
    'label': [1, 0, -1, 0],  # Assuming sentiment labels: 1 for positive, 0 for neutral, -1 for negative
}
df = pd.DataFrame(data)

# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize sentences using MPNet
encoded_input_mpnet = tokenizer_mpnet(train_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings using MPNet
with torch.no_grad():
    model_output_mpnet = model_mpnet(**encoded_input_mpnet)

# Perform pooling for embeddings using MPNet
sentence_embeddings_mpnet = mean_pooling(model_output_mpnet, encoded_input_mpnet['attention_mask'])

# Normalize embeddings
sentence_embeddings_mpnet = F.normalize(sentence_embeddings_mpnet, p=2, dim=1)

# Compute the centroids of each class for MPNet
centroids_mpnet = []
for label in [-1, 0, 1]:
    centroid = sentence_embeddings_mpnet[train_df['label'] == label].mean(dim=0)
    centroids_mpnet.append(centroid)

# Tokenize testing sentences using MPNet
encoded_input_test = tokenizer_mpnet(test_df['sentence'].tolist(), padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings for testing sentences using MPNet
with torch.no_grad():
    model_output_test = model_mpnet(**encoded_input_test)

# Perform pooling for testing embeddings using MPNet
sentence_embeddings_test = mean_pooling(model_output_test, encoded_input_test['attention_mask'])

# Normalize testing embeddings
sentence_embeddings_test = F.normalize(sentence_embeddings_test, p=2, dim=1)

# Classify new data based on cosine similarity and threshold for MPNet
threshold_mpnet = 0.33
predicted_labels_mpnet = classify_data(sentence_embeddings_test, torch.stack(centroids_mpnet), threshold_mpnet)

# Calculate precision, recall, and F1-score for each class for MPNet
precision_mpnet, recall_mpnet, f1_score_mpnet, _ = precision_recall_fscore_support(test_df['label'], predicted_labels_mpnet, average=None)

print("MPNet Precision:")
print(precision_mpnet)
print("MPNet Recall:")
print(recall_mpnet)
print("MPNet F1-score:")
print(f1_score_mpnet)

the error i was get :

getting keyerror at line : label in [-1, 0, 1]:
    centroid = sentence_embeddings_mpnet[train_df['label'] == label].mean(dim=0)

Should I use labels or not?

greybeard
  • 2,249
  • 8
  • 30
  • 66
snaikkk
  • 21
  • 1

0 Answers0