1

I'm trying to fine-tune RoBERTa and integrate external knowledge via a BiGRU block. But the model is not learning (the train loss is around 0.8 and is not decreasing). There is no problem with the data, I tried some other RoBERTa-based models on the same dataset and it worked fine.

Here is the architecture:

class CustomRoberta(PreTrainedModel):

    def __init__(self, config, num_labels, max_em_len_1, max_em_len_2, no_bert_layers=2):
        super(CustomRoberta, self).__init__(config)
        self.num_labels = num_labels
        self.bert = RobertaModel.from_pretrained("roberta-large")
        self.hidden_size = self.config.hidden_size
        self.emotion_embeddings = nn.Embedding(max_em_len_1, self.config.hidden_size)
        self.opinion_embeddings = nn.Embedding(max_em_len_2, self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.dense = nn.Linear(self.config.hidden_size * 3 * 3 * 2, num_labels) # avd_pool max_pool, last hidden state
        self.apply(self.init_bert_weights)
        self.linear = nn.Linear(self.config.hidden_size, num_labels)

        self.gru = nn.GRU(self.config.hidden_size * 3, self.config.hidden_size * 3, bidirectional=True, batch_first=True)


    def init_bert_weights(self, module):
        """ Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            nn.init.xavier_uniform_(module.weight)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def get_att(self, hiddes, emotion_embd):

        concat = torch.cat([hiddes, emotion_embd], -1)
        g = self.att_lin(concat)
        alpha = F.softmax(g, dim=0)
        att_hidden = alpha * hiddes
        return att_hidden

    def forward(self, input_ids, opinion_ids, attention_mask, emotion_ids, return_indices=False):
        bert_encoded_layers_raw = self.bert(input_ids, attention_mask).last_hidden_state 
        
        bert_encoded_layers = self.dropout(bert_encoded_layers_raw)

        
        emotion_embeddings = self.emotion_embeddings(emotion_ids)
        opinion_embeddings = self.opinion_embeddings(opinion_ids)
        eks = torch.cat((opinion_embeddings,  emotion_embeddings), -1)
        concat = torch.cat((bert_encoded_layers, eks), -1)
        gru_all_hidden, gru_last_hidden = self.gru(concat, torch.zeros(2, concat.shape[0], self.config.hidden_size * 3).to(device))

        gru_last_hidden_dir0 = gru_last_hidden[0, :, :]
        gru_last_hidden_dir1 = gru_last_hidden[1, :, :]


        gru_last_hidden_stacked = torch.cat(
            (gru_last_hidden_dir0, gru_last_hidden_dir1), dim=1
        )

        gru_avg = torch.mean(gru_all_hidden, dim=1)
        gru_max, _ = torch.max(gru_all_hidden, dim=1)
        gru_complete_concatted = torch.cat(
            (gru_last_hidden_stacked, gru_avg, gru_max), dim=1
        )
        logits = self.dense(gru_complete_concatted)
        

        return logits

Here is the training code:

roberta = CustomRoberta(config=config, num_labels=3, max_em_len_1=3, max_em_len_2=4)
roberta = roberta.to(device)
param_optimizer = list(roberta.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]


batch_size = 16
epochs = 5
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_steps, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()


for epoch in range(epochs):
            loss_epoch = []
            for step, batch in enumerate(train_dataloader):
                model.train()
                input_ids, opinion_ids, input_mask, emotion_ids, label_ids = batch
                logits = model(input_ids=input_ids.to(device), opinion_ids=opinion_ids.to(device), attention_mask=input_mask.to(device), emotion_ids=emotion_ids.to(device))
                loss = loss_fn(logits, label_ids.to(device))
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                loss_epoch.append(loss.item())

                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
        

What I've already checked and double-checked:

  • if the weights are changing during backpropagation (they are)
  • if the DataLoader works as expected and shuffle=True
  • to overfit the model on a tiny subset and many epochs (it can't overfit at all)
  • decreasing and increasing learning rate
  • deleting the entire GRU part and only training RoBERTa and the final dense layer

None of these worked. Please help!!

atlas
  • 11
  • 1

0 Answers0