Roberta with GRU is not training

Question

I'm trying to fine-tune RoBERTa and integrate external knowledge via a BiGRU block. But the model is not learning (the train loss is around 0.8 and is not decreasing). There is no problem with the data, I tried some other RoBERTa-based models on the same dataset and it worked fine.

Here is the architecture:

class CustomRoberta(PreTrainedModel):

    def __init__(self, config, num_labels, max_em_len_1, max_em_len_2, no_bert_layers=2):
        super(CustomRoberta, self).__init__(config)
        self.num_labels = num_labels
        self.bert = RobertaModel.from_pretrained("roberta-large")
        self.hidden_size = self.config.hidden_size
        self.emotion_embeddings = nn.Embedding(max_em_len_1, self.config.hidden_size)
        self.opinion_embeddings = nn.Embedding(max_em_len_2, self.config.hidden_size)
        self.dropout = nn.Dropout(0.1)
        self.dense = nn.Linear(self.config.hidden_size * 3 * 3 * 2, num_labels) # avd_pool max_pool, last hidden state
        self.apply(self.init_bert_weights)
        self.linear = nn.Linear(self.config.hidden_size, num_labels)

        self.gru = nn.GRU(self.config.hidden_size * 3, self.config.hidden_size * 3, bidirectional=True, batch_first=True)


    def init_bert_weights(self, module):
        """ Initialize the weights."""
        if isinstance(module, (nn.Linear, nn.Embedding)):
            nn.init.xavier_uniform_(module.weight)
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
            module.bias.data.zero_()

    def get_att(self, hiddes, emotion_embd):

        concat = torch.cat([hiddes, emotion_embd], -1)
        g = self.att_lin(concat)
        alpha = F.softmax(g, dim=0)
        att_hidden = alpha * hiddes
        return att_hidden

    def forward(self, input_ids, opinion_ids, attention_mask, emotion_ids, return_indices=False):
        bert_encoded_layers_raw = self.bert(input_ids, attention_mask).last_hidden_state 
        
        bert_encoded_layers = self.dropout(bert_encoded_layers_raw)

        
        emotion_embeddings = self.emotion_embeddings(emotion_ids)
        opinion_embeddings = self.opinion_embeddings(opinion_ids)
        eks = torch.cat((opinion_embeddings,  emotion_embeddings), -1)
        concat = torch.cat((bert_encoded_layers, eks), -1)
        gru_all_hidden, gru_last_hidden = self.gru(concat, torch.zeros(2, concat.shape[0], self.config.hidden_size * 3).to(device))

        gru_last_hidden_dir0 = gru_last_hidden[0, :, :]
        gru_last_hidden_dir1 = gru_last_hidden[1, :, :]


        gru_last_hidden_stacked = torch.cat(
            (gru_last_hidden_dir0, gru_last_hidden_dir1), dim=1
        )

        gru_avg = torch.mean(gru_all_hidden, dim=1)
        gru_max, _ = torch.max(gru_all_hidden, dim=1)
        gru_complete_concatted = torch.cat(
            (gru_last_hidden_stacked, gru_avg, gru_max), dim=1
        )
        logits = self.dense(gru_complete_concatted)
        

        return logits

Here is the training code:

roberta = CustomRoberta(config=config, num_labels=3, max_em_len_1=3, max_em_len_2=4)
roberta = roberta.to(device)
param_optimizer = list(roberta.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]


batch_size = 16
epochs = 5
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_steps, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()


for epoch in range(epochs):
            loss_epoch = []
            for step, batch in enumerate(train_dataloader):
                model.train()
                input_ids, opinion_ids, input_mask, emotion_ids, label_ids = batch
                logits = model(input_ids=input_ids.to(device), opinion_ids=opinion_ids.to(device), attention_mask=input_mask.to(device), emotion_ids=emotion_ids.to(device))
                loss = loss_fn(logits, label_ids.to(device))
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                loss_epoch.append(loss.item())

                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

What I've already checked and double-checked:

if the weights are changing during backpropagation (they are)
if the DataLoader works as expected and shuffle=True
to overfit the model on a tiny subset and many epochs (it can't overfit at all)
decreasing and increasing learning rate
deleting the entire GRU part and only training RoBERTa and the final dense layer

None of these worked. Please help!!

Roberta with GRU is not training

0 Answers0