I'm trying to fine-tune RoBERTa and integrate external knowledge via a BiGRU block. But the model is not learning (the train loss is around 0.8 and is not decreasing). There is no problem with the data, I tried some other RoBERTa-based models on the same dataset and it worked fine.
Here is the architecture:
class CustomRoberta(PreTrainedModel):
def __init__(self, config, num_labels, max_em_len_1, max_em_len_2, no_bert_layers=2):
super(CustomRoberta, self).__init__(config)
self.num_labels = num_labels
self.bert = RobertaModel.from_pretrained("roberta-large")
self.hidden_size = self.config.hidden_size
self.emotion_embeddings = nn.Embedding(max_em_len_1, self.config.hidden_size)
self.opinion_embeddings = nn.Embedding(max_em_len_2, self.config.hidden_size)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.config.hidden_size * 3 * 3 * 2, num_labels) # avd_pool max_pool, last hidden state
self.apply(self.init_bert_weights)
self.linear = nn.Linear(self.config.hidden_size, num_labels)
self.gru = nn.GRU(self.config.hidden_size * 3, self.config.hidden_size * 3, bidirectional=True, batch_first=True)
def init_bert_weights(self, module):
""" Initialize the weights."""
if isinstance(module, (nn.Linear, nn.Embedding)):
nn.init.xavier_uniform_(module.weight)
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
def get_att(self, hiddes, emotion_embd):
concat = torch.cat([hiddes, emotion_embd], -1)
g = self.att_lin(concat)
alpha = F.softmax(g, dim=0)
att_hidden = alpha * hiddes
return att_hidden
def forward(self, input_ids, opinion_ids, attention_mask, emotion_ids, return_indices=False):
bert_encoded_layers_raw = self.bert(input_ids, attention_mask).last_hidden_state
bert_encoded_layers = self.dropout(bert_encoded_layers_raw)
emotion_embeddings = self.emotion_embeddings(emotion_ids)
opinion_embeddings = self.opinion_embeddings(opinion_ids)
eks = torch.cat((opinion_embeddings, emotion_embeddings), -1)
concat = torch.cat((bert_encoded_layers, eks), -1)
gru_all_hidden, gru_last_hidden = self.gru(concat, torch.zeros(2, concat.shape[0], self.config.hidden_size * 3).to(device))
gru_last_hidden_dir0 = gru_last_hidden[0, :, :]
gru_last_hidden_dir1 = gru_last_hidden[1, :, :]
gru_last_hidden_stacked = torch.cat(
(gru_last_hidden_dir0, gru_last_hidden_dir1), dim=1
)
gru_avg = torch.mean(gru_all_hidden, dim=1)
gru_max, _ = torch.max(gru_all_hidden, dim=1)
gru_complete_concatted = torch.cat(
(gru_last_hidden_stacked, gru_avg, gru_max), dim=1
)
logits = self.dense(gru_complete_concatted)
return logits
Here is the training code:
roberta = CustomRoberta(config=config, num_labels=3, max_em_len_1=3, max_em_len_2=4)
roberta = roberta.to(device)
param_optimizer = list(roberta.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.1},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
batch_size = 16
epochs = 5
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-6)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1*total_steps, num_training_steps=total_steps)
loss_fn = torch.nn.CrossEntropyLoss()
for epoch in range(epochs):
loss_epoch = []
for step, batch in enumerate(train_dataloader):
model.train()
input_ids, opinion_ids, input_mask, emotion_ids, label_ids = batch
logits = model(input_ids=input_ids.to(device), opinion_ids=opinion_ids.to(device), attention_mask=input_mask.to(device), emotion_ids=emotion_ids.to(device))
loss = loss_fn(logits, label_ids.to(device))
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
loss_epoch.append(loss.item())
optimizer.step()
scheduler.step()
optimizer.zero_grad()
What I've already checked and double-checked:
- if the weights are changing during backpropagation (they are)
- if the DataLoader works as expected and shuffle=True
- to overfit the model on a tiny subset and many epochs (it can't overfit at all)
- decreasing and increasing learning rate
- deleting the entire GRU part and only training RoBERTa and the final dense layer
None of these worked. Please help!!