I am using a quite large GPU which is around 80 GB. The training epochs runs fine but for some reason when evaluating (the training set and validation sets have the same length more or less), I am running out of memory and getting this error:
File "/home.../transformers/trainer_pt_utils.py", line 75, in torch_pad_and_concatenate
return torch.cat((tensor1, tensor2), dim=0)
RuntimeError: CUDA out of memory. Tried to allocate 33.84 GiB (GPU 0; 79.35 GiB total
capacity; 36.51 GiB already allocated; 32.48 GiB free; 44.82 GiB reserved in total by
PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to
avoid fragmentation. See documentation for Memory Management and
PYTORCH_CUDA_ALLOC_CONF
The training and validation data was created like this:
train_texts, train_labels = read_dataset('basic_train.tsv')
val_texts, val_labels = read_dataset('basic_val.tsv')
train_encodings = tokenizer(train_texts, truncation=False, padding=True)
val_encodings = tokenizer(val_texts, truncation=False, padding=True)
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
...
return item
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
My training code looks like this:
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=10,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
warmup_steps=500,
weight_decay= 5e-5,
logging_dir='./logs',
logging_steps=10,
learning_rate= 2e-5,
eval_steps= 100,
save_steps=30000,
evaluation_strategy= 'steps'
)
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
metric = load_metric('accuracy')
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metric.compute(predictions=predictions, references=labels)
def collate_fn_t5(batch):
input_ids = torch.stack([example['input_ids'] for example in batch])
attention_mask = torch.stack([example['attention_mask'] for example in batch])
labels = torch.stack([example['input_ids'] for example in batch])
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
data_collator=collate_fn_t5,
# evaluation dataset
)
trainer.train()
eval_results = trainer.evaluate()