How to solve ' CUDA out of memory. Tried to allocate xxx MiB' in pytorch?

Question

I am trying to train a CNN in pytorch,but I meet some problems. The RuntimeError:

RuntimeError: CUDA out of memory. Tried to allocate 512.00 MiB (GPU 0; 2.00 GiB total capacity; 584.97 MiB already allocated; 13.81 MiB free; 590.00 MiB reserved in total by PyTorch)

This is my code:

import os
import numpy as np
import cv2
import torch as t
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader,Dataset
import time
import matplotlib.pyplot as plt
%matplotlib inline
root_path='C:/Users/60960/Desktop/recet-task/course_LeeML20/course_LeeML20-datasets/hw3/food-11'
training_path=root_path+'/training'
testing_path=root_path+'/testing'
validation_path=root_path+'/validation'
def readfile(path,has_label):
    img_paths=sorted(os.listdir(path))
    x=np.zeros((len(img_paths),128,128,3),dtype=np.uint8)
    y=np.zeros((len(img_paths)),dtype=np.uint8)
    for i,file in enumerate(img_paths):
        img=cv2.imread(path+'/'+file)
        x[i,:,:]=cv2.resize(img,(128,128))
        if has_label:
            y[i]=int(file.split('_')[0])
    if has_label:
        return x,y
    else:
        return x
def show_img(img_from_cv2):
    b,g,r=cv2.split(img_from_cv2)
    img=cv2.merge([r,g,b])
    plt.imshow(img)
    plt.show()
x_train,y_train=readfile(training_path,True)
x_val,y_val=readfile(validation_path,True)
x_test=readfile(testing_path,False)
train_transform=transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor()
])
test_transform=transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor()
])
class ImgDataset(Dataset):
    def __init__(self,x,y=None,transform=None):
        self.x=x
        self.y=y
        if y is not None:
            self.y=t.LongTensor(y)
        self.transform=transform
    def __len__(self):
        return len(self.x)
    def __getitem__(self,idx):
        X=self.x[idx]
        if self.transform is not None:
            X=self.transform(X)
        if self.y is not None:
            Y=self.y[idx]
            return X,Y
        return X
batch_size=128
train_set=ImgDataset(x_train,y_train,transform=train_transform)
val_set=ImgDataset(x_val,y_val,transform=test_transform)
train_loader=DataLoader(train_set,batch_size=batch_size,shuffle=True)
val_loader=DataLoader(val_set,batch_size=batch_size,shuffle=False)
class Classifier(nn.Module):
    def __init__(self):
        super(Classifier,self).__init__()
        self.cnn=nn.Sequential(
            nn.Conv2d(3,64,3,1,1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(64,128,3,1,1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(128,256,3,1,1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(256,512,3,1,1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0),

            nn.Conv2d(512,512,3,1,1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(2,2,0)
        )
        self.fc=nn.Sequential(
            nn.Linear(512*4*4,1024),
            nn.ReLU(),
            nn.Linear(1024,512),
            nn.ReLU(),
            nn.Linear(512,11)
        )
    def forward(self,x):
        out=self.cnn(x)
        out=out.view(out.size()[0],-1)
        return self.fc(out)
model=Classifier().cuda()
loss_fn=nn.CrossEntropyLoss()
optim=t.optim.Adam(model.parameters(),lr=0.001)
epochs=30
for epoch in range(epochs):
    epoch_start_time=time.time()
    train_acc=0.0
    train_loss=0.0
    val_acc=0.0
    val_loss=0.0
    model.train()
    for i,data in enumerate(train_loader):
        optim.zero_grad()
        train_pred=model(data[0].cuda())
        batch_loss=loss_fn(train_pred,data[1].cuda())
        batch_loss.backward()
        optim.step()
        train_acc+=np.sum(np.argmax(train_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
        train_loss+=batch_loss.item()
    model.eval()
    with t.no_grad():
        for i,data in enumerate(val_loader):
            val_pred=model(data[0].cuda())
            batch_loss=loss_fn(val_pred,data[1].cuda())
            val_acc+=np.sum(np.argmax(val_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
            val_loss+=batch_loss.item()
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % (epoch + 1, epochs, time.time()-epoch_start_time,train_acc/train_set.__len__(), train_loss/train_set.__len__(), val_acc/val_set.__len__(), val_loss/val_set.__len__()))
x_train_val=np.concatenate((x_train,x_val),axis=0)
y_train_val=np.concatenate((y_train,y_val),axis=0)
train_val_set=ImgDataset(x_train_val,x_train_val,train_transform)
train_val_loader=DataLoader(train_val_set,batch_size=batch_size,shuffle=True)
model_final=Classifier().cuda()
loss_fn=nn.CrossEntropy()
optim=t.optim.Adam(model_final.parameters(),lr=0.001)
epochs=30
for epoch in range(epochs):
    epoch_start_time=time.time()
    train_acc=0.0
    train_loss=0.0
    model_final.train()
    for i,data in enumerate(train_val_loader):
        optim.zero_grad()
        train_pred=model_final(data[0].cuda())
        batch_loss=loss_fn(train_pred,data[1].cuda())
        batch_loss.backward()
        optim.step()
        train_acc+=np.sum(np.argmax(train_pred.cpu().data.numpy(),axis=1)==data[1].numpy())
        train_loss+=batch_loss.item()
    print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f' % (epoch + 1, epochs, time.time()-epoch_start_time,train_acc/train_val_set.__len__(), train_loss/train_val_set.__len__()))
test_set=ImgDataset(x_test,transform=test_transform)
test_loader=DataLoader(test_set,batch_size=batch_size,shuffle=False)
model_final.eval()
prediction=[]
with t.no_grad():
    for i,data in enumerate(test_loader):
        test_pred=model_final(data.cuda())
        test_label=np.argmax(test_pred.cpu().data.numpy(),axis=1)
        for y in test_label:
            prediction.append(y)
with open('predict.csv','w') as f:
    f.write('Id,Category\n')
    for i,y in enumerate(prediction):
        f.write('{},{}\n,'.format(i,y))

Pytorch version is 1.4.0, opencv2 version is 4.2.0.
The training dataset are pictures like these:training set

The error happens at this line:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-770be67177f4> in <module>
    119     for i,data in enumerate(train_loader):
    120         optim.zero_grad()
--> 121         train_pred=model(data[0].cuda())
    122         batch_loss=loss_fn(train_pred,data[1].cuda())
    123         batch_loss.backward()

I have already installed: some information.
GPU utilization is low,close to zero: GPU utilization.
Error message says:

RuntimeError: CUDA out of memory. Tried to allocate 512.00 MiB.

So I want to know how to allocate more memory.
What's more, I have tried to reduce the batch size to 1, but this doesn't work. HELP!!!

It means you don't have enough GPU RAM to hold your model in memory. What type of GPU do you have? — jodag, Apr 15 '20 at 18:35
My GPU's information is here: https://i.loli.net/2020/04/16/1i8whHmfkxV3S9p.png — Wargrave Justice, Apr 16 '20 at 01:24
Your GPU only has 2GB of GPU RAM which is simply not enough to train modern deep 2d conv nets. To reduce the memory footprint I would advise reducing the number of channels in your linear layers since these tend to take a lot of memory. — jodag, Apr 16 '20 at 05:26

score 3 · Answer 1 · edited Apr 16 '20 at 15:42

3

Try reducing your batch_size (ex. 32). This can happen because your GPU memory can't hold all your images for a single epoch.

edited Apr 16 '20 at 15:42

shaw2thefloor

600
5
20

answered Apr 16 '20 at 04:58

Surya Mahadi

274
1
4

3

I have tried to reduce the batch size to 1, but this doesn't work. – Wargrave Justice Apr 16 '20 at 05:38
hmm you can reduce the number of convolution layer and the kernel size. but if you still wanna try this, you can try run your model on `google colab` – Surya Mahadi Apr 16 '20 at 14:25
Any other suggestions? I've made my model ridiculously small and it still fails – Rylan Schaeffer Sep 30 '21 at 19:39
@RylanSchaeffer what is your GPU? – Surya Mahadi Oct 03 '21 at 06:30
3

I figured out the issue. Reducing the batch size didn't help. The problem was that my custom dataloaders weren't releasing memory due to holding torch gradients. – Rylan Schaeffer Oct 03 '21 at 17:32
@SuryaMahadi I'm running into this problem on Google Colab lol, I think the failure is in the software somewhere and not the hardware because my instance is powerful – Aaron Meese Jun 17 '22 at 00:46

score 3 · Answer 2 · answered Jan 23 '21 at 06:08

3

Before reducing the batch size check the status of GPU memory :slight_smile:

nvidia-smi

Then check which process is eating up the memory choose PID and kill :boom: that process with

sudo kill -9 PID

or

sudo fuser -v /dev/nvidia*

sudo kill -9 PID

answered Jan 23 '21 at 06:08

W Wilfred Godfrey

39
2

How to solve ' CUDA out of memory. Tried to allocate xxx MiB' in pytorch?

2 Answers2