I'm trying to run a DNN in Google Colab as I saw they offer "Tesla K80", which would make my training way faster as I don't have a very good GPU in my Laptop.
I ran the code and the runtime crashed. There was no error output, just a notification by Colab saying :
"Your session crashed after using all available RAM."
I looked up and I found this:
Google Colaboratory: misleading information about its GPU (only 5% RAM available to some users)
I checked the available GPU memory using :
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
and I got the output:
Gen RAM Free: 11.6 GB | Proc size: 150.0 MB
GPU RAM Free: 11439MB | Used: 0MB | Util 0% | Total 11439MB
which means that I have enough GPU ram to train my model.
I tried restarting and resetting runtimes, I restarted the browser too. Nothing good came out of it. It kept on crashing.
Note: My training data file is 1.4gb, so I thought loading that might be using up the ram. So I split up my code, loaded the training data, and then ran the "check GPU ram" code again whose output was :
Gen RAM Free: 11.6 GB | Proc size: 1.50 GB
GPU RAM Free: 11439MB | Used: 0MB | Util 0% | Total 11439MB
Still, a lot of RAM is left. And the runtime again crashed.
How can I run my model training?
(following is my full code)
import os
from google.colab import drive
drive.mount("/drive")
os.chdir("/drive/My Drive/Colab Notebooks/GTAV/model")
checking available GPU memory
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
process = psutil.Process(os.getpid())
print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm()
my model and train function
import tensorflow as tf
import tflearn
from tflearn.layers.core import fully_connected, dropout, input_data
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.estimator import regression
from tflearn.layers.normalization import local_response_normalization
###################################################
RUN_ID = "model_alexnetv2-v0.9"
EPOCHS = 10
VS = 2000 #Validation Samples
WIDTH = 160
HEIGHT = 120
CHANNELS = 1
LR = 1e-3
###################################################
def alexnetv2(output=3):
network = input_data(shape=[None, WIDTH, HEIGHT, CHANNELS], name='input')
network = conv_2d(network, 96, 11, strides=4, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = conv_2d(network, 256, 5, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 256, 3, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = conv_2d(network, 256, 5, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 384, 3, activation='relu')
network = conv_2d(network, 256, 3, activation='relu')
network = max_pool_2d(network, 3, strides=2)
network = local_response_normalization(network)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, 4096, activation='tanh')
network = dropout(network, 0.5)
network = fully_connected(network, output, activation='softmax')
model = regression(network, optimizer='momentum',
loss='categorical_crossentropy',
learning_rate=LR, name='targets')
model = tflearn.DNN(model, checkpoint_path="model-alexnetv2", max_checkpoints=1,
tensorboard_verbose=2, tensorboard_dir="model_training_log")
return model
def train_data(training_data, model=False):
X = np.array([x[0] for x in training_data]).reshape(-1, WIDTH, HEIGHT, CHANNELS)
y = [x[1] for x in training_data]
print(" >> Samples and Labels created.!")
train_X = X[-VS:]
train_y = y[-VS:]
print(" >> Validation Set created.!")
X = X[:-VS]
y = y[:-VS]
print(" >> Training Set created.!")
train_X = train_X/255.0
X = X/255.0
if not model:
model = alexnetv2()
model.fit(X, y, n_epoch=EPOCHS, show_metric=True, snapshot_step=500
,validation_set=(train_X, train_y), run_id=RUN_ID)
return model
loading training data
import numpy as np
import time
start = time.time()
print("Loading data...//")
training_data = np.load("/drive/My Drive/Colab Notebooks/GTAV/training_data-v0.3.npy", allow_pickle=True)
print(f"{len(training_data)} training samples loaded in {np.round(time.time()-start, 2)} seconds.")
training model
print("-------------------------------")
print("Training model...//")
model = train_data(training_data)
#
os.mkdir(f"/drive/My Drive/Colab Notebooks/GTAV/model{RUN_ID}")
os.chdir(f"/drive/My Drive/Colab Notebooks/GTAV/model/{RUN_ID}")
#
#
print("Saving model...//")
model.save(RUN_ID)