How do I reduce memory usage for deep reinforcement learning algorithms?

Question

I wrote a script of DQN to play BreakoutDeterministic and ran it on my school GPU server. However, it seems that the code is taking up 97% of the total RAM memory (more than 100GB)!

I would like to know which part of the script is demanding this high usage of RAM? I used memory-profiler for 3 episodes and it seems that the memory requirement increases linearly with each time step on my laptop.

I wrote the script in PyCharm, python 3.6. My laptop 12GB RAM with no GPU but the school server is using Ubuntu, p100 GPU.

import gym
import numpy as np
import random
from collections import deque
from keras.layers import Dense, Input, Lambda, convolutional, core
from keras.models import Model
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import os
import time as dt
plt.switch_backend('agg')

def preprocess(state):
    process_state = np.mean(state, axis=2).astype(np.uint8) 
    process_state = process_state[::2, ::2] 
    process_state_size = list(process_state.shape)
    process_state_size.append(1)
    process_state = np.reshape(process_state, process_state_size)
    return process_state


class DQNAgent:
    def __init__(self, env):
        self.env = env
        self.action_size = env.action_space.n
        self.state_size = self.select_state_size()

        self.memory = deque(maxlen=1000000)  # specify memory size
        self.gamma = 0.99
        self.eps = 1.0
        self.eps_min = 0.01
        self.decay = 0.95
        self.lr = 0.00025
        self.start_life = 5 # get from environment

        self.tau = 0.125  # special since 2 models to be trained


        self.model = self.create_cnnmodel()
        self.target_model = self.create_cnnmodel() 

    def select_state_size(self):
        process_state = preprocess(self.env.reset())
        state_size = process_state.shape
        return state_size

    def create_cnnmodel(self):

        data_input = Input(shape=self.state_size, name='data_input', dtype='int32')
        normalized = Lambda(lambda x: x/255)(data_input)  
        conv1 = convolutional.Convolution2D(32, 8, strides=(4, 4), activation='relu')(normalized)  
        conv2 = convolutional.Convolution2D(64, 4, strides=(2,2), activation='relu')(conv1)
        conv3 = convolutional.Convolution2D(64, 3, strides=(1,1), activation='relu')(conv2)
        conv_flatten = core.Flatten()(conv3)  # flatten to feed cnn to fc
        h4 = Dense(512, activation='relu')(conv_flatten)
        prediction_output = Dense(self.action_size, name='prediction_output', activation='linear')(h4)

        model = Model(inputs=data_input, outputs=prediction_output)
        model.compile(optimizer=Adam(lr=self.lr),
                  loss='mean_squared_error') # 'mean_squared_error') keras.losses.logcosh(y_true, y_pred)

        return model

    def remember(self, state, action, reward, new_state, done): # store past experience as a pre-defined table
        self.memory.append([state, action, reward, new_state, done])

    def replay(self, batch_size):
        if batch_size > len(self.memory):
            return

        all_states = []
        all_targets = []
        samples = random.sample(self.memory, batch_size)  
        for sample in samples:
            state, action, reward, new_state, done = sample
            target = self.target_model.predict(state)  
            if done:
                target[0][action] = reward  
            else:                   
                target[0][action] = reward + self.gamma*np.max(self.target_model.predict(new_state)[0])
            all_states.append(state)
            all_targets.append(target)
        history = self.model.fit(np.vstack(all_states), np.vstack(all_targets), epochs=1, verbose=0) 
        return history

    def act(self, state):
        self.eps *= self.decay  
        self.eps = max(self.eps_min, self.eps)
        if np.random.random() < self.eps:
            return self.env.action_space.sample()  
        return np.argmax(self.model.predict(state)[0])  

    def train_target(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = (1-self.tau)*target_weights[i] + self.tau*weights[i] 
        self.target_model.set_weights(target_weights) #


def main(episodes):

    env = gym.make('BreakoutDeterministic-v4')

    agent = DQNAgent(env, cnn)  
    time = env._max_episode_steps
    batch_size = 32

    save_model = 'y'

    filepath = os.getcwd() 
    date = dt.strftime('%d%m%Y')
    clock = dt.strftime('%H.%M.%S')

    print('++ Training started on {} at {} ++'.format(date, clock))
    start_time = dt.time()

    tot_r = []
    tot_loss = []
    it_r = []
    it_loss = []
    tot_frames = 0

    for e in range(episodes):
        r = []
        loss = []

        state = env.reset()
        state = preprocess(state)
        state = state[None,:]

        current_life = agent.start_life

        for t in range(time):
            if rend_env == 'y':

            action = agent.act(state)
            new_state, reward, terminal_life, life = env.step(action)
            new_state = preprocess(new_state)
            new_state = new_state[None,:]

            if life['ale.lives'] < current_life:
                reward = -1
               current_life = life['ale.lives']

            agent.remember(state, action, reward, new_state, terminal_life)

            hist = agent.replay(batch_size)
            agent.train_target()

            state = new_state

            r.append(reward)
            tot_frames += 1

            if hist is None:
                loss.append(0.0)
            else:
                loss.append(hist.history['loss'][0])

            if t%20 == 0:
                print('Frame : {}, Cum Reward = {}, Avg Loss = {}, Curr Life: {}'.format(t,
                                                                                       np.sum(r),
                                                                                       round(np.mean(loss[-20:-1]),3),
                                                                                       current_life))

                agent.model.save('{}/Mod_Fig/DQN_BO_model_{}.h5'.format(filepath, date))
                agent.model.save_weights('{}/Mod_Fig/DQN_BO_weights_{}.h5'.format(filepath, date))

            if current_life == 0 or terminal_life:
                print('Episode {} of {}, Cum Reward = {}, Avg Loss = {}'.format(e, episodes, np.sum(r), np.mean(loss)))
                break

        tot_r.append(np.sum(r))
        tot_loss.append(np.mean(loss))
        it_r.append(r)
        it_loss.append(loss)

    print('Training ended on {} at {}'.format(date, clock))
    run_time = dt.time() - start_time
    print('Total Training time: %d Hrs %d Mins $d s' % (run_time // 3600, (run_time % 3600) // 60),
      (run_time % 3600) % 60 // 1)

    if save_model == 'y':
        agent.model.save('{}/Mod_Fig/DQN_BO_finalmodel_{}_{}.h5'.format(filepath, date, clock))
        agent.model.save_weights('{}/Mod_Fig/DQN_BO_finalweights_{}_{}.h5'.format(filepath, date, clock))

    agent.model.summary()

    return tot_r, tot_loss, it_r, it_loss, tot_frames


if __name__ == '__main__':
    episodes = 3
    total_reward, total_loss, rewards_iter, loss_iter, frames_epi = main(episodes=episodes)

Would really appreciate your comments and help on writing memory and speed efficient deep RL codes! I hope to train my DQN on breakout for 5000 episodes but the remote server only allows maximum of 48 hours of training. Thanks in advance!

score 2 · Answer 1 · answered Jan 26 '19 at 09:01

It sounds like you have a memory leak.

This line

agent.remember(state, action, reward, new_state, terminal_life)

gets called 5000 * env._max_episode_steps times, and each state is a (210, 160, 3) array. The first thing to try would be to reduce the size of self.memory = deque(maxlen=1000000) # specify memory size to verify that this is the sole cause.

If you really believe you need that much capacity, you should dump self.memory to disk and keep a only a small subsample in memory.

Additionally: subsampling from deque is very slow, deque is implemented as a linked list so each subsample is O(N*M). You should consider implementing your own ring buffer for self.memory.

Alternatively: you might consider a probabilistic buffer (I don't know the proper name), where each time you would append to a full buffer, remove an element at random and append the new element. This means any (state, action, reward, ...) tuple that is encountered has a nonzero probability of being contained in the buffer, with recent tuples being more likely than older ones.

score 2 · Answer 2 · answered Jan 30 '19 at 03:41

I had similar problems with memory and I still do.

The main cause of the large memory consumption are the states. But here's what I did to make it better:

Step 1: Resize them to a 84 x 84 sample using openCV. Some people instead downsample the images to 84 x 84. This results in each state having the shape (84,84,3).

Step 2: Convert these frames to grayscale (basically, black and white). This should change the shape to (84,84,1).

Step 3: Use dtype=np.uint8 for storing states. They consume minimal memory and are perfect for the pixel intensity values ranged 0-255.

Additional Info

I run my code on free Google Collab notebooks (K80 Tesla GPU and 13GB RAM), periodically saving the replay buffer to my drive.

For steps 1 and 2, consider using the OpenAI baseline Atari wrappers, as there is no point in reinventing the wheel.

You could also this snippet to check the amount of RAM used by your own program at each step, like I did:

import os
import psutil

def show_RAM_usage(self):
    py = psutil.Process(os.getpid())
    print('RAM usage: {} GB'.format(py.memory_info()[0]/2. ** 30))

This snippet is modified to use in my own program from the original answer

How do I reduce memory usage for deep reinforcement learning algorithms?

2 Answers2