0

I am trying to make training my DQN agent faster (for now, it takes +3s per training loop in each step, 300 steps for each episodes, so 15 minutes per episode amounting to 10 days in total to finish 1000 episodes which is the required number of episodes until convergence).

I did ask a similar question a while back here which improved the training speed but it still takes too long to train the DQN agent. I also tried using MirroredStrategy with 4 CPU cores (I cannot use GPU on my machine) like suggested here and here but it made training even slower (+11s per training loop)

I stumbled upon this which suggests that using a for loop makes the training slower.

This is my code:

n_possible_movements = 9
MINIBATCH_SIZE = 32

class DQNAgent(object):
    def __init__(self):
        self.epsilon = 1.0
        self.epsilon_decay = 0.8
        self.epsilon_min = 0.1
        self.learning_rate = 10e-4
        self.tau = 1e-3
        
                
        # Main models
        self.model_uav_pos = self._build_pos_model()

        # Target networks
        self.target_model_uav_pos = self._build_pos_model()
        # Copy weights
        self.target_model_uav_pos.set_weights(self.model_uav_pos.get_weights())

        # An array with last n steps for training
        self.replay_memory_pos_nn = deque(maxlen=REPLAY_MEMORY_SIZE)
        
    def _build_pos_model(self): # compile the DNN
        # create the DNN model
        dnn = self.create_pos_dnn()
        
        opt = Adam(learning_rate=self.learning_rate) #, decay=self.epsilon_decay)
        dnn.compile(loss="mse", optimizer=opt)
        dnn.call = tf.function(dnn.call, jit_compile=True)
        
        return dnn
    
    def create_pos_dnn(self): 
        # initialize the input shape
        pos_input_shape = (2,)
        requests_input_shape = (len(env.ues),)
        number_of_satisfied_ues_input_shape = (1,)
        # How many possible outputs we can have
        output_nodes = n_possible_movements
        
        # Initialize the inputs
        uav_current_position = Input(shape=pos_input_shape, name='pos')
        ues_requests = Input(shape=requests_input_shape, name='requests')
        number_of_satisfied_ues = Input(shape=number_of_satisfied_ues_input_shape, name='number_of_satisfied_ues')
        
        # Put them in a list
        list_inputs = [uav_current_position, ues_requests, number_of_satisfied_ues]
        
        # Merge all input features into a single large vector
        x = layers.concatenate(list_inputs)
        
        # Add a 1st Hidden (Dense) Layer
        dense_layer_1 = Dense(512, activation="relu")(x)
        
        # Add a 2nd Hidden (Dense) Layer
        dense_layer_2 = Dense(512, activation="relu")(dense_layer_1)
        
        # Add a 3rd Hidden (Dense) Layer
        dense_layer_3 = Dense(256, activation="relu")(dense_layer_2)
        
        # Output layer
        output_layer = Dense(output_nodes, activation="linear")(dense_layer_3)

        model = Model(inputs=list_inputs, outputs=output_layer)
                        
        # return the DNN
        return model
    
    def remember_pos_nn(self, state, action, reward, next_state, done):
        self.replay_memory_pos_nn.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later
        
    def act_upon_choosing_a_new_position(self, state): # state is a tuple (uav_position, requests_array, number_satisfaction)
        if np.random.rand() <= self.epsilon: # if acting randomly, take random action
            return random.randrange(n_possible_movements)
        pos =  np.array([state[0]])
        reqs =  np.array([state[1]])
        number_satisfaction = np.array([state[2]])
        act_values = self.model_uav_pos([pos, reqs, number_satisfaction]) # if not acting randomly, predict reward value based on current state
        return np.argmax(act_values[0]) 

    def target_train(self):
        weights = self.model_uav_pos.get_weights()
        target_weights = self.target_model_uav_pos.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model_uav_pos.set_weights(target_weights)

This is the training function before I tried the solution in the link:

def train_pos_nn(self):
        print("In Training..")

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
            print("Exiting Training: Replay Memory Not Full Enough...")
            return

        # Get a minibatch of random samples from memory replay table
        list_memory = list(self.replay_memory_pos_nn)
        random.shuffle(list_memory)
        minibatch = random.sample(list_memory, MINIBATCH_SIZE)

        start_time = time.time()
        # Enumerate our batches
        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
            print('...Starting Training...')
            target = 0
            pos =  np.array([current_state[0]])
            reqs =  np.array([current_state[1]])
            number_satisfaction = np.array([current_state[2]])
            pos_next = np.array([new_current_state[0]])
            reqs_next = np.array([new_current_state[1]])
            number_satisfaction_next = np.array([new_current_state[2]])
    
            # If not a terminal state, get new q from future states, otherwise set it to 0
            # almost like with Q Learning, but we use just part of equation here
            if not done:
                print("Predict Next State")
                target = reward + DISCOUNT * np.amax(self.target_model_uav_pos([pos_next, reqs_next, number_satisfaction_next]))
            else:
                target = reward

            # Update Q value for given state
            print("Predict State")
            target_f = self.model_uav_pos([pos, reqs, number_satisfaction])
            target_f = np.array(target_f)
            target_f[0][action] = target

            self.model_uav_pos.fit([pos, reqs, number_satisfaction], \
                                   target_f, \
                                   verbose=2, \
                                   shuffle=False, \
                                   callbacks=None, \
                                   epochs=1 \
                                  )  
        end_time = time.time()
        print("Time", end_time - start_time)
        # Update target network counter every episode
        self.target_train()

This is the training function after I tried the solution suggested in the link:

def train_pos_nn(self):
        print("In Training..")

        # Start training only if certain number of samples is already saved
        if len(self.replay_memory_pos_nn) < MIN_REPLAY_MEMORY_SIZE:
            print("Exiting Training: Replay Memory Not Full Enough...")
            return

        # Get a minibatch of random samples from memory replay table
        list_memory = list(self.replay_memory_pos_nn)
        random.shuffle(list_memory)
        
        # Draw a sample
        samples = random.sample(list_memory, MINIBATCH_SIZE)
        
        start_time = time.time()
        # Prepare the batch
        state, action, reward, new_state, done = zip(*samples)
        nstate = []
        for n_state in new_state:
            pos_next = np.array([n_state[0]])
            reqs_next = np.array([n_state[1]])
            number_satisfaction_next = np.array([n_state[2]])
            nstate.append([pos_next,reqs_next,number_satisfaction_next])
        done = np.array(done)[:,None]
        state = np.concatenate(state)
        reward = np.array(reward)[:,None]
        q_future = self.target_model_uav_pos(nstate)
        targets = reward + self.gamma*np.max(q_future, axis=1, keepdims=True)
        
        # Fit the model
        self.model.fit(state, targets, epochs=1, verbose=2)
        
        end_time = time.time()
        print("Time", end_time - start_time)
        # Update target network counter every episode
        self.target_train()

This call to predict q_future = self.target_model_uav_pos(nstate) gives me the error ValueError: Layer "model_19" expects 3 input(s), but it received 96 input tensors.

How do I do it correctly? And is there any other way to speed up the training process?

Ness
  • 158
  • 1
  • 12

0 Answers0