TF Agent taking the same action for all test states after training in Reinforcement Learning

Question

I am trying to create a Custom PyEnvironment for making an agent learn the optimum hour to send the notification to the users, based on the rewards received by clicking on the notifications sent in previous 7 days.

After the training is complete, the agent is only taking the action as the same hour even if different states are inputted.

states structure : We are considering last 7 days hour-response pairs of the user.

Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0   [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]

Hyperparameters:

duration = 1 # @param {type:"integer"}

discount = 0.99 # @param {type:"number"}

state_size = 7 # @param {type:"integer"}

num_episodes = 1 # @param {type:"integer"}

learning_rate = 1e-5 # @param {type:"number"}

num_eval_episodes = 100 # @param {type:"integer"}

replay_buffer_max_length = 100000  # @param {type:"integer"}

initial_collect_steps = 100 # @param {type:"integer"} 

batch_size = 10 # @param {type:"integer"}

num_iterations = 1000 # @param {type:"integer"}

collect_steps_per_iteration = 1 # @param {type:"integer"}

log_interval = 500 # @param {type:"integer"}

eval_interval = 500 # @param {type:"integer"}

Below is the code of NotifEnv class:

class NotifEnv(py_environment.PyEnvironment):

  def __init__(self, duration, discount, state_size):
    
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
    
    self._state_spec = array_spec.BoundedArraySpec(
            shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
    
    self._discount_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
  
    self.step_count = 0
    
    self.duration = duration
    
    self.state_size = state_size
    
    self.discount = discount
    
    self._episode_ended = False
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)

    
  def observation_spec(self):
    """Return state_spec."""
    return self._state_spec 

  def action_spec(self):
    """Return action_spec."""
    return self._action_spec

  def _reset(self):
    """Return initial_time_step."""
    
    self.state = np.array([0]*2*self.state_size, dtype=np.float32)
    
    self._episode_ended = False
    
    self.step_count = 0
    
    return ts.restart(np.array(self.state, dtype=np.float32))

  def _step(self, action):
    """Apply action and return new time_step."""
    
    if action<0 or action>23 :
        raise ValueError('`action` should be between 0 and 23.')
    
    self.step_count += 1
    curr_reward = get_reward(action)

    self.state[:-2] = self.state[2:]
    self.state[-2:] = [action+1, curr_reward]
    
    if self.step_count >= duration :
        self.episode_ended = True
        self.step_count = 0
        return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)

    else :
        return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)

Reward function (reward only given if hour of day < 12) else penalise the agent:

def get_reward(action):
    if action<12 :
        return 1
    else :
        return -0.3

Model Architecture :

train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
    
    def compute_avg_return(environment, policy, num_episodes=10):
    
        total_return = 0.0
        for _ in range(num_episodes):
            time_step = environment.reset()
            episode_return = 0.0
    
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = environment.step(action_step.action)
                episode_return += time_step.reward
            total_return += episode_return
    
        avg_return = total_return / num_episodes
        return avg_return.numpy()[0]

Training :

fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
    return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])


optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

Initialising Replay Buffer :

   replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            data_spec=agent.collect_data_spec,
            batch_size=train_env.batch_size,
            max_length=20)  # replay_buffer_max_length
    
   def collect_step(environment, policy, buffer):
        time_step = environment.current_time_step()
        print("time_step = "+str(time_step))
        action_step = policy.action(time_step)
        print("action_step = "+str(action_step))
        next_time_step = environment.step(action_step.action)
        print("next_time_step = "+str(next_time_step))
        traj = trajectory.from_transition(time_step, action_step, next_time_step)
        print("traj = "+str(traj))
        print("\n \n")
        # Add trajectory to the replay buffer
        buffer.add_batch(traj)
        # buffer.add_batch(traj)
    
    def collect_data(env, policy, buffer, steps):
        for _ in range(steps):
          print("NUM : "+str(_))
          collect_step(env, policy, buffer)
    
    collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)

Training Run :

iterator = iter(dataset)

agent.train = common.function(agent.train)

# Reset the train step
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)

for _ in range(num_iterations):

    # Collect a few steps using collect_policy and save to the replay buffer.
    # collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)

    # print("x = "+str(x))
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))
        print(np.max(q_net.get_weights()[0]))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

score 0 · Answer 1 · answered May 09 '22 at 08:50

I think that 1000 training steps could be on of the reason the agent is stuck with picking one action. RL agents need hundreds of thousands of iterations, and this could be even higher depending on the task and some neural networks hyperparameters, like the learning rate.

I suggest you to increase the num_iterations by a lot, and then check if something improves.

TF Agent taking the same action for all test states after training in Reinforcement Learning

1 Answers1