I am trying to create a Custom PyEnvironment for making an agent learn the optimum hour to send the notification to the users, based on the rewards received by clicking on the notifications sent in previous 7 days.
After the training is complete, the agent is only taking the action as the same hour even if different states are inputted.
states structure : We are considering last 7 days hour-response pairs of the user.
Eg : 7,0,14,1,5,0,3,0,23,0,9,1,12,0 [Reward 0 at 7th hour, Reward 1 at 14th hour and so on]
Hyperparameters:
duration = 1 # @param {type:"integer"}
discount = 0.99 # @param {type:"number"}
state_size = 7 # @param {type:"integer"}
num_episodes = 1 # @param {type:"integer"}
learning_rate = 1e-5 # @param {type:"number"}
num_eval_episodes = 100 # @param {type:"integer"}
replay_buffer_max_length = 100000 # @param {type:"integer"}
initial_collect_steps = 100 # @param {type:"integer"}
batch_size = 10 # @param {type:"integer"}
num_iterations = 1000 # @param {type:"integer"}
collect_steps_per_iteration = 1 # @param {type:"integer"}
log_interval = 500 # @param {type:"integer"}
eval_interval = 500 # @param {type:"integer"}
Below is the code of NotifEnv class:
class NotifEnv(py_environment.PyEnvironment):
def __init__(self, duration, discount, state_size):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=23, name='action')
self._state_spec = array_spec.BoundedArraySpec(
shape=(2*state_size, ), dtype=np.float32, minimum=0.0, maximum=1.0, name='state')
self._discount_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.float32, minimum=0.0, maximum=1.0, name='discount')
self.step_count = 0
self.duration = duration
self.state_size = state_size
self.discount = discount
self._episode_ended = False
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
def observation_spec(self):
"""Return state_spec."""
return self._state_spec
def action_spec(self):
"""Return action_spec."""
return self._action_spec
def _reset(self):
"""Return initial_time_step."""
self.state = np.array([0]*2*self.state_size, dtype=np.float32)
self._episode_ended = False
self.step_count = 0
return ts.restart(np.array(self.state, dtype=np.float32))
def _step(self, action):
"""Apply action and return new time_step."""
if action<0 or action>23 :
raise ValueError('`action` should be between 0 and 23.')
self.step_count += 1
curr_reward = get_reward(action)
self.state[:-2] = self.state[2:]
self.state[-2:] = [action+1, curr_reward]
if self.step_count >= duration :
self.episode_ended = True
self.step_count = 0
return ts.termination(np.array(self.state, dtype=np.float32), reward=curr_reward)
else :
return ts.transition(np.array(self.state, dtype=np.float32), discount=self.discount, reward=curr_reward)
Reward function (reward only given if hour of day < 12) else penalise the agent:
def get_reward(action):
if action<12 :
return 1
else :
return -0.3
Model Architecture :
train_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
eval_env = tf_py_environment.TFPyEnvironment(NotifEnv(duration=duration, discount=discount, state_size=state_size))
def compute_avg_return(environment, policy, num_episodes=10):
total_return = 0.0
for _ in range(num_episodes):
time_step = environment.reset()
episode_return = 0.0
while not time_step.is_last():
action_step = policy.action(time_step)
time_step = environment.step(action_step.action)
episode_return += time_step.reward
total_return += episode_return
avg_return = total_return / num_episodes
return avg_return.numpy()[0]
Training :
fc_layer_params = (6,)
action_tensor_spec = tensor_spec.from_spec(env.action_spec())
num_actions = 24
# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
return tf.keras.layers.Dense(
num_units,
activation=tf.keras.activations.relu,
kernel_initializer=tf.keras.initializers.VarianceScaling(
scale=2.0, mode='fan_in', distribution='truncated_normal'))
# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# it's output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
num_actions,
activation=None,
kernel_initializer=tf.keras.initializers.RandomUniform(
minval=-0.03, maxval=0.03),
bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
train_step_counter = tf.Variable(0)
print("train_step_counter = ",train_step_counter)
agent = dqn_agent.DqnAgent(
train_env.time_step_spec(),
train_env.action_spec(),
q_network=q_net,
optimizer=optimizer,
td_errors_loss_fn=common.element_wise_squared_loss,
train_step_counter=train_step_counter)
agent.initialize()
Initialising Replay Buffer :
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
data_spec=agent.collect_data_spec,
batch_size=train_env.batch_size,
max_length=20) # replay_buffer_max_length
def collect_step(environment, policy, buffer):
time_step = environment.current_time_step()
print("time_step = "+str(time_step))
action_step = policy.action(time_step)
print("action_step = "+str(action_step))
next_time_step = environment.step(action_step.action)
print("next_time_step = "+str(next_time_step))
traj = trajectory.from_transition(time_step, action_step, next_time_step)
print("traj = "+str(traj))
print("\n \n")
# Add trajectory to the replay buffer
buffer.add_batch(traj)
# buffer.add_batch(traj)
def collect_data(env, policy, buffer, steps):
for _ in range(steps):
print("NUM : "+str(_))
collect_step(env, policy, buffer)
collect_data(train_env, random_policy, replay_buffer, initial_collect_steps)
Training Run :
iterator = iter(dataset)
agent.train = common.function(agent.train)
# Reset the train step
agent.train_step_counter.assign(0)
# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
returns = [avg_return]
print("log_interval = ",log_interval)
print("eval_interval = ",eval_interval)
print("returns = ",returns)
for _ in range(num_iterations):
# Collect a few steps using collect_policy and save to the replay buffer.
# collect_data(train_env, agent.collect_policy, replay_buffer, collect_steps_per_iteration)
# Sample a batch of data from the buffer and update the agent's network.
experience, unused_info = next(iterator)
# print("x = "+str(x))
train_loss = agent.train(experience).loss
step = agent.train_step_counter.numpy()
if step % log_interval == 0:
print('step = {0}: loss = {1}'.format(step, train_loss))
print(np.max(q_net.get_weights()[0]))
if step % eval_interval == 0:
avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
print('step = {0}: Average Return = {1}'.format(step, avg_return))
returns.append(avg_return)