0

Below is my implementation of a2c using PyTorch. Upon learning about backpropagation in PyTorch, I have known to zero_grad() the optimizer after each update iteration. However, there is still a RunTime error on second-time backpropagation.

def torchworker(number, model):
  worker_env = gym.make("Taxi-v3").env
  max_steps_per_episode = 2000
  worker_opt = optim.Adam(lr=5e-4, params=model.parameters())
  
  p_history = []
  val_history = []
  r_history = []
  
  running_reward = 0
  episode_count = 0
  under = 0

  start = time.time()

  for i in range(2):
    state = worker_env.reset()
    episode_reward = 0
    penalties = 0
    drop = 0
    print("Episode {} begins ({})".format(episode_count, number))
    worker_env.render()
    criterion = nn.SmoothL1Loss()
    
    time_solve = 0

    for _ in range(1, max_steps_per_episode):
      #worker_env.render()
      state = torch.tensor(state, dtype=torch.long)
      action_probs = model.forward(state)[0]
      critic_value = model.forward(state)[1]
      val_history.append((state, critic_value[0]))

      # Choose action
      action = np.random.choice(6, p=action_probs.detach().numpy())
      p_history.append(torch.log(action_probs[action]))

      # Apply chosen action
      state, reward, done, _ = worker_env.step(action)
      r_history.append(reward)
      episode_reward += reward
      time_solve += 1

      if reward == -10:
        penalties += 1
      
      elif reward == 20:
        drop += 1
      
      if done:
        break

    # Update running reward to check condition for solving
    running_reward = (running_reward * (episode_count) + episode_reward) / (episode_count + 1)

    # Calculate discounted returns
    returns = deque(maxlen=3500)
    discounted_sum = 0
    for r in r_history[::-1]:
      discounted_sum = r + gamma * discounted_sum
      returns.appendleft(discounted_sum)

    # Calculate actor losses and critic losses
    loss_actor_value = 0
    loss_critic_value = 0
    history = zip(p_history, val_history, returns)
    for log_prob, value, ret in history:
      diff = ret - value[1]
      loss_actor_value += -log_prob * diff
      ret_tensor = torch.tensor(ret, dtype=torch.float32)
      loss_critic_value += criterion(value[1], ret_tensor)
 
    loss = loss_actor_value + 0.1 * loss_critic_value
    print(loss)
    
    # Update params
    loss.backward()
    worker_opt.step()
    worker_opt.zero_grad()


    # Log details
    end = time.time()
    episode_count += 1
    if episode_count % 1 == 0:
        worker_env.render()

    if running_reward > -50:  # Condition to consider the task solved
        under += 1
    
    if under > 5:
        print("Solved at episode {} !".format(episode_count))
        break

I believe there may be something to do with the architecture of my AC model, so I also include it here for reference.

class ActorCriticNetwork(nn.Module):
  def __init__(self, num_inputs, num_hidden, num_actions):
    super(ActorCriticNetwork, self).__init__()
    self.embed = nn.Embedding(500, 10)
    self.fc1 = nn.Linear(10, num_hidden * 2)
    self.fc2 = nn.Linear(num_hidden * 2, num_hidden)
    self.c = nn.Linear(num_hidden, 1)
    self.fc3 = nn.Linear(num_hidden, num_hidden)
    self.a = nn.Linear(num_hidden, num_actions)
  
  def forward(self, x):
    out = F.relu(self.embed(x))
    out = F.relu(self.fc1(out))
    out = F.relu(self.fc2(out))
    critic = self.c(out)
    out = F.relu(self.fc3(out.detach()))
    actor = F.softmax(self.a(out), dim=-1)
    return actor, critic

Would you please tell me what the mistake here is? Thank you in advance.

1 Answers1

0

SOLVED: I forgot to clear the history of probabilities, action-values and rewards after iterations. It is clear why that would cause the issue, as the older elements would cause propagating through old dcgs.