1

I am trying to adapt this tutorial code : https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html into different enviroment, however I cannot learn the model because it gives me two different crashes so far:

This error usually occurs when BATCH_SIZE is lowered to 4(or other lower number) after the agent makes 4 actions:

    in optimize_model
    non_final_next_states = torch.cat([s for s in batch.next_state
NotImplementedError: There were no tensor arguments to this function (e.g., you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat.  This usually means that this function requires a non-empty list of Tensors, or that you (the operator writer) forgot to register a fallback function.  Available functions are [CPU, QuantizedCPU, BackendSelect, Named, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode].

Other error occurs, when I leave BATCH_SIZE>=128 and it gives me after few actions:

action = select_action(state)
  File main.py in select_action
    return policy_net(state).max(1)[1].view(1, 1)
  File "\torch\nn\modules\module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File main.py, line 175, in forward
    x = F.relu(self.bn2(self.conv2(x)))
  File torch\nn\modules\module.py, line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File torch\nn\modules\conv.py, line 443, in forward
    return self._conv_forward(input, self.weight, self.bias)
  File torch\nn\modules\conv.py, line 439, in _conv_forward
    return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Given groups=1, weight of size [32, 16, 5, 5], expected input[1, 32, 118, 183] to have 16 channels, but got 32 channels instead

Most of the code I took from the tutorial.

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10

_, _, screen_height, screen_width = None,None,None,None
target_net=None
device="cpu"
steps_done = 0
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


episode_durations = []

def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

main function:

  global screen_height, screen_width,n_actions,policy_net ,target_net,optimizer,memory
    n_actions = len(gym.actions)
    screen_height, screen_width , _ = gym.take_screen("screen/shot.jpg",True).shape
    policy_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(50)


    num_episodes = 50
    for i_episode in range(num_episodes):
        # Initialize the environment and state

        last_screen = gym.take_screen("screen/shot.jpg")
        current_screen = gym.take_screen("screen/shot.jpg")
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(state)
            print(action)
            #_, reward, done, _ = getattr(gym, gym.actions[ action.item()])() # get score
            getattr(gym, gym.actions[action.item()])()

            reward = torch.tensor([gym.last_update_on_score], device=device)
            done = True
            # Observe new state
            last_screen = current_screen
            current_screen = gym.take_screen("screen/shot.jpg")
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            if (gym.get_score() and gym.last_score()):
                print("gameover")
                #time.sleep(5)
                gym.game_quit()
                break
            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            optimize_model()
            if done:
                episode_durations.append(t + 1)
                #plot_durations()
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

take_screen function

def take_screen(self, fname):

self.driver.save_screenshot(fname)
# load the image

# Resize, and add a batch dimension (BCHW)
image = Image.open(fname)
data = asarray(image).transpose((2, 0, 1))[:3:]
print(data.shape)
screen = np.ascontiguousarray(data, dtype=np.float32) / 255
screen = torch.from_numpy(screen)

return resize(screen).unsqueeze(0)

DQN class:

class DQN(nn.Module):

def __init__(self, h, w, outputs):
    super(DQN, self).__init__()
    self.conv1 = nn.Conv2d(3, 32, kernel_size=5,stride=2 )
    self.bn1 = nn.BatchNorm2d(32)
    self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
    self.bn2 = nn.BatchNorm2d(32)
    self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
    self.bn3 = nn.BatchNorm2d(32)

    # Number of Linear input connections depends on output of conv2d layers
    # and therefore the input image size, so compute it.
    def conv2d_size_out(size, kernel_size = 5, stride = 2):
        return (size - (kernel_size - 1) - 1) // stride  + 1
    convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
    convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
    linear_input_size = convw * convh * 1
    self.head = nn.Linear(linear_input_size, outputs)

# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
    #x = x.unsqueeze(0)
    x = F.relu(self.bn1(self.conv1(x)))
    x = F.relu(self.bn2(self.conv2(x)))
    x = F.relu(self.bn3(self.conv3(x)))
    return self.head(x.view(x.size(0), -1))

Rest of the code remained unchanged or is not imho related to the error(mostly the GYM enviroment). Can I ask you for helping me out what is wrong with my code? Thank you!!!

  • First mistake is in the convolution layer self.bn2, where it should be self.conv2 = nn.Conv2d(32, 32, kernel_size=5, stride=2). – SombreroMickey Oct 11 '21 at 12:43
  • Second mistake linear_input_size is wrong, should be 37152 in my case. However the first error, when batch reaches BATCH_SIZE persists. – SombreroMickey Oct 11 '21 at 13:19

0 Answers0