I am trying to adapt this tutorial code : https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html into different enviroment, however I cannot learn the model because it gives me two different crashes so far:
This error usually occurs when BATCH_SIZE is lowered to 4(or other lower number) after the agent makes 4 actions:
in optimize_model
non_final_next_states = torch.cat([s for s in batch.next_state
NotImplementedError: There were no tensor arguments to this function (e.g., you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat. This usually means that this function requires a non-empty list of Tensors, or that you (the operator writer) forgot to register a fallback function. Available functions are [CPU, QuantizedCPU, BackendSelect, Named, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, UNKNOWN_TENSOR_TYPE_ID, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, Autocast, Batched, VmapMode].
Other error occurs, when I leave BATCH_SIZE>=128 and it gives me after few actions:
action = select_action(state)
File main.py in select_action
return policy_net(state).max(1)[1].view(1, 1)
File "\torch\nn\modules\module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File main.py, line 175, in forward
x = F.relu(self.bn2(self.conv2(x)))
File torch\nn\modules\module.py, line 1051, in _call_impl
return forward_call(*input, **kwargs)
File torch\nn\modules\conv.py, line 443, in forward
return self._conv_forward(input, self.weight, self.bias)
File torch\nn\modules\conv.py, line 439, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
RuntimeError: Given groups=1, weight of size [32, 16, 5, 5], expected input[1, 32, 118, 183] to have 16 channels, but got 32 channels instead
Most of the code I took from the tutorial.
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
_, _, screen_height, screen_width = None,None,None,None
target_net=None
device="cpu"
steps_done = 0
def select_action(state):
global steps_done
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * steps_done / EPS_DECAY)
steps_done += 1
if sample > eps_threshold:
with torch.no_grad():
# t.max(1) will return largest column value of each row.
# second column on max result is index of where max element was
# found, so we pick action with the larger expected reward.
return policy_net(state).max(1)[1].view(1, 1)
else:
return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)
episode_durations = []
def optimize_model():
if len(memory) < BATCH_SIZE:
return
transitions = memory.sample(BATCH_SIZE)
# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
# detailed explanation). This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
# Compute a mask of non-final states and concatenate the batch elements
# (a final state would've been the one after which simulation ended)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=device, dtype=torch.bool)
non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None])
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
state_action_values = policy_net(state_batch).gather(1, action_batch)
# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = torch.zeros(BATCH_SIZE, device=device)
next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
# Compute the expected Q values
expected_state_action_values = (next_state_values * GAMMA) + reward_batch
# Compute Huber loss
criterion = nn.SmoothL1Loss()
loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
# Optimize the model
optimizer.zero_grad()
loss.backward()
for param in policy_net.parameters():
param.grad.data.clamp_(-1, 1)
optimizer.step()
main function:
global screen_height, screen_width,n_actions,policy_net ,target_net,optimizer,memory
n_actions = len(gym.actions)
screen_height, screen_width , _ = gym.take_screen("screen/shot.jpg",True).shape
policy_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net = DQN(screen_height, screen_width, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()
optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(50)
num_episodes = 50
for i_episode in range(num_episodes):
# Initialize the environment and state
last_screen = gym.take_screen("screen/shot.jpg")
current_screen = gym.take_screen("screen/shot.jpg")
state = current_screen - last_screen
for t in count():
# Select and perform an action
action = select_action(state)
print(action)
#_, reward, done, _ = getattr(gym, gym.actions[ action.item()])() # get score
getattr(gym, gym.actions[action.item()])()
reward = torch.tensor([gym.last_update_on_score], device=device)
done = True
# Observe new state
last_screen = current_screen
current_screen = gym.take_screen("screen/shot.jpg")
if not done:
next_state = current_screen - last_screen
else:
next_state = None
if (gym.get_score() and gym.last_score()):
print("gameover")
#time.sleep(5)
gym.game_quit()
break
# Store the transition in memory
memory.push(state, action, next_state, reward)
# Move to the next state
state = next_state
# Perform one step of the optimization (on the policy network)
optimize_model()
if done:
episode_durations.append(t + 1)
#plot_durations()
break
# Update the target network, copying all weights and biases in DQN
if i_episode % TARGET_UPDATE == 0:
target_net.load_state_dict(policy_net.state_dict())
take_screen function
def take_screen(self, fname):
self.driver.save_screenshot(fname)
# load the image
# Resize, and add a batch dimension (BCHW)
image = Image.open(fname)
data = asarray(image).transpose((2, 0, 1))[:3:]
print(data.shape)
screen = np.ascontiguousarray(data, dtype=np.float32) / 255
screen = torch.from_numpy(screen)
return resize(screen).unsqueeze(0)
DQN class:
class DQN(nn.Module):
def __init__(self, h, w, outputs):
super(DQN, self).__init__()
self.conv1 = nn.Conv2d(3, 32, kernel_size=5,stride=2 )
self.bn1 = nn.BatchNorm2d(32)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5, stride=2)
self.bn2 = nn.BatchNorm2d(32)
self.conv3 = nn.Conv2d(32, 32, kernel_size=5, stride=2)
self.bn3 = nn.BatchNorm2d(32)
# Number of Linear input connections depends on output of conv2d layers
# and therefore the input image size, so compute it.
def conv2d_size_out(size, kernel_size = 5, stride = 2):
return (size - (kernel_size - 1) - 1) // stride + 1
convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
linear_input_size = convw * convh * 1
self.head = nn.Linear(linear_input_size, outputs)
# Called with either one element to determine next action, or a batch
# during optimization. Returns tensor([[left0exp,right0exp]...]).
def forward(self, x):
#x = x.unsqueeze(0)
x = F.relu(self.bn1(self.conv1(x)))
x = F.relu(self.bn2(self.conv2(x)))
x = F.relu(self.bn3(self.conv3(x)))
return self.head(x.view(x.size(0), -1))
Rest of the code remained unchanged or is not imho related to the error(mostly the GYM enviroment). Can I ask you for helping me out what is wrong with my code? Thank you!!!