I am currently studying dynamic programming in reinforcement learning in which I came across two concepts Value-Iteration and Policy-Iteration. To understand the same, I am implementing the gridworld example from the Sutton which says :
The nonterminal states are S = {1, 2, . . . , 14}. There are four actions possible in each state, A = {up, down, right, left}, which deterministically cause the corresponding state transitions, except that actions that would take the agent off the grid in fact leave the state unchanged. Thus, for instance, p(6, 1|5, right) = 1, p(7, 1|7, right) = 1, and p(10, r |5, right) = 0 for all r belonging to R. This is an undiscounted, episodic task. The reward is -1 on all transitions until the terminal state is reached. The terminal state is shaded in the figure (although it is shown in two places, it is formally one state). The expected reward function is thus r(s, a, s') = 1 for all states s, s' and actions a. Suppose the agent follows the equiprobable random policy (all actions equally likely).
Below is the implementation of the two methods
class GridWorld:
def __init__(self,grid_size = 5, gamma = 0.9, penalty = -1.0, theta = 1e-2):
self.grid_size = grid_size
self.discount = gamma
self.actions = [np.array([0, -1]),
np.array([-1, 0]),
np.array([0, 1]),
np.array([1, 0])]
self.action_prob = 1/len(self.actions)
self.theta = theta
print('action prob : ',self.action_prob)
self.penalty_reward = penalty
self.re_init()
def re_init(self):
self.values = np.zeros((self.grid_size, self.grid_size))
self.policy = np.zeros(self.values.shape, dtype=np.int)
def checkTerminal(self,state):
x, y = state
if x == 0 and y == 0:
return 1
elif (x == self.grid_size - 1 and y == self.grid_size - 1):
return 1
else :
return 0
def step(self, state, action):
#print(state)
if self.checkTerminal(state):
next_state = state
reward = 0
else:
next_state = (np.array(state) + action).tolist()
x, y = next_state
if x < 0 or x >= self.grid_size or y < 0 or y >= self.grid_size:
next_state = state
reward = self.penalty_reward
return next_state, reward
def compValueIteration(self):
new_state_values = np.zeros((self.grid_size, self.grid_size))
policy = np.zeros((self.grid_size, self.grid_size))
iter_cnt = 0
while True:
#delta = 0
state_values = new_state_values.copy()
old_state_values = state_values.copy()
for i in range(self.grid_size):
for j in range(self.grid_size):
values = []
for action in self.actions:
(next_i, next_j), reward = self.step([i, j], action)
values.append(reward + self.discount*state_values[next_i, next_j])
new_state_values[i, j] = np.max(values)
policy[i, j] = np.argmax(values)
#delta = max(delta, np.abs(old_state_values[i, j] - new_state_values[i, j]))
delta = np.abs(old_state_values - new_state_values).max()
print(f'Difference: {delta}')
if delta < self.theta:
break
iter_cnt += 1
return new_state_values, policy, iter_cnt
def policyEvaluation(self,policy,new_state_values):
#new_state_values = np.zeros((self.grid_size, self.grid_size))
iter_cnt = 0
while True:
delta = 0
state_values = new_state_values.copy()
old_state_values = state_values.copy()
for i in range(self.grid_size):
for j in range(self.grid_size):
action = policy[i, j]
(next_i, next_j), reward = self.step([i, j], action)
value = self.action_prob * (reward + self.discount * state_values[next_i, next_j])
new_state_values[i, j] = value
delta = max(delta, np.abs(old_state_values[i, j] - new_state_values[i, j]))
print(f'Difference: {delta}')
if delta < self.theta:
break
iter_cnt += 1
return new_state_values
def policyImprovement(self, policy, values, actions):
#expected_action_returns = np.zeros((self.grid_size, self.grid_size, np.size(actions)))
policy_stable = True
for i in range(self.grid_size):
for j in range(self.grid_size):
old_action = policy[i, j]
act_cnt = 0
expected_rewards = []
for action in self.actions:
(next_i, next_j), reward = self.step([i, j], action)
expected_rewards.append(self.action_prob * (reward + self.discount * values[next_i, next_j]))
#max_reward = np.max(expected_rewards)
#new_action = np.random.choice(np.where(expected_rewards == max_reward)[0])
new_action = np.argmax(expected_rewards)
#print('new_action : ',new_action)
#print('old_action : ',old_action)
if old_action != new_action:
policy_stable = False
policy[i, j] = new_action
return policy, policy_stable
def compPolicyIteration(self):
iterations = 0
total_start_time = time.time()
while True:
start_time = time.time()
self.values = self.policyEvaluation(self.policy, self.values)
elapsed_time = time.time() - start_time
print(f'PE => Elapsed time {elapsed_time} seconds')
start_time = time.time()
self.policy, policy_stable = self.policyImprovement(self.policy,self.values, self.actions)
elapsed_time = time.time() - start_time
print(f'PI => Elapsed time {elapsed_time} seconds')
if policy_stable:
break
iterations += 1
total_elapsed_time = time.time() - total_start_time
print(f'Optimal policy is reached after {iterations} iterations in {total_elapsed_time} seconds')
return self.values, self.policy
But both of my implementations give different optimal policies and optimal values. I have followed exactly the algorithms given in the book.
Result with Policy Iteration:
values :
[[ 0. -0.33300781 -0.33300781 -0.33300781]
[-0.33300781 -0.33300781 -0.33300781 -0.33300781]
[-0.33300781 -0.33300781 -0.33300781 -0.33300781]
[-0.33300781 -0.33300781 -0.33300781 0. ]]
****************************************************************************************************
****************************************************************************************************
policy :
[[0 0 0 0]
[1 0 0 0]
[0 0 0 3]
[0 0 2 0]]
Result with Value Iteration:
values :
[[ 0.0 -1.0 -2.0 -3.0]
[-1.0 -2.0 -3.0 -2.0]
[-2.0 -3.0 -2.0 -1.0]
[-3.0 -2.0 -1.0 0.0]]
****************************************************************************************************
****************************************************************************************************
[[0. 0. 0. 0.]
[1. 0. 0. 3.]
[1. 0. 2. 3.]
[1. 2. 2. 0.]]
Also, value iteration converges after 4 iterations whereas policy iteration after 2 iterations.
Where did I go wrong? Can they give different optimal policies? But what I believe is as written in the book after 3rd iteration values we obtain are optimal.Then there must be some issues with policy iteration of my code which I am unable to see. Basically, how should I fix the policy?