I'm trying to set up a custom PyEnvironment and I get Given 'time_step' does not match expected 'time_step_spec
error. I don't see where the the dtype specification is missing.
Here's the environment:
class TicTacToe(py_environment.PyEnvironment):
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=1, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=(1,), dtype=np.int32, minimum=0, name='observation')
self._board = np.zeros((3,3), dtype=np.int32)
self._state = 0
self._episode_ended = False
self._moves = [(i,r) for i in range(3) for r in range(3)]
self._possiblemoves = self._moves
self._turn = 0
self._winner = 0
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self._state = self._board
self._episode_ended = False
return ts.restart(np.array([self._state], dtype=np.int32))
def _step(self, action):
if turn % 2 == 0:
self._player = 1
else:
self._player = 2
if action not in moves:
raise ValueError("faulty input")
if self._board[action] != 0:
raise ValueError("spot already taken")
self._board[action] = self._player
self._possiblemoves = []
for i in range(3):
for j in range(3):
if self._board[i,j] == 0:
self._possiblemoves.append((i,j))
for r in range(1,3):
if self._board[0,0] == self._board[1,1] == self._board[2,2] == r:
self._winner = r
break
elif self._board[0,2] == self._board[1,1] == self._board[2,0] == r:
self._winner = r
break
for j in range(0,3):
if self._board[0,j] == self._board[1,j] == self._board[2,j] == r:
self._winner = r
break
elif self._board[j,0] == self._board[j,1] == self._board[j,2] == r:
self._winner = r
if self._winner == self._player:
reward = 1
return ts.termination(np.array([self._state], dtype=np.int32), reward)
elif self._board.all() != 0:
reward = 0.5
return ts.termination(np.array([self._state], dtype=np.int32), reward)
else:
return ts.transition(
np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)
def render(self, mode: "human") -> np.ndarray:
if mode != "human":
raise ValueError(
"Only rendering mode supported is 'human', got {} instead.".format(
mode))
return self._board
Then when I run:
environment = TicTacToe()
utils.validate_py_environment(environment, episodes=1)
I get the error:
ValueError Traceback (most recent call last)
c:\Users\Student\Desktop\scripts\tictactoe.ipynb Cell 5' in <cell line: 2>()
1 environment = TicTacToe()
----> 2 utils.validate_py_environment(environment, episodes=1)
File c:\Users\Student\AppData\Local\Programs\Python\Python310\lib\site-packages\tf_agents\environments\utils.py:78, in validate_py_environment(environment, episodes, observation_and_action_constraint_splitter)
76 while episode_count < episodes:
77 if not array_spec.check_arrays_nest(time_step, batched_time_step_spec):
---> 78 raise ValueError(
79 'Given `time_step`: %r does not match expected '
80 '`time_step_spec`: %r' % (time_step, batched_time_step_spec))
82 action = random_policy.action(time_step).action
83 time_step = environment.step(action)
ValueError: Given `time_step`: TimeStep(
{'discount': array(1., dtype=float32),
'observation': array([[[0, 0, 0],
[0, 0, 0],
[0, 0, 0]]]),
'reward': array(0., dtype=float32),
'step_type': array(0)}) does not match expected `time_step_spec`: TimeStep(
{'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0),
'observation': BoundedArraySpec(shape=(), dtype=dtype('int32'), name='observation', minimum=0, maximum=2147483647),
'reward': ArraySpec(shape=(), dtype=dtype('float32'), name='reward'),
'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})
Where, If I see correctly, the only difference is that
'observation': array([[[0, 0, 0],
[0, 0, 0],
[0, 0, 0]]]),
Does not have dtype('int32')
specified, but I can't see why it does not. I sppecified it in observation_spec and I also specified it in board just in case.