3

1) It's unclear how to make action masking just more complex in rllib than we can find in examples. This mask works good from example action_mask_model.py with class TorchActionMaskModel(TorchModelV2, nn.Module)

self.observation_space = Dict({
    "action_mask": Box(0, 1, shape=(self.actions,)),
    "actual_obs": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
})

Now I want to make more complex:

self.observation_space = Dict({
    "action_mask": Box(0, 1, shape=(self.actions,)),
    "actual_obs": Dict({
        "obs1": Discrete(10),
        "obs2": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
    }),
})

It makes error:

prev_layer_size = int(np.product(obs_space.shape))
TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

Does anyone know how can I fix it? Full reproducible code python=3.8 rllib=1.12.0:

import numpy as np
import ray
import ray.rllib.agents.ppo as ppo
from ray.tune.registry import register_env
import gym
from gym.spaces import Box, Dict, Discrete

from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_torch
from ray.rllib.utils.torch_utils import FLOAT_MIN

torch, nn = try_import_torch()


# copy pasted from rllib/examples/models/action_mask_model.py
class TorchActionMaskModel(TorchModelV2, nn.Module):
    """PyTorch version of above ActionMaskingModel."""

    def __init__(
        self,
        obs_space,
        action_space,
        num_outputs,
        model_config,
        name,
        **kwargs,
    ):
        orig_space = getattr(obs_space, "original_space", obs_space)
        assert (
            isinstance(orig_space, Dict)
            and "action_mask" in orig_space.spaces
            and "actual_obs" in orig_space.spaces
        )

        TorchModelV2.__init__(
            self, obs_space, action_space, num_outputs, model_config, name, **kwargs
        )
        nn.Module.__init__(self)

        self.internal_model = TorchFC(
            orig_space["actual_obs"],
            action_space,
            num_outputs,
            model_config,
            name + "_internal",
        )


    def forward(self, input_dict, state, seq_lens):
        # Extract the available actions tensor from the observation.
        action_mask = input_dict["obs"]["action_mask"]

        # Compute the unmasked logits.
        logits, _ = self.internal_model({"obs": input_dict["obs"]["actual_obs"]})

        # Convert action_mask into a [0.0 || -inf]-type mask.
        inf_mask = torch.clamp(torch.log(action_mask), min=FLOAT_MIN)

        # Return masked logits.
        return logits + inf_mask, state

    def value_function(self):
        return self.internal_model.value_function()




class MyEnv(gym.Env):

    metadata = {"render.modes": ["human"]}

    def __init__(self):
        super(MyEnv, self).__init__()

        self.actions = 4

        self.action_space = Discrete(self.actions)
        self.observation_space = Dict({
            "action_mask": Box(0, 1, shape=(self.actions,)),
            #"actual_obs": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
            "actual_obs": Dict({
                "obs1": Discrete(10),
                "obs2": Box(low=-np.inf, high=np.inf, shape=(10, 10), dtype=np.float32),
            }),
        })
    
    def reset(self):
        return self._make_obs()
    
    def step(self, action):
        return self._make_obs(), 0, False, {}

    def _make_obs(self):
        return {
            "action_mask": np.array([1.0] * self.actions),
            #"actual_obs": np.zeros((10, 10), dtype=np.float32),
            "actual_obs": {"obs1": 1, "obs2": np.zeros((10, 10), dtype=np.float32)},
        }


def main ():

    ray.init()

    select_env = "env-v1"
    register_env(select_env, lambda config: MyEnv())
    config = ppo.DEFAULT_CONFIG.copy()
    config.update({
        "env": select_env,
        "framework": 'torch',
        "log_level": 'DEBUG',
        "model": {
            "custom_model": TorchActionMaskModel,
            # "no_final_linear": False,
        },
    })

    agent = ppo.PPOTrainer(config, env=select_env)
    for _ in range(5):
        agent.train()


if __name__ == "__main__":
    main()

2) there is also parametric_actions_model.py class TorchParametricActionsModel(DQNTorchModel): that also makes action masking, docs describes it here Variable-length / Parametric Action Spaces

class MyParamActionEnv(gym.Env):
    def __init__(self, max_avail_actions):
        self.action_space = Discrete(max_avail_actions)
        self.observation_space = Dict({
            "action_mask": Box(0, 1, shape=(max_avail_actions, )),
            "avail_actions": Box(-1, 1, shape=(max_avail_actions, action_embedding_sz)),
            "real_obs": ...,
        })

Anyone knows what's the difference TorchParametricActionsModel(DQNTorchModel) and class TorchActionMaskModel(TorchModelV2, nn.Module)? What class should I use for simple action masking PPO algorithm?

sirjay
  • 1,767
  • 3
  • 32
  • 52

0 Answers0