Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/rl/algorithms/ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from maro.rl.model import ContinuousACBasedNet, VNet
from maro.rl.policy import ContinuousRLPolicy
from maro.rl.training.algorithms import ActorCriticParams, ActorCriticTrainer

from .utils import mlp

actor_net_conf = {
Expand Down
1 change: 1 addition & 0 deletions tests/rl/algorithms/ppo.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

from maro.rl.training.algorithms import PPOParams, PPOTrainer

from .ac import MyVCriticNet, get_ac_policy

get_ppo_policy = get_ac_policy
Expand Down
2 changes: 2 additions & 0 deletions tests/rl/algorithms/sac.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

from typing import Tuple

import numpy as np
Expand All @@ -11,6 +12,7 @@
from maro.rl.model import ContinuousSACNet, QNet
from maro.rl.policy import ContinuousRLPolicy
from maro.rl.training.algorithms import SoftActorCriticParams, SoftActorCriticTrainer

from tests.rl.algorithms.utils import mlp

actor_net_conf = {
Expand Down
9 changes: 9 additions & 0 deletions tests/rl/gym_wrapper/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,12 @@
# Licensed under the MIT license.

algorithm = "ppo"

env_conf = {
"topology": "Walker2d-v4",
"start_tick": 0,
"durations": 5000,
"options": {
"random_seed": None,
},
}
1 change: 1 addition & 0 deletions tests/rl/gym_wrapper/env_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np

from maro.rl.rollout import AbsEnvSampler, CacheElement

from tests.rl.gym_wrapper.simulator.business_engine import GymBusinessEngine
from tests.rl.gym_wrapper.simulator.common import Action, DecisionEvent

Expand Down
14 changes: 3 additions & 11 deletions tests/rl/gym_wrapper/rl_component_bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,14 @@

from maro.rl.rl_component.rl_component_bundle import RLComponentBundle
from maro.simulator import Env

from tests.rl.gym_wrapper.simulator.business_engine import GymBusinessEngine
from .config import algorithm

from .config import algorithm, env_conf
from .env_sampler import GymEnvSampler

env_conf = {
"business_engine_cls": GymBusinessEngine,
"topology": "Walker2d-v4",
"start_tick": 0,
"durations": 5000,
"options": {
"random_seed": None,
},
}

learn_env = Env(**env_conf)
learn_env = Env(business_engine_cls=GymBusinessEngine, **env_conf)
test_env = learn_env
num_agents = len(learn_env.agent_idx_list)

Expand Down
8 changes: 5 additions & 3 deletions tests/rl/gym_wrapper/simulator/business_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
from typing import List, Optional, cast

import gym
from maro.backends.frame import FrameBase, SnapshotList

from maro.backends.frame import FrameBase, SnapshotList
from maro.event_buffer import CascadeEvent, EventBuffer, MaroEvents
from maro.simulator.scenarios import AbsBusinessEngine

from .common import Action, DecisionEvent


Expand Down Expand Up @@ -62,7 +63,7 @@ def _register_events(self) -> None:
def _on_action_received(self, event: CascadeEvent) -> None:
action = cast(Action, cast(list, event.payload)[0]).action

self._last_obs, reward, self._is_done, _, info = self._gym_env.step(action)
self._last_obs, reward, self._is_done, self._truncated, info = self._gym_env.step(action)
self._reward_record[event.tick] = reward
self._info_record[event.tick] = info

Expand All @@ -82,11 +83,12 @@ def get_info_at_tick(self, tick: int) -> object: # TODO
def reset(self, keep_seed: bool = False) -> None:
self._last_obs = self._gym_env.reset()[0]
self._is_done = False
self._truncated = False
self._reward_record = {}
self._info_record = {}

def post_step(self, tick: int) -> bool:
return self._is_done or tick + 1 == self._max_tick
return self._is_done or self._truncated or tick + 1 == self._max_tick

def get_agent_idx_list(self) -> List[int]:
return [0]
Expand Down
30 changes: 30 additions & 0 deletions tests/rl/performance.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Performance for Gym Task Suite

We benchmarked the MARO RL Toolkit implementation in Gym task suite.
Some are compared to the benchmarks in [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#).
Limited by the environment version difference<!-- and some others?-->,
there may be some gaps between the performance here and that in Spinning Up benchmarks.

The hyper-parameters are set to align with those used in [Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#experiment-details):

- Network of on-policy algorithms: size (64, 32) with tanh units for both policy and value function;
- Network of off-policy algorithms: size (256, 256) with relu units;
- Batch size for on-policy algorithms: 4000 steps of interaction per batch update;
- Batch size for off-policy algorithms: size 100 for each gradient descent step;

## Walker2d

### Benchmark in Spinning Up - PyTorch Version

- Environment version: Walker2d-v3
- 3M timesteps

![Walker2d: PyTorch Version](https://spinningup.openai.com/en/latest/_images/pytorch_walker2d_performance.svg)

### Performance with MARO RL Toolkit

- Environment version: Walker2d-v4
- Training Mode: simple
- Rollout Mode: single
- Environment duration: 5000 ticks
- Num of episodes: 600