microsoft · lihuoran · Jan 31, 2023 · Jan 30, 2023
diff --git a/tests/rl/algorithms/ac.py b/tests/rl/algorithms/ac.py
@@ -11,6 +11,7 @@
 from maro.rl.model import ContinuousACBasedNet, VNet
 from maro.rl.policy import ContinuousRLPolicy
 from maro.rl.training.algorithms import ActorCriticParams, ActorCriticTrainer
+
 from .utils import mlp
 
 actor_net_conf = {

diff --git a/tests/rl/algorithms/ppo.py b/tests/rl/algorithms/ppo.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 from maro.rl.training.algorithms import PPOParams, PPOTrainer
+
 from .ac import MyVCriticNet, get_ac_policy
 
 get_ppo_policy = get_ac_policy

diff --git a/tests/rl/algorithms/sac.py b/tests/rl/algorithms/sac.py
@@ -1,5 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
+
 from typing import Tuple
 
 import numpy as np
@@ -11,6 +12,7 @@
 from maro.rl.model import ContinuousSACNet, QNet
 from maro.rl.policy import ContinuousRLPolicy
 from maro.rl.training.algorithms import SoftActorCriticParams, SoftActorCriticTrainer
+
 from tests.rl.algorithms.utils import mlp
 
 actor_net_conf = {

diff --git a/tests/rl/gym_wrapper/config.py b/tests/rl/gym_wrapper/config.py
@@ -2,3 +2,12 @@
 # Licensed under the MIT license.
 
 algorithm = "ppo"
+
+env_conf = {
+    "topology": "Walker2d-v4",
+    "start_tick": 0,
+    "durations": 5000,
+    "options": {
+        "random_seed": None,
+    },
+}
diff --git a/tests/rl/gym_wrapper/env_sampler.py b/tests/rl/gym_wrapper/env_sampler.py
@@ -6,6 +6,7 @@
 import numpy as np
 
 from maro.rl.rollout import AbsEnvSampler, CacheElement
+
 from tests.rl.gym_wrapper.simulator.business_engine import GymBusinessEngine
 from tests.rl.gym_wrapper.simulator.common import Action, DecisionEvent
 

diff --git a/tests/rl/gym_wrapper/rl_component_bundle.py b/tests/rl/gym_wrapper/rl_component_bundle.py
@@ -5,22 +5,14 @@
 
 from maro.rl.rl_component.rl_component_bundle import RLComponentBundle
 from maro.simulator import Env
+
 from tests.rl.gym_wrapper.simulator.business_engine import GymBusinessEngine
-from .config import algorithm
 
+from .config import algorithm, env_conf
 from .env_sampler import GymEnvSampler
 
-env_conf = {
-    "business_engine_cls": GymBusinessEngine,
-    "topology": "Walker2d-v4",
-    "start_tick": 0,
-    "durations": 5000,
-    "options": {
-        "random_seed": None,
-    },
-}
 
-learn_env = Env(**env_conf)
+learn_env = Env(business_engine_cls=GymBusinessEngine, **env_conf)
 test_env = learn_env
 num_agents = len(learn_env.agent_idx_list)
 

diff --git a/tests/rl/gym_wrapper/simulator/business_engine.py b/tests/rl/gym_wrapper/simulator/business_engine.py
@@ -4,10 +4,11 @@
 from typing import List, Optional, cast
 
 import gym
-from maro.backends.frame import FrameBase, SnapshotList
 
+from maro.backends.frame import FrameBase, SnapshotList
 from maro.event_buffer import CascadeEvent, EventBuffer, MaroEvents
 from maro.simulator.scenarios import AbsBusinessEngine
+
 from .common import Action, DecisionEvent
 
 
@@ -62,7 +63,7 @@ def _register_events(self) -> None:
     def _on_action_received(self, event: CascadeEvent) -> None:
         action = cast(Action, cast(list, event.payload)[0]).action
 
-        self._last_obs, reward, self._is_done, _, info = self._gym_env.step(action)
+        self._last_obs, reward, self._is_done, self._truncated, info = self._gym_env.step(action)
         self._reward_record[event.tick] = reward
         self._info_record[event.tick] = info
 
@@ -82,11 +83,12 @@ def get_info_at_tick(self, tick: int) -> object:  # TODO
     def reset(self, keep_seed: bool = False) -> None:
         self._last_obs = self._gym_env.reset()[0]
         self._is_done = False
+        self._truncated = False
         self._reward_record = {}
         self._info_record = {}
 
     def post_step(self, tick: int) -> bool:
-        return self._is_done or tick + 1 == self._max_tick
+        return self._is_done or self._truncated or tick + 1 == self._max_tick
 
     def get_agent_idx_list(self) -> List[int]:
         return [0]

diff --git a/tests/rl/performance.md b/tests/rl/performance.md
@@ -0,0 +1,30 @@
+# Performance for Gym Task Suite
+
+We benchmarked the MARO RL Toolkit implementation in Gym task suite.
+Some are compared to the benchmarks in [OpenAI Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#).
+Limited by the environment version difference<!-- and some others?-->,
+there may be some gaps between the performance here and that in Spinning Up benchmarks.
+
+The hyper-parameters are set to align with those used in [Spinning Up](https://spinningup.openai.com/en/latest/spinningup/bench.html#experiment-details):
+
+- Network of on-policy algorithms: size (64, 32) with tanh units for both policy and value function;
+- Network of off-policy algorithms: size (256, 256) with relu units;
+- Batch size for on-policy algorithms: 4000 steps of interaction per batch update;
+- Batch size for off-policy algorithms: size 100 for each gradient descent step;
+
+## Walker2d
+
+### Benchmark in Spinning Up - PyTorch Version
+
+- Environment version: Walker2d-v3
+- 3M timesteps
+
+![Walker2d: PyTorch Version](https://spinningup.openai.com/en/latest/_images/pytorch_walker2d_performance.svg)
+
+### Performance with MARO RL Toolkit
+
+- Environment version: Walker2d-v4
+- Training Mode: simple
+- Rollout Mode: single
+- Environment duration: 5000 ticks
+- Num of episodes: 600