Merge branch 'Farama-Foundation:main' into py313

Farama-Foundation · Dec 4, 2024 · c20bb25 · c20bb25
2 parents 37240a9 + f949331
commit c20bb25
Show file tree

Hide file tree

Showing 26 changed files with 869 additions and 74 deletions.
diff --git a/docs/introduction/record_agent.md b/docs/introduction/record_agent.md
@@ -53,7 +53,7 @@ print(f'Episode lengths: {env.length_queue}')
 
 In the script above, for the :class:`RecordVideo` wrapper, we specify three different variables: ``video_folder`` to specify the folder that the videos should be saved (change for your problem), ``name_prefix`` for the prefix of videos themselves and finally an ``episode_trigger`` such that every episode is recorded. This means that for every episode of the environment, a video will be recorded and saved in the style "cartpole-agent/eval-episode-x.mp4".
 
-For the :class:`RecordEpisodicStatistics`, we only need to specify the buffer lengths, this is the max length of the internal ``time_queue``, ``return_queue`` and ``length_queue``. Rather than collect the data for each episode individually, we can use the data queues to print the information at the end of the evaluation.
+For the :class:`RecordEpisodeStatistics`, we only need to specify the buffer lengths, this is the max length of the internal ``time_queue``, ``return_queue`` and ``length_queue``. Rather than collect the data for each episode individually, we can use the data queues to print the information at the end of the evaluation.
 
 For speed ups in evaluating environments, it is possible to implement this with vector environments in order to evaluate ``N`` episodes at the same time in parallel rather than series.
 ```

diff --git a/docs/introduction/train_agent.md b/docs/introduction/train_agent.md
@@ -160,17 +160,17 @@ fig, axs = plt.subplots(1, 3, figsize=(20, 8))
 
 # np.convolve will compute the rolling mean for 100 episodes
 
-axs[0].plot(np.convolve(env.return_queue, np.ones(100)))
+axs[0].plot(np.convolve(env.return_queue, np.ones(100)/100))
 axs[0].set_title("Episode Rewards")
 axs[0].set_xlabel("Episode")
 axs[0].set_ylabel("Reward")
 
-axs[1].plot(np.convolve(env.length_queue, np.ones(100)))
+axs[1].plot(np.convolve(env.length_queue, np.ones(100)/100))
 axs[1].set_title("Episode Lengths")
 axs[1].set_xlabel("Episode")
 axs[1].set_ylabel("Length")
 
-axs[2].plot(np.convolve(agent.training_error, np.ones(100)))
+axs[2].plot(np.convolve(agent.training_error, np.ones(100)/100))
 axs[2].set_title("Training Error")
 axs[2].set_xlabel("Episode")
 axs[2].set_ylabel("Temporal Difference")

diff --git a/docs/tutorials/training_agents/blackjack_tutorial.py b/docs/tutorials/training_agents/blackjack_tutorial.py
@@ -275,7 +275,7 @@ def decay_epsilon(self):
 #
 
 
-env = gym.wrappers.RecordEpisodeStatistics(env, buffer_length=n_episodes)
+env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=n_episodes)
 for episode in tqdm(range(n_episodes)):
     obs, info = env.reset()
     done = False

diff --git a/gymnasium/core.py b/gymnasium/core.py
@@ -307,7 +307,9 @@ def __init__(self, env: Env[ObsType, ActType]):
             env: The environment to wrap
         """
         self.env = env
-        assert isinstance(env, Env)
+        assert isinstance(
+            env, Env
+        ), f"Expected env to be a `gymnasium.Env` but got {type(env)}"
 
         self._action_space: spaces.Space[WrapperActType] | None = None
         self._observation_space: spaces.Space[WrapperObsType] | None = None

diff --git a/gymnasium/envs/classic_control/cartpole.py b/gymnasium/envs/classic_control/cartpole.py
@@ -13,7 +13,7 @@
 from gymnasium import logger, spaces
 from gymnasium.envs.classic_control import utils
 from gymnasium.error import DependencyNotInstalled
-from gymnasium.vector import VectorEnv
+from gymnasium.vector import AutoresetMode, VectorEnv
 from gymnasium.vector.utils import batch_space
 
 
@@ -355,6 +355,7 @@ class CartPoleVectorEnv(VectorEnv):
     metadata = {
         "render_modes": ["rgb_array"],
         "render_fps": 50,
+        "autoreset_mode": AutoresetMode.NEXT_STEP,
     }
 
     def __init__(

diff --git a/gymnasium/envs/functional_jax_env.py b/gymnasium/envs/functional_jax_env.py
@@ -12,6 +12,7 @@
 from gymnasium.envs.registration import EnvSpec
 from gymnasium.experimental.functional import ActType, FuncEnv, StateType
 from gymnasium.utils import seeding
+from gymnasium.vector import AutoresetMode
 from gymnasium.vector.utils import batch_space
 
 
@@ -115,7 +116,7 @@ def __init__(
         """Initialize the environment from a FuncEnv."""
         super().__init__()
         if metadata is None:
-            metadata = {}
+            metadata = {"autoreset_mode": AutoresetMode.NEXT_STEP}
         self.func_env = func_env
         self.num_envs = num_envs
 

diff --git a/gymnasium/envs/mujoco/mujoco_rendering.py b/gymnasium/envs/mujoco/mujoco_rendering.py
@@ -258,11 +258,13 @@ def render(
 
         # Process rendered images according to render_mode
         if render_mode in ["depth_array", "rgbd_tuple"]:
-            depth_img = depth_arr.reshape(self.viewport.height, self.viewport.width)
+            depth_img = depth_arr.reshape((self.viewport.height, self.viewport.width))
             # original image is upside-down, so flip it
             depth_img = depth_img[::-1, :]
         if render_mode in ["rgb_array", "rgbd_tuple"]:
-            rgb_img = rgb_arr.reshape(self.viewport.height, self.viewport.width, 3)
+            rgb_img = rgb_arr.reshape((self.viewport.height, self.viewport.width, 3))
+            # original image is upside-down, so flip it
+            rgb_img = rgb_img[::-1, :]
 
             if segmentation:
                 seg_img = (
@@ -281,8 +283,6 @@ def render(
                         seg_ids[geom.segid + 1, 0] = geom.objtype
                         seg_ids[geom.segid + 1, 1] = geom.objid
                 rgb_img = seg_ids[seg_img]
-                # original image is upside-down, so flip it
-                rgb_img = rgb_img[::-1, :, :]
 
         # Return processed images based on render_mode
         if render_mode == "rgb_array":

diff --git a/gymnasium/envs/phys2d/cartpole.py b/gymnasium/envs/phys2d/cartpole.py
@@ -15,6 +15,7 @@
 from gymnasium.error import DependencyNotInstalled
 from gymnasium.experimental.functional import ActType, FuncEnv, StateType
 from gymnasium.utils import EzPickle
+from gymnasium.vector import AutoresetMode
 
 
 RenderStateType = Tuple["pygame.Surface", "pygame.time.Clock"]  # type: ignore  # noqa: F821
@@ -272,7 +273,12 @@ def __init__(self, render_mode: str | None = None, **kwargs: Any):
 class CartPoleJaxVectorEnv(FunctionalJaxVectorEnv, EzPickle):
     """Jax-based implementation of the vectorized CartPole environment."""
 
-    metadata = {"render_modes": ["rgb_array"], "render_fps": 50, "jax": True}
+    metadata = {
+        "render_modes": ["rgb_array"],
+        "render_fps": 50,
+        "jax": True,
+        "autoreset_mode": AutoresetMode.NEXT_STEP,
+    }
 
     def __init__(
         self,

diff --git a/gymnasium/envs/phys2d/pendulum.py b/gymnasium/envs/phys2d/pendulum.py
@@ -16,6 +16,7 @@
 from gymnasium.error import DependencyNotInstalled
 from gymnasium.experimental.functional import ActType, FuncEnv, StateType
 from gymnasium.utils import EzPickle
+from gymnasium.vector import AutoresetMode
 
 
 RenderStateType = Tuple["pygame.Surface", "pygame.time.Clock", Optional[float]]  # type: ignore  # noqa: F821
@@ -225,7 +226,12 @@ def get_default_params(self, **kwargs) -> PendulumParams:
 class PendulumJaxEnv(FunctionalJaxEnv, EzPickle):
     """Jax-based pendulum environment using the functional version as base."""
 
-    metadata = {"render_modes": ["rgb_array"], "render_fps": 30, "jax": True}
+    metadata = {
+        "render_modes": ["rgb_array"],
+        "render_fps": 30,
+        "jax": True,
+        "autoreset_mode": AutoresetMode.NEXT_STEP,
+    }
 
     def __init__(self, render_mode: str | None = None, **kwargs: Any):
         """Constructor where the kwargs are passed to the base environment to modify the parameters."""

diff --git a/gymnasium/envs/registration.py b/gymnasium/envs/registration.py
@@ -19,6 +19,8 @@
 
 import gymnasium as gym
 from gymnasium import Env, Wrapper, error, logger
+from gymnasium.logger import warn
+from gymnasium.vector import AutoresetMode
 
 
 if sys.version_info < (3, 10):
@@ -976,6 +978,15 @@ def create_single_env() -> Env:
         copied_id_spec.kwargs["wrappers"] = wrappers
     env.unwrapped.spec = copied_id_spec
 
+    if "autoreset_mode" not in env.metadata:
+        warn(
+            f"The VectorEnv ({env}) is missing AutoresetMode metadata, metadata={env.metadata}"
+        )
+    elif not isinstance(env.metadata["autoreset_mode"], AutoresetMode):
+        warn(
+            f"The VectorEnv ({env}) metadata['autoreset_mode'] is not an instance of AutoresetMode, {type(env.metadata['autoreset_mode'])}."
+        )
+
     return env
 
 

diff --git a/gymnasium/envs/tabular/blackjack.py b/gymnasium/envs/tabular/blackjack.py
@@ -16,6 +16,7 @@
 from gymnasium.error import DependencyNotInstalled
 from gymnasium.experimental.functional import ActType, FuncEnv, StateType
 from gymnasium.utils import EzPickle, seeding
+from gymnasium.vector import AutoresetMode
 from gymnasium.wrappers import HumanRendering
 
 
@@ -239,6 +240,7 @@ class BlackjackFunctional(
     metadata = {
         "render_modes": ["rgb_array"],
         "render_fps": 4,
+        "autoreseet-mode": AutoresetMode.NEXT_STEP,
     }
 
     def transition(

diff --git a/gymnasium/envs/tabular/cliffwalking.py b/gymnasium/envs/tabular/cliffwalking.py
@@ -15,6 +15,7 @@
 from gymnasium.error import DependencyNotInstalled
 from gymnasium.experimental.functional import ActType, FuncEnv, StateType
 from gymnasium.utils import EzPickle
+from gymnasium.vector import AutoresetMode
 from gymnasium.wrappers import HumanRendering
 
 
@@ -136,6 +137,7 @@ class CliffWalkingFunctional(
     metadata = {
         "render_modes": ["rgb_array"],
         "render_fps": 4,
+        "autoreset_mode": AutoresetMode.NEXT_STEP,
     }
 
     def transition(

diff --git a/gymnasium/vector/__init__.py b/gymnasium/vector/__init__.py
@@ -4,6 +4,7 @@
 from gymnasium.vector.async_vector_env import AsyncVectorEnv
 from gymnasium.vector.sync_vector_env import SyncVectorEnv
 from gymnasium.vector.vector_env import (
+    AutoresetMode,
     VectorActionWrapper,
     VectorEnv,
     VectorObservationWrapper,
@@ -21,4 +22,5 @@
     "SyncVectorEnv",
     "AsyncVectorEnv",
     "utils",
+    "AutoresetMode",
 ]
diff --git a/gymnasium/vector/async_vector_env.py b/gymnasium/vector/async_vector_env.py
@@ -35,7 +35,7 @@
     read_from_shared_memory,
     write_to_shared_memory,
 )
-from gymnasium.vector.vector_env import ArrayType, VectorEnv
+from gymnasium.vector.vector_env import ArrayType, AutoresetMode, VectorEnv
 
 
 __all__ = ["AsyncVectorEnv", "AsyncState"]
@@ -101,6 +101,7 @@ def __init__(
             | None
         ) = None,
         observation_mode: str | Space = "same",
+        autoreset_mode: str | AutoresetMode = AutoresetMode.NEXT_STEP,
     ):
         """Vectorized environment that runs multiple environments in parallel.
 
@@ -120,6 +121,7 @@ def __init__(
                 'different' defines that there can be multiple observation spaces with different parameters though requires the same shape and dtype,
                 warning, may raise unexpected errors. Passing a ``Tuple[Space, Space]`` object allows defining a custom ``single_observation_space`` and
                 ``observation_space``, warning, may raise unexpected errors.
+            autoreset_mode: The Autoreset Mode used, see todo for more details.
 
         Warnings:
             worker is an advanced mode option. It provides a high degree of flexibility and a high chance
@@ -135,7 +137,15 @@ def __init__(
         self.env_fns = env_fns
         self.shared_memory = shared_memory
         self.copy = copy
+        self.context = context
+        self.daemon = daemon
+        self.worker = worker
         self.observation_mode = observation_mode
+        self.autoreset_mode = (
+            autoreset_mode
+            if isinstance(autoreset_mode, AutoresetMode)
+            else AutoresetMode(autoreset_mode)
+        )
 
         self.num_envs = len(env_fns)
 
@@ -145,6 +155,7 @@ def __init__(
 
         # As we support `make_vec(spec)` then we can't include a `spec = dummy_env.spec` as this doesn't guarantee we can actual recreate the vector env.
         self.metadata = dummy_env.metadata
+        self.metadata["autoreset_mode"] = self.autoreset_mode
         self.render_mode = dummy_env.render_mode
 
         self.single_action_space = dummy_env.action_space
@@ -211,6 +222,7 @@ def __init__(
                         parent_pipe,
                         _obs_buffer,
                         self.error_queue,
+                        self.autoreset_mode,
                     ),
                 )
 
@@ -287,9 +299,32 @@ def reset_async(
                 str(self._state.value),
             )
 
-        for pipe, env_seed in zip(self.parent_pipes, seed):
-            env_kwargs = {"seed": env_seed, "options": options}
-            pipe.send(("reset", env_kwargs))
+        if options is not None and "reset_mask" in options:
+            reset_mask = options.pop("reset_mask")
+            assert isinstance(
+                reset_mask, np.ndarray
+            ), f"`options['reset_mask': mask]` must be a numpy array, got {type(reset_mask)}"
+            assert reset_mask.shape == (
+                self.num_envs,
+            ), f"`options['reset_mask': mask]` must have shape `({self.num_envs},)`, got {reset_mask.shape}"
+            assert (
+                reset_mask.dtype == np.bool_
+            ), f"`options['reset_mask': mask]` must have `dtype=np.bool_`, got {reset_mask.dtype}"
+            assert np.any(
+                reset_mask
+            ), f"`options['reset_mask': mask]` must contain a boolean array, got reset_mask={reset_mask}"
+
+            for pipe, env_seed, env_reset in zip(self.parent_pipes, seed, reset_mask):
+                if env_reset:
+                    env_kwargs = {"seed": env_seed, "options": options}
+                    pipe.send(("reset", env_kwargs))
+                else:
+                    pipe.send(("reset-noop", None))
+        else:
+            for pipe, env_seed in zip(self.parent_pipes, seed):
+                env_kwargs = {"seed": env_seed, "options": options}
+                pipe.send(("reset", env_kwargs))
+
         self._state = AsyncState.WAITING_RESET
 
     def reset_wait(
@@ -688,11 +723,13 @@ def _async_worker(
     parent_pipe: Connection,
     shared_memory: multiprocessing.Array | dict[str, Any] | tuple[Any, ...],
     error_queue: Queue,
+    autoreset_mode: AutoresetMode,
 ):
     env = env_fn()
     observation_space = env.observation_space
     action_space = env.action_space
     autoreset = False
+    observation = None
 
     parent_pipe.close()
 
@@ -709,19 +746,51 @@ def _async_worker(
                     observation = None
                     autoreset = False
                 pipe.send(((observation, info), True))
+            elif command == "reset-noop":
+                pipe.send(((observation, {}), True))
             elif command == "step":
-                if autoreset:
-                    observation, info = env.reset()
-                    reward, terminated, truncated = 0, False, False
-                else:
+                if autoreset_mode == AutoresetMode.NEXT_STEP:
+                    if autoreset:
+                        observation, info = env.reset()
+                        reward, terminated, truncated = 0, False, False
+                    else:
+                        (
+                            observation,
+                            reward,
+                            terminated,
+                            truncated,
+                            info,
+                        ) = env.step(data)
+                    autoreset = terminated or truncated
+                elif autoreset_mode == AutoresetMode.SAME_STEP:
                     (
                         observation,
                         reward,
                         terminated,
                         truncated,
                         info,
                     ) = env.step(data)
-                autoreset = terminated or truncated
+
+                    if terminated or truncated:
+                        reset_observation, reset_info = env.reset()
+
+                        info = {
+                            "final_info": info,
+                            "final_obs": observation,
+                            **reset_info,
+                        }
+                        observation = reset_observation
+                elif autoreset_mode == AutoresetMode.DISABLED:
+                    assert autoreset is False
+                    (
+                        observation,
+                        reward,
+                        terminated,
+                        truncated,
+                        info,
+                    ) = env.step(data)
+                else:
+                    raise ValueError(f"Unexpected autoreset_mode: {autoreset_mode}")
 
                 if shared_memory:
                     write_to_shared_memory(