From d38bd2314ed3311db444c4468eb75d0d5753c407 Mon Sep 17 00:00:00 2001
From: Brandon Trabucco <brandon@btrabucco.com>
Date: Mon, 29 Jan 2024 14:04:09 -0500
Subject: [PATCH] merging changes from Young into hopper-v1 task

---
 design_bench/__init__.py                      |  16 ++
 design_bench/oracles/exact/__init__.py        |   5 +
 .../oracles/exact/hopper_controller_oracle.py |  36 ++--
 .../hopper_controller_stochastic_oracle.py    | 193 ++++++++++++++++++
 setup.py                                      |   2 +-
 5 files changed, 231 insertions(+), 21 deletions(-)
 create mode 100644 design_bench/oracles/exact/hopper_controller_stochastic_oracle.py

diff --git a/design_bench/__init__.py b/design_bench/__init__.py
index adbad6f..c466f75 100644
--- a/design_bench/__init__.py
+++ b/design_bench/__init__.py
@@ -1209,6 +1209,22 @@
              noise_std=0.0))
 
 
+register('HopperController-Exact-v1',
+         'design_bench.datasets.continuous.hopper_controller_dataset:HopperControllerDataset',
+         'design_bench.oracles.exact:HopperControllerStochasticOracle',
+
+         # keyword arguments for building the dataset
+         dataset_kwargs=dict(
+             max_samples=None,
+             distribution=None,
+             max_percentile=100,
+             min_percentile=0),
+
+         # keyword arguments for building the exact oracle
+         oracle_kwargs=dict(
+             noise_std=0.0))
+
+
 register('HopperController-GP-v0',
          'design_bench.datasets.continuous.hopper_controller_dataset:HopperControllerDataset',
          'design_bench.oracles.sklearn:GaussianProcessOracle',
diff --git a/design_bench/oracles/exact/__init__.py b/design_bench/oracles/exact/__init__.py
index 5c103b9..d5df5ee 100644
--- a/design_bench/oracles/exact/__init__.py
+++ b/design_bench/oracles/exact/__init__.py
@@ -18,6 +18,11 @@
 except ImportError as e:
     print("Skipping HopperControllerOracle import:", e)
 
+try:
+    from .hopper_controller_stochastic_oracle import HopperControllerStochasticOracle
+except ImportError as e:
+    print("Skipping HopperControllerStochasticOracle import:", e)
+
 try:
     from .nas_bench_oracle import NASBenchOracle
 except ImportError as e:
diff --git a/design_bench/oracles/exact/hopper_controller_oracle.py b/design_bench/oracles/exact/hopper_controller_oracle.py
index e591134..f1e3485 100644
--- a/design_bench/oracles/exact/hopper_controller_oracle.py
+++ b/design_bench/oracles/exact/hopper_controller_oracle.py
@@ -117,43 +117,40 @@ def protected_predict(self, x, render=False, **render_kwargs):
             value 'x' in a model-based optimization problem
 
         """
+
         # extract weights from the vector design
         weights = []
         for s in ((self.obs_dim, self.hidden_dim),
-                  (1, self.hidden_dim,),
+                  (self.hidden_dim,),
                   (self.hidden_dim, self.hidden_dim),
-                  (1, self.hidden_dim,),
+                  (self.hidden_dim,),
                   (self.hidden_dim, self.action_dim),
-                  (1, self.action_dim,),
+                  (self.action_dim,),
                   (1, self.action_dim)):
             weights.append(x[0:np.prod(s)].reshape(s))
             x = x[np.prod(s):]
 
+        # the final weight is logstd and is not used
+        weights.pop(-1)
+
         # create a policy forward pass in numpy
         def mlp_policy(h):
-            h = h.reshape(1, -1)
             h = np.tanh(h @ weights[0] + weights[1])
             h = np.tanh(h @ weights[2] + weights[3])
-            h = h @ weights[4] + weights[5] + np.random.randn(1, self.action_dim) * np.exp(weights[6])
-            return h
+            return h @ weights[4] + weights[5]
 
         # make a copy of the policy and the environment
         env = gym.make(self.env_name)
 
         # perform a single rollout for quick evaluation
+        obs = env.reset()
+        done = False
         path_returns = np.zeros([1], dtype=np.float32)
-        total_return = 0.0
-        for _ in range(self.eval_n_trials):
-            obs = env.reset()
-            done = False
-            for step in range(1000):
-                obs, rew, done, info = env.step(mlp_policy(obs))
-                if render:
-                    env.render(**render_kwargs)
-                total_return += rew
-                if done:
-                    break
-        path_returns[0] = total_return / self.eval_n_trials
+        while not done:
+            obs, rew, done, info = env.step(mlp_policy(obs))
+            if render:
+                env.render(**render_kwargs)
+            path_returns += rew.astype(np.float32)
 
         # return the sum of rewards for a single trajectory
         return path_returns.astype(np.float32)
@@ -184,10 +181,9 @@ def __init__(self, dataset: ContinuousDataset, **kwargs):
         self.action_dim = 3
         self.hidden_dim = 64
         self.env_name = 'Hopper-v2'
-        self.eval_n_trials = 10
 
         # initialize the oracle using the super class
         super(HopperControllerOracle, self).__init__(
             dataset, internal_batch_size=1, is_batched=False,
             expect_normalized_y=False,
-            expect_normalized_x=False, expect_logits=None, **kwargs)
+            expect_normalized_x=False, expect_logits=None, **kwargs)
\ No newline at end of file
diff --git a/design_bench/oracles/exact/hopper_controller_stochastic_oracle.py b/design_bench/oracles/exact/hopper_controller_stochastic_oracle.py
new file mode 100644
index 0000000..92b40d4
--- /dev/null
+++ b/design_bench/oracles/exact/hopper_controller_stochastic_oracle.py
@@ -0,0 +1,193 @@
+from design_bench.oracles.exact_oracle import ExactOracle
+from design_bench.datasets.continuous_dataset import ContinuousDataset
+from design_bench.datasets.continuous.hopper_controller_dataset import HopperControllerDataset
+import numpy as np
+import gym
+
+
+class HopperControllerStochasticOracle(ExactOracle):
+    """An abstract class for managing the ground truth score functions f(x)
+    for model-based optimization problems, where the
+    goal is to find a design 'x' that maximizes a prediction 'y':
+
+    max_x { y = f(x) }
+
+    Public Attributes:
+
+    external_dataset: DatasetBuilder
+        an instance of a subclass of the DatasetBuilder class which points to
+        the mutable task dataset for a model-based optimization problem
+
+    internal_dataset: DatasetBuilder
+        an instance of a subclass of the DatasetBuilder class which has frozen
+        statistics and is used for training the oracle
+
+    is_batched: bool
+        a boolean variable that indicates whether the evaluation function
+        implemented for a particular oracle is batched, which effects
+        the scaling coefficient of its computational cost
+
+    internal_batch_size: int
+        an integer representing the number of design values to process
+        internally at the same time, if None defaults to the entire
+        tensor given to the self.score method
+    internal_measurements: int
+        an integer representing the number of independent measurements of
+        the prediction made by the oracle, which are subsequently
+        averaged, and is useful when the oracle is stochastic
+
+    noise_std: float
+        the standard deviation of gaussian noise added to the prediction
+        values 'y' coming out of the ground truth score function f(x)
+        in order to make the optimization problem difficult
+
+    expect_normalized_y: bool
+        a boolean indicator that specifies whether the inputs to the oracle
+        score function are expected to be normalized
+    expect_normalized_x: bool
+        a boolean indicator that specifies whether the outputs of the oracle
+        score function are expected to be normalized
+    expect_logits: bool
+        a boolean that specifies whether the oracle score function is
+        expecting logits when the dataset is discrete
+
+    Public Methods:
+
+    predict(np.ndarray) -> np.ndarray:
+        a function that accepts a batch of design values 'x' as input and for
+        each design computes a prediction value 'y' which corresponds
+        to the score in a model-based optimization problem
+
+    check_input_format(DatasetBuilder) -> bool:
+        a function that accepts a list of integers as input and returns true
+        when design values 'x' with the shape specified by that list are
+        compatible with this class of approximate oracle
+
+    """
+
+    name = "exact_average_return"
+
+    @classmethod
+    def supported_datasets(cls):
+        """An attribute the defines the set of dataset classes which this
+        oracle can be applied to forming a valid ground truth score
+        function for a model-based optimization problem
+
+        """
+
+        return {HopperControllerDataset}
+
+    @classmethod
+    def fully_characterized(cls):
+        """An attribute the defines whether all possible inputs to the
+        model-based optimization problem have been evaluated and
+        are are returned via lookup in self.predict
+
+        """
+
+        return False
+
+    @classmethod
+    def is_simulated(cls):
+        """An attribute the defines whether the values returned by the oracle
+         were obtained by running a computer simulation rather than
+         performing physical experiments with real data
+
+        """
+
+        return True
+
+    def protected_predict(self, x, render=False, **render_kwargs):
+        """Score function to be implemented by oracle subclasses, where x is
+        either a batch of designs if self.is_batched is True or is a
+        single design when self._is_batched is False
+
+        Arguments:
+
+        x_batch: np.ndarray
+            a batch or single design 'x' that will be given as input to the
+            oracle model in order to obtain a prediction value 'y' for
+            each 'x' which is then returned
+
+        Returns:
+
+        y_batch: np.ndarray
+            a batch or single prediction 'y' made by the oracle model,
+            corresponding to the ground truth score for each design
+            value 'x' in a model-based optimization problem
+
+        """
+        # extract weights from the vector design
+        weights = []
+        for s in ((self.obs_dim, self.hidden_dim),
+                  (1, self.hidden_dim,),
+                  (self.hidden_dim, self.hidden_dim),
+                  (1, self.hidden_dim,),
+                  (self.hidden_dim, self.action_dim),
+                  (1, self.action_dim,),
+                  (1, self.action_dim)):
+            weights.append(x[0:np.prod(s)].reshape(s))
+            x = x[np.prod(s):]
+
+        # create a policy forward pass in numpy
+        def mlp_policy(h):
+            h = h.reshape(1, -1)
+            h = np.tanh(h @ weights[0] + weights[1])
+            h = np.tanh(h @ weights[2] + weights[3])
+            h = h @ weights[4] + weights[5] + np.random.randn(1, self.action_dim) * np.exp(weights[6])
+            return h
+
+        # make a copy of the policy and the environment
+        env = gym.make(self.env_name)
+
+        # perform a single rollout for quick evaluation
+        path_returns = np.zeros([1], dtype=np.float32)
+        total_return = 0.0
+        for _ in range(self.eval_n_trials):
+            obs = env.reset()
+            done = False
+            for step in range(1000):
+                obs, rew, done, info = env.step(mlp_policy(obs))
+                if render:
+                    env.render(**render_kwargs)
+                total_return += rew
+                if done:
+                    break
+        path_returns[0] = total_return / self.eval_n_trials
+
+        # return the sum of rewards for a single trajectory
+        return path_returns.astype(np.float32)
+
+    def __init__(self, dataset: ContinuousDataset, **kwargs):
+        """Initialize the ground truth score function f(x) for a model-based
+        optimization problem, which involves loading the parameters of an
+        oracle model and estimating its computational cost
+
+        Arguments:
+
+        dataset: DiscreteDataset
+            an instance of a subclass of the DatasetBuilder class which has
+            a set of design values 'x' and prediction values 'y', and defines
+            batching and sampling methods for those attributes
+        noise_std: float
+            the standard deviation of gaussian noise added to the prediction
+            values 'y' coming out of the ground truth score function f(x)
+            in order to make the optimization problem difficult
+        internal_measurements: int
+            an integer representing the number of independent measurements of
+            the prediction made by the oracle, which are subsequently
+            averaged, and is useful when the oracle is stochastic
+
+        """
+
+        self.obs_dim = 11
+        self.action_dim = 3
+        self.hidden_dim = 64
+        self.env_name = 'Hopper-v2'
+        self.eval_n_trials = 10
+
+        # initialize the oracle using the super class
+        super(HopperControllerStochasticOracle, self).__init__(
+            dataset, internal_batch_size=1, is_batched=False,
+            expect_normalized_y=False,
+            expect_normalized_x=False, expect_logits=None, **kwargs)
diff --git a/setup.py b/setup.py
index 7767add..40881b6 100644
--- a/setup.py
+++ b/setup.py
@@ -20,7 +20,7 @@
                    'brandontrabucco/design-bench/archive/v2_0_20.tar.gz',
       keywords=['Deep Learning', 'Neural Networks',
                 'Benchmark', 'Model-Based Optimization'],
-      extras_require={'all': ['gym[mujoco]'], 'cma': ['cma']},
+      extras_require={'all': ['gym[mujoco]<0.26.0'], 'cma': ['cma']},
       install_requires=['pandas', 'requests', 'scikit-learn',
                         'torch', 'torchvision', 'numpy',
                         'tensorflow>=2.2', 'transformers',