add seeds

experimental-design · Dec 18, 2023 · 379a7c7 · 379a7c7
1 parent d4c5c49
commit 379a7c7
Show file tree

Hide file tree

Showing 11 changed files with 71 additions and 34 deletions.
diff --git a/bofire/data_models/domain/features.py b/bofire/data_models/domain/features.py
@@ -104,7 +104,7 @@ def get_by_keys(self, keys: Sequence[str]) -> Features:
     def get(
         self,
         includes: Union[Type, List[Type]] = AnyFeature,
-        excludes: Union[Type, List[Type]] = None,
+        excludes: Union[Type, List[Type]] = None,  # type: ignore
         exact: bool = False,
     ) -> Features:
         """get features of the domain
@@ -132,7 +132,7 @@ def get(
     def get_keys(
         self,
         includes: Union[Type, List[Type]] = AnyFeature,
-        excludes: Union[Type, List[Type]] = None,
+        excludes: Union[Type, List[Type]] = None,  # type: ignore
         exact: bool = False,
     ) -> List[str]:
         """Method to get feature keys of the domain
@@ -186,6 +186,7 @@ def sample(
         self,
         n: int = 1,
         method: SamplingMethodEnum = SamplingMethodEnum.UNIFORM,
+        seed: Optional[int] = None,
     ) -> pd.DataFrame:
         """Draw sobol samples
 
@@ -199,15 +200,18 @@ def sample(
         """
         if method == SamplingMethodEnum.UNIFORM:
             return self.validate_candidates(
-                pd.concat([feat.sample(n) for feat in self.get(Input)], axis=1)  # type: ignore
+                pd.concat(
+                    [feat.sample(n, seed=seed) for feat in self.get(Input)],  # type: ignore
+                    axis=1,
+                )
             )
         free_features = self.get_free()
         if method == SamplingMethodEnum.SOBOL:
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
-                X = Sobol(len(free_features)).random(n)
+                X = Sobol(len(free_features), seed=seed).random(n)
         else:
-            X = LatinHypercube(len(free_features)).random(n)
+            X = LatinHypercube(len(free_features), seed=seed).random(n)
         res = []
         for i, feat in enumerate(free_features):
             if isinstance(feat, ContinuousInput):
@@ -247,7 +251,9 @@ def validate_candidates(self, candidates: pd.DataFrame) -> pd.DataFrame:
         for feature in self:
             if feature.key not in candidates:
                 raise ValueError(f"no col for input feature `{feature.key}`")
-            candidates[feature.key] = feature.validate_candidental(candidates[feature.key])  # type: ignore
+            candidates[feature.key] = feature.validate_candidental(  # type: ignore
+                candidates[feature.key]
+            )
         if candidates[self.get_keys()].isnull().to_numpy().any():
             raise ValueError("there are null values")
         if candidates[self.get_keys()].isna().to_numpy().any():
@@ -260,7 +266,10 @@ def validate_experiments(
         for feature in self:
             if feature.key not in experiments:
                 raise ValueError(f"no col for input feature `{feature.key}`")
-            experiments[feature.key] = feature.validate_experimental(experiments[feature.key], strict=strict)  # type: ignore
+            experiments[feature.key] = feature.validate_experimental(
+                experiments[feature.key],
+                strict=strict,  # type: ignore
+            )
         if experiments[self.get_keys()].isnull().to_numpy().any():
             raise ValueError("there are null values")
         if experiments[self.get_keys()].isna().to_numpy().any():
@@ -270,7 +279,7 @@ def validate_experiments(
     def get_categorical_combinations(
         self,
         include: Union[Type, List[Type]] = Input,
-        exclude: Union[Type, List[Type]] = None,
+        exclude: Union[Type, List[Type]] = None,  # type: ignore
     ):
         """get a list of tuples pairing the feature keys with a list of valid categories
 
@@ -361,9 +370,7 @@ def _get_transform_info(
                 counter += len(feat.descriptors)
             elif isinstance(specs[feat.key], MolFeatures):
                 assert isinstance(feat, MolecularInput)
-                descriptor_names = specs[
-                    feat.key
-                ].get_descriptor_names()  # type: ignore
+                descriptor_names = specs[feat.key].get_descriptor_names()  # type: ignore
                 features2idx[feat.key] = tuple(
                     (np.array(range(len(descriptor_names))) + counter).tolist()
                 )
@@ -450,7 +457,9 @@ def inverse_transform(
                 transformed.append(feat.from_descriptor_encoding(experiments))
             elif isinstance(specs[feat.key], MolFeatures):
                 assert isinstance(feat, CategoricalMolecularInput)
-                transformed.append(feat.from_descriptor_encoding(specs[feat.key], experiments))  # type: ignore
+                transformed.append(
+                    feat.from_descriptor_encoding(specs[feat.key], experiments)  # type: ignore
+                )
 
         return pd.concat(transformed, axis=1)
 
@@ -574,9 +583,9 @@ def get_by_objective(
                 features=sorted(
                     filter_by_attribute(
                         self.get(ContinuousOutput).features,
-                        lambda of: of.objective,
+                        lambda of: of.objective,  # type: ignore
                         includes,
-                        excludes,
+                        excludes,  # type: ignore
                         exact,
                     )
                 )
@@ -682,7 +691,8 @@ def validate_candidates(self, candidates: pd.DataFrame) -> pd.DataFrame:
                 + [
                     [f"{key}_pred", f"{key}_sd"]
                     for key in self.get_keys_by_objective(
-                        excludes=Objective, includes=None  # type: ignore
+                        excludes=Objective,
+                        includes=None,  # type: ignore
                     )
                 ]
             )

diff --git a/bofire/data_models/features/categorical.py b/bofire/data_models/features/categorical.py
@@ -298,7 +298,7 @@ def from_ordinal_encoding(self, values: pd.Series) -> pd.Series:
         enc = np.array(self.categories)
         return pd.Series(enc[values], index=values.index, name=self.key)
 
-    def sample(self, n: int) -> pd.Series:
+    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
         """Draw random samples from the feature.
 
         Args:
@@ -308,7 +308,10 @@ def sample(self, n: int) -> pd.Series:
             pd.Series: drawn samples.
         """
         return pd.Series(
-            name=self.key, data=np.random.choice(self.get_allowed_categories(), n)
+            name=self.key,
+            data=np.random.default_rng(seed=seed).choice(
+                self.get_allowed_categories(), n
+            ),
         )
 
     def get_bounds(

diff --git a/bofire/data_models/features/continuous.py b/bofire/data_models/features/continuous.py
@@ -118,7 +118,7 @@ def validate_candidental(self, values: pd.Series) -> pd.Series:
             )
         return values
 
-    def sample(self, n: int) -> pd.Series:
+    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
         """Draw random samples from the feature.
 
         Args:
@@ -129,7 +129,9 @@ def sample(self, n: int) -> pd.Series:
         """
         return pd.Series(
             name=self.key,
-            data=np.random.uniform(self.lower_bound, self.upper_bound, n),
+            data=np.random.default_rng(seed=seed).uniform(
+                self.lower_bound, self.upper_bound, n
+            ),
         )
 
     def __str__(self) -> str:

diff --git a/bofire/data_models/features/discrete.py b/bofire/data_models/features/discrete.py
@@ -1,4 +1,4 @@
-from typing import ClassVar, Literal
+from typing import ClassVar, Literal, Optional
 
 import numpy as np
 import pandas as pd
@@ -75,7 +75,7 @@ def validate_candidental(self, values: pd.Series) -> pd.Series:
             )
         return values
 
-    def sample(self, n: int) -> pd.Series:
+    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
         """Draw random samples from the feature.
 
         Args:
@@ -84,7 +84,9 @@ def sample(self, n: int) -> pd.Series:
         Returns:
             pd.Series: drawn samples.
         """
-        return pd.Series(name=self.key, data=np.random.choice(self.values, n))
+        return pd.Series(
+            name=self.key, data=np.random.default_rng(seed=seed).choice(self.values, n)
+        )
 
     def from_continuous(self, values: pd.DataFrame) -> pd.Series:
         """Rounds continuous values to the closest discrete ones.

diff --git a/bofire/data_models/features/feature.py b/bofire/data_models/features/feature.py
@@ -95,7 +95,7 @@ def validate_candidental(self, values: pd.Series) -> pd.Series:
         pass
 
     @abstractmethod
-    def sample(self, n: int) -> pd.Series:
+    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
         """Sample a series of allowed values.
 
         Args:

diff --git a/bofire/data_models/features/molecular.py b/bofire/data_models/features/molecular.py
@@ -47,7 +47,7 @@ def is_fixed(self) -> bool:
     def fixed_value(self, transform_type: Optional[AnyMolFeatures] = None) -> None:
         return None
 
-    def sample(self, n: int) -> pd.Series:
+    def sample(self, n: int, seed: Optional[int] = None) -> pd.Series:
         raise ValueError("Sampling not supported for `MolecularInput`")
 
     def get_bounds(

diff --git a/bofire/strategies/samplers/polytope.py b/bofire/strategies/samplers/polytope.py
@@ -47,7 +47,9 @@ def __init__(
 
     def _ask(self, n: int) -> pd.DataFrame:
         if len(self.domain.constraints) == 0:
-            return self.domain.inputs.sample(n, self.fallback_sampling_method)
+            return self.domain.inputs.sample(
+                n, self.fallback_sampling_method, seed=self._get_seed()
+            )
 
         # check if we have pseudo fixed features in the linear equality constraints
         # a pseudo fixed is a linear euquality constraint with only one feature included
@@ -142,7 +144,7 @@ def _ask(self, n: int) -> pd.DataFrame:
                 equality_constraints=combined_eqs if len(combined_eqs) > 0 else None,
                 n_burnin=self.n_burnin,
                 thinning=self.n_thinning,
-                seed=self.rng.integers(1, 1000),
+                seed=self._get_seed(),
             ).squeeze(dim=0)
 
             # check that the random generated candidates are not always the same
@@ -163,7 +165,7 @@ def _ask(self, n: int) -> pd.DataFrame:
 
         # setup the categoricals and discrete ones as uniform sampled vals
         for feat in self.domain.get_features([CategoricalInput, DiscreteInput]):
-            samples[feat.key] = feat.sample(n)  # type: ignore
+            samples[feat.key] = feat.sample(n, seed=self._get_seed())  # type: ignore
 
         # setup the fixed continuous ones
         for key, value in fixed_features.items():

diff --git a/bofire/strategies/samplers/rejection.py b/bofire/strategies/samplers/rejection.py
@@ -28,15 +28,19 @@ def __init__(
 
     def _ask(self, n: int) -> pd.DataFrame:
         if len(self.domain.constraints) == 0:
-            return self.domain.inputs.sample(n, self.sampling_method)
+            return self.domain.inputs.sample(
+                n, self.sampling_method, seed=self._get_seed()
+            )
         n_iters = 0
         n_found = 0
         valid_samples = []
         while n_found < n:
             if n_iters > self.max_iters:
                 raise ValueError("Maximum iterations exceeded in rejection sampling.")
             samples = self.domain.inputs.sample(
-                self.num_base_samples, method=self.sampling_method
+                self.num_base_samples,
+                method=self.sampling_method,
+                seed=self._get_seed(),
             )
             valid = self.domain.constraints.is_fulfilled(samples)
             n_found += np.sum(valid)

diff --git a/bofire/strategies/samplers/sampler.py b/bofire/strategies/samplers/sampler.py
@@ -92,7 +92,12 @@ def ask(
                     raise_validation_error=raise_validation_error,
                 )
             return self.domain.validate_candidates(
-                samples.sample(n=candidate_count, replace=False, ignore_index=True),
+                samples.sample(
+                    n=candidate_count,
+                    replace=False,
+                    ignore_index=True,
+                    random_state=self._get_seed(),
+                ),
                 only_inputs=True,
                 raise_validation_error=raise_validation_error,
             )

diff --git a/bofire/strategies/strategy.py b/bofire/strategies/strategy.py
@@ -26,6 +26,15 @@ def __init__(
         self._experiments = None
         self._candidates = None
 
+    def _get_seed(self) -> int:
+        """Returns an integer sampled from the strategies random number generator,
+        that can be used to seed dependent generators.
+
+        Returns:
+            int: random seed.
+        """
+        return int(self.rng.integers(1, 100000))
+
     @classmethod
     def from_spec(cls, data_model: DataModel) -> "Strategy":
         """Used by the mapper to map from data model to functional strategy."""

diff --git a/tests/bofire/data_models/test_features.py b/tests/bofire/data_models/test_features.py
@@ -825,7 +825,6 @@ def test_categorical_descriptor_from_descriptor_encoding(key, categories, descri
         data=[[1.05, 2.5, 6], [4, 4.5, 9]],
     )
     samples = c1.from_descriptor_encoding(descriptor_values)
-    print(samples)
     assert np.all(samples == pd.Series([categories[0], categories[1]]))
 
     c2 = CategoricalDescriptorInput(
@@ -837,7 +836,6 @@ def test_categorical_descriptor_from_descriptor_encoding(key, categories, descri
     )
 
     samples = c2.from_descriptor_encoding(descriptor_values)
-    print(samples)
     assert np.all(samples == pd.Series([categories[1], categories[1]]))
 
 
@@ -1426,14 +1424,16 @@ def test_inputs_get_free(features, expected):
             inputs,
             Inputs(features=[if1, if2, if3, if4, if5, if7]),
         ]
-        for num_samples in [1, 2, 1024]
+        for num_samples in [1, 2, 64]
         for method in ["UNIFORM", "SOBOL", "LHS"]
     ],
 )
 def test_inputs_sample(features: Inputs, num_samples, method):
-    samples = features.sample(num_samples, method=method)
+    samples = features.sample(num_samples, method=method, seed=42)
     assert samples.shape == (num_samples, len(features))
     assert list(samples.columns) == features.get_keys()
+    samples2 = features.sample(num_samples, method=method, seed=42)
+    assert_frame_equal(samples2, samples)
 
 
 @pytest.mark.parametrize(