diff --git a/.github/workflows/py.test.yml b/.github/workflows/py.test.yml index 0fb2e7cbd..0f11a5c24 100644 --- a/.github/workflows/py.test.yml +++ b/.github/workflows/py.test.yml @@ -24,5 +24,5 @@ jobs: with: python-version: ${{ matrix.python-version }} cache: "poetry" - - run: poetry install --with test + - run: poetry install - run: poetry run pytest diff --git a/autora/cycle/__init__.py b/autora/cycle/__init__.py index 6d5ebaa75..5de7141e9 100644 --- a/autora/cycle/__init__.py +++ b/autora/cycle/__init__.py @@ -1 +1 @@ -from ._simple_cycle import _SimpleCycle as Cycle +from .simple import SimpleCycle as Cycle diff --git a/autora/cycle/_simple_cycle.py b/autora/cycle/_simple_cycle.py deleted file mode 100644 index 03172b7dd..000000000 --- a/autora/cycle/_simple_cycle.py +++ /dev/null @@ -1,269 +0,0 @@ -import copy -from dataclasses import dataclass, replace -from typing import Callable, Iterable, List, Optional - -import numpy as np -from sklearn.base import BaseEstimator - -from autora.experimentalist.pipeline import Pipeline -from autora.variable import VariableCollection - - -@dataclass(frozen=True) -class _SimpleCycleData: - """An object passed between processing steps in the _SimpleCycle which holds all the - data which can be updated.""" - - # Static - metadata: VariableCollection - - # Aggregates each cycle from the: - # ... Experimentalist - conditions: List[np.ndarray] - # ... Experiment Runner - observations: List[np.ndarray] - # ... Theorist - theories: List[BaseEstimator] - - -class _SimpleCycle: - """ - - Args: - metadata: - theorist: - experimentalist: - experiment_runner: - - Examples: - - Aim: Use the Cycle to recover a simple ground truth theory from noisy data. - - >>> def ground_truth(x): - ... return x + 1 - - The space of allowed x values is the integers between 0 and 10 inclusive, - and we record the allowed output values as well. - >>> from autora.variable import VariableCollection, Variable - >>> study_metadata = VariableCollection( - ... independent_variables=[Variable(name="x1", allowed_values=range(11))], - ... dependent_variables=[Variable(name="y", value_range=(-20, 20))], - ... ) - - The experimentalist is used to propose experiments. - Since the space of values is so restricted, we can just sample them all each time. - >>> from autora.experimentalist.pipeline import make_pipeline - >>> example_experimentalist = make_pipeline( - ... [study_metadata.independent_variables[0].allowed_values]) - - When we run a synthetic experiment, we get a reproducible noisy result: - >>> import numpy as np - >>> def get_example_synthetic_experiment_runner(): - ... rng = np.random.default_rng(seed=180) - ... def runner(x): - ... return ground_truth(x) + rng.normal(0, 0.1, x.shape) - ... return runner - >>> example_synthetic_experiment_runner = get_example_synthetic_experiment_runner() - >>> example_synthetic_experiment_runner(np.ndarray([1])) - array([2.04339546]) - - The theorist "tries" to work out the best theory. - We use a trivial scikit-learn regressor. - >>> from sklearn.linear_model import LinearRegression - >>> example_theorist = LinearRegression() - - We initialize the Cycle with the metadata describing the domain of the theory, - the theorist, experimentalist and experiment runner, - as well as a monitor which will let us know which cycle we're currently on. - >>> cycle = _SimpleCycle( - ... metadata=study_metadata, - ... theorist=example_theorist, - ... experimentalist=example_experimentalist, - ... experiment_runner=example_synthetic_experiment_runner, - ... monitor=lambda data: print(f"Generated {len(data.theories)} theories"), - ... ) - >>> cycle # doctest: +ELLIPSIS - <_simple_cycle._SimpleCycle object at 0x...> - - We can run the cycle by calling the run method: - >>> cycle.run(num_cycles=3) # doctest: +ELLIPSIS - Generated 1 theories - Generated 2 theories - Generated 3 theories - <_simple_cycle._SimpleCycle object at 0x...> - - We can now interrogate the results. The first set of conditions which went into the - experiment runner were: - >>> cycle.data.conditions[0] - array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) - - The observations include the conditions and the results: - >>> cycle.data.observations[0] - array([[ 0. , 0.92675345], - [ 1. , 1.89519928], - [ 2. , 3.08746571], - [ 3. , 3.93023943], - [ 4. , 4.95429102], - [ 5. , 6.04763988], - [ 6. , 7.20770574], - [ 7. , 7.85681519], - [ 8. , 9.05735823], - [ 9. , 10.18713406], - [10. , 10.88517906]]) - - In the third cycle (index = 2) the first and last values are different again: - >>> cycle.data.observations[2][[0,-1]] - array([[ 0. , 1.08559827], - [10. , 11.08179553]]) - - The best fit theory after the first cycle is: - >>> cycle.data.theories[0] - LinearRegression() - - >>> def report_linear_fit(m: LinearRegression, precision=4): - ... s = f"y = {np.round(m.coef_[0].item(), precision)} x " \\ - ... f"+ {np.round(m.intercept_.item(), 4)}" - ... return s - >>> report_linear_fit(cycle.data.theories[0]) - 'y = 1.0089 x + 0.9589' - - The best fit theory after all the cycles, including all the data, is: - >>> report_linear_fit(cycle.data.theories[-1]) - 'y = 0.9989 x + 1.0292' - - This is close to the ground truth theory of x -> (x + 1) - - We can also run the cycle with more control over the execution flow: - >>> next(cycle) # doctest: +ELLIPSIS - Generated 4 theories - <_simple_cycle._SimpleCycle object at 0x...> - - >>> next(cycle) # doctest: +ELLIPSIS - Generated 5 theories - <_simple_cycle._SimpleCycle object at 0x...> - - >>> next(cycle) # doctest: +ELLIPSIS - Generated 6 theories - <_simple_cycle._SimpleCycle object at 0x...> - - We can continue to run the cycle as long as we like, - with a simple arbitrary stopping condition like the number of theories generated: - >>> from itertools import takewhile - >>> _ = list(takewhile(lambda c: len(c.data.theories) < 9, cycle)) - Generated 7 theories - Generated 8 theories - Generated 9 theories - - ... or the precision (here we keep iterating while the difference between the gradients - between the second-last and last cycle is larger than 1x10^-3). - >>> _ = list( - ... takewhile( - ... lambda c: np.abs(c.data.theories[-1].coef_.item() - - ... c.data.theories[-2].coef_.item()) > 1e-3, - ... cycle - ... ) - ... ) - Generated 10 theories - Generated 11 theories - - ... or continue to run as long as we like: - >>> _ = cycle.run(num_cycles=100) # doctest: +ELLIPSIS - Generated 12 theories - ... - Generated 111 theories - - - - - """ - - def __init__( - self, - metadata: VariableCollection, - theorist, - experimentalist, - experiment_runner, - monitor: Optional[Callable[[_SimpleCycleData], None]] = None, - ): - - self.theorist = theorist - self.experimentalist = experimentalist - self.experiment_runner = experiment_runner - self.monitor = monitor - - self.data = _SimpleCycleData( - metadata=metadata, - conditions=[], - observations=[], - theories=[], - ) - - def run(self, num_cycles: int = 1): - for i in range(num_cycles): - next(self) - return self - - def __next__(self): - data = self.data - data = self._experimentalist_callback(self.experimentalist, data) - data = self._experiment_runner_callback(self.experiment_runner, data) - data = self._theorist_callback(self.theorist, data) - self._monitor_callback(data) - self.data = data - return self - - def __iter__(self): - return self - - @staticmethod - def _experimentalist_callback(experimentalist: Pipeline, data_in: _SimpleCycleData): - new_conditions = experimentalist() - if isinstance(new_conditions, Iterable): - # If the pipeline gives us an iterable, we need to make it into a concrete array. - # We can't move this logic to the Pipeline, because the pipeline doesn't know whether - # it's within another pipeline and whether it should convert the iterable to a - # concrete array. - new_conditions_values = list(new_conditions) - new_conditions_array = np.array(new_conditions_values) - else: - raise NotImplementedError(f"Object {new_conditions} can't be handled yet.") - - assert isinstance( - new_conditions_array, np.ndarray - ) # Check the object is bounded - data_out = replace( - data_in, - conditions=data_in.conditions + [new_conditions_array], - ) - return data_out - - @staticmethod - def _experiment_runner_callback( - experiment_runner: Callable, data_in: _SimpleCycleData - ): - x = data_in.conditions[-1] - y = experiment_runner(x) - new_observations = np.column_stack([x, y]) - data_out = replace( - data_in, observations=data_in.observations + [new_observations] - ) - return data_out - - @staticmethod - def _theorist_callback(theorist, data_in: _SimpleCycleData): - all_observations = np.row_stack(data_in.observations) - n_xs = len( - data_in.metadata.independent_variables - ) # The number of independent variables - x, y = all_observations[:, :n_xs], all_observations[:, n_xs:] - new_theorist = copy.copy(theorist) - new_theorist.fit(x, y) - data_out = replace( - data_in, - theories=data_in.theories + [new_theorist], - ) - return data_out - - def _monitor_callback(self, data: _SimpleCycleData): - if self.monitor is not None: - self.monitor(data) diff --git a/autora/cycle/simple.py b/autora/cycle/simple.py new file mode 100644 index 000000000..15ec46a6e --- /dev/null +++ b/autora/cycle/simple.py @@ -0,0 +1,525 @@ +import copy +from collections.abc import Mapping +from dataclasses import dataclass, replace +from typing import Callable, Dict, Iterable, List, Optional + +import numpy as np +from sklearn.base import BaseEstimator + +from autora.experimentalist.pipeline import Pipeline +from autora.utils.dictionary import LazyDict +from autora.variable import VariableCollection + + +@dataclass(frozen=True) +class SimpleCycleData: + """An object passed between and updated by processing steps in the SimpleCycle.""" + + # Static + metadata: VariableCollection + + # Aggregates each cycle from the: + # ... Experimentalist + conditions: List[np.ndarray] + # ... Experiment Runner + observations: List[np.ndarray] + # ... Theorist + theories: List[BaseEstimator] + + +def _get_cycle_properties(data: SimpleCycleData): + """ + Examples: + Even with an empty data object, we can initialize the dictionary, + >>> cycle_properties = _get_cycle_properties(SimpleCycleData(metadata=VariableCollection(), + ... conditions=[], observations=[], theories=[])) + + ... but it will raise an exception if a value isn't yet available when we try to use it + >>> cycle_properties["%theories[-1]%"] # doctest: +ELLIPSIS + Traceback (most recent call last): + ... + IndexError: list index out of range + + Nevertheless, we can iterate through its keys no problem: + >>> [key for key in cycle_properties.keys()] # doctest: +NORMALIZE_WHITESPACE + ['%observations.ivs[-1]%', '%observations.dvs[-1]%', '%observations.ivs%', + '%observations.dvs%', '%theories[-1]%', '%theories%'] + + """ + + n_ivs = len(data.metadata.independent_variables) + n_dvs = len(data.metadata.dependent_variables) + cycle_property_dict = LazyDict( + { + "%observations.ivs[-1]%": lambda: data.observations[-1][:, 0:n_ivs], + "%observations.dvs[-1]%": lambda: data.observations[-1][:, n_ivs:], + "%observations.ivs%": lambda: np.row_stack( + [np.empty([0, n_ivs + n_dvs])] + data.observations + )[:, 0:n_ivs], + "%observations.dvs%": lambda: np.row_stack(data.observations)[:, n_ivs:], + "%theories[-1]%": lambda: data.theories[-1], + "%theories%": lambda: data.theories, + } + ) + return cycle_property_dict + + +class SimpleCycle: + """ + Runs an experimentalist, theorist and experiment runner in a loop. + + Once initialized, the `cycle` can be started using the `cycle.run` method + or by calling `next(cycle)`. + + The `.data` attribute is updated with the results. + + Attributes: + data (dataclass): an object which is updated during the cycle and has the following + properties: + + - `metadata` + - `conditions`: a list of np.ndarrays representing all of the IVs proposed by the + experimentalist + - `observations`: a list of np.ndarrays representing all of the IVs and DVs returned by + the experiment runner + - `theories`: a list of all the fitted theories (scikit-learn compatible estimators) + + params (dict): a nested dictionary with parameters for the cycle parts. + + `{ + "experimentalist": {}, + "theorist": {}, + "experiment_runner": {} + }` + + + Examples: + + ### Basic Usage + + Aim: Use the SimpleCycle to recover a simple ground truth theory from noisy data. + + >>> def ground_truth(x): + ... return x + 1 + + The space of allowed x values is the integers between 0 and 10 inclusive, + and we record the allowed output values as well. + >>> from autora.variable import VariableCollection, Variable + >>> metadata_0 = VariableCollection( + ... independent_variables=[Variable(name="x1", allowed_values=range(11))], + ... dependent_variables=[Variable(name="y", value_range=(-20, 20))], + ... ) + + The experimentalist is used to propose experiments. + Since the space of values is so restricted, we can just sample them all each time. + >>> from autora.experimentalist.pipeline import make_pipeline + >>> example_experimentalist = make_pipeline( + ... [metadata_0.independent_variables[0].allowed_values]) + + When we run a synthetic experiment, we get a reproducible noisy result: + >>> import numpy as np + >>> def get_example_synthetic_experiment_runner(): + ... rng = np.random.default_rng(seed=180) + ... def runner(x): + ... return ground_truth(x) + rng.normal(0, 0.1, x.shape) + ... return runner + >>> example_synthetic_experiment_runner = get_example_synthetic_experiment_runner() + >>> example_synthetic_experiment_runner(np.ndarray([1])) + array([2.04339546]) + + The theorist "tries" to work out the best theory. + We use a trivial scikit-learn regressor. + >>> from sklearn.linear_model import LinearRegression + >>> example_theorist = LinearRegression() + + We initialize the SimpleCycle with the metadata describing the domain of the theory, + the theorist, experimentalist and experiment runner, + as well as a monitor which will let us know which cycle we're currently on. + >>> cycle = SimpleCycle( + ... metadata=metadata_0, + ... theorist=example_theorist, + ... experimentalist=example_experimentalist, + ... experiment_runner=example_synthetic_experiment_runner, + ... monitor=lambda data: print(f"Generated {len(data.theories)} theories"), + ... ) + >>> cycle # doctest: +ELLIPSIS + + + We can run the cycle by calling the run method: + >>> cycle.run(num_cycles=3) # doctest: +ELLIPSIS + Generated 1 theories + Generated 2 theories + Generated 3 theories + + + We can now interrogate the results. The first set of conditions which went into the + experiment runner were: + >>> cycle.data.conditions[0] + array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + + The observations include the conditions and the results: + >>> cycle.data.observations[0] + array([[ 0. , 0.92675345], + [ 1. , 1.89519928], + [ 2. , 3.08746571], + [ 3. , 3.93023943], + [ 4. , 4.95429102], + [ 5. , 6.04763988], + [ 6. , 7.20770574], + [ 7. , 7.85681519], + [ 8. , 9.05735823], + [ 9. , 10.18713406], + [10. , 10.88517906]]) + + In the third cycle (index = 2) the first and last values are different again: + >>> cycle.data.observations[2][[0,-1]] + array([[ 0. , 1.08559827], + [10. , 11.08179553]]) + + The best fit theory after the first cycle is: + >>> cycle.data.theories[0] + LinearRegression() + + >>> def report_linear_fit(m: LinearRegression, precision=4): + ... s = f"y = {np.round(m.coef_[0].item(), precision)} x " \\ + ... f"+ {np.round(m.intercept_.item(), 4)}" + ... return s + >>> report_linear_fit(cycle.data.theories[0]) + 'y = 1.0089 x + 0.9589' + + The best fit theory after all the cycles, including all the data, is: + >>> report_linear_fit(cycle.data.theories[-1]) + 'y = 0.9989 x + 1.0292' + + This is close to the ground truth theory of x -> (x + 1) + + We can also run the cycle with more control over the execution flow: + >>> next(cycle) # doctest: +ELLIPSIS + Generated 4 theories + + + >>> next(cycle) # doctest: +ELLIPSIS + Generated 5 theories + + + >>> next(cycle) # doctest: +ELLIPSIS + Generated 6 theories + + + We can continue to run the cycle as long as we like, + with a simple arbitrary stopping condition like the number of theories generated: + >>> from itertools import takewhile + >>> _ = list(takewhile(lambda c: len(c.data.theories) < 9, cycle)) + Generated 7 theories + Generated 8 theories + Generated 9 theories + + ... or the precision (here we keep iterating while the difference between the gradients + of the second-last and last cycle is larger than 1x10^-3). + >>> _ = list( + ... takewhile( + ... lambda c: np.abs(c.data.theories[-1].coef_.item() - + ... c.data.theories[-2].coef_.item()) > 1e-3, + ... cycle + ... ) + ... ) + Generated 10 theories + Generated 11 theories + + ... or continue to run as long as we like: + >>> _ = cycle.run(num_cycles=100) # doctest: +ELLIPSIS + Generated 12 theories + ... + Generated 111 theories + + ### Passing Static Parameters + + It's easy to pass parameters to the cycle components, if there are any needed. + Here we have an experimentalist which takes a parameter: + >>> uniform_random_rng = np.random.default_rng(180) + >>> def uniform_random_sampler(n): + ... return uniform_random_rng.uniform(low=0, high=11, size=n) + >>> example_experimentalist_with_parameters = make_pipeline([uniform_random_sampler]) + + The cycle can handle that using the `params` keyword: + >>> cycle_with_parameters = SimpleCycle( + ... metadata=metadata_0, + ... theorist=example_theorist, + ... experimentalist=example_experimentalist_with_parameters, + ... experiment_runner=example_synthetic_experiment_runner, + ... params={"experimentalist": {"uniform_random_sampler": {"n": 7}}} + ... ) + >>> _ = cycle_with_parameters.run() + >>> cycle_with_parameters.data.conditions[-1].flatten() + array([6.33661987, 7.34916618, 6.08596494, 2.28566582, 1.9553974 , + 5.80023149, 3.27007909]) + + For the next cycle, if we wish, we can change the parameter value: + >>> cycle_with_parameters.params["experimentalist"]["uniform_random_sampler"]\\ + ... ["n"] = 2 + >>> _ = cycle_with_parameters.run() + >>> cycle_with_parameters.data.conditions[-1].flatten() + array([10.5838232 , 9.45666031]) + + ### Accessing "Cycle Properties" + + Some experimentalists, experiment runners and theorists require access to the values + created during the cycle execution, e.g. experimentalists which require access + to the current best theory or the observed data. These data update each cycle, and + so cannot easily be set using simple `params`. + + For this case, it is possible to use "cycle properties" in the `params` dictionary. These + are the following strings, which will be replaced during execution by their respective + current values: + + - `"%observations.ivs[-1]%"`: the last observed independent variables + - `"%observations.dvs[-1]%"`: the last observed dependent variables + - `"%observations.ivs%"`: all the observed independent variables, + concatenated into a single array + - `"%observations.dvs%"`: all the observed dependent variables, + concatenated into a single array + - `"%theories[-1]%"`: the last fitted theorist + - `"%theories%"`: all the fitted theorists + + In the following example, we use the `"observations.ivs"` cycle property for an + experimentalist which excludes those conditions which have + already been seen. + + >>> metadata_1 = VariableCollection( + ... independent_variables=[Variable(name="x1", allowed_values=range(10))], + ... dependent_variables=[Variable(name="y")], + ... ) + >>> random_sampler_rng = np.random.default_rng(seed=180) + >>> def custom_random_sampler(conditions, n): + ... sampled_conditions = random_sampler_rng.choice(conditions, size=n, replace=False) + ... return sampled_conditions + >>> def exclude_conditions(conditions, excluded_conditions): + ... remaining_conditions = list(set(conditions) - set(excluded_conditions.flatten())) + ... return remaining_conditions + >>> unobserved_data_experimentalist = make_pipeline([ + ... metadata_1.independent_variables[0].allowed_values, + ... exclude_conditions, + ... custom_random_sampler + ... ] + ... ) + >>> cycle_with_cycle_properties = SimpleCycle( + ... metadata=metadata_1, + ... theorist=example_theorist, + ... experimentalist=unobserved_data_experimentalist, + ... experiment_runner=example_synthetic_experiment_runner, + ... params={ + ... "experimentalist": { + ... "exclude_conditions": {"excluded_conditions": "%observations.ivs%"}, + ... "custom_random_sampler": {"n": 1} + ... } + ... } + ... ) + + Now we can run the cycler to generate conditions and run experiments. The first time round, + we have the full set of 10 possible conditions to select from, and we select "2" at random: + >>> _ = cycle_with_cycle_properties.run() + >>> cycle_with_cycle_properties.data.conditions[-1] + array([2]) + + We can continue to run the cycler, each time we add more to the list of "excluded" options: + >>> _ = cycle_with_cycle_properties.run(num_cycles=5) + >>> cycle_with_cycle_properties.data.conditions + [array([2]), array([6]), array([5]), array([7]), array([3]), array([4])] + + By using the monitor callback, we can investigate what's going on with the cycle properties: + >>> cycle_with_cycle_properties.monitor = lambda data: print( + ... _get_cycle_properties(data)["%observations.ivs%"].flatten() + ... ) + + The monitor evaluates at the end of each cycle + and shows that we've added a new observed IV each step + >>> _ = cycle_with_cycle_properties.run() + [2. 6. 5. 7. 3. 4. 9.] + >>> _ = cycle_with_cycle_properties.run() + [2. 6. 5. 7. 3. 4. 9. 0.] + + We deactivate the monitor by making it "None" again. + >>> cycle_with_cycle_properties.monitor = None + + We can continue until we've sampled all of the options: + >>> _ = cycle_with_cycle_properties.run(num_cycles=2) + >>> cycle_with_cycle_properties.data.conditions # doctest: +NORMALIZE_WHITESPACE + [array([2]), array([6]), array([5]), array([7]), array([3]), \ + array([4]), array([9]), array([0]), array([8]), array([1])] + + If we try to evaluate it again, the experimentalist fails, as there aren't any more + conditions which are available: + >>> cycle_with_cycle_properties.run() # doctest: +ELLIPSIS + Traceback (most recent call last): + ... + ValueError: a cannot be empty unless no samples are taken + + """ + + def __init__( + self, + metadata: VariableCollection, + theorist, + experimentalist, + experiment_runner, + monitor: Optional[Callable[[SimpleCycleData], None]] = None, + params: Optional[Dict] = None, + ): + """ + Args: + metadata: a description of the dependent and independent variables + theorist: a scikit-learn-compatible estimator + experimentalist: an autora.experimentalist.Pipeline + experiment_runner: a function to map independent variables onto observed dependent + variables + monitor: a function which gets read-only access to the `data` attribute at the end of + each cycle. + params: a nested dictionary with parameters to be passed to the parts of the cycle. + E.g. if the experimentalist had a step named "pool" which took an argument "n", + which you wanted to set to the value 30, then params would be set to this: + `{"experimentalist": {"pool": {"n": 30}}}` + """ + + self.theorist = theorist + self.experimentalist = experimentalist + self.experiment_runner = experiment_runner + self.monitor = monitor + if params is None: + params = dict() + self.params = params + + self.data = SimpleCycleData( + metadata=metadata, + conditions=[], + observations=[], + theories=[], + ) + + def run(self, num_cycles: int = 1): + for i in range(num_cycles): + next(self) + return self + + def __next__(self): + assert ( + "experiment_runner" not in self.params + ), "experiment_runner cannot yet accept cycle properties" + assert ( + "theorist" not in self.params + ), "theorist cannot yet accept cycle properties" + + data = self.data + params_with_cycle_properties = _resolve_cycle_properties( + self.params, _get_cycle_properties(self.data) + ) + + data = self._experimentalist_callback( + self.experimentalist, + data, + params_with_cycle_properties.get("experimentalist", dict()), + ) + data = self._experiment_runner_callback(self.experiment_runner, data) + data = self._theorist_callback(self.theorist, data) + self._monitor_callback(data) + self.data = data + + return self + + def __iter__(self): + return self + + @staticmethod + def _experimentalist_callback( + experimentalist: Pipeline, data_in: SimpleCycleData, params: dict + ): + new_conditions = experimentalist(**params) + if isinstance(new_conditions, Iterable): + # If the pipeline gives us an iterable, we need to make it into a concrete array. + # We can't move this logic to the Pipeline, because the pipeline doesn't know whether + # it's within another pipeline and whether it should convert the iterable to a + # concrete array. + new_conditions_values = list(new_conditions) + new_conditions_array = np.array(new_conditions_values) + else: + raise NotImplementedError(f"Object {new_conditions} can't be handled yet.") + + assert isinstance( + new_conditions_array, np.ndarray + ) # Check the object is bounded + data_out = replace( + data_in, + conditions=data_in.conditions + [new_conditions_array], + ) + return data_out + + @staticmethod + def _experiment_runner_callback( + experiment_runner: Callable, data_in: SimpleCycleData + ): + x = data_in.conditions[-1] + y = experiment_runner(x) + new_observations = np.column_stack([x, y]) + data_out = replace( + data_in, observations=data_in.observations + [new_observations] + ) + return data_out + + @staticmethod + def _theorist_callback(theorist, data_in: SimpleCycleData): + all_observations = np.row_stack(data_in.observations) + n_xs = len( + data_in.metadata.independent_variables + ) # The number of independent variables + x, y = all_observations[:, :n_xs], all_observations[:, n_xs:] + new_theorist = copy.copy(theorist) + new_theorist.fit(x, y) + data_out = replace( + data_in, + theories=data_in.theories + [new_theorist], + ) + return data_out + + def _monitor_callback(self, data: SimpleCycleData): + if self.monitor is not None: + self.monitor(data) + + +def _resolve_cycle_properties(params: Dict, cycle_properties: Mapping): + """ + Resolve "cycle properties" inside a nested dictionary. + + In this context, a "cycle property" is a string which is meant to be replaced by a + different value before the dictionary is used. + + Args: + params: a (nested) dictionary of keys and values, where some values might be + "cycle property names" + cycle_properties: a dictionary of "cycle property names" and their "real values" + + Returns: a (nested) dictionary where "cycle property names" are replaced by the "real values" + + Examples: + + >>> params_0 = {"key": "%foo%"} + >>> cycle_properties_0 = {"%foo%": 180} + >>> _resolve_cycle_properties(params_0, cycle_properties_0) + {'key': 180} + + >>> params_1 = {"key": "%bar%", "nested_dict": {"inner_key": "%foobar%"}} + >>> cycle_properties_1 = {"%bar%": 1, "%foobar%": 2} + >>> _resolve_cycle_properties(params_1, cycle_properties_1) + {'key': 1, 'nested_dict': {'inner_key': 2}} + + """ + params_ = copy.copy(params) + for key, value in params_.items(): + if isinstance(value, dict): + params_[key] = _resolve_cycle_properties(value, cycle_properties) + elif ( + isinstance(value, str) and value in cycle_properties + ): # value is a key in the cycle_properties dictionary + params_[key] = cycle_properties[value] + else: + pass # no change needed + + return params_ diff --git a/autora/experimentalist/pooler/__init__.py b/autora/experimentalist/pooler/__init__.py new file mode 100644 index 000000000..54d836a1d --- /dev/null +++ b/autora/experimentalist/pooler/__init__.py @@ -0,0 +1,2 @@ +from .general_pool import grid_pool, random_pool +from .poppernet import poppernet_pool diff --git a/autora/experimentalist/pool.py b/autora/experimentalist/pooler/general_pool.py similarity index 100% rename from autora/experimentalist/pool.py rename to autora/experimentalist/pooler/general_pool.py diff --git a/autora/experimentalist/sampler/poppernet.py b/autora/experimentalist/pooler/poppernet.py similarity index 78% rename from autora/experimentalist/sampler/poppernet.py rename to autora/experimentalist/pooler/poppernet.py index 9508deaad..2995405ce 100644 --- a/autora/experimentalist/sampler/poppernet.py +++ b/autora/experimentalist/pooler/poppernet.py @@ -1,26 +1,28 @@ -from typing import Iterable, Optional, Tuple, cast +from typing import Optional, Tuple, cast import numpy as np import torch +from sklearn.preprocessing import StandardScaler from torch import nn from torch.autograd import Variable from autora.variable import ValueType, VariableCollection -def poppernet_pooler( +def poppernet_pool( model, x_train: np.ndarray, y_train: np.ndarray, metadata: VariableCollection, - num_samples: int = 100, + n: int = 100, training_epochs: int = 1000, optimization_epochs: int = 1000, training_lr: float = 1e-3, optimization_lr: float = 1e-3, mse_scale: float = 1, - limit_offset: float = 10**-10, + limit_offset: float = 0, # 10**-10, limit_repulsion: float = 0, + plot: bool = False, ): """ A pooler that generates samples for independent variables with the objective of maximizing the @@ -36,7 +38,7 @@ def poppernet_pooler( x_train: data that the model was trained on y_train: labels that the model was trained on metadata: Meta-data about the dependent and independent variables - num_samples: number of samples to return + n: number of samples to return training_epochs: number of epochs to train the popper network for approximating the error fo the model optimization_epochs: number of epochs to optimize the samples based on the trained @@ -48,7 +50,7 @@ def poppernet_pooler( boundaries limit_repulsion: a limited repulsion to prevent the samples from being too close to the allowed value boundaries - verbose: print out the prediction of the popper network as well as its training loss + plot: print out the prediction of the popper network as well as its training loss Returns: Sampled pool @@ -60,7 +62,7 @@ def poppernet_pooler( if len(x_train.shape) == 1: x_train = x_train.reshape(-1, 1) - x = np.empty([num_samples, x_train.shape[1]]) + x = np.empty([n, x_train.shape[1]]) y_train = np.array(y_train) if len(y_train.shape) == 1: @@ -99,10 +101,22 @@ def poppernet_pooler( raise Exception("Model must have `predict` or `predict_proba` method.") model_prediction = model_predict(x_train) + if isinstance(model_prediction, np.ndarray) is False: + try: + model_prediction = np.array(model_prediction) + except Exception: + raise Exception("Model prediction must be convertable to numpy array.") + if model_prediction.ndim == 1: + model_prediction = model_prediction.reshape(-1, 1) criterion = nn.MSELoss() model_loss = (model_prediction - y_train) ** 2 * mse_scale model_loss = np.mean(model_loss, axis=1) + + # standardize the loss + scaler = StandardScaler() + model_loss = scaler.fit_transform(model_loss.reshape(-1, 1)).flatten() + model_loss = torch.from_numpy(model_loss).float() popper_target = Variable(model_loss, requires_grad=False) @@ -127,6 +141,24 @@ def poppernet_pooler( popper_optimizer.step() losses.append(loss.item()) + if plot: + popper_input_full = np.linspace( + iv_limit_list[0][0], iv_limit_list[0][1], 1000 + ).reshape(-1, 1) + popper_input_full = Variable( + torch.from_numpy(popper_input_full), requires_grad=False + ).float() + popper_prediction = popper_net(popper_input_full) + plot_popper_diagnostics( + losses, + popper_input, + popper_input_full, + popper_prediction, + popper_target, + model_prediction, + y_train, + ) + # now that the popper network is trained we can sample new data points # to sample data points we need to provide the popper network with an initial condition # we will sample those initial conditions proportional to the loss of the current model @@ -140,7 +172,7 @@ def poppernet_pooler( popper_net.freeze_weights() - for condition in range(num_samples): + for condition in range(n): index = transform_category.sample() input_sample = torch.flatten(x_train_tensor[index, :]) @@ -165,10 +197,13 @@ def poppernet_pooler( # first add repulsion from variable limits for idx in range(len(input_sample)): - iv_value = input_sample[idx] + iv_value = popper_input[idx] iv_limits = iv_limit_list[idx] dist_to_min = np.abs(iv_value - np.min(iv_limits)) dist_to_max = np.abs(iv_value - np.max(iv_limits)) + # deal with boundary case where distance is 0 or very small + dist_to_min = np.max([dist_to_min, 0.00000001]) + dist_to_max = np.max([dist_to_max, 0.00000001]) repulsion_from_min = limit_repulsion / (dist_to_min**2) repulsion_from_max = limit_repulsion / (dist_to_max**2) iv_value_repulsed = ( @@ -179,7 +214,6 @@ def poppernet_pooler( # now add gradient for theory loss maximization delta = -optimization_lr * popper_input.grad popper_input += delta - popper_input.grad.zero_() # finally, clip input variable from its limits for idx in range(len(input_sample)): @@ -195,6 +229,7 @@ def poppernet_pooler( ] ) popper_input[idx] = iv_clipped_value + popper_input.grad.zero_() # add condition to new experiment sequence for idx in range(len(input_sample)): @@ -210,69 +245,63 @@ def poppernet_pooler( x[condition, idx] = iv_clipped_scaled_value - return x + return iter(x) -def plot_popper_diagnostics(losses, popper_input, popper_prediction, popper_target): +def plot_popper_diagnostics( + losses, + popper_input, + popper_input_full, + popper_prediction, + popper_target, + model_prediction, + target, +): print("Finished training Popper Network...") import matplotlib.pyplot as plt if popper_input.shape[1] > 1: plot_input = popper_input[:, 0] - plt.scatter(plot_input, popper_target.detach().numpy(), label="target") - plt.scatter(plot_input, popper_prediction.detach().numpy(), label="prediction") else: plot_input = popper_input - plt.plot(plot_input, popper_target.detach().numpy(), label="target") - plt.plot(plot_input, popper_prediction.detach().numpy(), label="prediction") + + if model_prediction.ndim > 1: + if model_prediction.shape[1] > 1: + model_prediction = model_prediction[:, 0] + target = target[:, 0] + + # PREDICTED MODEL ERROR PLOT + plot_input_order = np.argsort(np.array(plot_input).flatten()) + plot_input = plot_input[plot_input_order] + popper_target = popper_target[plot_input_order] + # popper_prediction = popper_prediction[plot_input_order] + plt.plot(popper_input_full, popper_prediction.detach().numpy(), label="prediction") + plt.scatter( + plot_input, popper_target.detach().numpy(), s=20, c="red", label="target" + ) plt.xlabel("x") - plt.ylabel("y") + plt.ylabel("model MSE") + plt.title("popper network prediction") plt.legend() plt.show() + + # CONVERGENCE PLOT plt.plot(losses) plt.xlabel("epoch") plt.ylabel("loss") + plt.title("loss for popper network") plt.show() - -def nearest_values_sampler( - samples, - allowed_values, -): - """ - A sampler which returns the nearest values between the input samples and the allowed values, - without replacement. - - Args: - samples: input conditions - allowed_samples: allowed conditions to sample from - - Returns: - the nearest values from `allowed_samples` to the `samples` - - """ - - if isinstance(allowed_values, Iterable): - allowed_values = np.array(list(allowed_values)) - - if len(allowed_values.shape) == 1: - allowed_values = allowed_values.reshape(-1, 1) - - num_samples = samples.shape[0] - - if allowed_values.shape[0] <= num_samples: - raise Exception("More samples requested than samples available in the pool x.") - - x_new = np.empty((num_samples, allowed_values.shape[1])) - - # get index of row in x that is closest to each sample - for row, sample in enumerate(samples): - dist = np.linalg.norm(allowed_values - sample, axis=1) - idx = np.argmin(dist) - x_new[row, :] = allowed_values[idx, :] - allowed_values = np.delete(allowed_values, idx, axis=0) - - return x_new + # MODEL PREDICTION PLOT + model_prediction = model_prediction[plot_input_order] + target = target[plot_input_order] + plt.plot(plot_input, model_prediction, label="model prediction") + plt.scatter(plot_input, target, s=20, c="red", label="target") + plt.xlabel("x") + plt.ylabel("y") + plt.title("model prediction vs. target") + plt.legend() + plt.show() # define the network diff --git a/autora/experimentalist/sampler/__init__.py b/autora/experimentalist/sampler/__init__.py index b7c574623..d2c337081 100644 --- a/autora/experimentalist/sampler/__init__.py +++ b/autora/experimentalist/sampler/__init__.py @@ -1,2 +1,4 @@ +from .model_disagreement import model_disagreement_sampler +from .nearest_value import nearest_values_sampler from .random import random_sampler from .uncertainty import uncertainty_sampler diff --git a/autora/experimentalist/sampler/dissimilarity.py b/autora/experimentalist/sampler/dissimilarity.py new file mode 100644 index 000000000..8b8b112ac --- /dev/null +++ b/autora/experimentalist/sampler/dissimilarity.py @@ -0,0 +1,96 @@ +from typing import Iterable, Literal + +import numpy as np +from sklearn.metrics import DistanceMetric + +AllowedMetrics = Literal[ + "euclidean", + "manhattan", + "chebyshev", + "minkowski", + "wminkowski", + "seuclidean", + "mahalanobis", + "haversine", + "hamming", + "canberra", + "braycurtis", + "matching", + "jaccard", + "dice", + "kulsinski", + "rogerstanimoto", + "russellrao", + "sokalmichener", + "sokalsneath", + "yule", +] + + +def summed_dissimilarity_sampler( + X: np.ndarray, X_ref: np.ndarray, n: int = 1, metric: AllowedMetrics = "euclidean" +) -> np.ndarray: + """ + This dissimilarity samples re-arranges the pool of IV conditions according to their + dissimilarity with respect to a reference pool X_ref. The default dissimilarity is calculated + as the average of the pairwise distances between the conditions in X and X_ref. + + Args: + X: pool of IV conditions to evaluate dissimilarity + X_ref: reference pool of IV conditions + n: number of samples to select + metric (str): dissimilarity measure. Options: 'euclidean', 'manhattan', 'chebyshev', + 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis', 'haversine', + 'hamming', 'canberra', 'braycurtis', 'matching', 'jaccard', 'dice', + 'kulsinski', 'rogerstanimoto', 'russellrao', 'sokalmichener', + 'sokalsneath', 'yule'. See [sklearn.metrics.DistanceMetric][] for more details. + + Returns: + Sampled pool + """ + + if isinstance(X, Iterable): + X = np.array(list(X)) + + if isinstance(X_ref, Iterable): + X_ref = np.array(list(X_ref)) + + if X.ndim == 1: + X = X.reshape(-1, 1) + + if X_ref.ndim == 1: + X_ref = X_ref.reshape(-1, 1) + + if X.shape[1] != X_ref.shape[1]: + raise ValueError( + f"X and X_ref must have the same number of columns.\n" + f"X has {X.shape[1]} columns, while X_ref has {X_ref.shape[1]} columns." + ) + + if X.shape[0] < n: + raise ValueError( + f"X must have at least {n} rows matching the number of requested samples." + ) + + dist = DistanceMetric.get_metric(metric) + + # create a list to store the summed distances for each row in matrix1 + summed_distances = [] + + # loop over each row in first matrix + for row in X: + # calculate the distances between the current row in matrix1 and all other rows in matrix2 + summed_distance = 0 + + for X_ref_row in X_ref: + + distance = dist.pairwise([row, X_ref_row])[0, 1] + summed_distance += distance + + # store the summed distance for the current row + summed_distances.append(summed_distance) + + # sort the rows in matrix1 by their summed distances + sorted_X = X[np.argsort(summed_distances)[::-1]] + + return sorted_X[:n] diff --git a/autora/experimentalist/sampler/nearest_value.py b/autora/experimentalist/sampler/nearest_value.py new file mode 100644 index 000000000..61f2713d7 --- /dev/null +++ b/autora/experimentalist/sampler/nearest_value.py @@ -0,0 +1,60 @@ +from typing import Iterable, Sequence, Union + +import numpy as np + + +def nearest_values_sampler( + samples: Union[Iterable, Sequence], + allowed_values: np.ndarray, + n: int, +): + """ + A sampler which returns the nearest values between the input samples and the allowed values, + without replacement. + + Args: + samples: input conditions + allowed_samples: allowed conditions to sample from + + Returns: + the nearest values from `allowed_samples` to the `samples` + + """ + + if isinstance(allowed_values, Iterable): + allowed_values = np.array(list(allowed_values)) + + if len(allowed_values.shape) == 1: + allowed_values = allowed_values.reshape(-1, 1) + + if isinstance(samples, Iterable): + samples = np.array(list(samples)) + + if allowed_values.shape[0] < n: + raise Exception( + "More samples requested than samples available in the set allowed of values." + ) + + if isinstance(samples, Iterable) or isinstance(samples, Sequence): + samples = np.array(list(samples)) + + if hasattr(samples, "shape"): + if samples.shape[0] < n: + raise Exception( + "More samples requested than samples available in the pool." + ) + + x_new = np.empty((n, allowed_values.shape[1])) + + # get index of row in x that is closest to each sample + for row, sample in enumerate(samples): + + if row >= n: + break + + dist = np.linalg.norm(allowed_values - sample, axis=1) + idx = np.argmin(dist) + x_new[row, :] = allowed_values[idx, :] + allowed_values = np.delete(allowed_values, idx, axis=0) + + return x_new diff --git a/autora/theorist/bms/mcmc.py b/autora/theorist/bms/mcmc.py index 1e80aea2b..3a12dde1e 100644 --- a/autora/theorist/bms/mcmc.py +++ b/autora/theorist/bms/mcmc.py @@ -1302,6 +1302,12 @@ def predict(self, x): Returns: predicted y values """ + if isinstance(x, np.ndarray): + columns = list() + for col in range(x.shape[1]): + columns.append("X" + str(col)) + x = pd.DataFrame(x, columns=columns) + if isinstance(x, pd.DataFrame): this_x = {"d0": x} input_type = "df" diff --git a/autora/utils/__init__.py b/autora/utils/__init__.py new file mode 100644 index 000000000..6fda29471 --- /dev/null +++ b/autora/utils/__init__.py @@ -0,0 +1 @@ +from . import dictionary diff --git a/autora/utils/dictionary.py b/autora/utils/dictionary.py new file mode 100644 index 000000000..b45b5b927 --- /dev/null +++ b/autora/utils/dictionary.py @@ -0,0 +1,18 @@ +from typing import Mapping + + +class LazyDict(Mapping): + """Inspired by https://gist.github.com/gyli/9b50bb8537069b4e154fec41a4b5995a""" + + def __init__(self, *args, **kw): + self._raw_dict = dict(*args, **kw) + + def __getitem__(self, key): + func = self._raw_dict.__getitem__(key) + return func() + + def __iter__(self): + return iter(self._raw_dict) + + def __len__(self): + return len(self._raw_dict) diff --git a/autora/variable/__init__.py b/autora/variable/__init__.py index 00f0c8039..4cdd8ff99 100644 --- a/autora/variable/__init__.py +++ b/autora/variable/__init__.py @@ -21,7 +21,7 @@ class Variable: """Describes an experimental variable: name, type, range, units, and value of a variable.""" name: str = "" - value_range: Optional[Tuple[Any, Any]] = (0, 1) + value_range: Optional[Tuple[Any, Any]] = None allowed_values: Optional[Sequence] = None units: str = "" type: ValueType = ValueType.REAL diff --git a/docs/theorist/bms/example.md b/docs/theorist/bms/example.md index 4cbabf99d..c891f9c96 100644 --- a/docs/theorist/bms/example.md +++ b/docs/theorist/bms/example.md @@ -1,4 +1,4 @@ -# +# Bayesian Machine Scientist ## Example @@ -21,21 +21,27 @@ Now let us choose a prior over the primitives. In this case, we will use priors prior = "Guimera2020" ``` -## Set up the BMS Regresssor +## Set up the BMS Regressor -We will use the BMS Regresssor to predict the outcomes. There are a number of parameters that determine how the architecture search is performed. The most important ones are listed below: +We will use the BMS Regressor to predict the outcomes. There are a number of parameters that determine how the architecture search is performed. The most important ones are listed below: - **`epochs`**: The number of epochs to run BMS. This corresponds to the total number of equation mutations - one mcmc step for each parallel-tempered equation and one tree swap between a pair of parallel-tempered equations. - **`prior_par`**: A dictionary of priors for each operation. The keys correspond to operations and the respective values correspond to prior probabilities of those operations. The model comes with a default. - **`ts`**: A list of temperature values. The machine scientist creates an equation tree for each of these values. Higher temperature trees are harder to fit, and thus they help prevent overfitting of the model. -Let's set up the BMS regressor with default parameters. +Let's use the same priors over primitives that we specified on the previous page as well as an illustrative set of temperatures to set up the BMS regressor with default parameters. ```python from autora.skl.bms import BMSRegressor -bms_estimator = BMSRegressor() +temperatures = [1.0] + [1.04**k for k in range(1, 20)] + +bms_estimator = BMSRegressor( + epochs=1500, + prior_par=primitives, + ts=temperatures, +) ``` Now we have everything to fit and verify the model. diff --git a/docs/theorist/bms/how_it_works.md b/docs/theorist/bms/how_it_works.md index 74bab64a0..40ab6a6d5 100644 --- a/docs/theorist/bms/how_it_works.md +++ b/docs/theorist/bms/how_it_works.md @@ -46,7 +46,7 @@ Bayesian inference via MCMC is then applied to navigate the search space efficie The search space is very rugged, and local minima are difficult to escape, so BMS employs parallel tempering to overcome this. -![Comptuation Graph](img/BMSTempering.png) +![Parallel_Tempering](img/BMSTempering.png) One incremental unit of search in this approach involves two steps: @@ -57,7 +57,7 @@ I) Markov chain Monte Carlo Sampling: c) Choosing a specific variable or parameter value is random. d) Accepting or rejecting the mutation depends on Metropolis' rule. -![Comptuation Graph](img/BMSEquationTreeOps.png) +![Tree_Mutations](img/BMSEquationTreeOps.png) II) Parallel Tree Swap: diff --git a/docs/theorist/bms/search_space.md b/docs/theorist/bms/search_space.md index 0d12f2d17..524b1b528 100644 --- a/docs/theorist/bms/search_space.md +++ b/docs/theorist/bms/search_space.md @@ -46,6 +46,6 @@ We can then pass these primitives directly to the BMS regressor as follows: from autora.skl.bms import BMSRegressor bms_estimator = BMSRegressor( - prior=primitives + prior_par=primitives ) ``` diff --git a/docs/theorist/darts/example.md b/docs/theorist/darts/example.md index 8f11b1791..60546967c 100644 --- a/docs/theorist/darts/example.md +++ b/docs/theorist/darts/example.md @@ -29,15 +29,15 @@ primitives = [ ] ``` -## Set up the DARTS Regresssor +## Set up the DARTS Regressor -We will use the DARTS Regresssor to predict the outcomes. There are a number of parameters that determine how the architecture search is performed. The most important ones are listed below: +We will use the DARTS Regressor to predict the outcomes. There are a number of parameters that determine how the architecture search is performed. The most important ones are listed below: - **num_graph_nodes**: The number of latent variables used to represent the model. - **arch_updates_per_epoch**: The number of architecture updates per training epoch. These updates affect the architecture weights $\alpha$ indicating the relative contribution of each operation for a given computation step. -- **arch_learning_rate_max**: The initial learning rate of the architecture weight optimizier. +- **arch_learning_rate_max**: The initial learning rate of the architecture weight optimizer. - **param_updates_per_epoch**: The number of parameter updates per epoch. Once the architecture updates are complete, the parameters associated with each operation are updated. -- **param_momentum**: The momentum of the parameter optimizier. +- **param_momentum**: The momentum of the parameter optimizer. - **max_epochs**: The maximum number of epochs to run DARTS. - **output_type**: The type of output to produce. In our case, we treat the outcome as a real variable, i.e., "real". diff --git a/docs/theorist/darts/how_it_works.md b/docs/theorist/darts/how_it_works.md index 7f548e886..b703f69fe 100644 --- a/docs/theorist/darts/how_it_works.md +++ b/docs/theorist/darts/how_it_works.md @@ -4,7 +4,7 @@ Regular DARTS treats the architecture of a neural network as a directed acyclic computation graph (DAG), containing $N$ nodes in sequential order. -![Comptuation Graph](img/darts_computation_graph.jpg) +![Computation Graph](img/darts_computation_graph.jpg) Each node $x_i$ corresponds to a latent representation of the input space. Each directed edge $e_{i, j}$ is associated with some operation $o_{i,j}$ that transforms the representation of the preceding node $i$, and feeds it to node $j$. Each intermediate node is computed by integrating over its transformed predecessors: diff --git a/docs/theorist/darts/search_space.md b/docs/theorist/darts/search_space.md index 8ded8b94e..2cdf9af47 100644 --- a/docs/theorist/darts/search_space.md +++ b/docs/theorist/darts/search_space.md @@ -28,7 +28,7 @@ Some of the primitives above may also be preceded by a linear transformation, al Note that the following functions are available but currently not identifiable by DARTS (please use the following functions with caution): - **reciprocal**: The output of the computation $x_j$ is the multiplicative inverse of its input $x_i$: $x_j = \frac{1}{x_i}$. -- **ln**: The output of the computation $x_j$ is the natural logaritm of its input $x_i$: $x_j = \ln(x_i)$. +- **ln**: The output of the computation $x_j$ is the natural logarithm of its input $x_i$: $x_j = \ln(x_i)$. - **softplus**: The output of the computation $x_j$ is a softplus function of its input $x_i$: $x_j = \log(1 + \exp(a * x_i)) / a$. - **softminus**: The output of the computation $x_j$ is a softminus function of its input $x_i$: $x_j = x_j - \log(1 + \exp(a * x_i)) / a$. diff --git a/example/cycle/simple_cycle_bms_model_poppernet.py b/example/cycle/simple_cycle_bms_model_poppernet.py new file mode 100644 index 000000000..a0027625d --- /dev/null +++ b/example/cycle/simple_cycle_bms_model_poppernet.py @@ -0,0 +1,134 @@ +import matplotlib.pyplot as plt +import numpy as np + +from autora.cycle import Cycle +from autora.experimentalist.pipeline import Pipeline +from autora.experimentalist.pooler import grid_pool, poppernet_pool +from autora.experimentalist.sampler import nearest_values_sampler +from autora.skl.bms import BMSRegressor +from autora.variable import Variable, VariableCollection + +# meta parameters +ground_truth_resolution = 1000 +samples_per_cycle = 7 +value_range = (-1, 5) +allowed_values = np.linspace(value_range[0], value_range[1], ground_truth_resolution) + + +# define ground truth +def ground_truth(xs): + # return (xs ** 2.) + xs + 1. + y = xs * 1.0 + y[xs < 0] = 0 + return y + + +# define variables +study_metadata = VariableCollection( + independent_variables=[ + Variable(name="x1", allowed_values=allowed_values, value_range=value_range) + ], + dependent_variables=[Variable(name="y", value_range=(-20, 20))], +) + + +# define experiment platform +def get_synthetic_experiment_runner(): + rng = np.random.default_rng(seed=180) + + def runner(xs): + return ground_truth(xs) + rng.normal(0, 0.5, xs.shape) + + return runner + + +synthetic_experiment_runner = get_synthetic_experiment_runner() + +# Initialize the experimentalist +random_experimentalist = Pipeline( + [ + ("grid_pool", grid_pool), # type: ignore + ("nearest_values_sampler", nearest_values_sampler), # type: ignore + ], + { + "grid_pool": {"ivs": study_metadata.independent_variables}, + "nearest_values_sampler": { + "allowed_values": np.linspace( + value_range[0], value_range[1], samples_per_cycle + ), + "n": samples_per_cycle, + }, + }, +) + +# define theorist +bms_theorist = BMSRegressor(epochs=100) + +# define seed cycle +# we will use this cycle to collect initial data and initialize the BMS model +seed_cycle = Cycle( + metadata=study_metadata, + theorist=bms_theorist, + experimentalist=random_experimentalist, + experiment_runner=synthetic_experiment_runner, +) + +# run seed cycle +seed_cycle.run(num_cycles=1) + +seed_model = seed_cycle.data.theories[0].model_ +seed_x = seed_cycle.data.conditions[0] +seed_y = seed_cycle.data.observations[0][:, 1] + + +# now we define the poppernet experimentalist which takes into account +# the seed data and the seed model +popper_experimentalist = Pipeline( + [ + ("popper_pool", poppernet_pool), # type: ignore + ("nearest_values_sampler", nearest_values_sampler), # type: ignore + ], + { + "popper_pool": { + "metadata": study_metadata, + "model": seed_model, + "x_train": seed_x, + "y_train": seed_y, + "n": samples_per_cycle, + "plot": True, + }, + "nearest_values_sampler": { + "allowed_values": allowed_values, + "n": samples_per_cycle, + }, + }, +) + +# running a new cycle taking into account the seed data and model +# TODO: need to find a way to incorporate the seed data into the cycle +cycle = Cycle( + metadata=study_metadata, + theorist=bms_theorist, + experimentalist=popper_experimentalist, + experiment_runner=synthetic_experiment_runner, +) +cycle.run(num_cycles=1) + +# plot output of architecture search +all_obs = np.row_stack(seed_cycle.data.observations) +x_obs, y_obs = all_obs[:, 0], all_obs[:, 1] +plt.scatter(x_obs, y_obs, s=10, label="seed data") + +all_obs = np.row_stack(cycle.data.observations) +x_obs, y_obs = all_obs[:, 0], all_obs[:, 1] +plt.scatter(x_obs, y_obs, s=10, label="collected data") + +x_pred = np.array(study_metadata.independent_variables[0].allowed_values).reshape( + ground_truth_resolution, 1 +) +y_pred_seed = seed_cycle.data.theories[0].predict(x_pred) +y_pred_final = cycle.data.theories[0].predict(x_pred) +plt.plot(x_pred, y_pred_seed, color="blue", label="seed model") +plt.plot(x_pred, y_pred_final, color="red", label="final model") +plt.legend() +plt.show() diff --git a/example/cycle/simple_cycle_uncertainty_experimentalist.ipynb b/example/cycle/simple_cycle_uncertainty_experimentalist.ipynb new file mode 100644 index 000000000..e324c9425 --- /dev/null +++ b/example/cycle/simple_cycle_uncertainty_experimentalist.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Simple Cycle Examples with Uncertainty vs. Random Experimentalist\n", + "The aim of this example notebook is to use the AutoRA `Cycle` to recover a ground truth theory from some noisy data using BSM.\n", + "It comparse the default \"random\" experimentalist with the \"uncertainty\" sampler." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from sklearn.dummy import DummyRegressor\n", + "\n", + "from autora.cycle import Cycle\n", + "from autora.experimentalist.sampler import random_sampler, poppernet_pooler, nearest_values_sampler\n", + "from autora.experimentalist.pipeline import make_pipeline\n", + "from autora.variable import VariableCollection, Variable" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "def ground_truth(xs):\n", + " oscillating_component = np.sin((4. * xs) - 3.)\n", + " parabolic_component = (-0.1 * xs ** 2.) + (2.5 * xs) + 1.\n", + " ys = oscillating_component + parabolic_component\n", + " return ys" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "The space of allowed x values is reals between -10 and 10 inclusive. We discretize them as we don't currently have a sampler which can sample from the uniform distribution." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "study_metadata = VariableCollection(\n", + " independent_variables=[Variable(name=\"x1\", allowed_values=np.linspace(-10, 10, 500))],\n", + " dependent_variables=[Variable(name=\"y\")],\n", + " )" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "So that we can compare the effectiveness of the two strategies, we fix the number of observations per cycle to be 100." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "observations_per_cycle = 100" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "When we run a synthetic experiment, we get a reproducible noisy result:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def get_example_synthetic_experiment_runner():\n", + " rng = np.random.default_rng(seed=180)\n", + " def runner(xs):\n", + " return ground_truth(xs) + rng.normal(0, 1.0, xs.shape)\n", + " return runner\n", + "\n", + "example_synthetic_experiment_runner = get_example_synthetic_experiment_runner()\n", + "x = np.array([1.])\n", + "example_synthetic_experiment_runner(x)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "plt.scatter(study_metadata.independent_variables[0].allowed_values[::5,], example_synthetic_experiment_runner(study_metadata.independent_variables[0].allowed_values[::5,]), alpha=1, s=0.1, c='r', label=\"samples\")\n", + "plt.plot(study_metadata.independent_variables[0].allowed_values, ground_truth(study_metadata.independent_variables[0].allowed_values), c=\"black\", label=\"ground truth\")\n", + "plt.legend()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "We use a common BMS regressor with a common parametrization to test the two methods." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from autora.skl.bms import BMSRegressor\n", + "bms_theorist = BMSRegressor(epochs=100)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "We also define a helper function to plot the results" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "def run_and_plot_cycle(cycle, study_metadata):\n", + " cycle.run(num_cycles=1)\n", + "\n", + " all_obs = np.row_stack(cycle.data.observations)\n", + " x_obs, y_obs = all_obs[:,0], all_obs[:,1]\n", + " x_obs_new, y_obs_new = cycle.data.observations[-1][:,0], cycle.data.observations[-1][:,1]\n", + "\n", + " x_pred = np.array(study_metadata.independent_variables[0].allowed_values).reshape(-1, 1)\n", + " y_pred = cycle.data.theories[-1].predict(x_pred)\n", + "\n", + " plt.plot(study_metadata.independent_variables[0].allowed_values, ground_truth(study_metadata.independent_variables[0].allowed_values), c=\"black\", label=\"ground truth\")\n", + " plt.scatter(x_obs, y_obs, s=1, c='r', label=\"samples\")\n", + " plt.scatter(x_obs_new, y_obs_new, s=1, c='green', facecolors=\"none\", label=\"new samples\")\n", + " plt.plot(x_pred, y_pred, c=\"blue\", label=\"theorist result\")\n", + "\n", + " plt.legend()\n", + "\n", + " plt.show()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Random Sampler" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "random_experimentalist = make_pipeline(\n", + " [study_metadata.independent_variables[0].allowed_values, random_sampler],\n", + " params={\"random_sampler\": {\"n\": observations_per_cycle}}\n", + ")\n", + "random_experimentalist_cycle = Cycle(\n", + " metadata=study_metadata,\n", + " theorist=bms_theorist,\n", + " experimentalist=random_experimentalist,\n", + " experiment_runner=example_synthetic_experiment_runner\n", + ")\n", + "\n", + "for _ in range(10):\n", + " run_and_plot_cycle(cycle=random_experimentalist_cycle, study_metadata=study_metadata)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "## Popper Sampler" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "poppernet_experimentalist = make_pipeline(\n", + " [poppernet_pooler, nearest_values_sampler],\n", + ")\n", + "\n", + "poppernet_experimentalist_cycle = Cycle(\n", + " metadata=study_metadata,\n", + " theorist=bms_theorist,\n", + " experimentalist=poppernet_experimentalist,\n", + " experiment_runner=example_synthetic_experiment_runner,\n", + " params={\"experimentalist\" : {\n", + " \"poppernet_pooler\": {\n", + " \"model\": \"%theories[-1]%\",\n", + " \"x_train\": \"%observations.ivs%\",\n", + " \"y_train\": \"%observations.dvs%\",\n", + " \"metadata\": study_metadata,\n", + " \"num_samples\": observations_per_cycle,\n", + " },\n", + " \"nearest_values_sampler\": {\n", + " \"allowed_values\": study_metadata.independent_variables[0].allowed_values\n", + " }\n", + " }\n", + " }\n", + " )" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "The Popper sampler depends on having a first guess for the theory, so we add an appropriate model and an initial datapoint to the cycle's data:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "x_seed = np.linspace(-10, 10, 20)\n", + "y_seed = example_synthetic_experiment_runner(x_seed)\n", + "\n", + "theory_seed = DummyRegressor(strategy=\"constant\", constant=y_seed[1])\n", + "theory_seed.fit(x_seed, y_seed)\n", + "\n", + "poppernet_experimentalist_cycle.data.theories.append(theory_seed)\n", + "poppernet_experimentalist_cycle.data.observations.append(np.column_stack([x_seed, y_seed]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "Now we can run the cycle and check the results." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "for _ in range(10):\n", + " run_and_plot_cycle(cycle=poppernet_experimentalist_cycle, study_metadata=study_metadata)\n" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/mkdocs.yml b/mkdocs.yml index 3f825fd23..e71bb8b03 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -5,6 +5,9 @@ site_name: Autonomous Empirical Research theme: name: material +watch: + - autora/ + plugins: - search - gen-files: @@ -14,8 +17,11 @@ plugins: nav_file: SUMMARY.md - section-index - mkdocstrings: - watch: - - autora/ + handlers: + python: + import: + - https://scikit-learn.org/stable/objects.inv + markdown_extensions: - pymdownx.arithmatex: diff --git a/poetry.lock b/poetry.lock index 78a5a763a..3d317d3ad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1635,23 +1635,23 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" [[package]] name = "scikit-learn" -version = "1.1.3" +version = "1.2.0" description = "A set of python modules for machine learning and data mining" category = "main" optional = false python-versions = ">=3.8" [package.dependencies] -joblib = ">=1.0.0" +joblib = ">=1.1.1" numpy = ">=1.17.3" scipy = ">=1.3.2" threadpoolctl = ">=2.0.0" [package.extras] -benchmark = ["matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] -docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.2)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] -examples = ["matplotlib (>=3.1.2)", "pandas (>=1.0.5)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] -tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.2)", "mypy (>=0.961)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pyamg (>=4.0.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "scikit-image (>=0.16.2)"] +benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.10.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.10.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=22.3.0)", "flake8 (>=3.8.2)", "matplotlib (>=3.1.3)", "mypy (>=0.961)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=5.3.1)", "pytest-cov (>=2.9.0)", "scikit-image (>=0.16.2)"] [[package]] name = "scipy" @@ -2848,6 +2848,8 @@ psutil = [ {file = "psutil-5.9.4-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:54d5b184728298f2ca8567bf83c422b706200bcbbfafdc06718264f9393cfeb7"}, {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:16653106f3b59386ffe10e0bad3bb6299e169d5327d3f187614b1cb8f24cf2e1"}, {file = "psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54c0d3d8e0078b7666984e11b12b88af2db11d11249a8ac8920dd5ef68a66e08"}, + {file = "psutil-5.9.4-cp36-abi3-win32.whl", hash = "sha256:149555f59a69b33f056ba1c4eb22bb7bf24332ce631c44a319cec09f876aaeff"}, + {file = "psutil-5.9.4-cp36-abi3-win_amd64.whl", hash = "sha256:fd8522436a6ada7b4aad6638662966de0d61d241cb821239b2ae7013d41a43d4"}, {file = "psutil-5.9.4-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6001c809253a29599bc0dfd5179d9f8a5779f9dffea1da0f13c53ee568115e1e"}, {file = "psutil-5.9.4.tar.gz", hash = "sha256:3d7f9739eb435d4b1338944abe23f49584bde5395f27487d2ee25ad9a8774a62"}, ] @@ -3094,27 +3096,27 @@ rfc3986-validator = [ {file = "rfc3986_validator-0.1.1.tar.gz", hash = "sha256:3d44bde7921b3b9ec3ae4e3adca370438eccebc676456449b145d533b240d055"}, ] scikit-learn = [ - {file = "scikit-learn-1.1.3.tar.gz", hash = "sha256:bef51978a51ec19977700fe7b86aecea49c825884f3811756b74a3b152bb4e35"}, - {file = "scikit_learn-1.1.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8e9dd76c7274055d1acf4526b8efb16a3531c26dcda714a0c16da99bf9d41900"}, - {file = "scikit_learn-1.1.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ee47f68d973cee7009f06edb956f2f5588a0f230f24a2a70175fd0ecf36e2653"}, - {file = "scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da5a2e95fef9805b1750e4abda4e834bf8835d26fc709a391543b53feee7bd0e"}, - {file = "scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:701181792a28c82fecae12adb5d15d0ecf57bffab7cf4bdbb52c7b3fd428d540"}, - {file = "scikit_learn-1.1.3-cp310-cp310-win_amd64.whl", hash = "sha256:30e27721adc308e8fd9f419f43068e43490005f911edf4476a9e585059fa8a83"}, - {file = "scikit_learn-1.1.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5699cded6c0685426433c7e5afe0fecad80ec831ec7fa264940e50c796775cc5"}, - {file = "scikit_learn-1.1.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:2ee2c649f2231b68511aabb0dc827edd8936aad682acc6263c34aed11bc95dac"}, - {file = "scikit_learn-1.1.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6d1c1394e38a3319ace620381f6f23cc807d8780e9915c152449a86fc8f1db21"}, - {file = "scikit_learn-1.1.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:250da993701da88bf475e7c5746abf1285ea0ae47e4d0917cd13afd6600bb162"}, - {file = "scikit_learn-1.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:fd3ee69d36d42a7dcbb17e355a5653af5fd241a7dfd9133080b3dde8d9e2aafb"}, - {file = "scikit_learn-1.1.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f5644663987ee221f5d1f47a593271b966c271c236fe05634e6bdc06041b5a2b"}, - {file = "scikit_learn-1.1.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:748f2bd632d6993e8918d43f1a26c380aeda4e122a88840d4c3a9af99d4239fe"}, - {file = "scikit_learn-1.1.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cd55c6fbef7608dbce1f22baf289dfcc6eb323247daa3c3542f73d389c724786"}, - {file = "scikit_learn-1.1.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38814f66285318f2e241305cca545eaa9b4126c65aa5dd78c69371f235f78e2b"}, - {file = "scikit_learn-1.1.3-cp38-cp38-win_amd64.whl", hash = "sha256:f4931f2a6c06e02c6c17a05f8ae397e2545965bc7a0a6cb38c8cd7d4fba8624d"}, - {file = "scikit_learn-1.1.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6785b8a3093329bf90ac01801be5525551728ae73edb11baa175df660820add4"}, - {file = "scikit_learn-1.1.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:28b2bd6a1419acd522ff45d282c8ba23dbccb5338802ab0ee12baa4ade0aba4c"}, - {file = "scikit_learn-1.1.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23fb9e74b813cc2528b5167d82ed08950b11106ccf50297161875e45152fb311"}, - {file = "scikit_learn-1.1.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5d4231af7199531e77da1b78a4cc6b3d960a00b1ec672578ac818aae2b9c35d"}, - {file = "scikit_learn-1.1.3-cp39-cp39-win_amd64.whl", hash = "sha256:4d3a19166d4e1cdfcab975c68f471e046ce01e74c42a9a33fa89a14c2fcedf60"}, + {file = "scikit-learn-1.2.0.tar.gz", hash = "sha256:680b65b3caee469541385d2ca5b03ff70408f6c618c583948312f0d2125df680"}, + {file = "scikit_learn-1.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1beaa631434d1f17a20b1eef5d842e58c195875d2bc11901a1a70b5fe544745b"}, + {file = "scikit_learn-1.2.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:d395730f26d8fc752321f1953ddf72647c892d8bed74fad4d7c816ec9b602dfa"}, + {file = "scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd3480c982b9e616b9f76ad8587804d3f4e91b4e2a6752e7dafb8a2e1f541098"}, + {file = "scikit_learn-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:184a42842a4e698ffa4d849b6019de50a77a0aa24d26afa28fa49c9190bb144b"}, + {file = "scikit_learn-1.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:867023a044fdfe59e5014a7fec7a3086a8928f10b5dce9382eedf4135f6709a2"}, + {file = "scikit_learn-1.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5546a8894a0616e92489ef995b39a0715829f3df96e801bb55cbf196be0d9649"}, + {file = "scikit_learn-1.2.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:bc7073e025b62c1067cbfb76e69d08650c6b9d7a0e7afdfa20cb92d4afe516f6"}, + {file = "scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fc0a72237f0c56780cf550df87201a702d3bdcbbb23c6ef7d54c19326fa23f19"}, + {file = "scikit_learn-1.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e1ea0bc1706da45589bcf2490cde6276490a1b88f9af208dbb396fdc3a0babf"}, + {file = "scikit_learn-1.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:f17420a8e3f40129aeb7e0f5ee35822d6178617007bb8f69521a2cefc20d5f00"}, + {file = "scikit_learn-1.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25ba705ee1600ffc5df1dccd8fae129d7c6836e44ffcbb52d78536c9eaf8fcf9"}, + {file = "scikit_learn-1.2.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:6b63ca2b0643d30fbf9d25d93017ed3fb8351f31175d82d104bfec60cba7bb87"}, + {file = "scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c772fa8c64776ad769fd764752c8452844307adcf10dee3adcc43988260f21"}, + {file = "scikit_learn-1.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0834e4cec2a2e0d8978f39cb8fe1cad3be6c27a47927e1774bf5737ea65ec228"}, + {file = "scikit_learn-1.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:da29d2e379c396a63af5ed4b671ad2005cd690ac373a23bee5a0f66504e05272"}, + {file = "scikit_learn-1.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:23a88883ca60c571a06278e4726b3b51b3709cfa4c93cacbf5568b22ba960899"}, + {file = "scikit_learn-1.2.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:40f3ff68c505cb9d1f3693397c73991875d609da905087e00e7b4477645ec67b"}, + {file = "scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9535e867281ae6987bb80620ba14cf1649e936bfe45f48727b978b7a2dbe835"}, + {file = "scikit_learn-1.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de897720173b26842e21bed54362f5294e282422116b61cd931d4f5d870b9855"}, + {file = "scikit_learn-1.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:ceb0008f345188aa236e49c973dc160b9ed504a3abd7b321a0ecabcb669be0bd"}, ] scipy = [ {file = "scipy-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1884b66a54887e21addf9c16fb588720a8309a57b2e258ae1c7986d4444d3bc0"}, diff --git a/tests/test_experimentalist_random.py b/tests/test_experimentalist_random.py index 0a1f9f1ec..e85f0a72a 100644 --- a/tests/test_experimentalist_random.py +++ b/tests/test_experimentalist_random.py @@ -5,7 +5,7 @@ from autora.experimentalist.filter import weber_filter from autora.experimentalist.pipeline import make_pipeline -from autora.experimentalist.pool import grid_pool +from autora.experimentalist.pooler.general_pool import grid_pool from autora.experimentalist.sampler import random_sampler from autora.variable import DV, IV, ValueType, VariableCollection diff --git a/tests/test_experimentalist_uncertainty.py b/tests/test_experimentalist_uncertainty.py index dbaea8c83..aa25f63c6 100644 --- a/tests/test_experimentalist_uncertainty.py +++ b/tests/test_experimentalist_uncertainty.py @@ -7,7 +7,7 @@ from autora.experimentalist.filter import weber_filter from autora.experimentalist.pipeline import make_pipeline -from autora.experimentalist.pool import grid_pool +from autora.experimentalist.pooler.general_pool import grid_pool from autora.experimentalist.sampler import uncertainty_sampler from autora.variable import DV, IV, ValueType, VariableCollection diff --git a/tests/test_poppernet_sampler.py b/tests/test_poppernet_pooler.py similarity index 83% rename from tests/test_poppernet_sampler.py rename to tests/test_poppernet_pooler.py index 7f5c0a5bd..83a013815 100644 --- a/tests/test_poppernet_sampler.py +++ b/tests/test_poppernet_pooler.py @@ -3,10 +3,8 @@ from sklearn.linear_model import LinearRegression, LogisticRegression from autora.experimentalist.pipeline import Pipeline -from autora.experimentalist.sampler.poppernet import ( - nearest_values_sampler, - poppernet_pooler, -) +from autora.experimentalist.pooler import poppernet_pool +from autora.experimentalist.sampler import nearest_values_sampler from autora.variable import DV, IV, ValueType, VariableCollection @@ -95,14 +93,14 @@ def test_poppernet_classification(synthetic_logr_model, classification_data_to_t # Run popper net sampler poppernet_pipeline = Pipeline( - [("pool", poppernet_pooler), ("sampler", nearest_values_sampler)], + [("pool", poppernet_pool), ("sampler", nearest_values_sampler)], params={ "pool": dict( model=model, x_train=X_train, y_train=Y_train, metadata=metadata, - num_samples=2, + n=2, training_epochs=1000, optimization_epochs=1000, training_lr=1e-3, @@ -111,7 +109,7 @@ def test_poppernet_classification(synthetic_logr_model, classification_data_to_t limit_offset=10**-10, limit_repulsion=0, ), - "sampler": {"allowed_values": X}, + "sampler": {"allowed_values": X, "n": 2}, }, ) @@ -157,23 +155,23 @@ def test_poppernet_regression(synthetic_linr_model, regression_data_to_test): ) poppernet_pipeline = Pipeline( - [("pool", poppernet_pooler), ("sampler", nearest_values_sampler)], + [("pool", poppernet_pool), ("sampler", nearest_values_sampler)], params={ "pool": dict( model=model, x_train=X_train, y_train=Y_train, metadata=metadata, - num_samples=5, + n=5, training_epochs=1000, - optimization_epochs=1000, + optimization_epochs=5000, training_lr=1e-3, optimization_lr=1e-3, mse_scale=1, - limit_offset=10**-10, - limit_repulsion=0, + limit_offset=0, + limit_repulsion=0.01, ), - "sampler": {"allowed_values": X}, + "sampler": {"allowed_values": X, "n": 5}, }, ) @@ -181,4 +179,12 @@ def test_poppernet_regression(synthetic_linr_model, regression_data_to_test): # the first value should be close to one of the local maxima of the # sine function - assert sample[0] == 1.5 or sample[0] == 4.5 + assert ( + sample[0] == 1.5 + or sample[0] == 4.5 + or sample[0] == 6 + or sample[0] == 0 + or sample[0] == 3 + ) + if sample[0] == 6 or sample[0] == 0 or sample[0] == 3: + assert sample[1] == 4.5 or sample[1] == 1.5 diff --git a/tests/test_summed_dissimilarity_sampler.py b/tests/test_summed_dissimilarity_sampler.py new file mode 100644 index 000000000..d67c0d52f --- /dev/null +++ b/tests/test_summed_dissimilarity_sampler.py @@ -0,0 +1,32 @@ +import numpy as np + +from autora.experimentalist.sampler.dissimilarity import summed_dissimilarity_sampler + + +def test_dissimilarity_sampler_1D(): + + num_samples = 2 + + # define two matrices + matrix1 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + matrix2 = np.array([1, 2, 3]) + + # reorder matrix1 according to its distances to matrix2 + reordered_matrix1 = summed_dissimilarity_sampler(matrix1, matrix2, n=num_samples) + + assert reordered_matrix1.shape[0] == num_samples + assert reordered_matrix1.shape[1] == 1 + assert np.array_equal(reordered_matrix1, np.array([[10], [9]])) + + +def test_dissimilarity_sampler_ND(): + # define two matrices + matrix1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) + matrix2 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]) + + # reorder matrix1 according to its distances to matrix2 + reordered_matrix1 = summed_dissimilarity_sampler(matrix1, matrix2, n=2) + + assert reordered_matrix1.shape[0] == 2 + assert reordered_matrix1.shape[1] == 3 + assert np.array_equal(reordered_matrix1, np.array([[10, 11, 12], [7, 8, 9]]))