diff --git a/.github/workflows/python-tox-testing-rmf-env.yml b/.github/workflows/python-tox-testing-rmf-env.yml new file mode 100644 index 00000000..3d26cdc1 --- /dev/null +++ b/.github/workflows/python-tox-testing-rmf-env.yml @@ -0,0 +1,29 @@ +name: poli rmf (conda, py3.9) + +on: + push: + schedule: + - cron: '0 0 * * 0' + +jobs: + build-linux: + runs-on: ubuntu-latest + strategy: + max-parallel: 5 + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.9 + uses: actions/setup-python@v3 + with: + python-version: '3.9' + - name: Add conda to system path + run: | + # $CONDA is an environment variable pointing to the root of the miniconda directory + echo $CONDA/bin >> $GITHUB_PATH + - name: Install dependencies + run: | + python -m pip install tox + - name: Test rmf-related black boxes with tox and pytest + run: | + tox -c tox.ini -e poli-rmf-py39 diff --git a/pyproject.toml b/pyproject.toml index ded40ea2..e48aff29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ markers = [ "poli__tdc: marks tests that run in the poli__tdc environment", "poli__protein: marks tests that run in the poli__protein environment", "poli__rasp: marks tests that run in the poli__rasp environment", + "poli__rmf: marks tests that run in poli__rmf environment", "unmarked: All other tests, which usually run in the base environment", ] diff --git a/src/poli/objective_factory.py b/src/poli/objective_factory.py index fba11cce..2777c403 100644 --- a/src/poli/objective_factory.py +++ b/src/poli/objective_factory.py @@ -435,6 +435,9 @@ def _instantiate_observer(observer_name: str, quiet: bool = False) -> AbstractOb The black-box function, initial value, and related information. """ + if _OBSERVER not in registry.config[_DEFAULT]: + registry.config[_DEFAULT][_OBSERVER] = _DEFAULT_OBSERVER_RUN_SCRIPT + observer_script: str = registry.config[_DEFAULT][_OBSERVER] if observer_name is not None: if observer_name != DEFAULT_OBSERVER_NAME: diff --git a/src/poli/objective_repository/__init__.py b/src/poli/objective_repository/__init__.py index 256c66e6..b9d4e7e5 100644 --- a/src/poli/objective_repository/__init__.py +++ b/src/poli/objective_repository/__init__.py @@ -75,6 +75,7 @@ from .rfp_foldx_stability_and_sasa.register import ( RFPFoldXStabilityAndSASAProblemFactory, ) +from .rmf_landscape.register import RMFBlackBox, RMFProblemFactory from .sa_tdc.register import SABlackBox, SAProblemFactory from .scaffold_hop.register import ScaffoldHopBlackBox, ScaffoldHopProblemFactory from .sitagliptin_mpo.register import ( @@ -138,6 +139,7 @@ "rdkit_logp": LogPProblemFactory, "rdkit_qed": QEDProblemFactory, "rfp_foldx_stability_and_sasa": RFPFoldXStabilityAndSASAProblemFactory, + "rmf_landscape": RMFProblemFactory, "sa_tdc": SAProblemFactory, "super_mario_bros": SuperMarioBrosProblemFactory, "white_noise": WhiteNoiseProblemFactory, @@ -182,6 +184,7 @@ "rdkit_logp": LogPBlackBox, "rdkit_qed": QEDBlackBox, "rfp_foldx_stability_and_sasa": FoldXStabilityAndSASABlackBox, + "rmf_landscape": RMFBlackBox, "sa_tdc": SABlackBox, "super_mario_bros": SuperMarioBrosBlackBox, "white_noise": WhiteNoiseBlackBox, diff --git a/src/poli/objective_repository/rmf_landscape/__init__.py b/src/poli/objective_repository/rmf_landscape/__init__.py new file mode 100644 index 00000000..70d8b601 --- /dev/null +++ b/src/poli/objective_repository/rmf_landscape/__init__.py @@ -0,0 +1,10 @@ +"""Rough Mount Fuji (RMF) fitness landscapes w/ tunable ruggedness using Numpy. +See J Neidhart, IG Szendro, J Krug + Adaptation in Tunably Rugged Fitness Landscapes: The Rough Mount Fuji Model. + Genetics 2014 . + DOI: https://doi.org/10.1534/genetics.114.167668 +See Aita et al. + Analysis of a local fitness landscape with a model of the rough Mt. Fuji-type landscape: Application to prolyl endopeptidase and thermolysin. + Biopolymers 2000 . + DOI: https://doi.org/10.1002/(SICI)1097-0282(200007)54:1<64::AID-BIP70>3.0.CO;2-R +""" diff --git a/src/poli/objective_repository/rmf_landscape/environment.yml b/src/poli/objective_repository/rmf_landscape/environment.yml new file mode 100644 index 00000000..66359bce --- /dev/null +++ b/src/poli/objective_repository/rmf_landscape/environment.yml @@ -0,0 +1,11 @@ +name: poli__rmf +channels: + - conda-forge + - defaults +dependencies: + - python=3.9 + - pip=23.2.1 + - pip: + - numpy + - "git+https://github.com/MachineLearningLifeScience/poli.git@dev" + - scipy \ No newline at end of file diff --git a/src/poli/objective_repository/rmf_landscape/information.py b/src/poli/objective_repository/rmf_landscape/information.py new file mode 100644 index 00000000..17c0d804 --- /dev/null +++ b/src/poli/objective_repository/rmf_landscape/information.py @@ -0,0 +1,15 @@ +import numpy as np + +from poli.core.black_box_information import BlackBoxInformation +from poli.core.util.proteins.defaults import AMINO_ACIDS + +rmf_info = BlackBoxInformation( + name="rmf_landscape", + max_sequence_length=np.inf, + aligned=True, + fixed_length=True, + deterministic=False, + alphabet=AMINO_ACIDS, # TODO: differentiate between AA and NA inputs? + log_transform_recommended=False, + discrete=True, +) diff --git a/src/poli/objective_repository/rmf_landscape/isolated_function.py b/src/poli/objective_repository/rmf_landscape/isolated_function.py new file mode 100644 index 00000000..6ec63d1d --- /dev/null +++ b/src/poli/objective_repository/rmf_landscape/isolated_function.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +import logging +from random import seed +from typing import List, Optional + +import numpy as np +from scipy.spatial.distance import hamming +from scipy.stats import genpareto + +from poli.core.abstract_isolated_function import AbstractIsolatedFunction +from poli.core.util.proteins.defaults import AMINO_ACIDS, ENCODING + + +class RMFIsolatedLogic(AbstractIsolatedFunction): + """ + RMF internal logic. + + Parameters + ---------- + wildtype : List[str] + String sequence of the reference, default: None. + c : float, optional + + alphabet : List[str] + Alphabet for the problem, by default AA list provided from poli.core.util.proteins.defaults + stochasticity: str, optional + Methods + ------- + _black_box(x, context=None) + Main black box method to compute the fitness value of x relative to the WT. + + Raises + ------ + AssertionError + If no wildtype sequence is provided. + """ + + def __init__( + self, + wildtype: List[str], + wt_val: float | None = 0.0, + c: float | None = None, + kappa: float | None = 0.1, + alphabet: List[str] | None = None, + seed: int | None = 0, + ) -> None: + """ + Initialize the RMFBlackBox object. + """ + assert wildtype is not None, ( + "Missing reference input sequence. " + "Did you forget to pass it to the create of the black box?" + ) + oracle_name = "RMF" + if not isinstance(wildtype, np.ndarray): + wildtype = np.array(list(wildtype)) + self.wildtype = wildtype + self.seed = seed + if alphabet is None: + logging.info("using default alphabet AAs.") + alphabet = AMINO_ACIDS + assert all( + [aa in ENCODING.keys() for aa in wildtype] + ), "Input wildtype elements not in encoding alphabet." + self.wt_int = np.array([ENCODING.get(aa) for aa in wildtype]) + if c is None: + c = 1 / (len(alphabet) - 1) + else: + c = c + assert c >= 0, "Invalid c : c > 0 required!" + logging.info(f"setting c={c}") + # if c == 0 : uncorrelated HoC landscape (?) + self.c = c + self.kappa = kappa + self.f_0 = ( + wt_val # in case of standardized observations (around WT) assume w.l.o.g. + ) + self.alphabet = alphabet + eta_var = genpareto.stats(c, moments="v") + self.theta = c / np.sqrt(eta_var) + self.rng = np.random.default_rng(seed) + logging.info(f"landscape theta={self.theta}") + super().__init__() + + @staticmethod + def f( + f0: float, + sigma: np.ndarray, + sigma_star: np.ndarray, + c: float, + kappa: float, + rand_state, + ) -> float: + L = len(sigma) + # from [1] (2) additive term via Hamming distance and constant + # hamm_dist = hamming(sigma.flatten(), sigma_star.flatten()) # NOTE scipy HD is normalized, DON't USE + hamm_dist = np.sum(sigma != sigma_star) + # from [2] nonadd. term is single small value accroding to RV, we use [1]gen.Pareto RV instead of Gaussian + eta = genpareto.rvs(kappa, size=1, random_state=rand_state) + # NOTE [1] describes eta as 2^L i.i.d. RV vector, which does not yield a single function value + f_p = f0 + -c * hamm_dist + f_val = f_p + eta + return f_val + + def __call__(self, x: np.ndarray, context=None) -> np.ndarray: + values = [] + for sequence in x: + L = len(sequence) + assert L == self.wildtype.shape[-1], "Inconsistent length: undefined." + x_int = np.array([ENCODING.get(aa) for aa in sequence]) + val = self.f( + f0=self.f_0, + sigma=x_int, + sigma_star=self.wt_int, + c=self.c, + kappa=self.kappa, + rand_state=self.rng, + ) + values.append(val) + return np.array(values).reshape(-1, 1) + + +if __name__ == "__main__": + from poli.core.registry import register_isolated_function + + register_isolated_function( + RMFIsolatedLogic, + name="rmf_landscape__isolated", + conda_environment_name="poli__rmf", + force=True, + ) diff --git a/src/poli/objective_repository/rmf_landscape/register.py b/src/poli/objective_repository/rmf_landscape/register.py new file mode 100644 index 00000000..8c5c6e34 --- /dev/null +++ b/src/poli/objective_repository/rmf_landscape/register.py @@ -0,0 +1,257 @@ +""" +Implements a (tunable) fitness landscape of the type RMF [1] for NA [1], AA [2] inputs. + +References +---------- +[1] Adaptation in Tunably Rugged Fitness Landscapes: The Rough Mount Fuji Model. + Neidhart J., Szendro I.G., and Krug, J. Genetics 198, 699-721 (2014). https://doi.org/10.1534/genetics.114.167668 +[2] Analysis of a local fitness landscape with a model of the rough Mt. Fuji-type landscape: Application to prolyl endopeptidase and thermolysin. + Aita T., Uchiyama H., et al. Biopolymers 54, 64-79 (2000). https://doi.org/10.1002/(SICI)1097-0282(200007)54:1<64::AID-BIP70>3.0.CO;2-R +""" + +from __future__ import annotations + +from typing import List, Optional, Union + +import numpy as np + +from poli.core.abstract_black_box import AbstractBlackBox +from poli.core.abstract_problem_factory import AbstractProblemFactory +from poli.core.black_box_information import BlackBoxInformation +from poli.core.problem import Problem +from poli.core.util.isolation.instancing import ( + get_inner_function, + instance_function_as_isolated_process, +) +from poli.core.util.seeding import seed_python_numpy_and_torch +from poli.objective_repository.rmf_landscape.information import rmf_info + + +class RMFBlackBox(AbstractBlackBox): + """ + RMF Black Box implementation. + + Parameters + ---------- + wildtype : str + The wildtype amino-acid sequence (aka reference sequence) against which all RMF values are computed against. + wt_val : float , optional + The reference value for the WT, zero if observations are standardized, else float value e.g. ddGs + c : float, optional + Constant scalar used in RMF computation, by default is the normalizing constant relative to alphabet size + kappa : float, optional + Parameterizes the generalized Pareto distribution, by default 0.1 . + Determines what type of distribution will be sampled from exponential family, Weibull, etc. + seed : int, optional + Random seed for replicability of results, by default None. + alphabet : List[str], optional + Type of alphabet of the sequences, by default Amino Acids. + Nucleic Acids possible. + batch_size : int, optional + The batch size for parallel evaluation, by default None. + parallelize : bool, optional + Flag to parallelize evaluation, by default False. + num_workers : int, optional + The number of workers for parallel evaluation, by default None. + evaluation_budget : int, optional + The evaluation budget, by default float("inf"). + force_isolation : bool, optional + Run in an isolated environment and process, by default False. + """ + + def __init__( + self, + wildtype: str, + wt_val: float = 0.0, + c: float | None = None, + kappa: float = 0.1, + seed: int | None = None, + alphabet: List[str] | None = None, + batch_size: int | None = None, + parallelize: bool | None = False, + num_workers: int | None = None, + evaluation_budget: int | None = float("inf"), + force_isolation: bool = False, + ) -> None: + """ + Initialize the RMFBlackBox object. + + Parameters + ---------- + batch_size : int, optional + The batch-size for parallel evaluation, default: None. + parallelize : bool, optional + Flag to parallelize the evaluation, default: False. + num_workers : int, optional + Number of workers for parallel evaluation, default: None. + evaluation_budget : int, optional + Maximum number of evaluations, default: float("inf"). + force_isolation: bool + Run the blackbox in an isolated environment, default: False. + """ + super().__init__( + batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, + evaluation_budget=evaluation_budget, + ) + self.wildtype = wildtype + self.wt_val = wt_val + self.c = c + self.kappa = kappa + self.alphabet = alphabet + self.seed = seed + self.force_isolation = force_isolation + inner_function = get_inner_function( # NOTE: this implicitly registers + isolated_function_name="rmf_landscape__isolated", + class_name="RMFIsolatedLogic", + module_to_import="poli.objective_repository.rmf_landscape.isolated_function", + wildtype=self.wildtype, + wt_val=self.wt_val, + c=self.c, + kappa=self.kappa, + alphabet=self.alphabet, + seed=self.seed, + force_isolation=self.force_isolation, + ) + + def _black_box(self, x: np.ndarray, context: None) -> np.ndarray: + """ + Runs the given input x provided + in the context with the RMF function and returns the + total fitness score. + + Parameters + ----------- + x : np.ndarray + The input array of strings containing mutations. + context : None + The context for the black box computation. + + Returns + -------- + y: np.ndarray + The computed fitness score(s) as a numpy array. + """ + inner_function = get_inner_function( + isolated_function_name="rmf_landscape__isolated", + class_name="RMFIsolatedLogic", + module_to_import="poli.objective_repository.rmf_landscape.isolated_function", + wildtype=self.wildtype, + wt_val=self.wt_val, + c=self.c, + kappa=self.kappa, + alphabet=self.alphabet, + seed=self.seed, + force_isolation=self.force_isolation, + ) + return inner_function(x, context) + + @staticmethod + def get_black_box_info() -> BlackBoxInformation: + return rmf_info + + +class RMFProblemFactory(AbstractProblemFactory): + """ + Problem factory for the rough Mt Fuji model problem. + + Methods + ------- + get_setup_information() + returns problem setup information. + create(...) + Creates RMF problem instance with specified parameters. + """ + + def get_setup_information(self) -> BlackBoxInformation: + return rmf_info + + def create( + self, + wildtype: List[str] | str, + wt_val: float | None = 0.0, + c: float | None = None, + kappa: float = 0.1, + alphabet: List[str] | None = None, + seed: int = None, + batch_size: int = None, + parallelize: bool = False, + num_workers: int = None, + evaluation_budget: int = float("inf"), + force_isolation: bool = False, + ) -> Problem: + """ + Create a RMFBlackBox object. + + Parameters + ---------- + wildtype : List[str] | str + Reference (wild-type) sequence is pseudo-optimum on start. + wt_val : float, optional + Reference function value (standardized observations) of WT. + c : float, optional + Constant value for function value computation. + If None passed default value is regularizing 1/(len(alphabet)-1) . + kappa: float + Determines generalized Pareto continuous RV. + alphabet: List[str], optional + Problem alphabet used, if None is passed default: AMINO_ACIDS. + seed : int, optional + Seed for random number generators. If None is passed, + the seeding doesn't take place. + batch_size : int, optional + Number of samples per batch for parallel computation. + parallelize : bool, optional + Flag indicating whether to parallelize the computation. + num_workers : int, optional + Number of worker processes for parallel computation. + evaluation_budget: int, optional + The maximum number of function evaluations. Default is infinity. + + Returns + ------- + problem : Problem + A problem instance containing a RMFBlackBox + function, and initial wildtypes x0. + + Raises + ------ + ValueError + If wildtype reference sequence is missing. + """ + if seed is not None: + seed_python_numpy_and_torch(seed) + + if wildtype is None: + raise ValueError("Missing reference sequence!") + + if isinstance(wildtype, str): + wildtype = list(wildtype) + + f = RMFBlackBox( + wildtype=wildtype, + wt_val=wt_val, + c=c, + kappa=kappa, + alphabet=alphabet, + seed=seed, + batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, + evaluation_budget=evaluation_budget, + force_isolation=force_isolation, + ) + x0 = np.array(wildtype).reshape(1, len(wildtype)) + problem = Problem(f, x0) + return problem + + +if __name__ == "__main__": + from poli.core.registry import register_problem + + rmf_problem_factory = RMFProblemFactory() + register_problem( + rmf_problem_factory, + conda_environment_name="poli__rmf", + ) diff --git a/src/poli/tests/registry/proteins/test_rmf.py b/src/poli/tests/registry/proteins/test_rmf.py new file mode 100644 index 00000000..28440054 --- /dev/null +++ b/src/poli/tests/registry/proteins/test_rmf.py @@ -0,0 +1,118 @@ +import numpy as np +import pytest + +from poli import objective_factory +from poli.objective_repository import AVAILABLE_PROBLEM_FACTORIES + +ref_aa_seq = "HPETLVKVKDAEDQLGARVGYIELDLNSGKILESFRPEERFPMMSTFKVLLCGAVLSRVDAGQEQLGRRIHYSQNDLVEYSPVTEKHLTDGMTVRELCSAAITMSDNTAANLLLTTIGGPKELTAFLHNMGDHVTRLDRWNPELNEAIPNDERDTTMPAAMATTLRKLLTGELLTLASRQQLIDWMEADKVAGPLLRSALPAGWFIADKSGAGERGSRGIIAALGPDGKPSRIVVIYTTGSQATMDERNRQIAEIGASLIKHW" + + +@pytest.mark.poli__rmf +def test_force_isolation_rmf_landscape(): + """ + Test if we can force-register the rmf_landscape problem. + """ + problem = objective_factory.create( + name="rmf_landscape", + wildtype=ref_aa_seq, + kappa=-100, # keep noise low + force_isolation=True, + ) + f, x0 = problem.black_box, problem.x0 + y0 = f(x0) + assert np.isclose(np.round(y0), 0.0) + f.terminate() + + +@pytest.mark.poli__rmf +def test_rmf_landscape_init(): + problem = objective_factory.create( + name="rmf_landscape", + wildtype=ref_aa_seq, + kappa=-100, + ) + f, x0 = problem.black_box, problem.x0 + y0 = f(x0) + assert np.isclose(np.round(y0), 0.0) + f.terminate() + + +@pytest.mark.poli__rmf +def test_rmf_landscape_batch_eval(): + problem = objective_factory.create( + name="rmf_landscape", + wildtype=ref_aa_seq, + ) + N = 10 + f, x0 = problem.black_box, problem.x0 + y0 = f(x0) + x_t = [] + seq_b = x0.copy() + seq_b[0, 1] = "Y" + x_t = np.vstack([seq_b for _ in range(N)]) + assert x_t.shape[0] == N + yt = f(x_t) + assert yt.shape[0] == N + f.terminate() + + +@pytest.mark.poli__rmf +@pytest.mark.parametrize("seed", [1, 2, 3]) +def test_rmf_seed_consistent(seed: int): + mutation_seq = list(ref_aa_seq) + mutation_seq[int(len(mutation_seq) / 2)] = "A" + mutation_seq[int(len(mutation_seq) / 4)] = "H" + mutation_seq = np.array(mutation_seq)[None, :] + problem_a = objective_factory.create( + name="rmf_landscape", + wildtype=ref_aa_seq, + seed=seed, + ) + problem_b = objective_factory.create( + name="rmf_landscape", wildtype=ref_aa_seq, seed=seed + ) + f_a, x0_a = problem_a.black_box, problem_a.x0 + y0_a = f_a(x0_a) + f_b, x0_b = problem_b.black_box, problem_b.x0 + y0_b = f_b(x0_b) + + y1_a = f_a(mutation_seq) + y1_b = f_b(mutation_seq) + # test equalities + assert all([x_a == x_b for x_a, x_b in zip(x0_a[0], x0_b[0])]) + assert y0_a == y0_b + assert y1_a == y1_b # value for mutated sequences equal + f_a.terminate() + f_b.terminate() + + +@pytest.mark.poli__rmf +@pytest.mark.parametrize("n_mutations", [1, 2, 3]) +def test_rmf_num_mutations_expected_val(n_mutations: int): + from scipy.stats import genpareto + + SEED = 1 + mutation_seq = list(ref_aa_seq) + for m in range(n_mutations): + mutation_seq[int(len(mutation_seq) / 2) - m] = "Y" + mutation_seq = np.array(mutation_seq)[None, :] + problem = objective_factory.create( + name="rmf_landscape", + kappa=-100, # set kappa <0 for sampling values close to zero + c=1, # set constant to one s.t. number mutations negative additive + wildtype=ref_aa_seq, + seed=SEED, + ) + + f, x0 = problem.black_box, problem.x0 + y0 = f(x0) + y1 = f(mutation_seq) + + rnd_state = np.random.default_rng(SEED) + ref_noise_0 = genpareto.rvs(f.kappa, size=1, random_state=rnd_state) + ref_noise_1 = genpareto.rvs(f.kappa, size=1, random_state=rnd_state) + + # black-box value minus noisy component should be approximately mutational distance if c==1 + assert np.isclose(np.round(y0 - ref_noise_0), 0) + assert np.isclose(np.round(y1 - ref_noise_1), -n_mutations) + f.terminate() diff --git a/tox.ini b/tox.ini index 833cef24..3cfab74c 100644 --- a/tox.ini +++ b/tox.ini @@ -98,7 +98,7 @@ commands= pytest {tty:--color=yes} -v -m "not slow and poli__dockstring" {posargs} [testenv:poli-rasp-py39] -description = run the tests with pytest on the dockstring environment for poli +description = run the tests with pytest on the RaSP environment for poli basepython = python3.9 wheel_build_env = .pkg deps= @@ -111,3 +111,17 @@ commands= sh -c "conda run -n poli__rasp python -m pip install -e ." pytest {tty:--color=yes} -v -m "not slow and poli__rasp" {posargs} +[testenv:poli-rmf-py39] +description = run the tests with pytest on the dockstring environment for poli +basepython = python3.9 +wheel_build_env = .pkg +deps= + {[testenv]deps} + -r requirements.txt + -e. +commands= + sh -c 'if conda info --envs | grep -q poli__rmf; then echo "poli__rmf already exists"; else conda env create -f ./src/poli/objective_repository/rmf_landscape/environment.yml; fi' + sh -c "conda run -n poli__rmf python -m pip uninstall -y poli" + sh -c "conda run -n poli__rmf python -m pip install -e ." + sh -c "conda run -n poli__rmf pip install pytest" + sh -c "conda run -n poli__rmf pytest -v -m 'not slow and poli__rmf'" \ No newline at end of file