MachineLearningLifeScience · RMichae1 · Jul 25, 2024 · Jul 9, 2024 · Jul 9, 2024 · Jul 9, 2024
diff --git a/.github/workflows/python-tox-testing-rmf-env.yml b/.github/workflows/python-tox-testing-rmf-env.yml
@@ -0,0 +1,29 @@
+name: poli rmf (conda, py3.9)
+
+on:
+  push:
+  schedule:
+    - cron: '0 0 * * 0'
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.9'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        python -m pip install tox
+    - name: Test rmf-related black boxes with tox and pytest
+      run: |
+        tox -c tox.ini -e poli-rmf-py39
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ markers = [
     "poli__tdc: marks tests that run in the poli__tdc environment",
     "poli__protein: marks tests that run in the poli__protein environment",
     "poli__rasp: marks tests that run in the poli__rasp environment",
+    "poli__rmf: marks tests that run in poli__rmf environment",
     "unmarked: All other tests, which usually run in the base environment",
 ]
 

diff --git a/src/poli/objective_factory.py b/src/poli/objective_factory.py
@@ -435,6 +435,9 @@ def _instantiate_observer(observer_name: str, quiet: bool = False) -> AbstractOb
         The black-box function, initial value, and related information.
 
     """
+    if _OBSERVER not in registry.config[_DEFAULT]:
+        registry.config[_DEFAULT][_OBSERVER] = _DEFAULT_OBSERVER_RUN_SCRIPT
+
     observer_script: str = registry.config[_DEFAULT][_OBSERVER]
     if observer_name is not None:
         if observer_name != DEFAULT_OBSERVER_NAME:

diff --git a/src/poli/objective_repository/__init__.py b/src/poli/objective_repository/__init__.py
@@ -75,6 +75,7 @@
 from .rfp_foldx_stability_and_sasa.register import (
     RFPFoldXStabilityAndSASAProblemFactory,
 )
+from .rmf_landscape.register import RMFBlackBox, RMFProblemFactory
 from .sa_tdc.register import SABlackBox, SAProblemFactory
 from .scaffold_hop.register import ScaffoldHopBlackBox, ScaffoldHopProblemFactory
 from .sitagliptin_mpo.register import (
@@ -138,6 +139,7 @@
     "rdkit_logp": LogPProblemFactory,
     "rdkit_qed": QEDProblemFactory,
     "rfp_foldx_stability_and_sasa": RFPFoldXStabilityAndSASAProblemFactory,
+    "rmf_landscape": RMFProblemFactory,
     "sa_tdc": SAProblemFactory,
     "super_mario_bros": SuperMarioBrosProblemFactory,
     "white_noise": WhiteNoiseProblemFactory,
@@ -182,6 +184,7 @@
     "rdkit_logp": LogPBlackBox,
     "rdkit_qed": QEDBlackBox,
     "rfp_foldx_stability_and_sasa": FoldXStabilityAndSASABlackBox,
+    "rmf_landscape": RMFBlackBox,
     "sa_tdc": SABlackBox,
     "super_mario_bros": SuperMarioBrosBlackBox,
     "white_noise": WhiteNoiseBlackBox,

diff --git a/src/poli/objective_repository/rmf_landscape/__init__.py b/src/poli/objective_repository/rmf_landscape/__init__.py
@@ -0,0 +1,10 @@
+"""Rough Mount Fuji (RMF) fitness landscapes w/ tunable ruggedness using Numpy.
+See J Neidhart, IG Szendro, J Krug 
+    Adaptation in Tunably Rugged Fitness Landscapes: The Rough Mount Fuji Model.
+    Genetics 2014 .
+    DOI: https://doi.org/10.1534/genetics.114.167668 
+See Aita et al. 
+    Analysis of a local fitness landscape with a model of the rough Mt. Fuji-type landscape: Application to prolyl endopeptidase and thermolysin. 
+    Biopolymers 2000 .
+    DOI: https://doi.org/10.1002/(SICI)1097-0282(200007)54:1<64::AID-BIP70>3.0.CO;2-R
+"""
diff --git a/src/poli/objective_repository/rmf_landscape/environment.yml b/src/poli/objective_repository/rmf_landscape/environment.yml
@@ -0,0 +1,11 @@
+name: poli__rmf
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9
+  - pip=23.2.1
+  - pip:
+    - numpy
+    - "git+https://github.com/MachineLearningLifeScience/poli.git@dev"
+    - scipy
diff --git a/src/poli/objective_repository/rmf_landscape/information.py b/src/poli/objective_repository/rmf_landscape/information.py
@@ -0,0 +1,15 @@
+import numpy as np
+
+from poli.core.black_box_information import BlackBoxInformation
+from poli.core.util.proteins.defaults import AMINO_ACIDS
+
+rmf_info = BlackBoxInformation(
+    name="rmf_landscape",
+    max_sequence_length=np.inf,
+    aligned=True,
+    fixed_length=True,
+    deterministic=False,
+    alphabet=AMINO_ACIDS,  # TODO: differentiate between AA and NA inputs?
+    log_transform_recommended=False,
+    discrete=True,
+)
diff --git a/src/poli/objective_repository/rmf_landscape/isolated_function.py b/src/poli/objective_repository/rmf_landscape/isolated_function.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+import logging
+from random import seed
+from typing import List, Optional
+
+import numpy as np
+from scipy.spatial.distance import hamming
+from scipy.stats import genpareto
+
+from poli.core.abstract_isolated_function import AbstractIsolatedFunction
+from poli.core.util.proteins.defaults import AMINO_ACIDS, ENCODING
+
+
+class RMFIsolatedLogic(AbstractIsolatedFunction):
+    """
+    RMF internal logic.
+
+    Parameters
+    ----------
+    wildtype : List[str]
+        String sequence of the reference, default: None.
+    c : float, optional
+
+    alphabet : List[str]
+        Alphabet for the problem, by default AA list provided from poli.core.util.proteins.defaults
+    stochasticity: str, optional
+    Methods
+    -------
+    _black_box(x, context=None)
+        Main black box method to compute the fitness value of x relative to the WT.
+
+    Raises
+    ------
+    AssertionError
+        If no wildtype sequence is provided.
+    """
+
+    def __init__(
+        self,
+        wildtype: List[str],
+        wt_val: float | None = 0.0,
+        c: float | None = None,
+        kappa: float | None = 0.1,
+        alphabet: List[str] | None = None,
+        seed: int | None = 0,
+    ) -> None:
+        """
+        Initialize the RMFBlackBox object.
+        """
+        assert wildtype is not None, (
+            "Missing reference input sequence. "
+            "Did you forget to pass it to the create of the black box?"
+        )
+        oracle_name = "RMF"
+        if not isinstance(wildtype, np.ndarray):
+            wildtype = np.array(list(wildtype))
+        self.wildtype = wildtype
+        self.seed = seed
+        if alphabet is None:
+            logging.info("using default alphabet AAs.")
+            alphabet = AMINO_ACIDS
+        assert all(
+            [aa in ENCODING.keys() for aa in wildtype]
+        ), "Input wildtype elements not in encoding alphabet."
+        self.wt_int = np.array([ENCODING.get(aa) for aa in wildtype])
+        if c is None:
+            c = 1 / (len(alphabet) - 1)
+        else:
+            c = c
+        assert c >= 0, "Invalid c : c > 0 required!"
+        logging.info(f"setting c={c}")
+        # if c == 0 : uncorrelated HoC landscape (?)
+        self.c = c
+        self.kappa = kappa
+        self.f_0 = (
+            wt_val  # in case of standardized observations (around WT) assume w.l.o.g.
+        )
+        self.alphabet = alphabet
+        eta_var = genpareto.stats(c, moments="v")
+        self.theta = c / np.sqrt(eta_var)
+        self.rng = np.random.default_rng(seed)
+        logging.info(f"landscape theta={self.theta}")
+        super().__init__()
+
+    @staticmethod
+    def f(
+        f0: float,
+        sigma: np.ndarray,
+        sigma_star: np.ndarray,
+        c: float,
+        kappa: float,
+        rand_state,
+    ) -> float:
+        L = len(sigma)
+        # from [1] (2) additive term via Hamming distance and constant
+        # hamm_dist = hamming(sigma.flatten(), sigma_star.flatten()) # NOTE scipy HD is normalized, DON't USE
+        hamm_dist = np.sum(sigma != sigma_star)
+        # from [2] nonadd. term is single small value accroding to RV, we use [1]gen.Pareto RV instead of Gaussian
+        eta = genpareto.rvs(kappa, size=1, random_state=rand_state)
+        # NOTE [1] describes eta as 2^L i.i.d. RV vector, which does not yield a single function value
+        f_p = f0 + -c * hamm_dist
+        f_val = f_p + eta
+        return f_val
+
+    def __call__(self, x: np.ndarray, context=None) -> np.ndarray:
+        values = []
+        for sequence in x:
+            L = len(sequence)
+            assert L == self.wildtype.shape[-1], "Inconsistent length: undefined."
+            x_int = np.array([ENCODING.get(aa) for aa in sequence])
+            val = self.f(
+                f0=self.f_0,
+                sigma=x_int,
+                sigma_star=self.wt_int,
+                c=self.c,
+                kappa=self.kappa,
+                rand_state=self.rng,
+            )
+            values.append(val)
+        return np.array(values).reshape(-1, 1)
+
+
+if __name__ == "__main__":
+    from poli.core.registry import register_isolated_function
+
+    register_isolated_function(
+        RMFIsolatedLogic,
+        name="rmf_landscape__isolated",
+        conda_environment_name="poli__rmf",
+        force=True,
+    )