Merge pull request #79 from MachineLearningLifeScience/dev

Better testing, and more objective functions
MachineLearningLifeScience · Nov 1, 2023 · 3543b70 · 3543b70
2 parents bbd116a + 7074477
commit 3543b70
Show file tree

Hide file tree

Showing 130 changed files with 91,788 additions and 1,124 deletions.
diff --git a/.github/workflows/python-tox-testing-including-conda-on-master.yml b/.github/workflows/python-tox-testing-including-conda-on-master.yml
@@ -0,0 +1,38 @@
+name: Test (master, conda, python 3.9)
+
+on:
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build-linux:
+    runs-on: ubuntu-latest
+    strategy:
+      max-parallel: 5
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.9'
+    - name: Add conda to system path
+      run: |
+        # $CONDA is an environment variable pointing to the root of the miniconda directory
+        echo $CONDA/bin >> $GITHUB_PATH
+    - name: Install dependencies
+      run: |
+        python -m pip install tox
+    - name: Test linting with tox
+      run: |
+        tox -c tox.master.ini -e lint
+    - name: Test poli-base with tox
+      run: |
+        tox -c tox.master.ini -e poli-base-py39
+    - name: Test poli-chem with tox
+      run: |
+        tox -c tox.master.ini -e poli-chem-py39
+    - name: Test poli-protein with tox
+      run: |
+        tox -c tox.master.ini -e poli-protein-py39
diff --git a/.github/workflows/python-tox-testing-including-conda.yml b/.github/workflows/python-tox-testing-including-conda.yml
@@ -1,4 +1,4 @@
-name: Test (conda, python 3.9)
+name: Test (dev, conda, python 3.9)
 
 on: [push]
 
@@ -23,13 +23,7 @@ jobs:
         python -m pip install tox
     - name: Test linting with tox
       run: |
-        tox -e lint
-    - name: Test poli-base with tox
+        tox -c tox.dev.ini -e lint
+    - name: Test poli-base with tox (ignoring RaSP)
       run: |
-        tox -e poli-base-py39
-    - name: Test poli-chem with tox
-      run: |
-        tox -e poli-chem-py39
-    - name: Test poli-protein with tox
-      run: |
-        tox -e poli-protein-py39
+        tox -c tox.dev.ini -e poli-base-py39 -- --ignore=src/poli/tests/registry/proteins/test_rasp.py
diff --git a/.gitignore b/.gitignore
@@ -12,3 +12,12 @@ src/poli/registered_objectives/*.sh
 
 .vscode/
 .DS_Store
+oracle/
+
+# Ignores MLFlow and WandB runs
+mlruns/
+wandb/
+
+# Ignore temporary files
+tmp/
+src/poli/objective_repository/rasp/101m.pdb
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1,47 @@
+# Contributing to `poli`
+
+![Linting: black](https://img.shields.io/badge/Linting-black-black)
+![Testing: pytest](https://img.shields.io/badge/Testing-pytest-blue)
+![Testing: tox](https://img.shields.io/badge/Testing-tox-blue)
+![Main branch: black](https://img.shields.io/badge/Pull_request_to-dev-blue)
+
+This note details how to contribute to `poli`.
+
+## Forking and making pull requests
+
+The main development branch is called `dev`. To contribute, we recommend creating a fork of this repository and making changes on your version. Once you are ready to contribute, we expect you to lint and test.
+
+## Linting your changes
+
+We expect you to lint the code you write or modify using `black`.
+
+```bash
+pip install black
+black ./path/to/files
+```
+
+## Testing your changes for `dev``
+
+Since we are testing multiple conda environments, we settled for using a combination of `tox` and `pytest`.
+
+```bash
+pip install tox
+
+# To test linting (from the root of the project)
+tox -c tox.dev.ini -e lint
+
+# To test in the base environment for poli
+tox -c tox.dev.ini -e poli-base-py39
+```
+
+If you want to run tests in all environments, remove `-e poli-base-py39` and just run `tox`.
+
+## More thorough testing
+
+In many cases, testing with the instructions above should be enough. However, since we are dealing with creating conda environments, the definite test comes by building the Docker image specified in `Dockerfile.test`, and running it.
+
+When contributing to the `@master` branch (i.e. to release), we will run these tests.
+
+## Create a pull request to dev
+
+Once all tests pass and you are ready to share your changes, create a pull request to the `dev` branch.
diff --git a/Dockerfile.test b/Dockerfile.test
@@ -0,0 +1,34 @@
+# This dockerfile allows us to run the tests in a container
+FROM --platform=linux/amd64 continuumio/anaconda3:latest
+
+# Set working directory
+WORKDIR /app
+
+# Copying the files from the host to the container
+COPY ./src /app/src
+COPY ./pyproject.toml /app/
+COPY ./setup.cfg /app/
+COPY ./requirements.txt /app/
+COPY ./requirements-dev.txt /app/
+COPY ./tox.ini /app/
+
+# Installing distutils
+RUN apt-get update && \
+    apt-get install build-essential -y && \
+    apt-get install -y python3.9-distutils
+
+# Installing python dependencies
+RUN conda --version
+RUN pip install -r requirements.txt
+RUN pip install -r requirements-dev.txt
+
+# Creating the relevant conda environments
+# For chem
+RUN conda env create --file src/poli/objective_repository/rdkit_qed/environment.yml
+
+# For proteins
+RUN conda env create --file src/poli/objective_repository/foldx_stability/environment.yml
+RUN conda env create --file src/poli/objective_repository/rasp/environment.yml
+
+# Running the tests
+CMD ["tox"]
diff --git a/README.MD b/README.MD
@@ -1,22 +1,18 @@
-# Protein Objective Library (POLi)
+# `poli`, a library for discrete objective functions
 
 [![Testing (conda, python 3.9)](https://github.com/MachineLearningLifeScience/poli/actions/workflows/python-tox-testing-including-conda.yml/badge.svg)](https://github.com/MachineLearningLifeScience/poli/actions/workflows/python-tox-testing-including-conda.yml)
 
-An easy-to-use, plug-and-play library to benchmark protein-related discrete optimization algorithms.
-Primarily, this library provides a way to encapsulate objective functions and their dependencies.
-The main benefit is that this allows to develop optimization algorithms that use (say) tensorflow without having to worry that the objective was written in (say) torch.
+`poli` is an easy-to-use, plug-and-play library to query black-box functions in biology and cheminformatics. Examples include:
+- Computing the **stability** of mutations from a wildtype protein (using `foldx`).
+- Computing the **docking scores** of ligands to proteins (using [`pyscreener`](https://github.com/coleygroup/pyscreener) and [`pytdc`](https://tdcommons.ai/functions/oracles/)).
 
-For any code written by other authors (whether objective function or algorithm) this library allows to benchmark and analyse it without too much interaction.
-
-On purpose, logging is kept at the objective function side.
-This allows easier benchmarking of algorithms from other authors.
-Algorithm-specific logging can be done internally, on the site of the algorithm if necessary.
+When dependencies get tough, this library provides a way to encapsulate objective functions into isolated `conda` environments. The main benefit is that this allows to develop optimization algorithms that use (say) tensorflow without having to worry about the specific dependencies of the objective function. Moreover, `poli` provides a way to inject logging into the objective function evaluations using observers.
 
 ## Basic usage
 
 ### Installation
 
-Run the following from the main directory (where this README file is also located) to install the package in development mode (that is, modifications to the source code is directly visible to file importing it without the need for reinstallation).
+Run the following from the main directory (where this README file is also located) to install the package in development mode (this way you could modify its contents and have these changes be reflected without having to re-install).
 ```
 pip install -e .
 ```
@@ -52,16 +48,40 @@ for _ in range(5):
 
 ```
 
-### Calling objective functions from the repository
+### When you have the right dependencies...
+
+If you have enough dependencies to run an objective function, it will become available. For example, try running `pip install rdkit selfies` followed by the `get_problems()` statement from above:
+
+```bash
+$ pip install rdkit selfies
+$ python -c "from poli.core.registry import get_problems ; print(get_problems())"
+['aloha', 'rdkit_logp', 'rdkit_qed', 'white_noise']
+```
+
+Now that both `rdkit` and `selfies` are in the current environment, problems like computing `logp` and `qed` of SELFIES or SMILES strings become available.
+
+### Calling objective functions in isolated enviroments
 
-As you might have noticed, you can get a list of the registered problems using the `get_problems` method inside `poli.core.registry`. You can also get a list of objective functions available for installing/registration using `from poli.objective_repository import AVAILABLE_PROBLEM_FACTORIES`:
+To get a list of all avilable objective functions, you can pass the `include_repository=True` flag to `get_problems`:
 
 ```bash
-$ python -c "from poli.objective_repository import AVAILABLE_PROBLEM_FACTORIES ; print(AVAILABLE_PROBLEM_FACTORIES)"
-'{"white_noise": <WhiteNoiseProblemFactory(L=inf)>, ...}'
+$ python -c "from poli.core.registry import get_problems ; print(get_problems(include_repository=True))"
+['aloha', 'drd3_docking', 'foldx_sasa', 'foldx_stability', ..., 'white_noise']
 ```
 
-If the function isn't there, you may:
-- Install all the required dependencies for running the file. Check the relevant environment under `poli/objective_repository/problem_name/environment.yml`.
-- Implement the problem yourself! An example of how to do this can be found in `poli/examples/a_simple_objective_function_registration`.
+**Most of these objective functions can be run out-of-the-box** in isolated enviroments. For example, consider computing the synthetic accessibility of a molecule using `pytdc`. This problem is called `sa_tdc` in `poli`, and can easily be run without having the right dependencies installed:
 
+```python
+from poli import objective_factory
+import numpy as np
+
+problem_info, f, x0, y0, run_info = objective_factory.create(
+    name="sa_tdc",
+    force_register=True,
+    string_representation="SELFIES",
+)
+
+x = np.array([["[C]", "[C]", "[C]"]])
+print(f"f({x}) = {f(x)}")
+
+```
diff --git a/examples/a_simple_objective_function_registration/registering_aloha.py b/examples/a_simple_objective_function_registration/registering_aloha.py
@@ -36,7 +36,9 @@ def get_setup_information(self) -> ProblemSetupInformation:
             alphabet=alphabet,
         )
 
-    def create(self, seed: int = 0) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
+    def create(
+        self, seed: int = None, **kwargs
+    ) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
         problem_info = self.get_setup_information()
         f = OurAlohaBlackBox(info=problem_info)
         x0 = np.array([["A", "L", "O", "O", "F"]])

diff --git a/examples/adding_a_wandb_observer/example_logging_rdkit_qed_using_wandb.py b/examples/adding_a_wandb_observer/example_logging_rdkit_qed_using_wandb.py
@@ -0,0 +1,41 @@
+"""
+This script uses the wandb observer to log some examples of the rdkit_qed objective function.
+
+To run this example, you will need to install wandb:
+
+    pip install wandb
+"""
+
+from pathlib import Path
+
+import numpy as np
+
+from poli import objective_factory
+
+from wandb_observer import WandbObserver
+
+THIS_DIR = Path(__file__).parent.resolve()
+
+if __name__ == "__main__":
+    # Defining the observer
+    observer = WandbObserver()
+
+    # Initializing a QED objective function.
+    alphabet = ["", "[C]", "..."]
+    problem_info, f, x0, y0, run_info = objective_factory.create(
+        name="rdkit_qed",
+        observer=observer,
+        alphabet=alphabet,
+        string_representation="SELFIES",
+        caller_info={"run_id": None, "experiment_id": None},
+    )
+
+    # Logging some examples
+    # The observer will register each call to f.
+    f(np.array([["[C]", "[C]"]]))
+    f(np.array([["[C]", "[C]", "[C]"]]))
+    f(np.array([["[C]", "[C]", "[C]", "[C]"]]))
+
+    # Finishing the observer, which will log a table that's
+    # being maintained.
+    observer.finish()
diff --git a/examples/adding_a_wandb_observer/wandb_observer.py b/examples/adding_a_wandb_observer/wandb_observer.py
@@ -0,0 +1,54 @@
+"""A simple example of how to log objective function calls using wandb.
+
+To run this example, you will need to install wandb:
+
+    pip install wandb
+"""
+
+import numpy as np
+from poli.core.problem_setup_information import ProblemSetupInformation
+import wandb
+
+from poli.core.util.abstract_observer import AbstractObserver
+
+
+class WandbObserver(AbstractObserver):
+    def __init__(self) -> None:
+        # Log into wandb
+        wandb.login()
+
+        # Some variables to keep track of the run
+        self.step = 0
+        self.x_table = wandb.Table(columns=["step", "x", "y"])
+        super().__init__()
+
+    def initialize_observer(
+        self,
+        problem_setup_info: ProblemSetupInformation,
+        caller_info: object,
+        x0: np.ndarray,
+        y0: np.ndarray,
+        seed: int,
+    ) -> object:
+        wandb.init(
+            config={
+                "name": problem_setup_info.name,
+                "max_sequence_length": problem_setup_info.max_sequence_length,
+                "alphabet": problem_setup_info.alphabet,
+                "x0": x0,
+                "y0": y0,
+                "seed": seed,
+            },
+        )
+
+    def observe(self, x: np.ndarray, y: np.ndarray, context=None) -> None:
+        for x_i, y_i in zip(x.tolist(), y.tolist()):
+            self.x_table.add_data(self.step, "".join(x_i), y_i)
+
+        wandb.log({"table of sequences": self.x_table})
+        wandb.log({"y": y}, step=self.step)
+
+        self.step += 1
+
+    def finish(self) -> None:
+        wandb.finish()
diff --git a/examples/adding_an_mlflow_observer/example_logging_rdkit_logp_using_mlflow.py b/examples/adding_an_mlflow_observer/example_logging_rdkit_logp_using_mlflow.py
@@ -0,0 +1,47 @@
+"""
+This script implements an example of how to use a simple
+MLFlow observer (implemented in ./mlflow_observer.py). Running
+this script will create a new experiment in ./mlruns.
+
+To run this example, you will need to install mlflow:
+
+    pip install mlflow
+
+To check its results, you will need to start a ui:
+
+    mlflow ui --backend-store-uri ./mlruns
+"""
+
+from pathlib import Path
+
+import numpy as np
+
+from poli import objective_factory
+
+from mlflow_observer import MlFlowObserver
+
+THIS_DIR = Path(__file__).parent.resolve()
+
+if __name__ == "__main__":
+    # Defining the observer
+    TRACKING_URI = THIS_DIR / "mlruns"
+    observer = MlFlowObserver(tracking_uri=TRACKING_URI)
+
+    # Initializing a logP objective function.
+    alphabet = ["", "[C]", "..."]
+    problem_info, f, x0, y0, run_info = objective_factory.create(
+        name="rdkit_logp",
+        observer=observer,
+        alphabet=alphabet,
+        string_representation="SELFIES",
+        caller_info={"run_id": None, "experiment_id": None},
+    )
+
+    # Logging some examples
+    # The observer will register each call to f.
+    f(np.array([["[C]", "[C]"]]))
+    f(np.array([["[C]", "[C]", "[C]"]]))
+    f(np.array([["[C]", "[C]", "[C]", "[C]"]]))
+
+    # Finishing the observer, which will close the MLFlow run.
+    observer.finish()