diff --git a/README.MD b/README.MD index 8b84bbc8..4e7d8509 100644 --- a/README.MD +++ b/README.MD @@ -1,16 +1,12 @@ -# Protein Objective Library (POLi) +# `poli`, a library for discrete sequence optimization [![Testing (conda, python 3.9)](https://github.com/MachineLearningLifeScience/poli/actions/workflows/python-tox-testing-including-conda.yml/badge.svg)](https://github.com/MachineLearningLifeScience/poli/actions/workflows/python-tox-testing-including-conda.yml) -An easy-to-use, plug-and-play library to benchmark protein-related discrete optimization algorithms. -Primarily, this library provides a way to encapsulate objective functions and their dependencies. -The main benefit is that this allows to develop optimization algorithms that use (say) tensorflow without having to worry that the objective was written in (say) torch. +`poli` is an easy-to-use, plug-and-play library to query black-box functions in biology and cheminformatics. Examples include: +- Computing the **stability** of mutations from a wildtype protein (using `foldx`). +- Computing the **docking scores** of ligands to proteins (using [`pyscreener`]() and [`pytdc`]()). -For any code written by other authors (whether objective function or algorithm) this library allows to benchmark and analyse it without too much interaction. - -On purpose, logging is kept at the objective function side. -This allows easier benchmarking of algorithms from other authors. -Algorithm-specific logging can be done internally, on the site of the algorithm if necessary. +When dependencies get tough, this library provides a way to encapsulate objective functions into isolated `conda` environments. The main benefit is that this allows to develop optimization algorithms that use (say) tensorflow without having to worry about the specific dependencies of the objective function. Moreover, `poli` provides a way to inject logging into the objective function evaluations using observers. ## Basic usage @@ -52,16 +48,40 @@ for _ in range(5): ``` -### Calling objective functions from the repository +### When you have the right dependencies... + +If you have enough dependencies to run an objective function, it will become available. For example, try running `pip install rdkit selfies` followed by the `get_problems()` statement from above: + +```bash +$ pip install rdkit selfies +$ python -c "from poli.core.registry import get_problems ; print(get_problems())" +['aloha', 'rdkit_logp', 'rdkit_qed', 'white_noise'] +``` + +Now that both `rdkit` and `selfies` are in the current environment, problems like computing `logp` and `qed` of SELFIES or SMILES strings become available. + +### Calling objective functions in isolated enviroments -As you might have noticed, you can get a list of the registered problems using the `get_problems` method inside `poli.core.registry`. You can also get a list of objective functions available for installing/registration using `from poli.objective_repository import AVAILABLE_PROBLEM_FACTORIES`: +To get a list of all avilable objective functions, you can pass the `include_repository=True` flag to `get_problems`: ```bash -$ python -c "from poli.objective_repository import AVAILABLE_PROBLEM_FACTORIES ; print(AVAILABLE_PROBLEM_FACTORIES)" -'{"white_noise": , ...}' +$ python -c "from poli.core.registry import get_problems ; print(get_problems(include_repository=True))" +['aloha', 'drd3_docking', 'foldx_sasa', 'foldx_stability', ..., 'white_noise'] ``` -If the function isn't there, you may: -- Install all the required dependencies for running the file. Check the relevant environment under `poli/objective_repository/problem_name/environment.yml`. -- Implement the problem yourself! An example of how to do this can be found in `poli/examples/a_simple_objective_function_registration`. +**Most of these objective functions can be run out-of-the-box** in isolated enviroments. For example, consider computing the synthetic accessibility of a molecule using `pytdc`. This problem is called `sa_tdc` in `poli`, and can easily be run without having the right dependencies installed: +```python +from poli import objective_factory +import numpy as np + +problem_info, f, x0, y0, run_info = objective_factory.create( + name="sa_tdc", + force_register=True, + string_representation="SELFIES", +) + +x = np.array([["[C]", "[C]", "[C]"]]) +print(f"f({x}) = {f(x)}") + +``` diff --git a/src/poli/core/chemistry/tdc_black_box.py b/src/poli/core/chemistry/tdc_black_box.py index d47fd2da..df369595 100644 --- a/src/poli/core/chemistry/tdc_black_box.py +++ b/src/poli/core/chemistry/tdc_black_box.py @@ -21,9 +21,11 @@ def __init__( oracle_name: str, info: ProblemSetupInformation, batch_size: int = None, + parallelize: bool = False, + num_workers: int = None, from_smiles: bool = True, ): - super().__init__(info, batch_size) + super().__init__(info, batch_size, parallelize, num_workers) self.oracle = Oracle(name=oracle_name) self.from_smiles = from_smiles diff --git a/src/poli/objective_repository/drd3_docking/environment.yml b/src/poli/objective_repository/drd3_docking/environment.yml index a6d5b64f..493671ba 100644 --- a/src/poli/objective_repository/drd3_docking/environment.yml +++ b/src/poli/objective_repository/drd3_docking/environment.yml @@ -1,4 +1,4 @@ -name: poli__lambo +name: poli__tdc channels: - conda-forge - defaults @@ -13,12 +13,7 @@ dependencies: - pip: - "git+https://github.com/MachineLearningLifeScience/poli.git@dev" - biopython==1.81 - - botorch==0.8.5 - - gpytorch==1.10 - - hydra-core==1.1.0.dev6 - python-levenshtein==0.12.2 - - pymoo==0.6.0.1 - - torch==2.0.1 - pandas==2.0.3 - cachetools==5.3.1 - rdkit @@ -29,4 +24,5 @@ dependencies: - configparse - h5py - tqdm - - scikit-learn \ No newline at end of file + - scikit-learn + - networkx \ No newline at end of file diff --git a/src/poli/objective_repository/drd3_docking/register.py b/src/poli/objective_repository/drd3_docking/register.py index abdb5c30..dc86d1dd 100644 --- a/src/poli/objective_repository/drd3_docking/register.py +++ b/src/poli/objective_repository/drd3_docking/register.py @@ -14,7 +14,7 @@ from poli.core.abstract_problem_factory import AbstractProblemFactory from poli.core.problem_setup_information import ProblemSetupInformation -from poli.core.util.chemistry.string_to_molecule import translate_selfies_to_smiles +from poli.core.util.chemistry.string_to_molecule import translate_smiles_to_selfies from poli.core.util.seeding import seed_numpy, seed_python @@ -24,6 +24,8 @@ def __init__( self, info: ProblemSetupInformation, batch_size: int = None, + parallelize: bool = False, + num_workers: int = None, from_smiles: bool = True, ): oracle_name = "3pbl_docking" @@ -31,6 +33,8 @@ def __init__( oracle_name=oracle_name, info=info, batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, from_smiles=from_smiles, ) @@ -48,6 +52,8 @@ def create( self, seed: int = None, batch_size: int = None, + parallelize: bool = False, + num_workers: int = None, string_representation: str = "SMILES", ) -> Tuple[TDCBlackBox, np.ndarray, np.ndarray]: """ @@ -68,12 +74,14 @@ def create( f = DRD3BlackBox( info=problem_info, batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, from_smiles=string_representation.upper() == "SMILES", ) # Initial example (from the TDC docs) x0_smiles = "c1ccccc1" - x0_selfies = translate_selfies_to_smiles([x0_smiles])[0] + x0_selfies = translate_smiles_to_selfies([x0_smiles])[0] if string_representation.upper() == "SMILES": x0 = np.array([list(x0_smiles)]) diff --git a/src/poli/objective_repository/sa_tdc/environment.yml b/src/poli/objective_repository/sa_tdc/environment.yml index a6d5b64f..493671ba 100644 --- a/src/poli/objective_repository/sa_tdc/environment.yml +++ b/src/poli/objective_repository/sa_tdc/environment.yml @@ -1,4 +1,4 @@ -name: poli__lambo +name: poli__tdc channels: - conda-forge - defaults @@ -13,12 +13,7 @@ dependencies: - pip: - "git+https://github.com/MachineLearningLifeScience/poli.git@dev" - biopython==1.81 - - botorch==0.8.5 - - gpytorch==1.10 - - hydra-core==1.1.0.dev6 - python-levenshtein==0.12.2 - - pymoo==0.6.0.1 - - torch==2.0.1 - pandas==2.0.3 - cachetools==5.3.1 - rdkit @@ -29,4 +24,5 @@ dependencies: - configparse - h5py - tqdm - - scikit-learn \ No newline at end of file + - scikit-learn + - networkx \ No newline at end of file diff --git a/src/poli/objective_repository/sa_tdc/register.py b/src/poli/objective_repository/sa_tdc/register.py index 182e0874..183c3009 100644 --- a/src/poli/objective_repository/sa_tdc/register.py +++ b/src/poli/objective_repository/sa_tdc/register.py @@ -14,7 +14,7 @@ from poli.core.abstract_problem_factory import AbstractProblemFactory from poli.core.problem_setup_information import ProblemSetupInformation -from poli.core.util.chemistry.string_to_molecule import translate_selfies_to_smiles +from poli.core.util.chemistry.string_to_molecule import translate_smiles_to_selfies from poli.core.util.seeding import seed_numpy, seed_python @@ -24,10 +24,19 @@ def __init__( self, info: ProblemSetupInformation, batch_size: int = None, + parallelize: bool = False, + num_workers: int = None, from_smiles: bool = True, ): oracle_name = "SA" - super().__init__(oracle_name, info, batch_size, from_smiles) + super().__init__( + oracle_name=oracle_name, + info=info, + batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, + from_smiles=from_smiles, + ) class SAProblemFactory(AbstractProblemFactory): @@ -43,6 +52,8 @@ def create( self, seed: int = None, batch_size: int = None, + parallelize: bool = False, + num_workers: int = None, string_representation: str = "SMILES", ) -> Tuple[SABlackBox, np.ndarray, np.ndarray]: """ @@ -61,12 +72,14 @@ def create( f = SABlackBox( info=problem_info, batch_size=batch_size, + parallelize=parallelize, + num_workers=num_workers, from_smiles=string_representation.upper() == "SMILES", ) # Initial example (from the TDC docs) x0_smiles = "CCNC(=O)c1ccc(NC(=O)N2CC[C@H](C)[C@H](O)C2)c(C)c1" - x0_selfies = translate_selfies_to_smiles([x0_smiles])[0] + x0_selfies = translate_smiles_to_selfies([x0_smiles])[0] # TODO: change for proper tokenization in the SMILES case. if string_representation.upper() == "SMILES": @@ -82,6 +95,6 @@ def create( register_problem( SAProblemFactory(), - conda_environment_name="poli__lambo", + conda_environment_name="poli__tdc", force=True, ) diff --git a/src/poli/tests/test_sa_tdc_registration_on_readme.py b/src/poli/tests/test_sa_tdc_registration_on_readme.py new file mode 100644 index 00000000..ff44323d --- /dev/null +++ b/src/poli/tests/test_sa_tdc_registration_on_readme.py @@ -0,0 +1,19 @@ +def test_minimal_isolation_example(): + """ + Tests the minimal working example from the readme, verbatum. + """ + from poli import objective_factory + import numpy as np + + problem_info, f, x0, y0, run_info = objective_factory.create( + name="sa_tdc", + force_register=True, + string_representation="SELFIES", + ) + + x = np.array([["[C]", "[C]", "[C]"]]) + print(f"f({x}) = {f(x)}") + + +if __name__ == "__main__": + test_minimal_isolation_example() diff --git a/tox.ini b/tox.ini index e19a615a..59c6c010 100644 --- a/tox.ini +++ b/tox.ini @@ -22,6 +22,9 @@ commands = sh -c 'if conda info --envs | grep -q poli__protein; then echo "poli__protein already exists"; else conda env create -f ./src/poli/objective_repository/foldx_stability/environment.yml; fi' sh -c "conda run -n poli__protein python -m pip uninstall -y poli" sh -c "conda run -n poli__protein python -m pip install -e ." + sh -c 'if conda info --envs | grep -q poli__tdc; then echo "poli__tdc already exists"; else conda env create -f ./src/poli/objective_repository/sa_tdc/environment.yml; fi' + sh -c "conda run -n poli__tdc python -m pip uninstall -y poli" + sh -c "conda run -n poli__tdc python -m pip install -e ." pytest {tty:--color=yes} -v {posargs} sh -c "rm -rf ~/.poli_objectives"