Skip to content

Commit

Permalink
73 add in domain gfp task (#74)
Browse files Browse the repository at this point in the history
* init gfp task , add assets

* add environment requirements

* add GFPblackbox and problemfactory, add main

* add module import

* lint

---------
  • Loading branch information
RMichae1 authored Oct 3, 2023
1 parent 7ee3fd7 commit 57862c6
Show file tree
Hide file tree
Showing 7 changed files with 58,540 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/poli/objective_repository/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,11 @@
AVAILABLE_PROBLEM_FACTORIES["drd3_docking"] = DDR3ProblemFactory
except (ImportError, FileNotFoundError):
pass


try:
from .gfp_select.register import GFPSelectionProblemFactory

AVAILABLE_PROBLEM_FACTORIES["gfp_select"] = GFPSelectionProblemFactory
except (ImportError, FileNotFoundError):
pass
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
>avGFP_reference_sequence
AGCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGTCGTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACACTAGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCACGGCATGGACGAGCTGTACAAGTGA
58,418 changes: 58,418 additions & 0 deletions src/poli/objective_repository/gfp_select/assets/gfp_data.csv

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions src/poli/objective_repository/gfp_select/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: poli__protein
channels:
- defaults
dependencies:
- python=3.9
- pip
- pip:
- biopython
- python-levenshtein
- numpy
- pandas
- "git+https://github.com/MachineLearningLifeScience/poli.git@master"
Empty file.
100 changes: 100 additions & 0 deletions src/poli/objective_repository/gfp_select/register.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from pathlib import Path
from typing import List, Tuple
import numpy as np
import pandas as pd

from poli.core.abstract_black_box import AbstractBlackBox
from poli.core.abstract_problem_factory import AbstractProblemFactory
from poli.core.problem_setup_information import ProblemSetupInformation
from poli.core.util import batch
from poli.core.util.proteins.defaults import AMINO_ACIDS
from poli.core.util.seeding import seed_numpy, seed_python


class GFPBlackBox(AbstractBlackBox):
def __init__(
self,
info: ProblemSetupInformation,
batch_size: int = None,
parallelize: bool = False,
num_workers: int = None,
seed: int = None,
):
gfp_df_path = Path(__file__).parent.resolve() / "assets" / "gfp_data.csv"
self.batch_size = batch_size
self.seed = seed
self.gfp_lookup_df = pd.read_csv(gfp_df_path)[
["medianBrightness", "aaSequence"]
]
super().__init__(info, batch_size, parallelize, num_workers)

def _black_box(self, x: np.array, context=None) -> np.ndarray:
"""
x is string sequence which we look-up in avilable df, return median Brightness
"""
if isinstance(x, np.ndarray):
_arr = x.copy()
x = ["".join(_seq) for _seq in _arr]
ys = []
for _x in x:
seq_subsets = self.gfp_lookup_df[
self.gfp_lookup_df.aaSequence.str.lower() == _x.lower()
]
# multiple matches possible, shuffle and return one:
candidate = seq_subsets.sample(n=1, random_state=self.seed)
ys.append(candidate.medianBrightness)
return np.array(ys)


class GFPSelectionProblemFactory(AbstractProblemFactory):
def get_setup_information(self) -> ProblemSetupInformation:
"""
The problem is set up such that all available sequences
are provided in x0, however only batch_size amount of observations are known.
I.e. f(x0[:batch_size]) is returned as f_0 .
The task is to find the minimum, given that only limited inquiries (batch_size) can be done.
Given that all X are known it is recommended to use an acquisition function to rank
and inquire the highest rated sequences with the _black_box.
"""
problem_setup_info = ProblemSetupInformation(
name="gfp_select",
max_sequence_length=237, # max len of aaSequence
alphabet=AMINO_ACIDS,
aligned=True, # TODO: perhaps add the fact that there is a random state here?
)
return problem_setup_info

def create(
self,
seed: int = None,
batch_size: int = None,
parallelize: bool = False,
num_workers: int = None,
) -> Tuple[AbstractBlackBox, np.ndarray, np.ndarray]:
seed_numpy(seed)
seed_python(seed)
problem_info = self.get_setup_information()
f = GFPBlackBox(
info=problem_info,
batch_size=batch_size,
parallelize=parallelize,
num_workers=num_workers,
seed=seed,
)

randomized_df = f.gfp_lookup_df.sample(frac=1, random_state=seed).reset_index()
# create 2D array for blackbox evaluation
x0 = np.array([list(_s) for _s in randomized_df.aaSequence.to_numpy()])
f_0 = f(x0[:batch_size])

return f, x0, f_0


if __name__ == "__main__":
from poli.core.registry import register_problem

gfp_problem_factory = GFPSelectionProblemFactory()
register_problem(
gfp_problem_factory,
conda_environment_name="poli__protein",
)

0 comments on commit 57862c6

Please sign in to comment.