Skip to content

Commit

Permalink
Merge pull request #12 from Jonas-Verhellen/development
Browse files Browse the repository at this point in the history
  • Loading branch information
jeriek authored Aug 5, 2020
2 parents dfb2f2a + 975351b commit 5777d8a
Show file tree
Hide file tree
Showing 108 changed files with 292 additions and 8,165 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

## Description

Argenomic is an open-source implementation of an illumination algorithm for optimization of small organic molecules. Argenomic provides a holistic overview of how high-performing molecules are distributed throughout a search space. This novel approach produces potent but qualitatively different molecules, illuminates the distribution of optimal solutions, and improves search efficiency compared to both machine learning and traditional genetic algorithm approaches. This implementation is based on an open-source, [graph-based genetic algorithm](https://github.com/jensengroup/GB-GA) for molecular optimisation, and influenced by state-of-the-art concepts from [soft robot design](https://github.com/resibots/pymap_elites). For more information, see the accompanying [blog post](https://jonas-verhellen.github.io/posts/2020/07/argenomic/).
Argenomic is an open-source implementation of an illumination algorithm for optimization of small organic molecules. Argenomic provides a holistic overview of how high-performing molecules are distributed throughout a search space. This novel approach produces potent but qualitatively different molecules, illuminates the distribution of optimal solutions, and improves search efficiency compared to both machine learning and traditional genetic algorithm approaches. This implementation is based on an open-source, [graph-based genetic algorithm](https://github.com/jensengroup/GB-GA) for molecular optimisation, and influenced by state-of-the-art concepts from [soft robot design](https://github.com/resibots/pymap_elites). For more information, see the accompanying [blog post](https://jonas-verhellen.github.io/posts/2020/07/argenomic/).

<p align="center">
<img src="https://github.com/Jonas-Verhellen/jonas-verhellen.github.io/blob/master/images/video.gif" />
Expand All @@ -17,7 +17,7 @@ Argenomic is an open-source implementation of an illumination algorithm for opti

After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Thiotixene) can be called upon in the following manner:
```
python3 illuminate.py configuration_file=./configuration/config.yaml generations=100
python3 illuminate.py generations=100
```

### Installing
Expand Down
136 changes: 72 additions & 64 deletions argenomic/infrastructure.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import csv
import hydra
import random
import itertools

Expand Down Expand Up @@ -34,18 +35,20 @@ def update(self, fitness, molecule, descriptor):

class archive:
def __init__(self, archive_config, descriptor_config) -> None:
self.archive_name = archive_config.name
self.archive_size = archive_config.size
kmeans = KMeans(n_clusters=self.archive_size)
kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, len(descriptor_config.properties)))
self.cvt_centers = kmeans.cluster_centers_
self.archive_accuracy = archive_config.accuracy
self.archive_dimensions = len(descriptor_config.properties)
self.cache_string = "cache_{}_{}.csv".format(self.archive_dimensions, self.archive_accuracy)
self.cvt_location = hydra.utils.to_absolute_path("data/cvt/" + self.cache_string)
if os.path.isfile(self.cvt_location):
self.cvt_centers = np.loadtxt(self.cvt_location)
else:
kmeans = KMeans(n_clusters=self.archive_size)
kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, self.archive_dimensions))
self.cvt_centers = kmeans.cluster_centers_
np.savetxt(self.cvt_location, self.cvt_centers)
self.cvt = KDTree(self.cvt_centers, metric='euclidean')
self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)]
if not os.path.isdir(self.archive_name):
os.mkdir(self.archive_name)
with open('{}/statistics.csv'.format(self.archive_name), 'w') as file:
file.write("## Argenomic Statistics File: {} \n".format(datetime.now()))
file.close()
return None

def cvt_index(self, descriptor: List[float]) -> int:
Expand All @@ -72,15 +75,20 @@ def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
def store_archive(self, generation: float) -> None:
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
pd.DataFrame(data=data).to_csv("{}/archive_{}.csv".format(self.archive_name, generation), index=False)
pd.DataFrame(data=data).to_csv("archive_{}.csv".format(generation), index=False)
return None

def store_statistics(self, generation: float) -> None:
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
fractional_size = len(elites_smiles)/self.archive_size
statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size]
with open('{}/statistics.csv'.format(self.archive_name), 'a') as file:
csv.writer(file).writerow(statistics)
if os.path.isfile('statistics.csv'):
with open('statistics.csv', 'a') as file:
csv.writer(file).writerow(statistics)
file.close()
else:
with open('statistics.csv', 'w') as file:
file.close()
print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4]))
print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3]))
return None
Expand All @@ -99,71 +107,71 @@ class arbiter:
Includes the option to run the structural filters from ChEMBL.
"""
def __init__(self, arbiter_config) -> None:
self.rules_dict = pd.read_csv("./data/smarts/alert_collection.csv")
self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
self.rules_list = self.rules_dict["smarts"].values.tolist()
self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]
self.rules_dict = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/alert_collection.csv"))
self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
self.rules_list = self.rules_dict["smarts"].values.tolist()
self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]

def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]:
"""
Applies the chosen filters (hologenicity, veber_infractions,
ChEMBL structural alerts, ...) to a list of molecules.
"""
filtered_molecules = []
for molecule in molecules:
if self.molecule_validity(molecule):
filtered_molecules.append(molecule)
return filtered_molecules
"""
Applies the chosen filters (hologenicity, veber_infractions,
ChEMBL structural alerts, ...) to a list of molecules.
"""
filtered_molecules = []
for molecule in molecules:
if self.molecule_validity(molecule):
filtered_molecules.append(molecule)
return filtered_molecules

def molecule_validity(self, molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule passes through the chosen filters (hologenicity,
veber_infractions, ChEMBL structural alerts, ...).
"""
toxicity = self.toxicity(molecule)
hologenicity = self.hologenicity(molecule)
veber_infraction = self.veber_infraction(molecule)
validity = not (toxicity or hologenicity or veber_infraction)
if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
ring_infraction = self.ring_infraction(molecule)
validity = validity and not (ring_infraction)
return validity
"""
Checks if a given molecule passes through the chosen filters (hologenicity,
veber_infractions, ChEMBL structural alerts, ...).
"""
toxicity = self.toxicity(molecule)
hologenicity = self.hologenicity(molecule)
veber_infraction = self.veber_infraction(molecule)
validity = not (toxicity or hologenicity or veber_infraction)
if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
ring_infraction = self.ring_infraction(molecule)
validity = validity and not (ring_infraction)
return validity

def toxicity(self, molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the structural filters.
"""
for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
"""
Checks if a given molecule fails the structural filters.
"""
for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
if len(molecule.GetSubstructMatches(pattern)) > tolerance:
return True
return False
return True
return False

@staticmethod
def hologenicity(molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the hologenicity filters.
"""
fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
return chlorine_saturation or bromide_saturation or fluorine_saturation
"""
Checks if a given molecule fails the hologenicity filters.
"""
fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
return chlorine_saturation or bromide_saturation or fluorine_saturation

@staticmethod
def ring_infraction(molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the ring infraction filters.
"""
ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
return ring_allene or macro_cycle or double_bond_in_small_ring
"""
Checks if a given molecule fails the ring infraction filters.
"""
ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
return ring_allene or macro_cycle or double_bond_in_small_ring

@staticmethod
def veber_infraction(molecule: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the veber infraction filters.
"""
rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
return rotatable_bond_saturation or hydrogen_bond_saturation
"""
Checks if a given molecule fails the veber infraction filters.
"""
rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
return rotatable_bond_saturation or hydrogen_bond_saturation
3 changes: 2 additions & 1 deletion argenomic/operations.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hydra
import random
import logging
import numpy as np
Expand All @@ -17,7 +18,7 @@ class mutator:
according to the principles of positional analogue scanning.
"""
def __init__(self) -> None:
self.mutation_data = pd.read_csv("./data/smarts/mutation_collection.tsv", sep='\t')
self.mutation_data = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/mutation_collection.tsv"), sep='\t')

def __call__(self, molecule:Chem.Mol) -> List[Chem.Mol]:
sampled_mutation = self.mutation_data.sample(n=1, weights='probability').iloc[0]
Expand Down
7 changes: 5 additions & 2 deletions configuration/config.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
---
data_file: ./data/smiles/guacamol_initial_rediscovery_thiotixene.smi
data_file: data/smiles/guacamol_initial_rediscovery_thiotixene.smi
batch_size: 40
initial_size: 100
workers: 1
threads: 2
generations: 75
archive:
name: ./results/thiotixene
name: thiotixene
size: 150
accuracy: 25000
descriptor:
Expand Down
Loading

0 comments on commit 5777d8a

Please sign in to comment.