Skip to content

Commit

Permalink
Merge pull request #13 from Jonas-Verhellen/major_update_spring_2021
Browse files Browse the repository at this point in the history
Major update spring 2021
  • Loading branch information
Jonas-Verhellen authored Mar 15, 2021
2 parents 5777d8a + 3855702 commit cbb564c
Show file tree
Hide file tree
Showing 37 changed files with 433 additions and 251 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Argenomic is an open-source implementation of an illumination algorithm for opti

## Getting Started

After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Thiotixene) can be called upon in the following manner:
After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Troglitazone) can be called upon in the following manner:
```
python3 illuminate.py generations=100
```
Expand Down Expand Up @@ -61,7 +61,7 @@ Important dependencies of the Argenomic software environment and where to find t

* Jan Jensen for his work in developing and open-sourcing a graph-based genetic algorithm for molecular optimisation, which served as impetus for this project.

* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space.
* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space.

* Pat Walters for his scripts indicating how to run structural alerts using the RDKit and ChEMBL, and for his many enlightening medicinal chemistry blog posts.

Expand Down
Binary file added __pycache__/cynosure.cpython-37.pyc
Binary file not shown.
1 change: 1 addition & 0 deletions argenomic/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file added argenomic/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file added argenomic/__pycache__/base.cpython-37.pyc
Binary file not shown.
Binary file modified argenomic/__pycache__/infrastructure.cpython-37.pyc
Binary file not shown.
Binary file modified argenomic/__pycache__/mechanism.cpython-37.pyc
Binary file not shown.
Binary file modified argenomic/__pycache__/operations.cpython-37.pyc
Binary file not shown.
19 changes: 19 additions & 0 deletions argenomic/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from typing import List, Tuple
from dataclasses import dataclass

class Elite:
def __init__(self, index):
self.index = index
self.molecule = None

def update(self, molecule):
if self.molecule is None or (molecule.fitness - self.molecule.fitness) > 0.0:
self.molecule = molecule
return None

@dataclass
class Molecule:
smiles: str
pedigree: Tuple[str, str ,str]
fitness: float = None
descriptor: List[float] = None
161 changes: 92 additions & 69 deletions argenomic/infrastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numpy as np
import pandas as pd
from typing import List, Tuple
from dataclasses import dataclass

from datetime import datetime
from sklearn.cluster import KMeans
Expand All @@ -18,22 +19,14 @@
rdBase.DisableLog('rdApp.error')
from rdkit.Chem import Lipinski

from argenomic.base import Molecule, Elite

class elite():
def __init__(self, index, descriptor):
self.index = index
self.fitness = 0.0
self.molecule = None
self.descriptor = descriptor

def update(self, fitness, molecule, descriptor):
if self.fitness < fitness:
self.fitness = fitness
self.molecule = molecule
self.descriptor = descriptor
return None

class archive:
class Archive:
"""
A composite class containing the current elite molecules in a CVT tree structure. Allows for processing of
new molecules, sampling of the existing elite molecules, and disk storage of the current state of the archive.
The CVT centers are either loaded from or deposited to cache disk storage.
"""
def __init__(self, archive_config, descriptor_config) -> None:
self.archive_size = archive_config.size
self.archive_accuracy = archive_config.accuracy
Expand All @@ -46,132 +39,162 @@ def __init__(self, archive_config, descriptor_config) -> None:
kmeans = KMeans(n_clusters=self.archive_size)
kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, self.archive_dimensions))
self.cvt_centers = kmeans.cluster_centers_
np.savetxt(self.cvt_location, self.cvt_centers)
np.savetxt(self.cvt_location, self.cvt_centers)
self.cvt = KDTree(self.cvt_centers, metric='euclidean')
self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)]
self.elites = [Elite(index) for index, _ in enumerate(self.cvt_centers, start=0)]
return None

def cvt_index(self, descriptor: List[float]) -> int:
"""
Returns CVT index for the niche nearest to the given discriptor.
"""
return self.cvt.query([descriptor], k=1)[1][0][0]

def add_to_archive(self, molecules: List[Chem.Mol], descriptors: List[List[float]], fitnesses: List[float]) -> None:
for molecule, descriptor, fitness in zip(molecules, descriptors, fitnesses):
self.elites[self.cvt_index(descriptor)].update(fitness, molecule, descriptor)
def add_to_archive(self, molecules) -> None:
"""
Takes in a list of molecules and adds them to the archive as prescribed by the MAP-Elites algorithm,
i.e. each niche only contains the most fit molecule. Other molecules are discarded.
"""
for molecule in molecules:
self.elites[self.cvt_index(molecule.descriptor)].update(molecule)
return None

def sample(self, size: int) -> List[Chem.Mol]:
pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
"""
Returns a list of elite molecules of the requisted length.
The elite molecules are randomly drawn, weighted by their fitness.
"""
pairs = [(elite.molecule, elite.molecule.fitness) for elite in self.elites if elite.molecule]
molecules, weights = map(list, zip(*pairs))
return random.choices(molecules, k=size, weights=weights)

def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
"""
Returns a list of pairs of elite molecules of the requisted length.
The elite molecules are randomly drawn, weighted by their fitness.
"""
pairs = [(elite.molecule, elite.molecule.fitness) for elite in self.elites if elite.molecule]
molecules, weights = map(list, zip(*pairs))
sample_molecules = random.choices(molecules, k=size, weights=weights)
sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True)
sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]
sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]
return sample_pairs

def store_archive(self, generation: float) -> None:
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
pd.DataFrame(data=data).to_csv("archive_{}.csv".format(generation), index=False)
return None

def store_statistics(self, generation: float) -> None:
elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
fractional_size = len(elites_smiles)/self.archive_size
statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size]
def store_data(self, generation: float) -> None:
"""
Creates a dataframe representing the archive and writes it to disk. In addtion, basic statistics about
the state of the archive are saved to disk and printed to the IO stream.
"""
archive_data = self.get_archive_data()
fractional_size = len(archive_data["smiles"])/self.archive_size
max_fitness, mean_fitness = np.max(archive_data["fitnesses"]), np.mean(archive_data["fitnesses"])
if os.path.isfile('statistics.csv'):
with open('statistics.csv', 'a') as file:
csv.writer(file).writerow(statistics)
csv.writer(file).writerow([generation, max_fitness, mean_fitness, fractional_size])
file.close()
else:
with open('statistics.csv', 'w') as file:
file.close()
print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4]))
print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3]))
pd.DataFrame(data=archive_data).to_csv("archive_{}.csv".format(generation), index=False)
print('Generation: {}, Size: {:.2f}'.format(generation, fractional_size))
print('Fitness Max: {:.5f}, Fitness Mean: {:.5f}'.format(max_fitness, mean_fitness))
return None

def elites_data(self) -> Tuple[List[str], List[float], List[float]]:
elites_list = [elite for elite in self.elites if elite.molecule]
elites_smiles = [Chem.MolToSmiles(elite.molecule) for elite in elites_list]
elites_descriptors = [elite.descriptor for elite in elites_list]
elites_fitnesses = [elite.fitness for elite in elites_list]
return elites_smiles, elites_descriptors, elites_fitnesses


class arbiter:

def get_archive_data(self) -> None:
elite_indices = [elite.index for elite in self.elites if elite.molecule]
elite_molecules = [elite.molecule for elite in self.elites if elite.molecule]
elites_smiles = [molecule.smiles for molecule in elite_molecules]
elites_pedigree = [molecule.pedigree for molecule in elite_molecules]
elites_descriptors = [molecule.descriptor for molecule in elite_molecules]
elites_fitnesses = [molecule.fitness for molecule in elite_molecules]
archive_data = {'index': elite_indices, 'smiles': elites_smiles, 'pedigree': elites_pedigree, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
return archive_data

class Arbiter:
"""
A catalog class containing different druglike filters for small molecules.
Includes the option to run the structural filters from ChEMBL.
"""
def __init__(self, arbiter_config) -> None:
self.cache_smiles = []
self.rules_dict = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/alert_collection.csv"))
self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
self.rules_list = self.rules_dict["smarts"].values.tolist()
self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]

def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]:
def __call__(self, molecules):
"""
Applies the chosen filters (hologenicity, veber_infractions,
ChEMBL structural alerts, ...) to a list of molecules.
ChEMBL structural alerts, ...) to a list of molecules and removes duplicates.
"""
filtered_molecules = []
molecules = self.unique_molecules(molecules)
for molecule in molecules:
if self.molecule_validity(molecule):
molecular_graph = Chem.MolFromSmiles(molecule.smiles)
if self.molecule_filter(molecular_graph):
filtered_molecules.append(molecule)
return filtered_molecules

def molecule_validity(self, molecule: Chem.Mol) -> bool:
def unique_molecules(self, molecules: List[Molecule]) -> List[Molecule]:
"""
Checks if a molecule in a lost of molcules is duplicated, either in this batch or before.
"""
unique_molecules = []
for molecule in molecules:
if molecule.smiles not in self.cache_smiles:
unique_molecules.append(molecule)
self.cache_smiles.append(molecule.smiles)
return unique_molecules

def molecule_filter(self, molecular_graph: Chem.Mol) -> bool:
"""
Checks if a given molecule passes through the chosen filters (hologenicity,
Checks if a given molecular structure passes through the chosen filters (hologenicity,
veber_infractions, ChEMBL structural alerts, ...).
"""
toxicity = self.toxicity(molecule)
hologenicity = self.hologenicity(molecule)
veber_infraction = self.veber_infraction(molecule)
toxicity = self.toxicity(molecular_graph)
hologenicity = self.hologenicity(molecular_graph)
veber_infraction = self.veber_infraction(molecular_graph)
validity = not (toxicity or hologenicity or veber_infraction)
if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
ring_infraction = self.ring_infraction(molecule)
if molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
ring_infraction = self.ring_infraction(molecular_graph)
validity = validity and not (ring_infraction)
return validity

def toxicity(self, molecule: Chem.Mol) -> bool:
def toxicity(self, molecular_graph: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the structural filters.
"""
for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
if len(molecule.GetSubstructMatches(pattern)) > tolerance:
if len(molecular_graph.GetSubstructMatches(pattern)) > tolerance:
return True
return False

@staticmethod
def hologenicity(molecule: Chem.Mol) -> bool:
def hologenicity(molecular_graph: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the hologenicity filters.
"""
fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
fluorine_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
bromide_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
chlorine_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
return chlorine_saturation or bromide_saturation or fluorine_saturation

@staticmethod
def ring_infraction(molecule: Chem.Mol) -> bool:
def ring_infraction(molecular_graph: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the ring infraction filters.
"""
ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
ring_allene = molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
macro_cycle = max([len(j) for j in molecular_graph.GetRingInfo().AtomRings()]) > 6
double_bond_in_small_ring = molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
return ring_allene or macro_cycle or double_bond_in_small_ring

@staticmethod
def veber_infraction(molecule: Chem.Mol) -> bool:
def veber_infraction(molecular_graph: Chem.Mol) -> bool:
"""
Checks if a given molecule fails the veber infraction filters.
"""
rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecular_graph) > 10
hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecular_graph) + Lipinski.NumHDonors(molecular_graph) > 10
return rotatable_bond_saturation or hydrogen_bond_saturation
58 changes: 30 additions & 28 deletions argenomic/mechanism.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,73 +15,75 @@
from rdkit.Chem import rdMolDescriptors
from rdkit.DataStructs.cDataStructs import TanimotoSimilarity

class descriptor:
class Descriptor:
"""
A strategy class for calculating the descriptor vector of a molecule.
"""
def __init__(self, config_descriptor) -> None:
self.properties = []
self.ranges = config_descriptor.ranges
self.ranges = config_descriptor.ranges
self.property_names = config_descriptor.properties
for name in self.property_names:
module, fuction = name.split(".")
module = getattr(sys.modules[__name__], module)
self.properties.append(getattr(module, fuction))
return None

def __call__(self, molecule: Chem.Mol) -> List[float]:
def __call__(self, molecule) -> None:
"""
Calculating the descriptor vector of a molecule.
Updates the descriptor vector of a molecule.
"""
descriptor = []
molecular_graph = Chem.MolFromSmiles(molecule.smiles)
for property, range in zip(self.properties, self.ranges):
descriptor.append(self.rescale(property(molecule), range))
return descriptor
descriptor.append(self.rescale(property(molecular_graph), range))
molecule.descriptor = descriptor
return molecule

@staticmethod
def rescale(feature: List[float], range: List[float]) -> List[float]:
"""
Rescaling the feature to the unit range.
Rescales the feature to the unit range.
"""
rescaled_feature = (feature - range[0])/(range[1] - range[0])
return rescaled_feature

class fitness:
class Fitness:
"""
A strategy class for calculating the fitness of a molecule.
"""
def __init__(self, config_fitness) -> None:
self.memoized_cache = dict()
self.fingerprint_type = config_fitness.type
self.target = Chem.MolFromSmiles(config_fitness.target)
self.target_fingerprint = self.get_fingerprint(self.target, self.fingerprint_type)
return None

def __call__(self, molecule: Chem.Mol) -> float:
smiles = Chem.MolToSmiles(molecule)
if smiles in self.memoized_cache:
fitness = self.memoized_cache[smiles]
else:
molecule_fingerprint = self.get_fingerprint(molecule, self.fingerprint_type)
fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
self.memoized_cache[smiles] = fitness
return fitness
def __call__(self, molecule) -> None:
"""
Updates the fitness value of a molecule.
"""
molecular_graph = Chem.MolFromSmiles(Chem.CanonSmiles(molecule.smiles))
molecule_fingerprint = self.get_fingerprint(molecular_graph, self.fingerprint_type)
fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
molecule.fitness = fitness
return molecule

def get_fingerprint(self, molecule: Chem.Mol, fingerprint_type: str):
def get_fingerprint(self, molecular_graph: Chem.Mol, fingerprint_type: str):
method_name = 'get_' + fingerprint_type
method = getattr(self, method_name)
if method is None:
raise Exception('{} is not a supported fingerprint type.'.format(fingerprint_type))
return method(molecule)
return method(molecular_graph)

def get_ECFP4(self, molecular_graph: Chem.Mol):
return AllChem.GetMorganFingerprint(molecular_graph, 2)

def get_ECFP4(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 2)
def get_ECFP6(self, molecular_graph: Chem.Mol):
return AllChem.GetMorganFingerprint(molecular_graph, 3)

def get_ECFP6(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 3)
def get_FCFP4(self, molecular_graph: Chem.Mol):
return AllChem.GetMorganFingerprint(molecular_graph, 2, useFeatures=True)

def get_FCFP4(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 2, useFeatures=True)
def get_FCFP6(self, molecular_graph: Chem.Mol):
return AllChem.GetMorganFingerprint(molecular_graph, 3, useFeatures=True)

def get_FCFP6(self, molecule: Chem.Mol):
return AllChem.GetMorganFingerprint(molecule, 3, useFeatures=True)
Loading

0 comments on commit cbb564c

Please sign in to comment.