Merge pull request #13 from Jonas-Verhellen/major_update_spring_2021

Major update spring 2021
Jonas-Verhellen · Mar 15, 2021 · cbb564c · cbb564c
2 parents 5777d8a + 3855702
commit cbb564c
Show file tree

Hide file tree

Showing 37 changed files with 433 additions and 251 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ Argenomic is an open-source implementation of an illumination algorithm for opti
 
 ## Getting Started
 
-After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Thiotixene) can be called upon in the following manner:
+After installing the software and running the tests, a basic usage example of argenomic (i.e. the rediscovery of Troglitazone) can be called upon in the following manner:
 ```
 python3 illuminate.py generations=100
 ```
@@ -61,7 +61,7 @@ Important dependencies of the Argenomic software environment and where to find t
 
 * Jan Jensen for his work in developing and open-sourcing a graph-based genetic algorithm for molecular optimisation, which served as impetus for this project.
 
-* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space.  
+* Jean-Baptiste Mouret and Jeff Clune for their breakthrough invention of illumination algorithms, providing a holistic view of high-performing solutions throughout a search space.
 
 * Pat Walters for his scripts indicating how to run structural alerts using the RDKit and ChEMBL, and for his many enlightening medicinal chemistry blog posts.
 

diff --git a/__pycache__/cynosure.cpython-37.pyc b/__pycache__/cynosure.cpython-37.pyc
diff --git a/argenomic/__init__.py b/argenomic/__init__.py
@@ -0,0 +1 @@
+
diff --git a/argenomic/__pycache__/__init__.cpython-37.pyc b/argenomic/__pycache__/__init__.cpython-37.pyc
diff --git a/argenomic/__pycache__/base.cpython-37.pyc b/argenomic/__pycache__/base.cpython-37.pyc
diff --git a/argenomic/__pycache__/infrastructure.cpython-37.pyc b/argenomic/__pycache__/infrastructure.cpython-37.pyc
diff --git a/argenomic/__pycache__/mechanism.cpython-37.pyc b/argenomic/__pycache__/mechanism.cpython-37.pyc
diff --git a/argenomic/__pycache__/operations.cpython-37.pyc b/argenomic/__pycache__/operations.cpython-37.pyc
diff --git a/argenomic/base.py b/argenomic/base.py
@@ -0,0 +1,19 @@
+from typing import List, Tuple
+from dataclasses import dataclass
+
+class Elite:
+    def __init__(self, index):
+        self.index = index
+        self.molecule = None
+
+    def update(self, molecule):
+        if self.molecule is None or (molecule.fitness - self.molecule.fitness) > 0.0:
+            self.molecule = molecule
+        return None
+
+@dataclass
+class Molecule:
+    smiles: str
+    pedigree: Tuple[str, str ,str] 
+    fitness: float = None
+    descriptor: List[float] = None
diff --git a/argenomic/infrastructure.py b/argenomic/infrastructure.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 from typing import List, Tuple
+from dataclasses import dataclass
 
 from datetime import datetime
 from sklearn.cluster import KMeans
@@ -18,22 +19,14 @@
 rdBase.DisableLog('rdApp.error')
 from rdkit.Chem import Lipinski
 
+from argenomic.base import Molecule, Elite
 
-class elite():
-    def __init__(self, index, descriptor):
-        self.index = index
-        self.fitness = 0.0
-        self.molecule = None
-        self.descriptor = descriptor
-
-    def update(self, fitness, molecule, descriptor):
-        if self.fitness < fitness:
-            self.fitness = fitness
-            self.molecule = molecule
-            self.descriptor = descriptor
-        return None
-
-class archive:
+class Archive:
+    """
+    A composite class containing the current elite molecules in a CVT tree structure. Allows for processing of 
+    new molecules, sampling of the existing elite molecules, and disk storage of the current state of the archive. 
+    The CVT centers are either loaded from or deposited to cache disk storage. 
+    """
     def __init__(self, archive_config, descriptor_config) -> None:
         self.archive_size = archive_config.size
         self.archive_accuracy = archive_config.accuracy
@@ -46,132 +39,162 @@ def __init__(self, archive_config, descriptor_config) -> None:
             kmeans = KMeans(n_clusters=self.archive_size)
             kmeans = kmeans.fit(np.random.rand(archive_config.accuracy, self.archive_dimensions))
             self.cvt_centers = kmeans.cluster_centers_
-            np.savetxt(self.cvt_location, self.cvt_centers)
+            np.savetxt(self.cvt_location, self.cvt_centers)        
         self.cvt = KDTree(self.cvt_centers, metric='euclidean')
-        self.elites = [elite(index, cvt_center) for index, cvt_center in enumerate(self.cvt_centers, start=0)]
+        self.elites = [Elite(index) for index, _ in enumerate(self.cvt_centers, start=0)]
         return None
 
     def cvt_index(self, descriptor: List[float]) -> int:
+        """
+        Returns CVT index for the niche nearest to the given discriptor. 
+        """
         return self.cvt.query([descriptor], k=1)[1][0][0]
 
-    def add_to_archive(self, molecules: List[Chem.Mol], descriptors: List[List[float]], fitnesses: List[float]) -> None:
-        for molecule, descriptor, fitness in zip(molecules, descriptors, fitnesses):
-            self.elites[self.cvt_index(descriptor)].update(fitness, molecule, descriptor)
+    def add_to_archive(self, molecules) -> None:
+        """
+        Takes in a list of molecules and adds them to the archive as prescribed by the MAP-Elites algorithm, 
+        i.e. each niche only contains the most fit molecule. Other molecules are discarded. 
+        """
+        for molecule in molecules:
+            self.elites[self.cvt_index(molecule.descriptor)].update(molecule)
         return None
 
     def sample(self, size: int) -> List[Chem.Mol]:
-        pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
+        """
+        Returns a list of elite molecules of the requisted length. 
+        The elite molecules are randomly drawn, weighted by their fitness. 
+        """
+        pairs = [(elite.molecule, elite.molecule.fitness) for elite in self.elites if elite.molecule]
         molecules, weights = map(list, zip(*pairs))
         return random.choices(molecules, k=size, weights=weights)
 
     def sample_pairs(self, size: int) -> List[Tuple[Chem.Mol, Chem.Mol]]:
-        pairs = [(elite.molecule, elite.fitness) for elite in self.elites if elite.fitness > 0.0]
+        """
+        Returns a list of pairs of elite molecules of the requisted length. 
+        The elite molecules are randomly drawn, weighted by their fitness. 
+        """
+        pairs = [(elite.molecule, elite.molecule.fitness) for elite in self.elites if elite.molecule]
         molecules, weights = map(list, zip(*pairs))
         sample_molecules = random.choices(molecules, k=size, weights=weights)
         sample_pairs = np.random.choice(list(filter(None, sample_molecules)), size=(size, 2), replace=True)
-        sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]
+        sample_pairs = [tuple(sample_pair) for sample_pair in sample_pairs]       
         return sample_pairs
 
-    def store_archive(self, generation: float) -> None:
-        elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
-        data = {'elites': elites_smiles, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
-        pd.DataFrame(data=data).to_csv("archive_{}.csv".format(generation), index=False)
-        return None
-
-    def store_statistics(self, generation: float) -> None:
-        elites_smiles, elites_descriptors, elites_fitnesses = self.elites_data()
-        fractional_size = len(elites_smiles)/self.archive_size
-        statistics = [generation, np.max(elites_fitnesses), np.mean(elites_fitnesses), np.std(elites_fitnesses), fractional_size]
+    def store_data(self, generation: float) -> None:
+        """
+        Creates a dataframe representing the archive and writes it to disk. In addtion, basic statistics about 
+        the state of the archive are saved to disk and printed to the IO stream.           
+        """
+        archive_data = self.get_archive_data()
+        fractional_size = len(archive_data["smiles"])/self.archive_size
+        max_fitness, mean_fitness = np.max(archive_data["fitnesses"]), np.mean(archive_data["fitnesses"])
         if os.path.isfile('statistics.csv'):
             with open('statistics.csv', 'a') as file:
-                csv.writer(file).writerow(statistics)
+                csv.writer(file).writerow([generation, max_fitness, mean_fitness, fractional_size])
                 file.close()
         else:
             with open('statistics.csv', 'w') as file:
                 file.close()
-        print('Generation: {}, Size: {:.2f}'.format(statistics[0], statistics[4]))
-        print('Fitness Max: {:.7f}, Mean: {:.7f}, Std: {:.7f}'.format(statistics[1], statistics[2], statistics[3]))
+        pd.DataFrame(data=archive_data).to_csv("archive_{}.csv".format(generation), index=False)
+        print('Generation: {}, Size: {:.2f}'.format(generation, fractional_size))
+        print('Fitness Max: {:.5f}, Fitness Mean: {:.5f}'.format(max_fitness, mean_fitness))
         return None
-
-    def elites_data(self) -> Tuple[List[str], List[float], List[float]]:
-        elites_list = [elite for elite in self.elites if elite.molecule]
-        elites_smiles = [Chem.MolToSmiles(elite.molecule) for elite in elites_list]
-        elites_descriptors = [elite.descriptor for elite in elites_list]
-        elites_fitnesses = [elite.fitness for elite in elites_list]
-        return elites_smiles, elites_descriptors, elites_fitnesses
-
-
-class arbiter:
+
+    def get_archive_data(self) -> None:
+        elite_indices = [elite.index for elite in self.elites if elite.molecule]
+        elite_molecules = [elite.molecule for elite in self.elites if elite.molecule]
+        elites_smiles = [molecule.smiles for molecule in elite_molecules]
+        elites_pedigree = [molecule.pedigree for molecule in elite_molecules]
+        elites_descriptors = [molecule.descriptor for molecule in elite_molecules]
+        elites_fitnesses = [molecule.fitness for molecule in elite_molecules]
+        archive_data = {'index': elite_indices, 'smiles': elites_smiles, 'pedigree': elites_pedigree, 'descriptors': elites_descriptors, 'fitnesses': elites_fitnesses}
+        return archive_data
+
+class Arbiter:
     """
     A catalog class containing different druglike filters for small molecules.
     Includes the option to run the structural filters from ChEMBL.
     """
     def __init__(self, arbiter_config) -> None:
+        self.cache_smiles = []
         self.rules_dict = pd.read_csv(hydra.utils.to_absolute_path("data/smarts/alert_collection.csv"))
         self.rules_dict= self.rules_dict[self.rules_dict.rule_set_name.isin(arbiter_config.rules)]
         self.rules_list = self.rules_dict["smarts"].values.tolist()
         self.tolerance_list = pd.to_numeric(self.rules_dict["max"]).values.tolist()
         self.pattern_list = [Chem.MolFromSmarts(smarts) for smarts in self.rules_list]
 
-    def __call__(self, molecules:List[Chem.Mol]) -> List[Chem.Mol]:
+    def __call__(self, molecules):
         """
         Applies the chosen filters (hologenicity, veber_infractions,
-        ChEMBL structural alerts, ...) to a list of molecules.
+        ChEMBL structural alerts, ...) to a list of molecules and removes duplicates.
         """
         filtered_molecules = []
+        molecules = self.unique_molecules(molecules)
         for molecule in molecules:
-            if self.molecule_validity(molecule):
+            molecular_graph = Chem.MolFromSmiles(molecule.smiles)
+            if self.molecule_filter(molecular_graph):
                 filtered_molecules.append(molecule)
         return filtered_molecules
 
-    def molecule_validity(self, molecule: Chem.Mol) -> bool:
+    def unique_molecules(self, molecules: List[Molecule]) -> List[Molecule]:
+        """
+        Checks if a molecule in a lost of molcules is duplicated, either in this batch or before.
+        """
+        unique_molecules = []
+        for molecule in molecules:
+            if molecule.smiles not in self.cache_smiles:
+                unique_molecules.append(molecule)
+                self.cache_smiles.append(molecule.smiles)
+        return unique_molecules
+
+    def molecule_filter(self, molecular_graph: Chem.Mol) -> bool:
         """
-        Checks if a given molecule passes through the chosen filters (hologenicity,
+        Checks if a given molecular structure passes through the chosen filters (hologenicity,
         veber_infractions, ChEMBL structural alerts, ...).
         """
-        toxicity = self.toxicity(molecule)
-        hologenicity = self.hologenicity(molecule)
-        veber_infraction = self.veber_infraction(molecule)
+        toxicity = self.toxicity(molecular_graph)
+        hologenicity = self.hologenicity(molecular_graph)
+        veber_infraction = self.veber_infraction(molecular_graph)
         validity = not (toxicity or hologenicity or veber_infraction)
-        if molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
-            ring_infraction = self.ring_infraction(molecule)
+        if molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[R]')):
+            ring_infraction = self.ring_infraction(molecular_graph)
             validity = validity and not (ring_infraction)
         return validity
 
-    def toxicity(self, molecule: Chem.Mol) -> bool:
+    def toxicity(self, molecular_graph: Chem.Mol) -> bool:
         """
         Checks if a given molecule fails the structural filters.
         """
         for (pattern, tolerance) in zip(self.pattern_list, self.tolerance_list):
-            if len(molecule.GetSubstructMatches(pattern)) > tolerance:
+            if len(molecular_graph.GetSubstructMatches(pattern)) > tolerance:
                 return True
         return False
 
     @staticmethod
-    def hologenicity(molecule: Chem.Mol) -> bool:
+    def hologenicity(molecular_graph: Chem.Mol) -> bool:
         """
         Checks if a given molecule fails the hologenicity filters.
         """
-        fluorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
-        bromide_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
-        chlorine_saturation = len(molecule.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
+        fluorine_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[F]'))) > 6
+        bromide_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[Br]'))) > 3
+        chlorine_saturation = len(molecular_graph.GetSubstructMatches(Chem.MolFromSmarts('[Cl]'))) > 3
         return chlorine_saturation or bromide_saturation or fluorine_saturation
 
     @staticmethod
-    def ring_infraction(molecule: Chem.Mol) -> bool:
+    def ring_infraction(molecular_graph: Chem.Mol) -> bool:
         """
         Checks if a given molecule fails the ring infraction filters.
         """
-        ring_allene = molecule.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
-        macro_cycle = max([len(j) for j in molecule.GetRingInfo().AtomRings()]) > 6
-        double_bond_in_small_ring = molecule.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
+        ring_allene = molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[R]=[R]=[R]'))
+        macro_cycle = max([len(j) for j in molecular_graph.GetRingInfo().AtomRings()]) > 6
+        double_bond_in_small_ring = molecular_graph.HasSubstructMatch(Chem.MolFromSmarts('[r3,r4]=[r3,r4]'))
         return ring_allene or macro_cycle or double_bond_in_small_ring
 
     @staticmethod
-    def veber_infraction(molecule: Chem.Mol) -> bool:
+    def veber_infraction(molecular_graph: Chem.Mol) -> bool:
         """
         Checks if a given molecule fails the veber infraction filters.
         """
-        rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecule) > 10
-        hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecule) + Lipinski.NumHDonors(molecule) > 10
+        rotatable_bond_saturation = Lipinski.NumRotatableBonds(molecular_graph) > 10
+        hydrogen_bond_saturation = Lipinski.NumHAcceptors(molecular_graph) + Lipinski.NumHDonors(molecular_graph) > 10
         return rotatable_bond_saturation or hydrogen_bond_saturation
diff --git a/argenomic/mechanism.py b/argenomic/mechanism.py
@@ -15,73 +15,75 @@
 from rdkit.Chem import rdMolDescriptors
 from rdkit.DataStructs.cDataStructs import TanimotoSimilarity
 
-class descriptor:
+class Descriptor:
     """
     A strategy class for calculating the descriptor vector of a molecule.
     """
     def __init__(self, config_descriptor) -> None:
         self.properties = []
-        self.ranges = config_descriptor.ranges
+        self.ranges = config_descriptor.ranges   
         self.property_names = config_descriptor.properties
         for name in self.property_names:
             module, fuction = name.split(".")
             module = getattr(sys.modules[__name__], module)
             self.properties.append(getattr(module, fuction))
         return None
 
-    def __call__(self, molecule: Chem.Mol) -> List[float]:
+    def __call__(self, molecule) -> None:
         """
-        Calculating the descriptor vector of a molecule.
+        Updates the descriptor vector of a molecule.
         """
         descriptor = []
+        molecular_graph = Chem.MolFromSmiles(molecule.smiles)
         for property, range in zip(self.properties, self.ranges):
-            descriptor.append(self.rescale(property(molecule), range))
-        return descriptor
+            descriptor.append(self.rescale(property(molecular_graph), range))
+        molecule.descriptor = descriptor
+        return molecule
 
     @staticmethod
     def rescale(feature: List[float], range: List[float]) -> List[float]:
         """
-        Rescaling the feature to the unit range.
+        Rescales the feature to the unit range.
         """
         rescaled_feature = (feature - range[0])/(range[1] - range[0])
         return rescaled_feature
 
-class fitness:
+class Fitness:
     """
     A strategy class for calculating the fitness of a molecule.
     """
     def __init__(self, config_fitness) -> None:
-        self.memoized_cache = dict()
         self.fingerprint_type = config_fitness.type
         self.target = Chem.MolFromSmiles(config_fitness.target)
         self.target_fingerprint = self.get_fingerprint(self.target, self.fingerprint_type)
         return None
 
-    def __call__(self, molecule: Chem.Mol) -> float:
-        smiles = Chem.MolToSmiles(molecule)
-        if smiles in self.memoized_cache:
-            fitness = self.memoized_cache[smiles]
-        else:
-            molecule_fingerprint = self.get_fingerprint(molecule, self.fingerprint_type)
-            fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
-            self.memoized_cache[smiles] = fitness
-        return fitness
+    def __call__(self, molecule) -> None:
+        """
+        Updates the fitness value of a molecule.
+        """
+        molecular_graph = Chem.MolFromSmiles(Chem.CanonSmiles(molecule.smiles))
+        molecule_fingerprint = self.get_fingerprint(molecular_graph, self.fingerprint_type)
+        fitness = TanimotoSimilarity(self.target_fingerprint, molecule_fingerprint)
+        molecule.fitness = fitness
+        return molecule
 
-    def get_fingerprint(self, molecule: Chem.Mol, fingerprint_type: str):
+    def get_fingerprint(self, molecular_graph: Chem.Mol, fingerprint_type: str):
         method_name = 'get_' + fingerprint_type
         method = getattr(self, method_name)
         if method is None:
             raise Exception('{} is not a supported fingerprint type.'.format(fingerprint_type))
-        return method(molecule)
+        return method(molecular_graph)
+
+    def get_ECFP4(self, molecular_graph: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecular_graph, 2)
 
-    def get_ECFP4(self, molecule: Chem.Mol):
-        return AllChem.GetMorganFingerprint(molecule, 2)
+    def get_ECFP6(self, molecular_graph: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecular_graph, 3)
 
-    def get_ECFP6(self, molecule: Chem.Mol):
-        return AllChem.GetMorganFingerprint(molecule, 3)
+    def get_FCFP4(self, molecular_graph: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecular_graph, 2, useFeatures=True)
 
-    def get_FCFP4(self, molecule: Chem.Mol):
-        return AllChem.GetMorganFingerprint(molecule, 2, useFeatures=True)
+    def get_FCFP6(self, molecular_graph: Chem.Mol):
+        return AllChem.GetMorganFingerprint(molecular_graph, 3, useFeatures=True)
 
-    def get_FCFP6(self, molecule: Chem.Mol):
-        return AllChem.GetMorganFingerprint(molecule, 3, useFeatures=True)