Add initial examples

SimonBoothroyd · Dec 3, 2021 · 080b424 · 080b424
1 parent d86bf03
commit 080b424
Show file tree

Hide file tree

Showing 9 changed files with 558 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -20,6 +20,10 @@ conda env create --name nagl --file devtools/conda-envs/test_env.yaml
 python setup.py develop
 ```
 
+## Getting Started
+
+Examples for using this framework can be found in the [`examples`](examples) directory.
+
 ## Copyright
 
-Copyright (c) 2020, Simon Boothroyd
+Copyright (c) 2021, Simon Boothroyd
diff --git a/examples/data-sets/label.sh b/examples/data-sets/label.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Set the job name and wall time limit
+#BSUB -J nagl
+#BSUB -W 168:00
+#
+# Set the output and error output paths.
+#BSUB -o  %J.o
+#BSUB -e  %J.e
+#
+# Set any cpu options.
+#BSUB -n 1 -R "span[ptile=1]"
+#BSUB -M 16
+
+# Enable conda
+. ~/.bashrc
+
+# Use the right conda environment
+conda activate nagl
+
+rm -rf labelled && mkdir labelled
+
+# Compute the AM1 partial charges and multi-conformer WBO for each molecule.
+for name in "enamine-10240.sdf.gz" \
+            "enamine-50240.sdf.gz" \
+            "NCI-Open_2012-05-01.sdf.gz" \
+            "ChEMBL_eps_78.sdf.gz" \
+            "ZINC_eps_78.sdf.gz" \
+            "OpenFF-Industry-Benchmark-Season-1-v1-1.smi"
+do
+
+  nagl label --input "processed/${name}"            \
+             --output "labelled/${name%%.*}.sqlite" \
+             --n-workers 250                        \
+             --batch-size 250                       \
+             --worker-type lsf                      \
+             --lsf-memory 4                         \
+             --lsf-walltime "32:00"                 \
+             --lsf-queue "cpuqueue"                 \
+             --lsf-env "nagl"
+
+done
diff --git a/examples/data-sets/prepare.sh b/examples/data-sets/prepare.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+# Set the job name and wall time limit
+#BSUB -J nagl
+#BSUB -W 24:00
+#
+# Set the output and error output paths.
+#BSUB -o  %J.o
+#BSUB -e  %J.e
+#
+# Set any cpu options.
+#BSUB -n 20 -R "span[ptile=20]"
+#BSUB -M 2
+
+# Enable conda
+. ~/.bashrc
+
+# Use the right conda environment
+conda activate nagl
+
+# Filter the NCI and Enamine sets according to the criteria proposed by
+# Bleiziffer, Schaller and Riniker (see 10.1021/acs.jcim.7b00663)
+rm -rf processed && mkdir processed
+
+for name in "enamine-10240" "enamine-50240" "NCI-Open_2012-05-01"
+do
+
+  nagl prepare filter --input "raw/${name}.sdf.gz" \
+                      --output "processed/${name}.sdf.gz" \
+                      --strip-ions \
+                      --n-processes 20
+
+done
+
+# We don't need to filter the Rinker sets are they are provided in their
+# processed form or the OpenFF data set as this was curated by hand.
+for name in "ChEMBL_eps_78.sdf.gz" \
+            "ZINC_eps_78.sdf.gz" \
+            "OpenFF-Industry-Benchmark-Season-1-v1-1.smi"
+do
+
+  cp "raw/${name}" "processed/${name}"
+
+done
diff --git a/examples/submit-am1-hparams.sh b/examples/submit-am1-hparams.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+#
+# Set the job name and wall time limit
+#BSUB -J am1[1-486]%60
+#BSUB -W 02:00
+#
+# Set the output and error output paths.
+#BSUB -o  %J.o
+#BSUB -e  %J.e
+#
+# Set any gpu options.
+#BSUB -q gpuqueue
+#BSUB -gpu num=1:j_exclusive=yes:mode=shared:mps=no:
+#
+#BSUB -M 5
+
+# Enable conda
+. ~/.bashrc
+
+conda activate nagl
+conda env export > conda-env-h-params.yml
+
+# Launch my program.
+module load cuda/11.0
+
+export batch_size=(256 512)
+
+export n_gcn_layers=(3 4 5)
+export n_gcn_hidden_features=(32 64 128)
+
+export n_am1_layers=(2 3 4)
+export n_am1_hidden_features=(32 64 128)
+
+export learning_rate=(0.001 0.0001 0.00001)
+
+export indices=( $(
+  python utilities/job-to-matrix-index.py $LSB_JOBINDEX \
+                                          ${#batch_size[@]} \
+                                          ${#n_gcn_layers[@]} \
+                                          ${#n_gcn_hidden_features[@]} \
+                                          ${#n_am1_layers[@]} \
+                                          ${#n_am1_hidden_features[@]} \
+                                          ${#learning_rate[@]}
+) )
+
+echo "MATRIX INDICES=${indices[*]}"
+
+python train-am1-q-model.py --train-set             "data-sets/labelled/enamine-50240.sqlite" \
+                            --train-batch-size      ${batch_size[${indices[0]}]} \
+                            --val-set               "data-sets/labelled/OpenFF-Industry-Benchmark-Season-1-v1-1.sqlite" \
+                            --n-gcn-layers          ${n_gcn_layers[${indices[1]}]} \
+                            --n-gcn-hidden-features ${n_gcn_hidden_features[${indices[2]}]} \
+                            --n-am1-layers          ${n_am1_layers[${indices[3]}]} \
+                            --n-am1-hidden-features ${n_am1_hidden_features[${indices[4]}]} \
+                            --learning-rate         ${learning_rate[${indices[5]}]} \
+                            --n-epochs              175
diff --git a/examples/submit-am1-prod.sh b/examples/submit-am1-prod.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+#
+# Set the job name and wall time limit
+#BSUB -J am1
+#BSUB -W 02:00
+#
+# Set the output and error output paths.
+#BSUB -o  %J.o
+#BSUB -e  %J.e
+#
+# Set any gpu options.
+#BSUB -q gpuqueue
+#BSUB -gpu num=1:j_exclusive=yes:mode=shared:mps=no:
+#
+#BSUB -M 5
+
+# Enable conda
+. ~/.bashrc
+
+conda activate nagl
+conda env export > conda-env.yml
+
+# Launch my program.
+module load cuda/11.0
+
+python train-am1-q-model.py --train-set             "data-sets/labelled/ChEMBL_eps_78.sqlite" \
+                            --train-set             "data-sets/labelled/ZINC_eps_78.sqlite"   \
+                            --train-batch-size      256                                       \
+                            --val-set               "data-sets/labelled/enamine-10240.sqlite" \
+                            --test-set              "data-sets/labelled/OpenFF-Industry-Benchmark-Season-1-v1-1.sqlite" \
+                            --n-gcn-layers          5                                         \
+                            --n-gcn-hidden-features 128                                       \
+                            --n-am1-layers          2                                         \
+                            --n-am1-hidden-features 64                                        \
+                            --learning-rate         0.001                                     \
+                            --n-epochs              400
diff --git a/examples/train-am1-q-model-simple.py b/examples/train-am1-q-model-simple.py
@@ -0,0 +1,131 @@
+from typing import Dict
+
+import numpy
+import pytorch_lightning as pl
+import torch
+from openff.toolkit.topology import Molecule
+
+from nagl.datasets import DGLMoleculeDataLoader, DGLMoleculeDataset
+from nagl.features import AtomConnectivity, AtomFormalCharge, AtomicElement, BondOrder
+from nagl.lightning import DGLMoleculeLightningModel
+from nagl.nn import SequentialLayers
+from nagl.nn.modules import ConvolutionModule, ReadoutModule
+from nagl.nn.pooling import PoolAtomFeatures
+from nagl.nn.postprocess import ComputePartialCharges
+
+
+def label_function(molecule: Molecule) -> Dict[str, torch.Tensor]:
+    """Generates a set of train / val / test labels for a given molecule."""
+    from simtk import unit
+
+    # Generate a set of ELF10 conformers.
+    molecule.generate_conformers(n_conformers=800, rms_cutoff=0.05 * unit.angstrom)
+    molecule.apply_elf_conformer_selection()
+
+    partial_charges = []
+
+    for conformer in molecule.conformers:
+
+        molecule.assign_partial_charges("am1-mulliken", use_conformers=[conformer])
+
+        partial_charges.append(
+            molecule.partial_charges.value_in_unit(unit.elementary_charge)
+        )
+
+    return {
+        "am1-charges": torch.from_numpy(numpy.mean(partial_charges, axis=0)).float()
+    }
+
+
+def main():
+
+    print(torch.seed())
+
+    # Define the atom / bond features of interest.
+    atom_features = [
+        AtomicElement(["C", "O", "H"]),
+        AtomConnectivity(),
+        AtomFormalCharge([-1, 0, 1]),
+    ]
+    bond_features = [
+        BondOrder(),
+    ]
+
+    # Compute the total length of the input atomic feature vector
+    n_atom_features = sum(len(feature) for feature in atom_features)
+
+    # Load in the training and test data
+    training_smiles = ["CO", "CCO", "CCCO", "CCCCO"]
+    training_data = DGLMoleculeDataset.from_smiles(
+        training_smiles,
+        atom_features,
+        bond_features,
+        label_function,
+        enumerate_resonance=True,
+    )
+    training_loader = DGLMoleculeDataLoader(
+        training_data, batch_size=len(training_smiles), shuffle=False
+    )
+
+    test_smiles = [
+        "CCCCCCCCCO",
+    ]
+    test_loader = DGLMoleculeDataLoader(
+        DGLMoleculeDataset.from_smiles(
+            test_smiles,
+            atom_features,
+            bond_features,
+            label_function,
+            enumerate_resonance=True,
+        ),
+        batch_size=len(test_smiles),
+        shuffle=False,
+    )
+
+    # Define the model.
+    n_gcn_layers = 5
+    n_gcn_hidden_features = 128
+
+    n_am1_layers = 2
+    n_am1_hidden_features = 64
+
+    learning_rate = 0.001
+
+    model = DGLMoleculeLightningModel(
+        convolution_module=ConvolutionModule(
+            architecture="SAGEConv",
+            in_feats=n_atom_features,
+            hidden_feats=[n_gcn_hidden_features] * n_gcn_layers,
+        ),
+        readout_modules={
+            # The keys of the readout modules should correspond to keys in the
+            # label dictionary.
+            "am1-charges": ReadoutModule(
+                pooling_layer=PoolAtomFeatures(),
+                readout_layers=SequentialLayers(
+                    in_feats=n_gcn_hidden_features,
+                    hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2],
+                    activation=["ReLU"] * n_am1_layers + ["Identity"],
+                ),
+                postprocess_layer=ComputePartialCharges(),
+            )
+        },
+        learning_rate=learning_rate,
+    )
+
+    print(model)
+
+    # Train the model
+    n_epochs = 100
+
+    n_gpus = 0 if not torch.cuda.is_available() else 1
+    print(f"Using {n_gpus} GPUs")
+
+    trainer = pl.Trainer(gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs)
+
+    trainer.fit(model, train_dataloaders=training_loader)
+    trainer.test(model, test_dataloaders=test_loader)
+
+
+if __name__ == "__main__":
+    main()