Skip to content

Commit

Permalink
Add initial examples
Browse files Browse the repository at this point in the history
  • Loading branch information
SimonBoothroyd committed Dec 3, 2021
1 parent d86bf03 commit 080b424
Show file tree
Hide file tree
Showing 9 changed files with 558 additions and 1 deletion.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ conda env create --name nagl --file devtools/conda-envs/test_env.yaml
python setup.py develop
```

## Getting Started

Examples for using this framework can be found in the [`examples`](examples) directory.

## Copyright

Copyright (c) 2020, Simon Boothroyd
Copyright (c) 2021, Simon Boothroyd
42 changes: 42 additions & 0 deletions examples/data-sets/label.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/bin/bash
#
# Set the job name and wall time limit
#BSUB -J nagl
#BSUB -W 168:00
#
# Set the output and error output paths.
#BSUB -o %J.o
#BSUB -e %J.e
#
# Set any cpu options.
#BSUB -n 1 -R "span[ptile=1]"
#BSUB -M 16

# Enable conda
. ~/.bashrc

# Use the right conda environment
conda activate nagl

rm -rf labelled && mkdir labelled

# Compute the AM1 partial charges and multi-conformer WBO for each molecule.
for name in "enamine-10240.sdf.gz" \
"enamine-50240.sdf.gz" \
"NCI-Open_2012-05-01.sdf.gz" \
"ChEMBL_eps_78.sdf.gz" \
"ZINC_eps_78.sdf.gz" \
"OpenFF-Industry-Benchmark-Season-1-v1-1.smi"
do

nagl label --input "processed/${name}" \
--output "labelled/${name%%.*}.sqlite" \
--n-workers 250 \
--batch-size 250 \
--worker-type lsf \
--lsf-memory 4 \
--lsf-walltime "32:00" \
--lsf-queue "cpuqueue" \
--lsf-env "nagl"

done
44 changes: 44 additions & 0 deletions examples/data-sets/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash
#
# Set the job name and wall time limit
#BSUB -J nagl
#BSUB -W 24:00
#
# Set the output and error output paths.
#BSUB -o %J.o
#BSUB -e %J.e
#
# Set any cpu options.
#BSUB -n 20 -R "span[ptile=20]"
#BSUB -M 2

# Enable conda
. ~/.bashrc

# Use the right conda environment
conda activate nagl

# Filter the NCI and Enamine sets according to the criteria proposed by
# Bleiziffer, Schaller and Riniker (see 10.1021/acs.jcim.7b00663)
rm -rf processed && mkdir processed

for name in "enamine-10240" "enamine-50240" "NCI-Open_2012-05-01"
do

nagl prepare filter --input "raw/${name}.sdf.gz" \
--output "processed/${name}.sdf.gz" \
--strip-ions \
--n-processes 20

done

# We don't need to filter the Rinker sets are they are provided in their
# processed form or the OpenFF data set as this was curated by hand.
for name in "ChEMBL_eps_78.sdf.gz" \
"ZINC_eps_78.sdf.gz" \
"OpenFF-Industry-Benchmark-Season-1-v1-1.smi"
do

cp "raw/${name}" "processed/${name}"

done
56 changes: 56 additions & 0 deletions examples/submit-am1-hparams.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
#
# Set the job name and wall time limit
#BSUB -J am1[1-486]%60
#BSUB -W 02:00
#
# Set the output and error output paths.
#BSUB -o %J.o
#BSUB -e %J.e
#
# Set any gpu options.
#BSUB -q gpuqueue
#BSUB -gpu num=1:j_exclusive=yes:mode=shared:mps=no:
#
#BSUB -M 5

# Enable conda
. ~/.bashrc

conda activate nagl
conda env export > conda-env-h-params.yml

# Launch my program.
module load cuda/11.0

export batch_size=(256 512)

export n_gcn_layers=(3 4 5)
export n_gcn_hidden_features=(32 64 128)

export n_am1_layers=(2 3 4)
export n_am1_hidden_features=(32 64 128)

export learning_rate=(0.001 0.0001 0.00001)

export indices=( $(
python utilities/job-to-matrix-index.py $LSB_JOBINDEX \
${#batch_size[@]} \
${#n_gcn_layers[@]} \
${#n_gcn_hidden_features[@]} \
${#n_am1_layers[@]} \
${#n_am1_hidden_features[@]} \
${#learning_rate[@]}
) )

echo "MATRIX INDICES=${indices[*]}"

python train-am1-q-model.py --train-set "data-sets/labelled/enamine-50240.sqlite" \
--train-batch-size ${batch_size[${indices[0]}]} \
--val-set "data-sets/labelled/OpenFF-Industry-Benchmark-Season-1-v1-1.sqlite" \
--n-gcn-layers ${n_gcn_layers[${indices[1]}]} \
--n-gcn-hidden-features ${n_gcn_hidden_features[${indices[2]}]} \
--n-am1-layers ${n_am1_layers[${indices[3]}]} \
--n-am1-hidden-features ${n_am1_hidden_features[${indices[4]}]} \
--learning-rate ${learning_rate[${indices[5]}]} \
--n-epochs 175
36 changes: 36 additions & 0 deletions examples/submit-am1-prod.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash
#
# Set the job name and wall time limit
#BSUB -J am1
#BSUB -W 02:00
#
# Set the output and error output paths.
#BSUB -o %J.o
#BSUB -e %J.e
#
# Set any gpu options.
#BSUB -q gpuqueue
#BSUB -gpu num=1:j_exclusive=yes:mode=shared:mps=no:
#
#BSUB -M 5

# Enable conda
. ~/.bashrc

conda activate nagl
conda env export > conda-env.yml

# Launch my program.
module load cuda/11.0

python train-am1-q-model.py --train-set "data-sets/labelled/ChEMBL_eps_78.sqlite" \
--train-set "data-sets/labelled/ZINC_eps_78.sqlite" \
--train-batch-size 256 \
--val-set "data-sets/labelled/enamine-10240.sqlite" \
--test-set "data-sets/labelled/OpenFF-Industry-Benchmark-Season-1-v1-1.sqlite" \
--n-gcn-layers 5 \
--n-gcn-hidden-features 128 \
--n-am1-layers 2 \
--n-am1-hidden-features 64 \
--learning-rate 0.001 \
--n-epochs 400
131 changes: 131 additions & 0 deletions examples/train-am1-q-model-simple.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
from typing import Dict

import numpy
import pytorch_lightning as pl
import torch
from openff.toolkit.topology import Molecule

from nagl.datasets import DGLMoleculeDataLoader, DGLMoleculeDataset
from nagl.features import AtomConnectivity, AtomFormalCharge, AtomicElement, BondOrder
from nagl.lightning import DGLMoleculeLightningModel
from nagl.nn import SequentialLayers
from nagl.nn.modules import ConvolutionModule, ReadoutModule
from nagl.nn.pooling import PoolAtomFeatures
from nagl.nn.postprocess import ComputePartialCharges


def label_function(molecule: Molecule) -> Dict[str, torch.Tensor]:
"""Generates a set of train / val / test labels for a given molecule."""
from simtk import unit

# Generate a set of ELF10 conformers.
molecule.generate_conformers(n_conformers=800, rms_cutoff=0.05 * unit.angstrom)
molecule.apply_elf_conformer_selection()

partial_charges = []

for conformer in molecule.conformers:

molecule.assign_partial_charges("am1-mulliken", use_conformers=[conformer])

partial_charges.append(
molecule.partial_charges.value_in_unit(unit.elementary_charge)
)

return {
"am1-charges": torch.from_numpy(numpy.mean(partial_charges, axis=0)).float()
}


def main():

print(torch.seed())

# Define the atom / bond features of interest.
atom_features = [
AtomicElement(["C", "O", "H"]),
AtomConnectivity(),
AtomFormalCharge([-1, 0, 1]),
]
bond_features = [
BondOrder(),
]

# Compute the total length of the input atomic feature vector
n_atom_features = sum(len(feature) for feature in atom_features)

# Load in the training and test data
training_smiles = ["CO", "CCO", "CCCO", "CCCCO"]
training_data = DGLMoleculeDataset.from_smiles(
training_smiles,
atom_features,
bond_features,
label_function,
enumerate_resonance=True,
)
training_loader = DGLMoleculeDataLoader(
training_data, batch_size=len(training_smiles), shuffle=False
)

test_smiles = [
"CCCCCCCCCO",
]
test_loader = DGLMoleculeDataLoader(
DGLMoleculeDataset.from_smiles(
test_smiles,
atom_features,
bond_features,
label_function,
enumerate_resonance=True,
),
batch_size=len(test_smiles),
shuffle=False,
)

# Define the model.
n_gcn_layers = 5
n_gcn_hidden_features = 128

n_am1_layers = 2
n_am1_hidden_features = 64

learning_rate = 0.001

model = DGLMoleculeLightningModel(
convolution_module=ConvolutionModule(
architecture="SAGEConv",
in_feats=n_atom_features,
hidden_feats=[n_gcn_hidden_features] * n_gcn_layers,
),
readout_modules={
# The keys of the readout modules should correspond to keys in the
# label dictionary.
"am1-charges": ReadoutModule(
pooling_layer=PoolAtomFeatures(),
readout_layers=SequentialLayers(
in_feats=n_gcn_hidden_features,
hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2],
activation=["ReLU"] * n_am1_layers + ["Identity"],
),
postprocess_layer=ComputePartialCharges(),
)
},
learning_rate=learning_rate,
)

print(model)

# Train the model
n_epochs = 100

n_gpus = 0 if not torch.cuda.is_available() else 1
print(f"Using {n_gpus} GPUs")

trainer = pl.Trainer(gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs)

trainer.fit(model, train_dataloaders=training_loader)
trainer.test(model, test_dataloaders=test_loader)


if __name__ == "__main__":
main()
Loading

0 comments on commit 080b424

Please sign in to comment.