diff --git a/MSMBuilder/assigning.py b/MSMBuilder/assigning.py index d08cb8cd..18a7d0d3 100644 --- a/MSMBuilder/assigning.py +++ b/MSMBuilder/assigning.py @@ -7,6 +7,7 @@ import tables import warnings from mdtraj import io +from msmbuilder import MSMLib as msmlib import logging logger = logging.getLogger(__name__) @@ -240,3 +241,53 @@ def streaming_assign_with_checkpoint(metric, project, generators, assignments_pa "-- this function is deprecated"), DeprecationWarning) assign_with_checkpoint(metric, project, generators, assignments_path, distances_path, chunk_size, atom_indices_to_load) + + +def outer_product_assignment(assign1, assign2): + """Combine the results of two clusterings. + + Specifically, if a conformation is in state i after clustering + with metric 1, and it is in state j after clustering with metric 2, + we assign it to a new state (ij) + + This is a naive way of combining two metrics to give a singular + assignment of states. + + Parameters + ---------- + assign1 : np.ndarray + Assignment array 1 + assign2 : np.ndarray + Assignment array 2 + + Returns + ---------- + new_assign : np.ndarray + New assignments array, renumbered from 0 ... N-1, N <= n1 * n2 + """ + + assert assign1.shape == assign2.shape, """Assignments must be + for the same set of trajectories.""" + new_assign = -1 * np.ones_like(assign1, dtype=np.int) + + nstates1 = np.max(assign1) + 1 + nstates2 = np.max(assign2) + 1 + + translations = np.reshape(np.arange(nstates1 * nstates2), + (nstates1, nstates2)) + + ass_shape = assign1.shape + for i in xrange(ass_shape[0]): + for j in xrange(ass_shape[1]): + # TODO: vectorize this loop + if assign1[i, j] == -1: + # No assignment here + assert assign2[i, j] == -1, """Assignments must be for + the same set of trajectories.""" + else: + new_assign[i, j] = translations[assign1[i, j], assign2[i, j]] + + # Renumber states in place + msmlib.renumber_states(new_assign) + return new_assign + diff --git a/MSMBuilder/metrics/__init__.py b/MSMBuilder/metrics/__init__.py index dc097d5f..cd0ed733 100644 --- a/MSMBuilder/metrics/__init__.py +++ b/MSMBuilder/metrics/__init__.py @@ -19,3 +19,4 @@ from .hybrid import Hybrid, HybridPNorm from .projection import RedDimPNorm from .positions import Positions +from .solvent import SolventFp diff --git a/MSMBuilder/metrics/parsers.py b/MSMBuilder/metrics/parsers.py index 1d6e1254..f1eb8a61 100644 --- a/MSMBuilder/metrics/parsers.py +++ b/MSMBuilder/metrics/parsers.py @@ -10,7 +10,7 @@ from msmbuilder.metrics import (RMSD, Dihedral, BooleanContact, AtomPairs, ContinuousContact, AbstractDistanceMetric, Vectorized, - RedDimPNorm, Positions) + RedDimPNorm, Positions, SolventFp) def add_argument(group, *args, **kwargs): @@ -22,8 +22,6 @@ def add_argument(group, *args, **kwargs): kwargs['help'] = d group.add_argument(*args, **kwargs) -################################################################################ - def locate_metric_plugins(name): if not name in ['add_metric_parser', 'construct_metric', 'metric_class']: @@ -39,7 +37,7 @@ def add_metric_parsers(parser): metric_subparser = parser.add_subparsers(dest='metric', description='Available metrics to use.') - #metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric') + # metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric') rmsd = metric_subparser.add_parser('rmsd', description='''RMSD: Root mean square deviation over a set of user defined atoms (typically backbone heavy atoms or alpha carbons). To evaluate the distance @@ -80,7 +78,7 @@ def add_metric_parsers(parser): Furthermore, the sense in which the distance between two residues is computed can be either specified as "CA", "closest", or "closest-heavy", which will respectively compute ("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of - atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), + atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to one residue and j to the other residue. This code is executed in parallel on multiple cores (but not multiple boxes) using OMP.''') @@ -106,6 +104,21 @@ def add_metric_parsers(parser): help='which distance metric', choices=AtomPairs.allowable_scipy_metrics) metric_parser_list.append(atompairs) + solventfp = metric_subparser.add_parser('solventfp', description="""SOLVENTFP: For each frame + calculate a 'solvent fingerprint' by considering the weighted pairwise distances + between solute and solvent atoms as in as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.""") + add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat', + help='Indices of the solvent (water) atoms') + add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat', + help='Indices of the solute (protein) atoms') + add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5, + type=float, help='std. dev. of gaussian kernel') + add_argument(solventfp, '-p', dest='solventfp_p', default=2, + help='p used for metric=minkowski (otherwise ignored)') + add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean', + help='which distance metric', choices=SolventFp.allowable_scipy_metrics) + metric_parser_list.append(solventfp) + positions = metric_subparser.add_parser('positions', description="""POSITIONS: For each frame we represent the conformation as a vector of atom positions, where the atoms have been aligned to a target structure.""") @@ -121,10 +134,10 @@ def add_metric_parsers(parser): help='which distance metric', choices=Positions.allowable_scipy_metrics) metric_parser_list.append(positions) - tica = metric_subparser.add_parser( 'tica', description=''' + tica = metric_subparser.add_parser('tica', description=''' tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f. in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.) - for more details. In addition to these options, you must provide an additional + for more details. In addition to these options, you must provide an additional metric you used to prepare the trajectories in the training step.''') add_argument(tica, '-p', dest='p', help='p value for p-norm') @@ -213,6 +226,15 @@ def construct_metric(args): metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices, metric=args.positions_metric, p=args.positions_p) + elif metric_name == 'solventfp': + proti = np.loadtxt(args.solventfp_proti, np.int) + wateri = np.loadtxt(args.solventfp_wateri, np.int) + metric = SolventFp(solute_indices=proti, + solvent_indices=wateri, + sigma=args.solventfp_sigma, + metric=args.solventfp_metric, + p=args.solventfp_p) + elif metric_name == "tica": tica_obj = tICA.load(args.tica_fn) diff --git a/MSMBuilder/metrics/solvent.py b/MSMBuilder/metrics/solvent.py new file mode 100644 index 00000000..25001712 --- /dev/null +++ b/MSMBuilder/metrics/solvent.py @@ -0,0 +1,106 @@ +from __future__ import print_function, absolute_import, division + +import logging + +from .baseclasses import Vectorized +import mdtraj as md +import numpy as np + + +logger = logging.getLogger(__name__) + + +class SolventFp(Vectorized): + """Distance metric for calculating distances between frames based on their + solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8. + """ + + allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev', + 'cityblock', 'correlation', 'cosine', + 'euclidean', 'minkowski', 'sqeuclidean', + 'seuclidean', 'mahalanobis', 'sqmahalanobis'] + + def __init__(self, solute_indices, solvent_indices, sigma, + metric='euclidean', p=2, V=None, VI=None): + """Create a distance metric to capture solvent degrees of freedom + + Parameters + ---------- + solute_indices : ndarray + atom indices of the solute atoms + solvent_indices : ndarray + atom indices of the solvent atoms + sigma : float + width of gaussian kernel + metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock', + 'correlation', 'cosine', 'euclidean', 'minkowski', + 'sqeuclidean', 'seuclidean', 'mahalanobis', 'sqmahalanobis'} + Distance metric to equip the vector space with. + p : int, optional + p-norm order, used for metric='minkowski' + V : ndarray, optional + variances, used for metric='seuclidean' + VI : ndarray, optional + inverse covariance matrix, used for metric='mahalanobi' + + """ + + # Check input indices + md.utils.ensure_type(solute_indices, dtype=np.int, ndim=1, + name='solute', can_be_none=False) + md.utils.ensure_type(solvent_indices, dtype=np.int, ndim=1, + name='solvent', can_be_none=False) + + super(SolventFp, self).__init__(metric, p, V, VI) + self.solute_indices = solute_indices + self.solvent_indices = solvent_indices + self.sigma = sigma + + def __repr__(self): + "String representation of the object" + return ('metrics.SolventFp(metric=%s, p=%s, sigma=%s)' + % (self.metric, self.p, self.sigma)) + + def prepare_trajectory(self, trajectory): + """Calculate solvent fingerprints + Parameters + ---------- + trajectory : msmbuilder.Trajectory + An MSMBuilder trajectory to prepare + + Returns + ------- + fingerprints : ndarray + A 2D array of fingerprint vectors of + shape (traj_length, protein_atom) + """ + + # Give shorter names to these things + prot_indices = self.solute_indices + water_indices = self.solvent_indices + sigma = self.sigma + + # The result vector + fingerprints = np.zeros((trajectory.n_frames, len(prot_indices))) + + # Check for periodic information + if trajectory.unitcell_lengths is None: + logging.warn('No periodic information found for computing solventfp.') + + for i, prot_i in enumerate(prot_indices): + # For each protein atom, calculate distance to all water + # molecules + atom_pairs = np.empty((len(water_indices), 2)) + atom_pairs[:, 0] = prot_i + atom_pairs[:, 1] = water_indices + # Get a traj_length x n_water_indices vector of distances + distances = md.compute_distances(trajectory, + atom_pairs, + periodic=True) + # Calculate guassian kernel + distances = np.exp(-distances / (2 * sigma * sigma)) + + # Sum over water atoms for all frames + fingerprints[:, i] = np.sum(distances, axis=1) + + return fingerprints diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py new file mode 100755 index 00000000..fb3e593a --- /dev/null +++ b/scripts/AssignOuter.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# This file is part of MSMBuilder. +# +# Copyright 2014 Stanford University +# +# MSMBuilder is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +#=============================================================================== +# Imports +#=============================================================================== + +import logging + +from mdtraj import io + +from msmbuilder import arglib, assigning +from msmbuilder.metrics import solvent + +#=============================================================================== +# Globals +#=============================================================================== + +logger = logging.getLogger('msmbuilder.scripts.AssignOuter') +parser = arglib.ArgumentParser(description= + """Combine the results of two clusterings. Specifically, if a conformation + is in state i after clustering with metric 1, and it is in state j + after clustering with metric 2, we assign it to a new state (ij) + + This is a naive way of combining two metrics to give a singular + assignment of states.""") +parser.add_argument('assignment1', default='./Data1/Assignments.h5', + help='First assignment file') +parser.add_argument('assignment2', default='./Data2/Assignments.h5', + help='Second assignment file') +parser.add_argument('assignment_out', default='OuterProductAssignments.h5', + help='Output file') + + +def run(assign1_fn, assign2_fn, out_fn): + assign1 = io.loadh(assign1_fn, 'arr_0') + assign2 = io.loadh(assign2_fn, 'arr_0') + + new_assignments = assigning.outer_product_assignment(assign1, assign2) + io.saveh(out_fn, new_assignments) + logger.info('Saved outer product assignments to %s', out_fn) + + +def entry_point(): + args = parser.parse_args() + arglib.die_if_path_exists(args.assignment_out) + + run(args.assignment1, args.assignment2, args.assignment_out) + + +if __name__ == "__main__": + entry_point() diff --git a/scripts/CreateAtomIndices.py b/scripts/CreateAtomIndices.py index adb19022..4f6971a2 100755 --- a/scripts/CreateAtomIndices.py +++ b/scripts/CreateAtomIndices.py @@ -17,7 +17,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -from __future__ import print_function import sys import logging import numpy as np @@ -26,21 +25,26 @@ logger = logging.getLogger('msmbuilder.scripts.CreateAtomIndices') + parser = arglib.ArgumentParser( description="Creates an atom indices file for RMSD from a PDB.") parser.add_argument('pdb') parser.add_argument('output', default='AtomIndices.dat') parser.add_argument('atom_type', help='''Atoms to include in index file. + One of four options: (1) minimal (CA, CB, C, N, O, recommended), (2) heavy, - (3) alpha (carbons), or (4) all. Use "all" in cases where protein - nomenclature may be inapproprate, although you may want to define your own - indices in such situations. Note that "heavy" keeps all heavy atoms that + (3) alpha (carbons), (4) water (water oxygens), or (5) all. Use "all" + in cases where protein nomenclature may be inapproprate, although you + may want to define your own indices in such situations. + Note that "heavy" keeps all heavy atoms that are not symmetry equivalent. By symmetry equivalent, we mean atoms identical under an exchange of labels. For example, heavy will exclude - the two pairs of equivalent carbons (CD, CE) in a PHE ring. - Note that AtomIndices.dat should be zero-indexed--that is, a 0 + the two pairs of equivalent carbons (CD, CE) in a PHE ring. + Note that AtomIndices.dat should be zero-indexed--that is, a 0 in AtomIndices.dat corresponds to the first atom in your PDB''', - choices=['minimal', 'heavy', 'alpha', 'all'], default='minimal') + choices=['minimal', 'heavy', 'alpha', 'all', 'water'], + default='minimal') + def run(PDBfn, atomtype): @@ -123,6 +127,7 @@ def run(PDBfn, atomtype): "CNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"], "NNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"], "SOL": [], + "HOH": [], "Cl-": [], "Na+": [] } @@ -136,6 +141,13 @@ def run(PDBfn, atomtype): elif atomtype == 'alpha': for key in toKeepDict.keys(): toKeepDict[key] = ["CA"] + elif atomtype == 'water': + # Note: we only keep water oxygens + for key in toKeepDict.keys(): + if key == "SOL" or key == 'HOH': + toKeepDict[key] = ["O"] + else: + toKeepDict[key] = [] elif atomtype == "all": pass else: @@ -152,6 +164,7 @@ def run(PDBfn, atomtype): indices = [a.index for a in pdb.topology.atoms if selector(a)] return np.array(indices) + def entry_point(): args = parser.parse_args() arglib.die_if_path_exists(args.output) @@ -161,4 +174,3 @@ def entry_point(): if __name__ == "__main__": entry_point() - diff --git a/scripts/__init__.py b/scripts/__init__.py index 4336fa5e..610f9992 100644 --- a/scripts/__init__.py +++ b/scripts/__init__.py @@ -20,6 +20,7 @@ __all__ = [ 'Assign', 'AssignHierarchical', + 'AssignOuter', 'BACE_Coarse_Graining', 'BuildMSM', 'CalculateClusterRadii',