From b62c543ca1d99e3cb74ecbd331148356d0ff3188 Mon Sep 17 00:00:00 2001
From: Matthew Harrigan <harrigan@stanford.edu>
Date: Fri, 4 Apr 2014 14:20:55 -0700
Subject: [PATCH 1/4] script 1

---
 scripts/AssignOuter.py | 69 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 scripts/AssignOuter.py

diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py
new file mode 100644
index 00000000..fb3e593a
--- /dev/null
+++ b/scripts/AssignOuter.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+# This file is part of MSMBuilder.
+#
+# Copyright 2014 Stanford University
+#
+# MSMBuilder is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+#===============================================================================
+# Imports
+#===============================================================================
+
+import logging
+
+from mdtraj import io
+
+from msmbuilder import arglib, assigning
+from msmbuilder.metrics import solvent
+
+#===============================================================================
+# Globals
+#===============================================================================
+
+logger = logging.getLogger('msmbuilder.scripts.AssignOuter')
+parser = arglib.ArgumentParser(description=
+    """Combine the results of two clusterings. Specifically, if a conformation
+    is in state i after clustering with metric 1, and it is in state j
+    after clustering with metric 2, we assign it to a new state (ij)
+
+    This is a naive way of combining two metrics to give a singular
+    assignment of states.""")
+parser.add_argument('assignment1', default='./Data1/Assignments.h5',
+                    help='First assignment file')
+parser.add_argument('assignment2', default='./Data2/Assignments.h5',
+                    help='Second assignment file')
+parser.add_argument('assignment_out', default='OuterProductAssignments.h5',
+                    help='Output file')
+
+
+def run(assign1_fn, assign2_fn, out_fn):
+    assign1 = io.loadh(assign1_fn, 'arr_0')
+    assign2 = io.loadh(assign2_fn, 'arr_0')
+
+    new_assignments = assigning.outer_product_assignment(assign1, assign2)
+    io.saveh(out_fn, new_assignments)
+    logger.info('Saved outer product assignments to %s', out_fn)
+
+
+def entry_point():
+    args = parser.parse_args()
+    arglib.die_if_path_exists(args.assignment_out)
+
+    run(args.assignment1, args.assignment2, args.assignment_out)
+
+
+if __name__ == "__main__":
+    entry_point()

From 435efd12becf0d599e0f62358464b2a83c109731 Mon Sep 17 00:00:00 2001
From: Matthew Harrigan <harrigan@stanford.edu>
Date: Fri, 4 Apr 2014 14:23:34 -0700
Subject: [PATCH 2/4] script 2

---
 scripts/CreateAtomIndices.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/scripts/CreateAtomIndices.py b/scripts/CreateAtomIndices.py
index adb19022..4f6971a2 100755
--- a/scripts/CreateAtomIndices.py
+++ b/scripts/CreateAtomIndices.py
@@ -17,7 +17,6 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
-from __future__ import print_function
 import sys
 import logging
 import numpy as np
@@ -26,21 +25,26 @@
 logger = logging.getLogger('msmbuilder.scripts.CreateAtomIndices')
 
 
+
 parser = arglib.ArgumentParser(
     description="Creates an atom indices file for RMSD from a PDB.")
 parser.add_argument('pdb')
 parser.add_argument('output', default='AtomIndices.dat')
 parser.add_argument('atom_type', help='''Atoms to include in index file.
+
     One of four options: (1) minimal (CA, CB, C, N, O, recommended), (2) heavy,
-    (3) alpha (carbons), or (4) all.  Use "all" in cases where protein
-    nomenclature may be inapproprate, although you may want to define your own
-    indices in such situations.  Note that "heavy" keeps all heavy atoms that
+    (3) alpha (carbons), (4) water (water oxygens), or (5) all.  Use "all"
+    in cases where protein nomenclature may be inapproprate, although you
+    may want to define your own indices in such situations.
+    Note that "heavy" keeps all heavy atoms that
     are not symmetry equivalent.  By symmetry equivalent, we mean atoms
     identical under an exchange of labels.  For example, heavy will exclude
-    the two pairs of equivalent carbons (CD, CE) in a PHE ring.    
-    Note that AtomIndices.dat should be zero-indexed--that is, a 0 
+    the two pairs of equivalent carbons (CD, CE) in a PHE ring.
+    Note that AtomIndices.dat should be zero-indexed--that is, a 0
     in AtomIndices.dat corresponds to the first atom in your PDB''',
-                    choices=['minimal', 'heavy', 'alpha', 'all'], default='minimal')
+                    choices=['minimal', 'heavy', 'alpha', 'all', 'water'],
+                    default='minimal')
+
 
 
 def run(PDBfn, atomtype):
@@ -123,6 +127,7 @@ def run(PDBfn, atomtype):
         "CNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"],
         "NNLE": ["N", "CA", "CB", "C", "O", "CG", "CD", "CE"],
         "SOL": [],
+        "HOH": [],
         "Cl-": [],
         "Na+": []
     }
@@ -136,6 +141,13 @@ def run(PDBfn, atomtype):
     elif atomtype == 'alpha':
         for key in toKeepDict.keys():
             toKeepDict[key] = ["CA"]
+    elif atomtype == 'water':
+        # Note: we only keep water oxygens
+        for key in toKeepDict.keys():
+            if key == "SOL" or key == 'HOH':
+                toKeepDict[key] = ["O"]
+            else:
+                toKeepDict[key] = []
     elif atomtype == "all":
         pass
     else:
@@ -152,6 +164,7 @@ def run(PDBfn, atomtype):
     indices = [a.index for a in pdb.topology.atoms if selector(a)]
     return np.array(indices)
 
+
 def entry_point():
     args = parser.parse_args()
     arglib.die_if_path_exists(args.output)
@@ -161,4 +174,3 @@ def entry_point():
 
 if __name__ == "__main__":
     entry_point()
-

From 6fc41d3f7e93aa685f401217b865634ab5a5f887 Mon Sep 17 00:00:00 2001
From: Matthew Harrigan <harrigan@stanford.edu>
Date: Fri, 4 Apr 2014 14:34:08 -0700
Subject: [PATCH 3/4] The rest

---
 MSMBuilder/assigning.py       |  51 ++++++++++++++++
 MSMBuilder/metrics/parsers.py |  36 +++++++++---
 MSMBuilder/metrics/solvent.py | 106 ++++++++++++++++++++++++++++++++++
 3 files changed, 186 insertions(+), 7 deletions(-)
 create mode 100644 MSMBuilder/metrics/solvent.py

diff --git a/MSMBuilder/assigning.py b/MSMBuilder/assigning.py
index d08cb8cd..18a7d0d3 100644
--- a/MSMBuilder/assigning.py
+++ b/MSMBuilder/assigning.py
@@ -7,6 +7,7 @@
 import tables
 import warnings
 from mdtraj import io
+from msmbuilder import MSMLib as msmlib
 import logging
 logger = logging.getLogger(__name__)
 
@@ -240,3 +241,53 @@ def streaming_assign_with_checkpoint(metric, project, generators, assignments_pa
                    "-- this function is deprecated"), DeprecationWarning)
     assign_with_checkpoint(metric, project, generators, assignments_path,
                            distances_path, chunk_size, atom_indices_to_load)
+
+
+def outer_product_assignment(assign1, assign2):
+    """Combine the results of two clusterings.
+
+    Specifically, if a conformation is in state i after clustering
+    with metric 1, and it is in state j after clustering with metric 2,
+    we assign it to a new state (ij)
+
+    This is a naive way of combining two metrics to give a singular
+    assignment of states.
+
+    Parameters
+    ----------
+    assign1 : np.ndarray
+        Assignment array 1
+    assign2 : np.ndarray
+        Assignment array 2
+
+    Returns
+    ----------
+    new_assign : np.ndarray
+        New assignments array, renumbered from 0 ... N-1, N <= n1 * n2
+    """
+
+    assert assign1.shape == assign2.shape, """Assignments must be
+        for the same set of trajectories."""
+    new_assign = -1 * np.ones_like(assign1, dtype=np.int)
+
+    nstates1 = np.max(assign1) + 1
+    nstates2 = np.max(assign2) + 1
+
+    translations = np.reshape(np.arange(nstates1 * nstates2),
+                              (nstates1, nstates2))
+
+    ass_shape = assign1.shape
+    for i in xrange(ass_shape[0]):
+        for j in xrange(ass_shape[1]):
+            # TODO: vectorize this loop
+            if assign1[i, j] == -1:
+                # No assignment here
+                assert assign2[i, j] == -1, """Assignments must be for
+                    the same set of trajectories."""
+            else:
+                new_assign[i, j] = translations[assign1[i, j], assign2[i, j]]
+
+    # Renumber states in place
+    msmlib.renumber_states(new_assign)
+    return new_assign
+
diff --git a/MSMBuilder/metrics/parsers.py b/MSMBuilder/metrics/parsers.py
index 1d6e1254..f1eb8a61 100644
--- a/MSMBuilder/metrics/parsers.py
+++ b/MSMBuilder/metrics/parsers.py
@@ -10,7 +10,7 @@
 from msmbuilder.metrics import (RMSD, Dihedral, BooleanContact,
                                 AtomPairs, ContinuousContact,
                                 AbstractDistanceMetric, Vectorized,
-                                RedDimPNorm, Positions)
+                                RedDimPNorm, Positions, SolventFp)
 
 
 def add_argument(group, *args, **kwargs):
@@ -22,8 +22,6 @@ def add_argument(group, *args, **kwargs):
             kwargs['help'] = d
     group.add_argument(*args, **kwargs)
 
-################################################################################
-
 
 def locate_metric_plugins(name):
     if not name in ['add_metric_parser', 'construct_metric', 'metric_class']:
@@ -39,7 +37,7 @@ def add_metric_parsers(parser):
 
     metric_subparser = parser.add_subparsers(dest='metric', description='Available metrics to use.')
 
-    #metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric')
+    # metrics_parsers = parser.add_subparsers(description='Available metrics to use.',dest='metric')
     rmsd = metric_subparser.add_parser('rmsd',
                                        description='''RMSD: Root mean square deviation over a set of user defined atoms
         (typically backbone heavy atoms or alpha carbons). To evaluate the distance
@@ -80,7 +78,7 @@ def add_metric_parsers(parser):
         Furthermore, the sense in which the distance between two residues is computed can be
         either specified as "CA", "closest", or "closest-heavy", which will respectively compute
         ("CA") the distance between the residues' alpha carbons, ("closest"), the closest distance between any pair of
-        atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"), 
+        atoms i and j such that i belongs to one residue and j to the other residue, ("closest-heavy"),
         or the closest distance between any pair of NON-HYDROGEN atoms i and j such that i belongs to
         one residue and j to the other residue. This code is executed in parallel on multiple cores (but
         not multiple boxes) using OMP.''')
@@ -106,6 +104,21 @@ def add_metric_parsers(parser):
                  help='which distance metric', choices=AtomPairs.allowable_scipy_metrics)
     metric_parser_list.append(atompairs)
 
+    solventfp = metric_subparser.add_parser('solventfp', description="""SOLVENTFP: For each frame
+        calculate a 'solvent fingerprint' by considering the weighted pairwise distances
+        between solute and solvent atoms as in as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.""")
+    add_argument(solventfp, '-w', dest='solventfp_wateri', default='WaterIndices.dat',
+                help='Indices of the solvent (water) atoms')
+    add_argument(solventfp, '-v', dest='solventfp_proti', default='ProteinIndices.dat',
+                help='Indices of the solute (protein) atoms')
+    add_argument(solventfp, '-s', dest='solventfp_sigma', default=0.5,
+                 type=float, help='std. dev. of gaussian kernel')
+    add_argument(solventfp, '-p', dest='solventfp_p', default=2,
+                 help='p used for metric=minkowski (otherwise ignored)')
+    add_argument(solventfp, '-m', dest='solventfp_metric', default='euclidean',
+        help='which distance metric', choices=SolventFp.allowable_scipy_metrics)
+    metric_parser_list.append(solventfp)
+
     positions = metric_subparser.add_parser('positions', description="""POSITIONS: For each frame
         we represent the conformation as a vector of atom positions, where the atoms have been
         aligned to a target structure.""")
@@ -121,10 +134,10 @@ def add_metric_parsers(parser):
                  help='which distance metric', choices=Positions.allowable_scipy_metrics)
     metric_parser_list.append(positions)
 
-    tica = metric_subparser.add_parser( 'tica', description='''
+    tica = metric_subparser.add_parser('tica', description='''
         tICA: This metric is based on a variation of PCA which looks for the slowest d.o.f.
         in the simulation data. See (Schwantes, C.R., Pande, V.S. JCTC 2013, 9 (4), 2000-09.)
-        for more details. In addition to these options, you must provide an additional 
+        for more details. In addition to these options, you must provide an additional
         metric you used to prepare the trajectories in the training step.''')
 
     add_argument(tica, '-p', dest='p', help='p value for p-norm')
@@ -213,6 +226,15 @@ def construct_metric(args):
         metric = Positions(target, atom_indices=atom_indices, align_indices=align_indices,
                            metric=args.positions_metric, p=args.positions_p)
 
+    elif metric_name == 'solventfp':
+        proti = np.loadtxt(args.solventfp_proti, np.int)
+        wateri = np.loadtxt(args.solventfp_wateri, np.int)
+        metric = SolventFp(solute_indices=proti,
+                           solvent_indices=wateri,
+                           sigma=args.solventfp_sigma,
+                           metric=args.solventfp_metric,
+                           p=args.solventfp_p)
+
     elif metric_name == "tica":
         tica_obj = tICA.load(args.tica_fn)
 
diff --git a/MSMBuilder/metrics/solvent.py b/MSMBuilder/metrics/solvent.py
new file mode 100644
index 00000000..c31f8d7d
--- /dev/null
+++ b/MSMBuilder/metrics/solvent.py
@@ -0,0 +1,106 @@
+from __future__ import print_function, absolute_import, division
+
+import logging
+
+from baseclasses import Vectorized
+import mdtraj as md
+import numpy as np
+
+
+logger = logging.getLogger(__name__)
+
+
+class SolventFp(Vectorized):
+    """Distance metric for calculating distances between frames based on their
+    solvent signature as in Gu et al. BMC Bioinformatics 2013, 14(Suppl 2):S8.
+    """
+
+    allowable_scipy_metrics = ['braycurtis', 'canberra', 'chebyshev',
+                               'cityblock', 'correlation', 'cosine',
+                               'euclidean', 'minkowski', 'sqeuclidean',
+                               'seuclidean', 'mahalanobis', 'sqmahalanobis']
+
+    def __init__(self, solute_indices, solvent_indices, sigma,
+                 metric='euclidean', p=2, V=None, VI=None):
+        """Create a distance metric to capture solvent degrees of freedom
+
+        Parameters
+        ----------
+        solute_indices : ndarray
+            atom indices of the solute atoms
+        solvent_indices : ndarray
+            atom indices of the solvent atoms
+        sigma : float
+                width of gaussian kernel
+        metric : {'braycurtis', 'canberra', 'chebyshev', 'cityblock',
+                  'correlation', 'cosine', 'euclidean', 'minkowski',
+                  'sqeuclidean', 'seuclidean', 'mahalanobis', 'sqmahalanobis'}
+            Distance metric to equip the vector space with.
+        p : int, optional
+            p-norm order, used for metric='minkowski'
+        V : ndarray, optional
+            variances, used for metric='seuclidean'
+        VI : ndarray, optional
+            inverse covariance matrix, used for metric='mahalanobi'
+
+        """
+
+        # Check input indices
+        md.utils.ensure_type(solute_indices, dtype=np.int, ndim=1,
+                             name='solute', can_be_none=False)
+        md.utils.ensure_type(solvent_indices, dtype=np.int, ndim=1,
+                             name='solvent', can_be_none=False)
+
+        super(SolventFp, self).__init__(metric, p, V, VI)
+        self.solute_indices = solute_indices
+        self.solvent_indices = solvent_indices
+        self.sigma = sigma
+
+    def __repr__(self):
+        "String representation of the object"
+        return ('metrics.SolventFp(metric=%s, p=%s, sigma=%s)'
+                % (self.metric, self.p, self.sigma))
+
+    def prepare_trajectory(self, trajectory):
+        """Calculate solvent fingerprints
+        Parameters
+        ----------
+        trajectory : msmbuilder.Trajectory
+            An MSMBuilder trajectory to prepare
+
+        Returns
+        -------
+        fingerprints : ndarray
+            A 2D array of fingerprint vectors of
+            shape (traj_length, protein_atom)
+        """
+
+        # Give shorter names to these things
+        prot_indices = self.solute_indices
+        water_indices = self.solvent_indices
+        sigma = self.sigma
+
+        # The result vector
+        fingerprints = np.zeros((trajectory.n_frames, len(prot_indices)))
+
+        # Check for periodic information
+        if trajectory.unitcell_lengths is None:
+            logging.warn('No periodic information found for computing solventfp.')
+
+        for i, prot_i in enumerate(prot_indices):
+            # For each protein atom, calculate distance to all water
+            # molecules
+            atom_pairs = np.empty((len(water_indices), 2))
+            atom_pairs[:, 0] = prot_i
+            atom_pairs[:, 1] = water_indices
+            # Get a traj_length x n_water_indices vector of distances
+            distances = md.compute_distances(trajectory,
+                                             atom_pairs,
+                                             periodic=True)
+            # Calculate guassian kernel
+            distances = np.exp(-distances / (2 * sigma * sigma))
+
+            # Sum over water atoms for all frames
+            fingerprints[:, i] = np.sum(distances, axis=1)
+
+        return fingerprints

From 1af603b4f7d3020c2bc09cf078328e0f4458ce83 Mon Sep 17 00:00:00 2001
From: Matthew Harrigan <harrigan@stanford.edu>
Date: Fri, 4 Apr 2014 14:48:39 -0700
Subject: [PATCH 4/4] bugfixes

---
 MSMBuilder/metrics/__init__.py | 1 +
 MSMBuilder/metrics/solvent.py  | 2 +-
 scripts/AssignOuter.py         | 0
 scripts/__init__.py            | 1 +
 4 files changed, 3 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 scripts/AssignOuter.py

diff --git a/MSMBuilder/metrics/__init__.py b/MSMBuilder/metrics/__init__.py
index dc097d5f..cd0ed733 100644
--- a/MSMBuilder/metrics/__init__.py
+++ b/MSMBuilder/metrics/__init__.py
@@ -19,3 +19,4 @@
 from .hybrid import Hybrid, HybridPNorm
 from .projection import RedDimPNorm
 from .positions import Positions
+from .solvent import SolventFp
diff --git a/MSMBuilder/metrics/solvent.py b/MSMBuilder/metrics/solvent.py
index c31f8d7d..25001712 100644
--- a/MSMBuilder/metrics/solvent.py
+++ b/MSMBuilder/metrics/solvent.py
@@ -2,7 +2,7 @@
 
 import logging
 
-from baseclasses import Vectorized
+from .baseclasses import Vectorized
 import mdtraj as md
 import numpy as np
 
diff --git a/scripts/AssignOuter.py b/scripts/AssignOuter.py
old mode 100644
new mode 100755
diff --git a/scripts/__init__.py b/scripts/__init__.py
index 4336fa5e..610f9992 100644
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
@@ -20,6 +20,7 @@
 __all__ = [
  'Assign',
  'AssignHierarchical',
+ 'AssignOuter',
  'BACE_Coarse_Graining',
  'BuildMSM',
  'CalculateClusterRadii',