diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index 2f21ff8e..cfe752d7 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -27,4 +27,4 @@ jobs: run: | pip install pytest pip install -e . - pytest --ignore pensa/diffnets/tests/test_api.py --ignore pensa/diffnets/tests/test_cli.py --ignore pensa/diffnets/tests/test_diffnets.py + pytest diff --git a/pensa/diffnets/__init__.py b/pensa/diffnets/__init__.py deleted file mode 100644 index 27240f60..00000000 --- a/pensa/diffnets/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -""" -diffnets -Supervised and self-supervised autoencoders to identify the mechanistic basis for biochemical differences between protein variants. -""" - -# Add imports here -from . import training, data_processing, analysis, utils, exmax, nnutils - -# Handle versioneer -from ._version import get_versions -versions = get_versions() -__version__ = versions['version'] -__git_revision__ = versions['full-revisionid'] -del get_versions, versions diff --git a/pensa/diffnets/_version.py b/pensa/diffnets/_version.py deleted file mode 100644 index f31bf120..00000000 --- a/pensa/diffnets/_version.py +++ /dev/null @@ -1,520 +0,0 @@ - -# This file helps to compute a version number in source trees obtained from -# git-archive tarball (such as those provided by githubs download-from-tag -# feature). Distribution tarballs (built by setup.py sdist) and build -# directories (produced by setup.py build) will contain a much shorter file -# that just contains the computed version number. - -# This file is released into the public domain. Generated by -# versioneer-0.18 (https://github.com/warner/python-versioneer) - -"""Git implementation of _version.py.""" - -import errno -import os -import re -import subprocess -import sys - - -def get_keywords(): - """Get the keywords needed to look up the version information.""" - # these strings will be replaced by git during git-archive. - # setup.py/versioneer.py will grep for the variable names, so they must - # each be defined on a line of their own. _version.py will just call - # get_keywords(). - git_refnames = "$Format:%d$" - git_full = "$Format:%H$" - git_date = "$Format:%ci$" - keywords = {"refnames": git_refnames, "full": git_full, "date": git_date} - return keywords - - -class VersioneerConfig: - """Container for Versioneer configuration parameters.""" - - -def get_config(): - """Create, populate and return the VersioneerConfig() object.""" - # these strings are filled in when 'setup.py versioneer' creates - # _version.py - cfg = VersioneerConfig() - cfg.VCS = "git" - cfg.style = "pep440" - cfg.tag_prefix = "" - cfg.parentdir_prefix = "None" - cfg.versionfile_source = "diffnets/_version.py" - cfg.verbose = False - return cfg - - -class NotThisMethod(Exception): - """Exception raised if a method is not valid for the current scenario.""" - - -LONG_VERSION_PY = {} -HANDLERS = {} - - -def register_vcs_handler(vcs, method): # decorator - """Decorator to mark a method as the handler for a particular VCS.""" - def decorate(f): - """Store f in HANDLERS[vcs][method].""" - if vcs not in HANDLERS: - HANDLERS[vcs] = {} - HANDLERS[vcs][method] = f - return f - return decorate - - -def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False, - env=None): - """Call the given command(s).""" - assert isinstance(commands, list) - p = None - for c in commands: - try: - dispcmd = str([c] + args) - # remember shell=False, so use git.cmd on windows, not just git - p = subprocess.Popen([c] + args, cwd=cwd, env=env, - stdout=subprocess.PIPE, - stderr=(subprocess.PIPE if hide_stderr - else None)) - break - except EnvironmentError: - e = sys.exc_info()[1] - if e.errno == errno.ENOENT: - continue - if verbose: - print("unable to run %s" % dispcmd) - print(e) - return None, None - else: - if verbose: - print("unable to find command, tried %s" % (commands,)) - return None, None - stdout = p.communicate()[0].strip() - if sys.version_info[0] >= 3: - stdout = stdout.decode() - if p.returncode != 0: - if verbose: - print("unable to run %s (error)" % dispcmd) - print("stdout was %s" % stdout) - return None, p.returncode - return stdout, p.returncode - - -def versions_from_parentdir(parentdir_prefix, root, verbose): - """Try to determine the version from the parent directory name. - - Source tarballs conventionally unpack into a directory that includes both - the project name and a version string. We will also support searching up - two directory levels for an appropriately named parent directory - """ - rootdirs = [] - - for i in range(3): - dirname = os.path.basename(root) - if dirname.startswith(parentdir_prefix): - return {"version": dirname[len(parentdir_prefix):], - "full-revisionid": None, - "dirty": False, "error": None, "date": None} - else: - rootdirs.append(root) - root = os.path.dirname(root) # up a level - - if verbose: - print("Tried directories %s but none started with prefix %s" % - (str(rootdirs), parentdir_prefix)) - raise NotThisMethod("rootdir doesn't start with parentdir_prefix") - - -@register_vcs_handler("git", "get_keywords") -def git_get_keywords(versionfile_abs): - """Extract version information from the given file.""" - # the code embedded in _version.py can just fetch the value of these - # keywords. When used from setup.py, we don't want to import _version.py, - # so we do it with a regexp instead. This function is not used from - # _version.py. - keywords = {} - try: - f = open(versionfile_abs, "r") - for line in f.readlines(): - if line.strip().startswith("git_refnames ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["refnames"] = mo.group(1) - if line.strip().startswith("git_full ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["full"] = mo.group(1) - if line.strip().startswith("git_date ="): - mo = re.search(r'=\s*"(.*)"', line) - if mo: - keywords["date"] = mo.group(1) - f.close() - except EnvironmentError: - pass - return keywords - - -@register_vcs_handler("git", "keywords") -def git_versions_from_keywords(keywords, tag_prefix, verbose): - """Get version information from git keywords.""" - if not keywords: - raise NotThisMethod("no keywords at all, weird") - date = keywords.get("date") - if date is not None: - # git-2.2.0 added "%cI", which expands to an ISO-8601 -compliant - # datestamp. However we prefer "%ci" (which expands to an "ISO-8601 - # -like" string, which we must then edit to make compliant), because - # it's been around since git-1.5.3, and it's too difficult to - # discover which version we're using, or to work around using an - # older one. - date = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - refnames = keywords["refnames"].strip() - if refnames.startswith("$Format"): - if verbose: - print("keywords are unexpanded, not using") - raise NotThisMethod("unexpanded keywords, not a git-archive tarball") - refs = set([r.strip() for r in refnames.strip("()").split(",")]) - # starting in git-1.8.3, tags are listed as "tag: foo-1.0" instead of - # just "foo-1.0". If we see a "tag: " prefix, prefer those. - TAG = "tag: " - tags = set([r[len(TAG):] for r in refs if r.startswith(TAG)]) - if not tags: - # Either we're using git < 1.8.3, or there really are no tags. We use - # a heuristic: assume all version tags have a digit. The old git %d - # expansion behaves like git log --decorate=short and strips out the - # refs/heads/ and refs/tags/ prefixes that would let us distinguish - # between branches and tags. By ignoring refnames without digits, we - # filter out many common branch names like "release" and - # "stabilization", as well as "HEAD" and "master". - tags = set([r for r in refs if re.search(r'\d', r)]) - if verbose: - print("discarding '%s', no digits" % ",".join(refs - tags)) - if verbose: - print("likely tags: %s" % ",".join(sorted(tags))) - for ref in sorted(tags): - # sorting will prefer e.g. "2.0" over "2.0rc1" - if ref.startswith(tag_prefix): - r = ref[len(tag_prefix):] - if verbose: - print("picking %s" % r) - return {"version": r, - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": None, - "date": date} - # no suitable tags, so version is "0+unknown", but full hex is still there - if verbose: - print("no suitable tags, using unknown + full revision id") - return {"version": "0+unknown", - "full-revisionid": keywords["full"].strip(), - "dirty": False, "error": "no suitable tags", "date": None} - - -@register_vcs_handler("git", "pieces_from_vcs") -def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): - """Get version from 'git describe' in the root of the source tree. - - This only gets called if the git-archive 'subst' keywords were *not* - expanded, and _version.py hasn't already been rewritten with a short - version string, meaning we're inside a checked out source tree. - """ - GITS = ["git"] - if sys.platform == "win32": - GITS = ["git.cmd", "git.exe"] - - out, rc = run_command(GITS, ["rev-parse", "--git-dir"], cwd=root, - hide_stderr=True) - if rc != 0: - if verbose: - print("Directory %s not under git control" % root) - raise NotThisMethod("'git rev-parse --git-dir' returned error") - - # if there is a tag matching tag_prefix, this yields TAG-NUM-gHEX[-dirty] - # if there isn't one, this yields HEX[-dirty] (no NUM) - describe_out, rc = run_command(GITS, ["describe", "--tags", "--dirty", - "--always", "--long", - "--match", "%s*" % tag_prefix], - cwd=root) - # --long was added in git-1.5.5 - if describe_out is None: - raise NotThisMethod("'git describe' failed") - describe_out = describe_out.strip() - full_out, rc = run_command(GITS, ["rev-parse", "HEAD"], cwd=root) - if full_out is None: - raise NotThisMethod("'git rev-parse' failed") - full_out = full_out.strip() - - pieces = {} - pieces["long"] = full_out - pieces["short"] = full_out[:7] # maybe improved later - pieces["error"] = None - - # parse describe_out. It will be like TAG-NUM-gHEX[-dirty] or HEX[-dirty] - # TAG might have hyphens. - git_describe = describe_out - - # look for -dirty suffix - dirty = git_describe.endswith("-dirty") - pieces["dirty"] = dirty - if dirty: - git_describe = git_describe[:git_describe.rindex("-dirty")] - - # now we have TAG-NUM-gHEX or HEX - - if "-" in git_describe: - # TAG-NUM-gHEX - mo = re.search(r'^(.+)-(\d+)-g([0-9a-f]+)$', git_describe) - if not mo: - # unparseable. Maybe git-describe is misbehaving? - pieces["error"] = ("unable to parse git-describe output: '%s'" - % describe_out) - return pieces - - # tag - full_tag = mo.group(1) - if not full_tag.startswith(tag_prefix): - if verbose: - fmt = "tag '%s' doesn't start with prefix '%s'" - print(fmt % (full_tag, tag_prefix)) - pieces["error"] = ("tag '%s' doesn't start with prefix '%s'" - % (full_tag, tag_prefix)) - return pieces - pieces["closest-tag"] = full_tag[len(tag_prefix):] - - # distance: number of commits since tag - pieces["distance"] = int(mo.group(2)) - - # commit: short hex revision ID - pieces["short"] = mo.group(3) - - else: - # HEX: no tags - pieces["closest-tag"] = None - count_out, rc = run_command(GITS, ["rev-list", "HEAD", "--count"], - cwd=root) - pieces["distance"] = int(count_out) # total number of commits - - # commit date: see ISO-8601 comment in git_versions_from_keywords() - date = run_command(GITS, ["show", "-s", "--format=%ci", "HEAD"], - cwd=root)[0].strip() - pieces["date"] = date.strip().replace(" ", "T", 1).replace(" ", "", 1) - - return pieces - - -def plus_or_dot(pieces): - """Return a + if we don't already have one, else return a .""" - if "+" in pieces.get("closest-tag", ""): - return "." - return "+" - - -def render_pep440(pieces): - """Build up version string, with post-release "local version identifier". - - Our goal: TAG[+DISTANCE.gHEX[.dirty]] . Note that if you - get a tagged build and then dirty it, you'll get TAG+0.gHEX.dirty - - Exceptions: - 1: no tags. git_describe was just HEX. 0+untagged.DISTANCE.gHEX[.dirty] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += plus_or_dot(pieces) - rendered += "%d.g%s" % (pieces["distance"], pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - else: - # exception #1 - rendered = "0+untagged.%d.g%s" % (pieces["distance"], - pieces["short"]) - if pieces["dirty"]: - rendered += ".dirty" - return rendered - - -def render_pep440_pre(pieces): - """TAG[.post.devDISTANCE] -- No -dirty. - - Exceptions: - 1: no tags. 0.post.devDISTANCE - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += ".post.dev%d" % pieces["distance"] - else: - # exception #1 - rendered = "0.post.dev%d" % pieces["distance"] - return rendered - - -def render_pep440_post(pieces): - """TAG[.postDISTANCE[.dev0]+gHEX] . - - The ".dev0" means dirty. Note that .dev0 sorts backwards - (a dirty tree will appear "older" than the corresponding clean one), - but you shouldn't be releasing software with -dirty anyways. - - Exceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += plus_or_dot(pieces) - rendered += "g%s" % pieces["short"] - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - rendered += "+g%s" % pieces["short"] - return rendered - - -def render_pep440_old(pieces): - """TAG[.postDISTANCE[.dev0]] . - - The ".dev0" means dirty. - - Eexceptions: - 1: no tags. 0.postDISTANCE[.dev0] - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"] or pieces["dirty"]: - rendered += ".post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - else: - # exception #1 - rendered = "0.post%d" % pieces["distance"] - if pieces["dirty"]: - rendered += ".dev0" - return rendered - - -def render_git_describe(pieces): - """TAG[-DISTANCE-gHEX][-dirty]. - - Like 'git describe --tags --dirty --always'. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - if pieces["distance"]: - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render_git_describe_long(pieces): - """TAG-DISTANCE-gHEX[-dirty]. - - Like 'git describe --tags --dirty --always -long'. - The distance/hash is unconditional. - - Exceptions: - 1: no tags. HEX[-dirty] (note: no 'g' prefix) - """ - if pieces["closest-tag"]: - rendered = pieces["closest-tag"] - rendered += "-%d-g%s" % (pieces["distance"], pieces["short"]) - else: - # exception #1 - rendered = pieces["short"] - if pieces["dirty"]: - rendered += "-dirty" - return rendered - - -def render(pieces, style): - """Render the given version pieces into the requested style.""" - if pieces["error"]: - return {"version": "unknown", - "full-revisionid": pieces.get("long"), - "dirty": None, - "error": pieces["error"], - "date": None} - - if not style or style == "default": - style = "pep440" # the default - - if style == "pep440": - rendered = render_pep440(pieces) - elif style == "pep440-pre": - rendered = render_pep440_pre(pieces) - elif style == "pep440-post": - rendered = render_pep440_post(pieces) - elif style == "pep440-old": - rendered = render_pep440_old(pieces) - elif style == "git-describe": - rendered = render_git_describe(pieces) - elif style == "git-describe-long": - rendered = render_git_describe_long(pieces) - else: - raise ValueError("unknown style '%s'" % style) - - return {"version": rendered, "full-revisionid": pieces["long"], - "dirty": pieces["dirty"], "error": None, - "date": pieces.get("date")} - - -def get_versions(): - """Get version information or return default if unable to do so.""" - # I am in _version.py, which lives at ROOT/VERSIONFILE_SOURCE. If we have - # __file__, we can work backwards from there to the root. Some - # py2exe/bbfreeze/non-CPython implementations don't do __file__, in which - # case we can only use expanded keywords. - - cfg = get_config() - verbose = cfg.verbose - - try: - return git_versions_from_keywords(get_keywords(), cfg.tag_prefix, - verbose) - except NotThisMethod: - pass - - try: - root = os.path.realpath(__file__) - # versionfile_source is the relative path from the top of the source - # tree (where the .git directory might live) to this file. Invert - # this to find the root from __file__. - for i in cfg.versionfile_source.split('/'): - root = os.path.dirname(root) - except NameError: - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to find root of source tree", - "date": None} - - try: - pieces = git_pieces_from_vcs(cfg.tag_prefix, root, verbose) - return render(pieces, cfg.style) - except NotThisMethod: - pass - - try: - if cfg.parentdir_prefix: - return versions_from_parentdir(cfg.parentdir_prefix, root, verbose) - except NotThisMethod: - pass - - return {"version": "0+unknown", "full-revisionid": None, - "dirty": None, - "error": "unable to compute version", "date": None} diff --git a/pensa/diffnets/analysis.py b/pensa/diffnets/analysis.py deleted file mode 100644 index 7de8ce32..00000000 --- a/pensa/diffnets/analysis.py +++ /dev/null @@ -1,881 +0,0 @@ -import mdtraj as md -import numpy as np -import itertools -from . import utils -import multiprocessing as mp -import os -import functools -from torch.autograd import Variable -import torch -from scipy import stats -from scipy.stats import pearsonr -from sklearn.metrics import roc_auc_score, roc_curve -import matplotlib.pyplot as plt -import enspara -import enspara.cluster as cluster -import enspara.info_theory as infotheor -import enspara.msm as msm -import enspara.cluster as cluster -import enspara.info_theory as infotheor -import pickle -import scipy.sparse -import sys -from pylab import * -from torch.autograd import Variable -from collections import defaultdict - -class Analysis: - """Core object for running analysis. - - Parameters - ---------- - net : nnutils object - Neural network to perform analysis with - netdir : str - path to directory with neural network results - datadir : str - path to directory with data required to train the data. Includes - cm.npy, wm.npy, uwm.npy, master.pdb, an aligned_xtcs dir, and an - indicators dir. - """ - def __init__(self, net, netdir, datadir): - self.net = net - self.netdir = netdir - self.datadir = datadir - self.top = md.load(os.path.join( - self.datadir, "master.pdb")) - self.cm = np.load(os.path.join(self.datadir, "cm.npy")) - self.n_cores = mp.cpu_count() - - def encode_data(self): - """Calculate the latent space for all trajectory frames. - """ - enc_dir = os.path.join(self.netdir, "encodings") - utils.mkdir(enc_dir) - xtc_dir = os.path.join(self.datadir, "aligned_xtcs") - encode_dir(self.net, xtc_dir, enc_dir, self.top, self.n_cores, self.cm) - - def recon_traj(self): - """Reconstruct all trajectory frames using the trained neural - network""" - recon_dir = os.path.join(self.netdir, "recon_trajs") - utils.mkdir(recon_dir) - enc_dir = os.path.join(self.netdir, "encodings") - recon_traj_dir(self.net, enc_dir, recon_dir, self.top.top, - self.cm, self.n_cores) - print("trajectories reconstructed") - - def get_labels(self): - """Calculate the classification score for all trajectory frames - """ - label_dir = os.path.join(self.netdir, "labels") - utils.mkdir(label_dir) - enc_dir = os.path.join(self.netdir, "encodings") - calc_labels(self.net, enc_dir, label_dir, self.n_cores) - print("labels calculated for all states") - - def get_rmsd(self): - """Calculate RMSD between actual trajectory frames and autoencoder - reconstructed frames""" - rmsd_fn = os.path.join(self.netdir, "rmsd.npy") - recon_dir = os.path.join(self.netdir, "recon_trajs") - orig_xtc_dir = os.path.join(self.datadir, "aligned_xtcs") - rmsd = rmsd_dists_dir(recon_dir, orig_xtc_dir, self.top, self.n_cores) - np.save(rmsd_fn, rmsd) - - def morph(self,n_frames=10): - """Get representative structures for classification scores - from 0 to 1. - - Parameters - ---------- - n_frames : int - How many representative structures to output. Bins between - 0 and 1 will be calculated with this number. - """ - morph_label(self.net,self.netdir,self.datadir,n_frames=n_frames) - - def assign_labels_to_variants(self,plot_labels=False): - """Map DiffNet labels to each variant with option to plot - a histogram of the labels. - - Parameters - ---------- - plot_labels : optional, boolean - Save a matplotlob figure of the label histogram. - - Returns - ------- - lab_v : dictionary - Dictionary mapping labels to their respective variants. - """ - - lab_fns = utils.get_fns(os.path.join(self.netdir,"labels"),"*.npy") - traj_d_path = os.path.join(self.datadir,"traj_dict.pkl") - traj_d = pickle.load(open(traj_d_path, 'rb')) - lab_v = defaultdict(list) - for key,item in traj_d.items(): - for traj_ind in range(item[0],item[1]): - lab = np.load(lab_fns[traj_ind]) - lab_v[key].append(lab) - - if plot_labels: - plt.figure(figsize=(16,16)) - axes = plt.gca() - lw = 8 - - for k in traj_d.keys(): - t = np.concatenate(lab_v[k]) - n, x = np.histogram(t, range=(0, 1), bins=50) - plt.plot(x[:-1],n,label=k,linewidth=lw) - - - plt.xticks(fontsize=36) - plt.yticks(fontsize=36) - axes.set_xlabel('DiffNet Label',labelpad=40, fontsize=36) - axes.set_ylabel('# of Simulation Frames',labelpad=40,fontsize=36) - axes.tick_params(direction='out', length=20, width=5, - grid_color='r', grid_alpha=0.5) - plt.legend(fontsize=36) - - for axis in ['top','bottom','left','right']: - axes.spines[axis].set_linewidth(5) - - plt.savefig(os.path.join(self.netdir,"label_plot.png")) - - return lab_v - - def find_feats(self,inds,out_fn,n_states=2000,num2plot=100,clusters=None): - """Generate a .pml file that will show the distances that change - in a way that is most with changes in the classifications score. - - Parameters - ---------- - inds : np.ndarray, - Indices of the topology file that are to be included in - calculating what distances are most correlated with classification - score. - out_fn : str - Name of the output file. - n_states : int (default=2000) - How many cluster centers to calculate and use for correlation - measurement. - num2plot : int (default=100) - Number of distances to be shown. - clusters : enspara cluster object - Cluster object with center_indices attribute - """ - if not clusters: - cc_dir = os.path.join(self.netdir, "cluster_%d" % n_states) - utils.mkdir(cc_dir) - - enc = utils.load_npy_dir(os.path.join(self.netdir, "encodings"), "*npy") - if hasattr(self.net,"split_inds"): - x = self.net.encoder1[-1].out_features - enc = enc[:,:x] - clusters = cluster.hybrid.hybrid(enc, euc_dist, - n_clusters=n_states, n_iters=1) - cluster_fn = os.path.join(cc_dir, "clusters.pkl") - pickle.dump(clusters, open(cluster_fn, 'wb')) - - find_features(self.net,self.datadir,self.netdir, - clusters.center_indices,inds,out_fn,num2plot=num2plot) - - def run_core(self): - """Wrapper to run the analysis functions that should be - run after training. - """ - self.encode_data() - - self.recon_traj() - - self.get_labels() - - self.get_rmsd() - -def euc_dist(trj, frame): - diff = np.abs(trj - frame) - try: - d = np.sqrt(np.sum(diff * diff, axis=1)) - except: - d = np.array([np.sqrt(np.sum(diff * diff))]) - return d - -def recon_traj(enc, net, top, cm): - n = len(enc) - n_atoms = top.n_atoms - x = Variable(torch.from_numpy(enc).type(torch.FloatTensor)) - coords = net.decode(x) - coords = coords.detach().numpy() - coords += cm - coords = coords.reshape((n, n_atoms, 3)) - traj = md.Trajectory(coords, top) - return traj - -def _recon_traj_dir(enc_fn, net, recon_dir, top, cm): - enc = np.load(enc_fn) - traj = recon_traj(enc, net, top, cm) - - new_fn = os.path.split(enc_fn)[1] - base_fn = os.path.splitext(new_fn)[0] - new_fn = base_fn + ".xtc" - new_fn = os.path.join(recon_dir, new_fn) - traj.save(new_fn) - -def recon_traj_dir(net, enc_dir, recon_dir, top, cm, n_cores): - enc_fns = utils.get_fns(enc_dir, "*.npy") - - pool = mp.Pool(processes=n_cores) - f = functools.partial(_recon_traj_dir, net=net, recon_dir=recon_dir, top=top, cm=cm) - pool.map(f, enc_fns) - pool.close() - -def _calc_labels(enc_fn, net, label_dir): - enc = np.load(enc_fn) - if hasattr(net,"split_inds"): - x = net.encoder1[-1].out_features - enc = enc[:,:x] - enc = Variable(torch.from_numpy(enc).type(torch.FloatTensor)) - labels = net.classify(enc) - labels = labels.detach().numpy() - - new_fn = os.path.split(enc_fn)[1] - new_fn = os.path.join(label_dir, "lab" + new_fn) - np.save(new_fn, labels) - -def calc_labels(net, enc_dir, label_dir, n_cores): - enc_fns = utils.get_fns(enc_dir, "*npy") - - pool = mp.Pool(processes=n_cores) - f = functools.partial(_calc_labels, net=net, label_dir=label_dir) - pool.map(f, enc_fns) - pool.close() - -def get_rmsd_dists(orig_traj, recon_traj): - n_frames = len(recon_traj) - if n_frames != len(orig_traj): - # should raise exception - print("Can't get rmsds between trajectories of different lengths") - return - pairwise_rmsd = [] - for i in range(0, n_frames, 10): - r = md.rmsd(recon_traj[i], orig_traj[i], parallel=False)[0] - pairwise_rmsd.append(r) - pairwise_rmsd = np.array(pairwise_rmsd) - return pairwise_rmsd - -def _rmsd_dists_dir(recon_fn, orig_xtc_dir, ref_pdb): - recon_traj = md.load(recon_fn, top=ref_pdb.top) - base_fn = os.path.split(recon_fn)[1] - orig_fn = os.path.join(orig_xtc_dir, base_fn) - orig_traj = md.load(orig_fn, top=ref_pdb.top) - pairwise_rmsd = get_rmsd_dists(orig_traj, recon_traj) - return pairwise_rmsd - -def rmsd_dists_dir(recon_dir, orig_xtc_dir, ref_pdb, n_cores): - recon_fns = utils.get_fns(recon_dir, "*.xtc") - - pool = mp.Pool(processes=n_cores) - f = functools.partial(_rmsd_dists_dir, orig_xtc_dir=orig_xtc_dir, ref_pdb=ref_pdb) - res = pool.map(f, recon_fns) - pool.close() - - pairwise_rmsd = np.concatenate(res) - return pairwise_rmsd - -def _encode_dir(xtc_fn, net, outdir, top, cm): - traj = md.load(xtc_fn, top=top) - n = len(traj) - n_atoms = traj.top.n_atoms - x = traj.xyz.reshape((n, 3*n_atoms))-cm - x = Variable(torch.from_numpy(x).type(torch.FloatTensor)) - if hasattr(net, 'split_inds'): - lat1, lat2 = net.encode(x) - output = torch.cat((lat1,lat2),1) - else: - output = net.encode(x) - output = output.detach().numpy() - new_fn = os.path.split(xtc_fn)[1] - new_fn = os.path.splitext(new_fn)[0] + ".npy" - new_fn = os.path.join(outdir, new_fn) - np.save(new_fn, output) - -def encode_dir(net, xtc_dir, outdir, top, n_cores, cm): - xtc_fns = utils.get_fns(xtc_dir, "*.xtc") - - pool = mp.Pool(processes=n_cores) - f = functools.partial(_encode_dir, net=net, outdir=outdir, top=top, cm=cm) - pool.map(f, xtc_fns) - pool.close() - -def morph_label(net,nn_dir,data_dir,n_frames=10): - pdb_fn = os.path.join(data_dir, "master.pdb") - ref_s = md.load(pdb_fn) - n_atoms = ref_s.top.n_atoms - uwm_fn = os.path.join(data_dir, "uwm.npy") - uwm = np.load(uwm_fn) - cm_fn = os.path.join(data_dir, "cm.npy") - cm = np.load(cm_fn) - enc = utils.load_npy_dir(os.path.join(nn_dir, "encodings"), "*npy") - n_latent = int(enc.shape[1]) - morph_dir = os.path.join(nn_dir, "morph_label") - if not os.path.exists(morph_dir): - os.mkdir(morph_dir) - - labels_dir = os.path.join(nn_dir,"labels") - labels = utils.load_npy_dir(labels_dir,"*.npy") - labels = labels.flatten() - - my_min = np.min(labels) - my_max = np.max(labels) - morph_enc = np.zeros((n_frames,n_latent)) - vals = np.linspace(my_min, my_max, n_frames) - delta = (vals[1] - vals[0]) * 0.5 - for i in range(n_frames): - val = vals[i] - inds = np.where(np.logical_and(labels>=val-delta, - labels<=val+delta))[0] - - for j in range(n_latent): - x = np.mean(enc[inds,j]) - morph_enc[i, j] = x - - #morph_enc = Variable(torch.from_numpy(morph_enc).type(torch.FloatTensor)) - morph_enc = np.array(morph_enc) - traj = recon_traj(morph_enc,net,ref_s.top,cm) - rmsf = get_rmsf(traj) - - out_fn = os.path.join(morph_dir, "morph_0-1.pdb") - traj.save_pdb(out_fn, bfactors=rmsf) - -def get_rmsf(traj): - x_mean = traj.xyz.mean(axis=0) - delta = traj.xyz - x_mean - d2 = np.einsum('ijk,ijk->ij', delta, delta) - p = 1.0*np.ones(len(traj)) / len(traj) - msf = np.einsum('ij,i->j', d2, p) - return np.sqrt(msf) - -def find_features(net,data_dir,nn_dir,clust_cents,inds,out_fn,num2plot=100): - #Need to atom custom indices - encs_dir = os.path.join(nn_dir,"encodings") - encs = utils.load_npy_dir(encs_dir,"*.npy") - encs = encs[clust_cents] - - cm = np.load(os.path.join(data_dir,"cm.npy")) - top = md.load(os.path.join(data_dir,"master.pdb")) - traj = recon_traj(encs,net,top.top,cm) - print("trajectory calculated") - all_pairs = list(itertools.product(inds, repeat=2)) - distances = md.compute_distances(traj,all_pairs) - - labels_dir = os.path.join(nn_dir,"labels") - labels = utils.load_npy_dir(labels_dir,"*.npy") - labels = labels[clust_cents] - - n = len(inds) - print(n, " distances being calculated") - r_values = [] - slopes = [] - for i in np.arange(n*n): - slope, intercept, r_value, p_value, std_err = stats.linregress(labels.flatten(),distances[:,i]) - r_values.append(r_value) - slopes.append(slope) - - r2_values = np.array(r_values)**2 - corr_slopes = [] - count = 0 - print("Starting to write pymol file") - f = open(os.path.join(nn_dir,out_fn), "w") - for i in np.argsort(r2_values)[-num2plot:]: - corr_slopes.append(slopes[i]) - #print(slopes[i],r2_values[i],i) - j,k = np.array(all_pairs)[i,:] - jnum = top.top.atom(j).residue.resSeq - jname = top.top.atom(j).name - knum = top.top.atom(k).residue.resSeq - kname = top.top.atom(k).name - if slopes[i] < 0: - f.write("distance dc%s, master and resi %s and name %s, master and resi %s and name %s\n" % (count,jnum,jname,knum,kname)) - f.write("color red, dc%s\n" % count) - f.write("hide label\n") - else: - f.write("distance df%s, master and resi %s and name %s, master and resi %s and name %s\n" % (count,jnum,jname,knum,kname)) - f.write("color blue, df%s\n" % count) - f.write("hide label\n") - count+=1 - f.close() - -######################################################### -# # -# Extra analysis functions # -# # -######################################################### - - -def calc_auc(net_fn,out_fn,data,labels): - net = pickle.load(open(net_fn, 'rb')) - net.cpu() - full_x = torch.from_numpy(data).type(torch.FloatTensor) - if hasattr(net, "encode"): - full_x = Variable(full_x.view(-1, 784).float()) - pred_x, latents, pred_class = net(full_x) - preds = pred_class.detach().numpy() - else: - full_x = Variable(full_x.view(-1, 3,32,32).float()) - preds = net(full_x).detach().numpy() - fpr, tpr, thresh = roc_curve(labels,preds) - auc = roc_auc_score(labels,preds.flatten()) - print("AUC: %f" % auc) - #plt.figure() - #lw = 2 - #plt.plot(fpr, tpr, color='darkorange', - # lw=lw, label='ROC curve (area = %f)' % auc) - #plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') - #plt.xlim([0.0, 1.0]) - #plt.ylim([0.0, 1.05]) - #plt.xlabel('False Positive Rate') - #plt.ylabel('True Positive Rate') - #plt.title('Receiver operating characteristic example') - #plt.legend(loc="lower right") - #plt.savefig(out_fn) - #plt.close() - return auc, fpr, tpr - -def split_vars(d, vars): - n = len(d) - n_vars = len(vars) - n_per_var = int(len(d)/n_vars) - lst = {} - for i in range(n_vars): - v = vars[i] - lst[v] = d[i*n_per_var:(i+1)*n_per_var] - return lst - - -def get_extrema(lst_lst): - my_min = np.inf - my_max = -np.inf - for lst in lst_lst: - my_min = np.min((my_min, np.min(lst))) - my_max = np.max((my_max, np.max(lst))) - return my_min, my_max - - -def common_hist(lst_lst, labels, bins): - my_min, my_max = get_extrema(lst_lst) - n_lst = len(lst_lst) - all_h = {} - for i in range(n_lst): - h, x = np.histogram(lst_lst[i], bins=bins, range=(my_min, my_max)) - all_h[labels[i]] = h - return all_h, x - -def calc_overlap(d1, d2, bins): - n_feat = d1.shape[1] - js = np.zeros(n_feat) - ent1 = np.zeros(n_feat) - ent2 = np.zeros(n_feat) - for i in range(n_feat): - h, x = common_hist([d1[:, i], d2[:, i]], ["d1", "d2"], bins) - h1 = h["d1"] - h2 = h["d2"] - p1 = np.array(h1) / h1.sum() - p2 = np.array(h2) / h2.sum() - js[i] = infotheor.js_divergence(p1, p2) - ent1[i] = infotheor.shannon_entropy(p1) - ent2[i] = infotheor.shannon_entropy(p2) - return js, ent1, ent2 - -def project(enc, lab, vars, i1, i2, bins, my_title, cutoff=0.8): - subsample = 100 - - all_act_inds = np.where(lab>cutoff)[0] - act_i1_mu = enc[all_act_inds, i1].mean() - act_i1_std = enc[all_act_inds, i1].std() - act_i2_mu = enc[all_act_inds, i2].mean() - act_i2_std = enc[all_act_inds, i2].std() - - n_vars = len(vars) - enc_dict = split_vars(enc, vars) - lab_dict = split_vars(lab, vars) - i1_dict = {} - i2_dict = {} - act_inds = {} - for v in vars: - i1_dict[v] = enc_dict[v][:, i1] - i2_dict[v] = enc_dict[v][:, i2] - act_inds[v] = np.where(lab_dict[v]>cutoff)[0] - # i1_min, i1_max = get_extrema(i1_dict.values()) - # i2_min, i2_max = get_extrema(i2_dict.values()) - - # drop outliers by only show data within const*std - const = 3 - i1_mu = enc[:, i1].mean() - i1_std = enc[:, i1].std() - i1_min = np.max((i1_mu-const*i1_std, enc[:, i1].min())) - i1_max = np.min((i1_mu+const*i1_std, enc[:, i1].max())) - i2_mu = enc[:, i2].mean() - i2_std = enc[:, i2].std() - i2_min = np.max((i2_mu-const*i2_std, enc[:, i2].min())) - i2_max = np.min((i2_mu+const*i2_std, enc[:, i2].max())) - - # get min/max of z dim - cmin = np.inf - cmax = -np.inf - for i in range(n_vars): - v = vars[i] - tmp, x, y = np.histogram2d(i1_dict[v], i2_dict[v], range=([i1_min, i1_max], [i2_min, i2_max]), bins=n_bins) - tmp /= tmp.sum() - h = np.zeros(tmp.shape) - inds = np.where(tmp>0) - h[inds] = -np.log(tmp[inds]) - #inds = np.where(np.isnan(h)) - #h[inds] = 0 - cmin = np.min((cmin, h[inds].min())) - cmax = np.max((cmax, h[inds].max())) - - height = 4 - width = height*n_vars - fig = figure(figsize=(width, height)) - fig.suptitle(my_title) - bins = 20 - dot_size = 0.1 - for i in range(n_vars): - v = vars[i] - ax = fig.add_subplot(1, n_vars, i+1, aspect='auto', xlim=x[[0, -1]], ylim=y[[0, -1]]) - #scatter(i1_dict[v], i2_dict[v], s=dot_size, c='b', alpha=0.1) - tmp, x, y = np.histogram2d(i1_dict[v], i2_dict[v], range=([i1_min, i1_max], [i2_min, i2_max]), bins=n_bins) - tmp /= tmp.sum() - h = cmax*np.ones(tmp.shape) - inds = np.where(tmp>0) - h[inds] = -np.log(tmp[inds]) - h -= cmax - delta_x = (x[1]-x[0])/2.0 - delta_y = (y[1]-y[0])/2.0 - #imshow(h, interpolation='bilinear', aspect='auto', origin='low', extent=[x[0]+delta_x, x[-1]+delta_x, y[0]+delta_y, y[-1]+delta_y], vmin=cmin-cmax, vmax=0, cmap=get_cmap('Blues_r')) - # transpose to put first dimension (i1) on x axis - #imshow(h.T, interpolation='bilinear', aspect='auto', origin='low', extent=[y[0]+delta_y, y[-1]+delta_y, x[0]+delta_x, x[-1]+delta_x], vmin=cmin-cmax, vmax=0, cmap=get_cmap('Blues_r')) - imshow(h.T, interpolation='bilinear', aspect='auto', origin='low', extent=[x[0]+delta_x, x[-1]+delta_x, y[0]+delta_y, y[-1]+delta_y], vmin=cmin-cmax, vmax=0, cmap=get_cmap('Blues_r')) - colorbar() - - lines = [] - line_labels = [] - for v2 in vars: - i1_mu = i1_dict[v2].mean() - i1_std = i1_dict[v2].std() - i2_mu = i2_dict[v2].mean() - i2_std = i2_dict[v2].std() - #print(v, "x", i1_mu, i1_std) - #print(v, "y", i2_mu, i2_std) - line, _, _ = errorbar([i1_mu], [i2_mu], xerr=[i1_std], yerr=[i2_std], label=v2) - lines.append(line) - line_labels.append(v2) - - # inds = act_inds[v2] - # if inds.shape[0] > subsample: - # inds = inds[::subsample] - # print(inds.shape) - # if inds.shape[0] > 0: - # scatter(i1_dict[v2][inds], i2_dict[v2][inds], s=dot_size, c='k') - - line, _, _ = errorbar([act_i1_mu], [act_i2_mu], xerr=[act_i1_std], yerr=[act_i2_std], label='act', ecolor='k', fmt='k') - lines.append(line) - line_labels.append('act') - #legend() - - title(v) - # scatter([0], [0], s=dot_size*10, c='k') - # scatter([6], [0], s=dot_size*10, c='k') - # scatter([6], [6], s=dot_size*10, c='k') - fig.legend(lines, line_labels) - show() - -def morph_conditional(nn_dir, data_dir, n_frames=10): - net = pickle.load(open("%s/nn_best_polish.pkl" % nn_dir, 'rb')) - net.cpu() - pdb_fn = os.path.join(nn_dir, "master.pdb") - ref_s = md.load(pdb_fn) - n_atoms = ref_s.top.n_atoms - uwm_fn = os.path.join(data_dir, "uwm.npy") - uwm = np.load(uwm_fn) - cm_fn = os.path.join(data_dir, "cm.npy") - cm = np.load(cm_fn) - enc = load_npy_dir(os.path.join(nn_dir, "encodings"), "*npy") - n_latent = int(enc.shape[1]) - morph_dir = os.path.join(nn_dir, "morph") - if not os.path.exists(morph_dir): - os.mkdir(morph_dir) - - for i in range(n_latent): - my_min, my_max = get_extrema([enc[:, i]]) - print(i, my_min, my_max) - morph_enc = np.zeros((n_frames, n_latent)) - vals = np.linspace(my_min, my_max, n_frames) - delta = (vals[1] - vals[0]) * 0.5 - for j in range(n_frames): - val = vals[j] - - # set each latent variable to most probable value given latent(ind) within delta of selected value - inds = np.where(np.logical_and(enc[:,i]>=val-delta, enc[:,i]<=val+delta))[0] - for k in range(n_latent): - n, x = np.histogram(enc[inds, k], bins=20) - offset = (x[1] - x[0]) * 0.5 - morph_enc[j, k] = x[n.argmax()] + offset - - # fix ref latent variable to val - morph_enc[j, i] = val - - morph_enc = Variable(torch.from_numpy(morph_enc).type(torch.FloatTensor)) - try: - outputs, labs = net.decode(morph_enc) - except: - print("single") - outputs = net.decode(morph_enc) - outputs = outputs.data.numpy() - coords = whiten.apply_unwhitening(outputs, uwm, cm) - print("shape", coords.shape) - recon_trj = md.Trajectory(coords.reshape((n_frames, n_atoms, 3)), ref_s.top) - out_fn = os.path.join(morph_dir, "m%d.pdb" % i) - recon_trj.save(out_fn) - -def morph_cond_mean(nn_dir,data_dir,n_frames=10): - net = pickle.load(open("%s/nn_best_polish.pkl" % nn_dir, 'rb')) - net.cpu() - pdb_fn = os.path.join(nn_dir, "master.pdb") - ref_s = md.load(pdb_fn) - n_atoms = ref_s.top.n_atoms - uwm_fn = os.path.join(data_dir, "uwm.npy") - uwm = np.load(uwm_fn) - cm_fn = os.path.join(data_dir, "cm.npy") - cm = np.load(cm_fn) - enc = load_npy_dir(os.path.join(nn_dir, "encodings"), "*npy") - n_latent = int(enc.shape[1]) - morph_dir = os.path.join(nn_dir, "morph_bin_mean") - if not os.path.exists(morph_dir): - os.mkdir(morph_dir) - - for i in range(n_latent): - my_min, my_max = get_extrema([enc[:, i]]) - print(i, my_min, my_max) - morph_enc = np.zeros((n_frames, n_latent)) - vals = np.linspace(my_min, my_max, n_frames) - delta = (vals[1] - vals[0]) * 0.5 - for j in range(n_frames): - val = vals[j] - - # set each latent variable to most probable value given latent(ind) within delta of selected value - inds = np.where(np.logical_and(enc[:,i]>=val-delta, enc[:,i]<=val+delta))[0] - for k in range(n_latent): - x = np.mean(enc[inds,k]) - morph_enc[j, k] = x - - # fix ref latent variable to val - morph_enc[j, i] = val - - morph_enc = Variable(torch.from_numpy(morph_enc).type(torch.FloatTensor)) - traj = utils.recon_traj(morph_enc,net,ref_s.top,cm) - rmsf = get_rmsf(traj) - - out_fn = os.path.join(outdir, "m%d.pdb" % i) - traj.save_pdb(out_fn, bfactors=rmsf) - -def morph_std(nn_dir, data_dir, enc): - outdir = os.path.join(nn_dir, "morph_std") - utils.mkdir(outdir) - n_frames = 10 - - net = pickle.load(open("%s/nn_best_polish.pkl" % nn_dir, 'rb')) - net.cpu() - pdb_fn = os.path.join(nn_dir, "master.pdb") - ref_s = md.load(pdb_fn) - n_atoms = ref_s.top.n_atoms - cm_fn = os.path.join(data_dir, "cm.npy") - cm = np.load(cm_fn) - - n_latent = int(enc.shape[1]) - ave_enc = enc.mean(axis=0) - std_enc = enc.std(axis=0) - max_enc = enc.max(axis=0) - min_enc = enc.min(axis=0) - - # want vary between mean +/- 2*std but not go out of range - for i in range(n_latent): - #my_min = np.max((ave_enc[i]-5*std_enc[i], min_enc[i])) - #my_max = np.min((ave_enc[i]+5*std_enc[i], max_enc[i])) - my_min = min_enc[i] - my_max = max_enc[i] - - morph_enc = np.zeros((n_frames, n_latent)) + ave_enc - morph_enc[:, i] = np.linspace(my_min, my_max, n_frames) - traj = utils.recon_traj(morph_enc, net, ref_s.top, cm) - - rmsf = get_rmsf(traj) - - out_fn = os.path.join(outdir, "m%d.pdb" % i) - traj.save_pdb(out_fn, bfactors=rmsf) - -def get_act_inact(nn_dir, data_dir, enc, labels): - """Save most active/inactive sturctures with RMSDs from target less than 2 Angstroms.""" - outdir = os.path.join(nn_dir, "act_and_inact") - utils.mkdir(outdir) - n_extreme = 1000 - - net = pickle.load(open("%s/nn_best_polish.pkl" % nn_dir, 'rb')) - net.cpu() - pdb_fn = os.path.join(nn_dir, "master.pdb") - ref_s = md.load(pdb_fn) - ca_inds = ref_s.top.select('name CA') - n_atoms = ref_s.top.n_atoms - cm_fn = os.path.join(data_dir, "cm.npy") - cm = np.load(cm_fn) - - rmsd_cutoff = 0.2 - rmsd_fn = os.path.join(nn_dir, "rmsd.npy") - rmsd = np.load(rmsd_fn) - good_inds = np.where(rmsdio', coords, coords) - #matmul is faster - c00 = np.matmul(coords.transpose(),coords) - assert isinstance(c00.flat[0], np.double) - np.save(os.path.join(self.xtc_dir, "cov"+traj_num+".npy"),c00) - - def _get_c00_xtc(self, xtc_fn, top, cm): - """Reshape MDTraj Trajectory item into an array of XYZ - coordinates and then call other function to calculate - covariance matrix, c00. - - Parameters - ---------- - xtc_fn : str - Path to trajectory - top : md.Trajectory object - Topology corresponding to the trajectory - cm : np.ndarray, shape=(3*n_atoms,) - Avg. center of mass of each atom across all trajectories. - - Returns - ------- - n : int - Number of data frames in the trajectory - """ - traj = md.load(xtc_fn, top=top) - traj_num = xtc_fn.split("/")[-1].split(".")[0] - n = len(traj) - n_atoms = traj.top.n_atoms - coords = traj.xyz.astype(np.double).reshape((n, 3 * n_atoms)) - self.get_c00(coords,cm,traj_num) - return n - - def get_c00_xtc_list(self, xtc_fns, top, cm, n_cores): - """Calculate the covariance matrix across all trajectories. - - Parameters - ---------- - xtc_fn : list of str's - Paths to trajectories. - top : md.Trajectory object - Topology corresponding to the trajectories - cm : np.ndarray, shape=(3*n_atoms,) - Avg. center of mass of each atom across all trajectories. - n_cores : int - Number of threads to parallelize task across. - - Returns - ------- - c00 : np.ndarray, shape=(n_atoms*3,n_atoms*3) - Covariance matrix across all trajectories - """ - pool = mp.Pool(processes=n_cores) - f = functools.partial(self._get_c00_xtc, top=top, cm=cm) - result = pool.map_async(f, xtc_fns) - result.wait() - r = result.get() - pool.close() - - c00_fns = np.sort(glob.glob(os.path.join(self.xtc_dir, "cov*.npy"))) - c00 = np.sum(np.load(c00_fn) for c00_fn in c00_fns) - c00 /= sum(r) - assert isinstance(c00.flat[0], np.double) - return c00 - - def get_wuw_mats(self, c00): - """Calculate whitening matrix and unwhitening matrix. - Method adapted from deeptime (https://github.com/markovmodel/deeptime/blob/master/time-lagged-autoencoder/tae/utils.py) - - Parameters - ---------- - c00 : np.ndarray, shape=(n_atoms*3,n_atoms*3) - Covariance matrix - - Returns - ------- - uwm : np.ndarray, shape=(n_atoms*3,n_atoms*3) - unwhitening matrix - wm : np.ndarray, shape=(n_atoms*3,n_atoms*3) - whitening matrix - """ - # Previous implementation - # uwm = sqrtm(c00).real - # wm = inv(uwm).real - # return uwm, wm - - # Updated implementation - e, v = torch.symeig(torch.from_numpy(c00).double(), eigenvectors=True) - # In valid covariance matrix the smallest eigenvalue should be positive - # because the covariance matrix is a positive semidefinite matrix - # https://stats.stackexchange.com/questions/52976/is-a-sample-covariance-matrix-always-symmetric-and-positive-definite - assert torch.min(e) > 0.0 - d = torch.diag(1.0 / torch.sqrt(e)) - wm = torch.mm(torch.mm(v, d), v.t()) - return inv(wm.numpy()), wm.numpy() - - def apply_unwhitening(self, whitened, uwm, cm): - """ Apply whitening to XYZ coordinates. - - Parameters - ---------- - whitened : np.ndarray, shape=(n_frames,3*n_atoms) - Whitened XYZ coordinates of a trajectory. - wm : np.ndarray, shape=(n_atoms*3,n_atoms*3) - whitening matrix - cm : np.ndarray, shape=(3*n_atoms,) - Avg. center of mass of each atom across all trajectories. - - Returns - ------- - coords : np.ndarray, shape=(n_frames,3*n_atoms) - XYZ coordinates of a trajectory. - """ - # multiply each row in whitened by c00_sqrt - coords = np.einsum('ij,aj->ai', uwm, whitened) - coords += cm - return coords - - def apply_whitening(self, coords, wm, cm): - """ Apply whitening to XYZ coordinates. - - Parameters - ---------- - coords : np.ndarray, shape=(n_frames,3*n_atoms) - XYZ coordinates of a trajectory. - wm : np.ndarray, shape=(n_atoms*3,n_atoms*3) - whitening matrix - cm : np.ndarray, shape=(3*n_atoms,) - Avg. center of mass of each atom across all trajectories. - - Returns - ------- - whitened : np.ndarray, shape=(n_frames,3*n_atoms) - Whitened XYZ coordinates of a trajectory. - """ - # multiply each row in coords by inv_c00 - whitened = np.einsum('ij,aj->ai', wm, coords) - return whitened - - def _apply_whitening_xtc_fn(self, xtc_fn, top, outdir, wm, cm): - """Apply data whitening to a trajectory file - - - Parameters - ---------- - xtc_fn : str - Path to trajectory - top : md.Trajectory object - Topology corresponding to the trajectories - outdir : str - Directory to output whitened trajectory - wm : np.ndarray, shape=(n_atoms*3,n_atoms*3) - whitening matrix - cm : np.ndarray, shape=(3*n_atoms,) - Avg. center of mass of each atom across all trajectories. - """ - print("whiten", xtc_fn) - traj = md.load(xtc_fn, top=top) - - n = len(traj) - n_atoms = traj.top.n_atoms - coords = traj.xyz.reshape((n, 3 * n_atoms)) - coords -= cm - whitened = self.apply_whitening(coords, wm, cm) - dir, fn = os.path.split(xtc_fn) - new_fn = os.path.join(outdir, fn) - traj = md.Trajectory(whitened.reshape((n, n_atoms, 3)), top) - traj.save(new_fn) - - def apply_whitening_xtc_dir(self,xtc_dir, top, wm, cm, n_cores, outdir): - """Apply data whitening parallelized across many trajectories - - Parameters - ---------- - xtc_fn : list of str's - Paths to trajectories. - top : md.Trajectory object - Topology corresponding to the trajectories - outdir : str - Directory to output whitened trajectory - wm : np.ndarray, shape=(n_atoms*3,n_atoms*3) - whitening matrix - cm : np.ndarray, shape=(3*n_atoms,) - Avg. center of mass of each atom across all trajectories. - n_cores : int - Number of threads to parallelize task across. - """ - xtc_fns = np.sort(glob.glob(os.path.join(xtc_dir, "*.xtc"))) - - pool = mp.Pool(processes=n_cores) - f = functools.partial(self._apply_whitening_xtc_fn, top=top, outdir=outdir, wm=wm, cm=cm) - pool.map(f, xtc_fns) - pool.close() - - def run(self): - """Whiten existing processed trajectory data in self.data_dir - to calculate and write out a covariance matrix (c00.npy), a - whitening matrix (wm.npy) and an unwhitening matrix (uwm.npy). - """ - outdir = self.data_dir - whitened_dir = os.path.join(outdir,"whitened_xtcs") - mkdir(whitened_dir) - n_cores = mp.cpu_count() - traj_fns = get_fns(self.xtc_dir, "*.xtc") - master = md.load(os.path.join(outdir,"master.pdb")) - c00 = self.get_c00_xtc_list(traj_fns, master.top, self.cm, n_cores) - c00_fn = os.path.join(outdir,"c00.npy") - np.save(c00_fn, c00) - c00_fns = np.sort(glob.glob(os.path.join(self.xtc_dir, "cov*.npy"))) - for fn in c00_fns: - os.remove(fn) - uwm, wm = self.get_wuw_mats(c00) - uwm_fn = os.path.join(outdir, "uwm.npy") - np.save(uwm_fn, uwm) - wm_fn = os.path.join(outdir, "wm.npy") - np.save(wm_fn, wm) - #self.apply_whitening_xtc_dir(self.myNav.xtc_dir, master.top, wm, self.cm, n_cores, whitened_dir) - - - diff --git a/pensa/diffnets/exmax.py b/pensa/diffnets/exmax.py deleted file mode 100755 index 6358dd13..00000000 --- a/pensa/diffnets/exmax.py +++ /dev/null @@ -1,234 +0,0 @@ -""" -Copyright 2015 by Washington University in Saint Louis. Authored by S. Joshua Swamidass. -A license to use for strictly non-commerical use is granted. -Derivative work is not permited without prior written authorization. -All other rights reserved. -""" - -from scipy import inf, asarray, array, rand, zeros, prod, where, allclose -from scipy.stats import pearsonr - -def distribution_of_sum(P, ignore_idx=set()): - """ - Given a set of binomial random variables parameterized by a vector P. - Ignoring variables in ignore_idx... - What is the distribution of their sum? - - Output is the discreet distribution D, where the probability of a - specific sum s is D[s]. - O(N^2) time in length of P - - For example, using this P... - >>> P = [0.5, 0.25, 0.5] - - >>> distribution_of_sum(P) - array([ 0.1875, 0.4375, 0.3125, 0.0625]) - - Ignoring the 2nd (1 in zero indexing) variable, we have... - >>> distribution_of_sum(P, [1]) - array([ 0.25, 0.5 , 0.25, 0. ]) - """ - P = asarray(P) - N = P.shape[0] - P1 = 1 - P #convenience variable that is 1 minus P - - #starting distribution of the sum (no variables in summation) is D[0] = 1 and D[>0] = 0 - #in otherwords, we know the sum is zero. - D = zeros(N + 1) - D[0] = 1 - - - - for n in range(N): - if n in ignore_idx: - continue #skip P[n] if n in ignore_idx - - #Convolution of the distribution of the nth variable with the summation distribution for the first n-1 variables. - C = D * P1[n] - C[1:] += D[:-1] * P[n] - - D = C #the distribution of the sum of the first n variables - - #D is now the distribution of sums for all variables - return D - - -def expectation_range_CUBIC(P, lower, upper): - """ - Given a set of binomial random variables parameterized by a vector P. - Conditioned on the number successes between lower and upper (inclusive). - What is the expectation of each random variable? - - Output is a vector E of expectations. - O(N^3) time in length of P - - >>> R = rand(10) # a random vector of probabilities 10 elements long. - >>> - >>> lower, upper = 3, 6 - >>> - >>> EC = expectation_range_CUBIC(R, lower, upper) - >>> EE = expectation_range_EXP(R, lower, upper) - - >>> correlation, pvalue = pearsonr(EE, EC) - >>> correlation > .99 and pvalue < .01 - True - - This shows that both versions yield results that are > 99% correlated. - """ - P = asarray(P) - N = P.shape[0] - E = zeros(N) - if upper == 0: - return E - - D = distribution_of_sum(P) - PROB_IN_RANGE = sum(D[max(0, lower): min(N, upper + 1)]) - - for i in range(N): - #Implementation described in paper - D = distribution_of_sum(P, [i]) - PROB_IN_RANGE_CONDITION_ON_I = sum(D[max(0, lower - 1): min(N, upper)]) - E[i] = P[i] * PROB_IN_RANGE_CONDITION_ON_I / PROB_IN_RANGE - - #Alternative, equivalent implementation - #POS = sum(D[max(0, lower - 1): min(N, upper)]) * P[i] - #NEG = sum(D[max(0, lower): min(N, upper + 1)]) * (1 - P[i]) - #E[i] = POS/ (POS + NEG) - return E - - -def expectation_range_EXP(P, lower, upper): - """ - Given a set of binomial random variables parameterized by a vector P. - Condition on the number successes between lower and upper (inclusive). - What is the expectation of each random variable? - - This version is slow, but more conceptually clear. - - Output is a vector E of expectations. - O(2^N) time in length of P - - This version suffers from floating point error, and should not be used - for anything other than testing. - """ - P = asarray(P) - N = P.shape[0] - P1 = 1 - P - E = zeros(N) - if upper == 0: - return E - - import itertools - - D = 0 - - for S in itertools.product(*tuple([[1, 0]] * N)): #iterate over all binary vectors of length N - SUM = sum(S) - if SUM >= upper or SUM <= lower: - continue #skip the vectors without the right sum - - S = array(S) - - p = prod(where(S, P, P1)) #probability of S according to P - - E += p * S #summing up this vector's contribution to the final expectation - D += p #sum up this contribution to the denominator - - E = E / D #normalize by dividing by $\sum Pr(S)$ - return E - -def expectation_or_LINEAR(P, E_or): - """ - Given a set of binomial random variables parameterized by a vector P. - Conditioned on E[at least one success] = E_or - What is the expectation of each random variable? - - Output is a vector E of expectations. - - All the implementations should produce the same results. - - >>> R = rand(10) - >>> - >>> EL = expectation_or_LINEAR(R, 1) - >>> EC = expectation_or_CUBIC(R, 1) - >>> EE = expectation_E_EXP(R, 1) - - >>> correlation, pvalue = pearsonr(EL, EC) - >>> correlation > .99 and pvalue < .01 - True - - >>> allclose(EL, EE) - True - >>> correlation, pvalue = pearsonr(EL, EE) - >>> correlation > .99 and pvalue < .01 - True - - This shows that all versions yield results that are > 99% correlated. - - And we know the results for some simple cases. - - >>> expectation_or([0.5, 0.5], 1) - array([ 0.66666667, 0.66666667]) - >>> expectation_or([0.5, 0.5], .75) - array([ 0.5, 0.5]) - """ - P = asarray(P) - # boundary case that is easy to compuate and would cause problems if we - # let it pass through - if any(P == 1): - return P * E_or - - # probability of success at this index, but failure or success at all - # others - P1 = P - # probability of failure at this index, but success at one or more others - P0 = (1 - prod(1 - P) / (1 - P)) * (1 - P) - # given that at least one is success, what is the probability of success - # for this index - PS = P1 / (P1 + P0) - # but the true probability is E_or, so adjust expectations down and return - return PS * E_or - - -def expectation_or_CUBIC(P, E_or): - """ - Given a set of binomial random variables parameterized by a vector P. - Conditioned on E[at least one success] = E_or - What is the expectation of each random variable? - - Output is a vector E of expectations. - - alternate, equivalent implementation for error checking - the problem with this implementation is that it is O(N^3) time - """ - return expectation_range(P, 1, inf) * E_or - - -def expectation_E_EXP(P, E_or): - """ - Given a set of binomial random variables parameterized by a vector P. - Conditioned on E[at least one success] = E_or - What is the expectation of each random variable? - - Output is a vector E of expectations. - - alternate, equivalent implementation for error checking - the problem with this implementation is that it is exponential time - """ - P = asarray(P) - P1 = 1 - P - N = P.shape[0] - E = zeros(N) - import itertools - for S in itertools.product(*tuple([[1, 0]] * N)): #iterate over all binary vectors of length N - S = array(S) - p = prod(where(S, P, P1)) #compute the probability according to P of vector - E += p * S #accumulate the probability-weighted average - E = E * E_or / (1 - prod(P1)) #divide by the probability of getting at least 1 success and multiply times E_or - return E - - -expectation_or = expectation_or_LINEAR -expectation_range = expectation_range_CUBIC - - diff --git a/pensa/diffnets/nnutils.py b/pensa/diffnets/nnutils.py deleted file mode 100755 index 7c93cd9c..00000000 --- a/pensa/diffnets/nnutils.py +++ /dev/null @@ -1,596 +0,0 @@ -# supervised autoencoders -import mdtraj as md -import multiprocessing as mp -import numpy as np -import os -import itertools -import torch -import torch.nn as nn -import torch.nn.functional as F - -from torch.autograd import Variable - -class split_ae(nn.Module): - """Unsupervised autoencoder with a split input (i.e. 2 encoders) - - Parameters - ---------- - layer_sizes : list - List of integers indicating the size of each layer in the - encoder including the latent layer. First two must be identical. - inds1 : np.ndarray - Indices in the training input array that go into encoder1. - inds2 : np.ndarray - Indices in the training input array that go into encoder2. - wm : np.ndarray, shape=(n_inputs,n_inputs) - Whitening matrix -- is applied to input data - uwm : np.ndarray, shape=(n_inputs,n_inputs) - unwhitening matrix - """ - - def __init__(self,layer_sizes,inds1,inds2,wm,uwm): - super(split_ae, self).__init__() - self.sizes = layer_sizes - self.n = len(self.sizes) - self.inds1 = inds1 - self.inds2 = inds2 - self.n_features = len(self.inds1)+len(self.inds2) - self.wm1 = wm[self.inds1[:,None],self.inds1] - self.wm2 = wm[self.inds2[:,None],self.inds2] - self.uwm = uwm - self.ratio = len(inds1)/(len(inds1)+len(inds2)) - - self.encoder1 = nn.ModuleList() - self.encoder2 = nn.ModuleList() - self.encoder1.append(nn.Linear(len(inds1),len(inds1))) - self.encoder2.append(nn.Linear(len(inds2),len(inds2))) - for i in range(1,self.n-1): - small_layer_in = int(np.ceil(self.sizes[i]*self.ratio)) - print(small_layer_in) - small_layer_out = int(np.ceil(self.sizes[i+1]*self.ratio)) - print(small_layer_out) - big_layer_in = int(np.floor(self.sizes[i] * (1-self.ratio))) - print(big_layer_in) - big_layer_out = int(np.floor(self.sizes[i+1] * (1-self.ratio))) - print(big_layer_out) - #if small_layer_in < 3: - # small_layer_in = 3 - # big_layer_in = self.sizes[i]-3 - #if small_layer_out < 3: - # small_layer_out = 3 - # big_layer_out = self.sizes[i]-3 - - self.encoder1.append(nn.Linear(small_layer_in, small_layer_out)) - self.encoder2.append(nn.Linear(big_layer_in,big_layer_out)) - - self.decoder = nn.ModuleList() - for i in range(self.n-1,0,-1): - self.decoder.append(nn.Linear(self.sizes[i], self.sizes[i-1])) - - @property - def split_inds(self): - return True - - def freeze_weights(self,old_net=None): - """Procedure to make the whitening matrix and unwhitening matrix - as untrainable layers. Additionally, freezes weights associated - with a previously learned encoder layer. - - Parameters - ---------- - old_net : split_ae object - Previously trained network with overlapping architecture. Weights - learned in this previous networks encoder will be frozen in the - new network. - """ - vwm = Variable(torch.from_numpy(self.wm1).type(torch.FloatTensor)) - self.encoder1[0].weight.data = vwm - vz = Variable(torch.from_numpy(np.zeros(len(self.inds1))).type(torch.FloatTensor)) - self.encoder1[0].bias.data = vz - vwm2 = Variable(torch.from_numpy(self.wm2).type(torch.FloatTensor)) - self.encoder2[0].weight.data = vwm2 - vz2 = Variable(torch.from_numpy(np.zeros(len(self.inds2))).type(torch.FloatTensor)) - self.encoder2[0].bias.data = vz2 - for p in self.encoder1[0].parameters(): - p.requires_grad = False - for p in self.encoder2[0].parameters(): - p.requires_grad = False - self.decoder[-1].weight.data = Variable(torch.from_numpy(self.uwm).type(torch.FloatTensor)) - self.decoder[-1].bias.data = Variable(torch.from_numpy(np.zeros(self.n_features)).type(torch.FloatTensor)) - for p in self.decoder[-1].parameters(): - p.requires_grad = False - - if old_net: - n_old = len(old_net.encoder1) - for i in range(1,n_old): - self.encoder1[i].weight.data = old_net.encoder1[i].weight.data - self.encoder1[i].bias.data = old_net.encoder1[i].bias.data - for p in self.encoder1[i].parameters(): - p.requires_grad = False - self.encoder2[i].weight.data = old_net.encoder2[i].weight.data - self.encoder2[i].bias.data = old_net.encoder2[i].bias.data - for p in self.encoder2[i].parameters(): - p.requires_grad = False - - def unfreeze_weights(self): - """Makes all encoders weights trainable. - """ - n_old = len(self.encoder1) - for i in range(1,n_old): - for p in self.encoder1[i].parameters(): - p.requires_grad = True - for p in self.encoder2[i].parameters(): - p.requires_grad = True - - def encode(self,x): - """Pass the data through the encoder to the latent layer. - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data for a given sample - - Returns - ------- - lat1 : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector associated with encoder1 - lat2 : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector associated with encoder2 - """ - x1 = x[:,self.inds1] - x2 = x[:,self.inds2] - lat1 = self.encoder1[0](x1) - lat2 = self.encoder2[0](x2) - for i in range(1, self.n-1): - lat1 = F.leaky_relu(self.encoder1[i](lat1)) - lat2 = F.leaky_relu(self.encoder2[i](lat2)) - return lat1, lat2 - - def decode(self,latent): - """Pass the latent space vector through the decoder - - Parameters - ---------- - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstruction of the original input data - """ - recon = latent - for i in range(self.n-2): - recon = F.leaky_relu(self.decoder[i](recon)) - recon = self.decoder[-1](recon) - return recon - - def forward(self,x): - """Pass data through the entire network - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data for a given sample - - Returns - ------- - recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstruction of the original input data - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - """ - lat1, lat2 = self.encode(x) - latent = torch.cat((lat1,lat2),1) - recon = self.decode(latent) - - return recon, latent, None - -class split_sae(split_ae): - """Supervised autoencoder with split architecture""" - def __init__(self, layer_sizes,inds1,inds2,wm,uwm): - super(split_sae, self).__init__(layer_sizes,inds1,inds2,wm,uwm) - - self.classifier = nn.Linear(self.encoder1[-1].weight.data.shape[0], 1) - - def classify(self, latent): - """Perfom classification task using latent space representation - - Parameters - ---------- - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - Value between 0 and 1 - """ - return torch.sigmoid(self.classifier(latent)) - - def forward(self, x): - """Pass data through the entire network - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data for a given sample - - Returns - ------- - recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstruction of the original input data - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - label : - Value between 0 and 1 - """ - lat1, lat2 = self.encode(x) - - label = self.classify(lat1) - - latent = torch.cat((lat1,lat2),1) - - recon = self.decode(latent) - return recon, latent, label - - -class ae(nn.Module): - """Unsupervised autoencoder - - Parameters - ---------- - layer_sizes : list - List of integers indicating the size of each layer in the - encoder including the latent layer. First two must be identical. - wm : np.ndarray, shape=(n_inputs,n_inputs) - Whitening matrix -- is applied to input data - uwm : np.ndarray, shape=(n_inputs,n_inputs) - unwhitening matrix - """ - def __init__(self, layer_sizes,wm,uwm): - super(ae, self).__init__() - self.sizes = layer_sizes - self.n = len(self.sizes) - self.wm = wm - self.uwm = uwm - - self.encoder = nn.ModuleList() - for i in range(self.n-1): - self.encoder.append(nn.Linear(self.sizes[i], self.sizes[i+1])) - - self.decoder = nn.ModuleList() - for i in range(self.n-1, 0, -1): - self.decoder.append(nn.Linear(self.sizes[i], self.sizes[i-1])) - - def freeze_weights(self,old_net=None): - """Procedure to make the whitening matrix and unwhitening matrix - as untrainable layers. Additionally, freezes weights associated - with a previously learned encoder layer. - - Parameters - ---------- - old_net : ae object - Previously trained network with overlapping architecture. Weights - learned in this previous networks encoder will be frozen in the - new network. - """ - self.encoder[0].weight.data = Variable(torch.from_numpy(self.wm).type(torch.FloatTensor)) - self.encoder[0].bias.data = Variable(torch.from_numpy(np.zeros(len(self.wm))).type(torch.FloatTensor)) - for p in self.encoder[0].parameters(): - p.requires_grad = False - self.decoder[-1].weight.data = Variable(torch.from_numpy(self.uwm).type(torch.FloatTensor)) - self.decoder[-1].bias.data = Variable(torch.from_numpy(np.zeros(len(self.uwm))).type(torch.FloatTensor)) - for p in self.decoder[-1].parameters(): - p.requires_grad = False - - if old_net: - n_old = len(old_net.encoder) - for i in range(1,n_old): - self.encoder[i].weight.data = old_net.encoder[i].weight.data - self.encoder[i].bias.data = old_net.encoder[i].bias.data - for p in self.encoder[i].parameters(): - p.requires_grad = False - - def unfreeze_weights(self): - """Makes all encoders weights trainable. - """ - n_old = len(self.encoder) - for i in range(1,n_old): - for p in self.encoder[i].parameters(): - p.requires_grad = True - - def encode(self, x): - """Pass the data through the encoder to the latent layer. - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data for a given sample - - Returns - ------- - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector associated with encoder1 - """ - # whiten, without applying non-linearity - latent = self.encoder[0](x) - - # do non-linear layers - for i in range(1, self.n-1): - latent = F.leaky_relu(self.encoder[i](latent)) - - return latent - - def decode(self, x): - """Pass the latent space vector through the decoder - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstruction of the original input data - """ - # do non-linear layers - recon = x - for i in range(self.n-2): - recon = F.leaky_relu(self.decoder[i](recon)) - - # unwhiten, without applying non-linearity - recon = self.decoder[-1](recon) - - return recon - - def forward(self, x): - """Pass data through the entire network - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data for a given sample - - Returns - ------- - recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstruction of the original input data - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - """ - latent = self.encode(x) - recon = self.decode(latent) - # None for labels, so number returns same as sae class - return recon, latent, None - - -# build classifier based on latent representation from an AE -class classify_ae(nn.Module): - """Logistic Regression model - - Parameters - ---------- - n_latent : int - Number of latent variables - """ - def __init__(self, n_latent): - super(classify_ae, self).__init__() - self.n_latent = n_latent - - self.fc1 = nn.Linear(self.n_latent, 1) - - def classify(self, x): - """Perfom classification task using latent space representation - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - Value between 0 and 1 - """ - return torch.sigmoid(self.fc1(x)) - - def forward(self, x): - """Perfom classification task using latent space representation - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - Value between 0 and 1 - """ - return self.classify(x) - - -class sae(ae): - """Supervised autoencoder - - Parameters - ---------- - layer_sizes : list - List of integers indicating the size of each layer in the - encoder including the latent layer. First two must be identical. - wm : np.ndarray, shape=(n_inputs,n_inputs) - Whitening matrix -- is applied to input data - uwm : np.ndarray, shape=(n_inputs,n_inputs) - unwhitening matrix - """ - def __init__(self, layer_sizes, wm, uwm): - super(sae, self).__init__(layer_sizes,wm,uwm) - - self.classifier = nn.Linear(self.sizes[-1], 1) - - def classify(self, latent): - """Perfom classification task using latent space representation - - Parameters - ---------- - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - Value between 0 and 1 - """ - return torch.sigmoid(self.classifier(latent)) - - def forward(self, x): - """Pass through the entire network - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - - Returns - ------- - recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstruction of the original input data - latent : torch.cuda.FloatTensor or torch.FloatTensor - Latent space vector - label : - Value between 0 and 1 - """ - latent = self.encode(x) - label = self.classify(latent) - recon = self.decode(latent) - return recon, latent, label - - -class vae(ae): - def __init__(self, layer_sizes): - super(vae, self).__init__(layer_sizes) - - # last layer of encoder is mu, also need logvar of equal size - self.logvar = nn.Linear(self.sizes[-2], self.sizes[-1]) - - def encode(self, x): - # whiten, without applying non-linearity - latent = self.encoder[0](x) - - # do non-linear layers - for i in range(1, self.n-2): - latent = F.leaky_relu(self.encoder[i](latent)) - - mu = F.leaky_relu(self.encoder[-1](latent)) - logvar = F.leaky_relu(self.logvar(latent)) - return mu, logvar - - def reparameterize(self, mu, logvar): - std = torch.exp(0.5*logvar) - eps = torch.randn_like(std) - return eps.mul(std).add_(mu) - - def forward(self, x): - mu, logvar = self.encode(x) - z = self.reparameterize(mu, logvar) - recon = self.decode(z) - # None for labels, so number returns same as sae class - return recon, mu, logvar, None - - -class svae(vae): - def __init__(self, layer_sizes): - super(svae, self).__init__(layer_sizes) - - self.classifier = nn.Linear(self.sizes[-1], 1) - - def classify(self, latent): - return torch.sigmoid(self.classifier(latent)) - - def forward(self, x): - mu, logvar = self.encode(x) - z = self.reparameterize(mu, logvar) - label = self.classify(z) - recon = self.decode(z) - # None for labels, so number returns same as sae class - return recon, mu, logvar, label - - -def my_mse(x, x_recon): - """Calculate mean squared error loss - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data - x_recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstructed input - - Returns - ------- - torch.cuda.FloatTensor or torch.FloatTensor - """ - return torch.mean(torch.pow(x-x_recon, 2)) - - -def my_l1(x, x_recon): - """Calculate l1 loss - - Parameters - ---------- - x : torch.cuda.FloatTensor or torch.FloatTensor - Input data - x_recon : torch.cuda.FloatTensor or torch.FloatTensor - Reconstructed input - - Returns - ------- - torch.cuda.FloatTensor or torch.FloatTensor - """ - return torch.mean(torch.abs(x-x_recon)) - -def chunks(arr, chunk_size): - """Yield successive chunk_size chunks from arr.""" - for i in range(0, len(arr), chunk_size): - yield arr[i:i + chunk_size] - -def split_inds(pdb,resnum,focus_dist): - """Identify indices close and far from a residue of interest. - Each index corresponds to an X,Y, or Z coordinate of an atom - in the pdb. - - Parameters - ---------- - pdb : md.Trajectory object - Structure used to find close/far indices. - resnum : integer - The residue number of interest. - focus_dist : float (nannmeters) - All indices within this distance of resnum will be selected - as close indices. - - Returns - ------- - close_xyz_inds : np.ndarray - Indices of x,y,z positions of atoms in pdb that are close - to resnum. - non_close_xyz_inds : np.ndarray - Indices of x,y,z positions of atoms in pdb that are not - close to resnum. - """ - res_atoms = pdb.topology.select("resSeq %s" % resnum) - dist_combos = [res_atoms,np.arange(pdb.n_atoms)] - dist_combos = np.array(list(itertools.product(*dist_combos))) - - dpdb = md.compute_distances(pdb,dist_combos) - ind_loc = np.where(dpdb.flatten() 0.0 - - # Covariance matrix should be symmetric - assert np.allclose(cov, cov.T) - - uwm, wm = w.get_wuw_mats(cov.astype(np.double)) - - # ZCA whitening matrix should be symmteric - assert np.allclose(wm, wm.T) - - # assert that whitening and whitening produce the identity matrix - assert_matrix_is_identity(np.matmul(wm, uwm)) - - # assert that the covariance matrix multiplied by the transpose of - # the whitening matrix multiplied by the whitening matrix is identity - # for detailed explanation see page 3 of https://arxiv.org/pdf/1512.00809.pdf - test = np.matmul(cov, np.matmul(wm.transpose(), wm)) - assert_matrix_is_identity(test) - - diff --git a/pensa/diffnets/tests/test_cli.py b/pensa/diffnets/tests/test_cli.py deleted file mode 100644 index 4687ec4e..00000000 --- a/pensa/diffnets/tests/test_cli.py +++ /dev/null @@ -1,165 +0,0 @@ -import os -import shutil -import subprocess -import tempfile -import mdtraj as md -import numpy as np -from diffnets.utils import get_fns - -CURR_DIR = os.getcwd() -UP_DIR = CURR_DIR[:-len(CURR_DIR.split('/')[-1])] -CLI_DIR = UP_DIR + 'cli' - -def test_preprocess_default_inds(): - curr_dir = os.getcwd() - try: - td = tempfile.mkdtemp(dir=curr_dir) - ftmp = tempfile.NamedTemporaryFile(delete=False) - traj_dirs_tmp = ftmp.name + ".npy" - inp = np.array([os.path.join(curr_dir,"data/traj1"), - os.path.join(curr_dir,"data/traj2")]) - np.save(traj_dirs_tmp, inp, allow_pickle=False) - - ftmp2 = tempfile.NamedTemporaryFile(delete=False) - pdb_fns_tmp = ftmp2.name + ".npy" - inp = np.array([os.path.join(curr_dir,"data/beta-peptide1.pdb"), - os.path.join(curr_dir,"data/beta-peptide2.pdb")]) - np.save(pdb_fns_tmp, inp, allow_pickle=False) - - subprocess.call(['python', CLI_DIR + "/main.py", "process", - traj_dirs_tmp, pdb_fns_tmp, td]) - - assert os.path.exists(os.path.join(td,"wm.npy")) - assert os.path.exists(os.path.join(td,"uwm.npy")) - assert os.path.exists(os.path.join(td,"master.pdb")) - assert os.path.exists(os.path.join(td,"data")) - - xtc_fns = os.path.join(td,"aligned_xtcs") - data_fns = get_fns(xtc_fns,"*.xtc") - ind_fns = os.path.join(td,"indicators") - inds = get_fns(ind_fns,"*.npy") - - print(len(data_fns)) - assert len(data_fns) == len(inds) - - finally: - os.remove(traj_dirs_tmp) - os.remove(pdb_fns_tmp) - shutil.rmtree(td) - -def test_preprocess_custom_inds(): - curr_dir = os.getcwd() - try: - td = tempfile.mkdtemp(dir=curr_dir) - ftmp = tempfile.NamedTemporaryFile(delete=False) - traj_dirs_tmp = ftmp.name + ".npy" - inp = np.array([os.path.join(curr_dir,"data/traj1"), - os.path.join(curr_dir,"data/traj2")]) - np.save(traj_dirs_tmp, inp, allow_pickle=False) - - ftmp2 = tempfile.NamedTemporaryFile(delete=False) - pdb_fns_tmp = ftmp2.name + ".npy" - inp = np.array([os.path.join(curr_dir,"data/beta-peptide1.pdb"), - os.path.join(curr_dir,"data/beta-peptide2.pdb")]) - np.save(pdb_fns_tmp, inp, allow_pickle=False) - - ftmp3 = tempfile.NamedTemporaryFile(delete=False) - inds_fn_tmp = ftmp3.name + ".npy" - pdb = md.load(inp[0]) - inds = pdb.top.select("name CA or name N or name CB or name C") - both_inds = np.array([inds,inds]) - np.save(inds_fn_tmp, both_inds, allow_pickle=False) - - subprocess.call(['python', CLI_DIR + "/main.py", "process", - traj_dirs_tmp, pdb_fns_tmp, td, - "-a" + inds_fn_tmp]) - - assert os.path.exists(os.path.join(td,"wm.npy")) - assert os.path.exists(os.path.join(td,"uwm.npy")) - assert os.path.exists(os.path.join(td,"master.pdb")) - assert os.path.exists(os.path.join(td,"data")) - - xtc_fns = os.path.join(td,"aligned_xtcs") - data_fns = get_fns(xtc_fns,"*.xtc") - ind_fns = os.path.join(td,"indicators") - inds = get_fns(ind_fns,"*.npy") - - print(len(data_fns)) - assert len(data_fns) == len(inds) - - finally: - os.remove(traj_dirs_tmp) - os.remove(pdb_fns_tmp) - os.remove(inds_fn_tmp) - shutil.rmtree(td) - -def test_train(): - curr_dir = os.getcwd() - try: - td = tempfile.mkdtemp(dir=curr_dir) - ftmp = tempfile.NamedTemporaryFile(delete=False,mode="w+") - - params =["data_dir: '%s/data/whitened'" % curr_dir, - "n_epochs: 4", - "act_map: [0,1]", - "lr: 0.0001", - "n_latent: 10", - "hidden_layer_sizes: [50]", - "em_bounds: [[0.1,0.3],[0.6,0.9]]", - "do_em: True", - "em_batch_size: 50", - "nntype: 'nnutils.sae'", - "batch_size: 32", - "batch_output_freq: 50", - "epoch_output_freq: 2", - "test_batch_size: 50", - "frac_test: 0.1", - "subsample: 10", - "outdir: %s" % td, - "data_in_mem: False" - ] - - for line in params: - ftmp.write(line) - ftmp.write("\n") - ftmp.close() - - subprocess.call(['python', CLI_DIR + "/main.py", "train", - ftmp.name]) - - assert os.path.exists(os.path.join(td,"nn_best_polish.pkl")) - finally: - os.remove(ftmp.name) - shutil.rmtree(td) - -def test_analyze(): - curr_dir = os.getcwd() - try: - subprocess.call(['python', CLI_DIR + "/main.py", "analyze", - "%s/data/whitened" % curr_dir, - "%s/data/trained_output" % curr_dir, - "-c", "20"]) - - assert os.path.exists(os.path.join("%s/data/trained_output" % curr_dir, - "rescorr-100.pml")) - assert os.path.exists(os.path.join("%s/data/trained_output/rmsd.npy" - % curr_dir)) - assert os.path.exists(os.path.join("%s/data/trained_output/labels" - % curr_dir)) - assert os.path.exists(os.path.join("%s/data/trained_output/encodings" - % curr_dir)) - assert os.path.exists(os.path.join("%s/data/trained_output/cluster_20" - % curr_dir)) - assert os.path.exists(os.path.join("%s/data/trained_output/recon_trajs" - % curr_dir)) - assert os.path.exists(os.path.join("%s/data/trained_output/morph_label" - % curr_dir)) - - finally: - shutil.rmtree(os.path.join("%s/data/trained_output/encodings" % curr_dir)) - shutil.rmtree(os.path.join("%s/data/trained_output/labels" % curr_dir)) - shutil.rmtree(os.path.join("%s/data/trained_output/cluster_20" % curr_dir)) - shutil.rmtree(os.path.join("%s/data/trained_output/recon_trajs" % curr_dir)) - shutil.rmtree(os.path.join("%s/data/trained_output/morph_label" % curr_dir)) - os.remove(os.path.join("%s/data/trained_output/rmsd.npy" % curr_dir)) - os.remove(os.path.join("%s/data/trained_output/rescorr-100.pml" % curr_dir)) diff --git a/pensa/diffnets/tests/test_diffnets.py b/pensa/diffnets/tests/test_diffnets.py deleted file mode 100644 index 307572e5..00000000 --- a/pensa/diffnets/tests/test_diffnets.py +++ /dev/null @@ -1,12 +0,0 @@ -""" -Unit and regression test for the diffnets package. -""" - -# Import package, test suite, and other packages as needed -import diffnets -import pytest -import sys - -def test_diffnets_imported(): - """Sample test, will always pass so long as import statement worked""" - assert "diffnets" in sys.modules diff --git a/pensa/diffnets/training.py b/pensa/diffnets/training.py deleted file mode 100644 index bb6216e1..00000000 --- a/pensa/diffnets/training.py +++ /dev/null @@ -1,556 +0,0 @@ -import os -import pickle -import sys -import multiprocessing as mp -import mdtraj as md -import numpy as np -from . import exmax, nnutils, utils, data_processing -import copy -import pickle - -import torch -import torch.nn as nn -import torch.optim as optim -from torch.autograd import Variable -from torch.utils import data as torch_data - -class Dataset(torch_data.Dataset): - 'Characterizes a dataset for PyTorch' - def __init__(self, train_inds, labels, data): - 'Initialization' - self.labels = labels - self.train_inds = train_inds - self.data = data - - def __len__(self): - 'Denotes the total number of samples' - return len(self.train_inds) - - def __getitem__(self, index): - 'Generates one sample of data' - #If data needs to be loaded - ID = self.train_inds[index] - if type(self.data) is str: - # Load data and get label - X = torch.load(self.data + "/ID-%s" % ID + '.pt') - else: - X = torch.from_numpy(self.data[ID]).type(torch.FloatTensor) - y = self.labels[ID] - - return X, y, ID - - - -class Trainer: - - def __init__(self,job): - """Object to train your DiffNet. - - Parameters: - ----------- - job : dict - Dictionary with all training parameters. See training_dict.txt - for all keys. All keys are required. See train_submit.py for an - example. - """ - self.job = job - - def set_training_data(self, job, train_inds, test_inds, labels, data): - """Construct generators out of the dataset for training, validation, - and expectation maximization. - - Parameters - ---------- - job : dict - See training_dict.tx for all keys. - train_inds : np.ndarray - Indices in data that are to be trained on - test_inds : np.ndarray - Indices in data that are to be validated on - labels : np.ndarray, - classification labels used for training - data : np.ndarray, shape=(n_frames,3*n_atoms) OR str to path - All data - """ - - batch_size = job['batch_size'] - cpu_cores = job['em_n_cores'] - test_batch_size = job['test_batch_size'] - em_batch_size = job['em_batch_size'] - subsample = job['subsample'] - data_dir = job["data_dir"] - - n_train_inds = len(train_inds) - random_inds = np.random.choice(np.arange(n_train_inds),int(n_train_inds/subsample),replace=False) - sampler=torch_data.SubsetRandomSampler(random_inds) - - params_t = {'batch_size': batch_size, - 'shuffle':False, - 'num_workers': cpu_cores, - 'sampler': sampler} - - params_v = {'batch_size': test_batch_size, - 'shuffle':True, - 'num_workers': cpu_cores} - - params_e = {'batch_size': em_batch_size, - 'shuffle':True, - 'num_workers': cpu_cores} - - n_snapshots = len(train_inds) + len(test_inds) - - training_set = Dataset(train_inds, labels, data) - training_generator = torch_data.DataLoader(training_set, **params_t) - - validation_set = Dataset(test_inds, labels, data) - validation_generator = torch_data.DataLoader(validation_set, **params_v) - - em_set = Dataset(train_inds, labels, data) - em_generator = torch_data.DataLoader(em_set, **params_e) - - return training_generator, validation_generator, em_generator - - def em_parallel(self, net, em_generator, train_inds, em_batch_size, - indicators, em_bounds, em_n_cores, label_str, epoch): - """Use expectation maximization to update all training classification - labels. - - Parameters - ---------- - net : nnutils neural network object - Neural network - em_generator : Dataset object - Training data - train_inds : np.ndarray - Indices in data that are to be trained on - em_batch_size : int - Number of examples that are have their classification labels - updated in a single round of expectation maximization. - indicators : np.ndarray, shape=(len(data),) - Value to indicate which variant each data frame came from. - em_bounds : np.ndarray, shape=(n_variants,2) - A range that sets what fraction of conformations you - expect a variant to have biochemical property. Rank order - of variants is more important than the ranges themselves. - em_n_cores : int - CPU cores to use for expectation maximization calculation - - Returns - ------- - new_labels : np.ndarray, shape=(len(data),) - Updated classification labels for all training examples - """ - use_cuda = torch.cuda.is_available() - device = torch.device("cuda:0" if use_cuda else "cpu") - - n_em = np.ceil(train_inds.shape[0]*1.0/em_batch_size) - freq_output = np.floor(n_em/10.0) - train_inds = [] - inputs = [] - i = 0 - ##To save DiffNet labels before each EM update - pred_labels = -1 * np.ones(indicators.shape[0]) - for local_batch, local_labels, t_inds in em_generator: - t_inds = np.array(t_inds) - local_batch, local_labels = local_batch.to(device), local_labels.to(device) - if hasattr(net, "decode"): - if hasattr(net, "reparameterize"): - x_pred, latent, logvar, class_pred = net(local_batch) - else: - x_pred, latent, class_pred = net(local_batch) - else: - class_pred = net(local_batch) - cur_labels = class_pred.cpu().detach().numpy() - pred_labels[t_inds] = cur_labels.flatten() - inputs.append([cur_labels, indicators[t_inds], em_bounds]) - if i % freq_output == 0: - print(" %d/%d" % (i, n_em)) - i += 1 - train_inds.append(t_inds) - - pred_label_fn = os.path.join(self.job['outdir'],"tmp_labels_%s_%s.npy" % (label_str,epoch)) - np.save(pred_label_fn,pred_labels) - pool = mp.Pool(processes=em_n_cores) - res = pool.map(self.apply_exmax, inputs) - pool.close() - - train_inds = np.concatenate(np.array(train_inds)) - new_labels = -1 * np.ones((indicators.shape[0], 1)) - new_labels[train_inds] = np.concatenate(res) - return new_labels - - def apply_exmax(self, inputs): - """Apply expectation maximization to a batch of data. - - Parameters - ---------- - inputs : list - list where the 0th index is a list of current classification - labels of length == batch_size. 1st index is a corresponding - list of variant simulation indicators. 2nd index is em_bounds. - - Returns - ------- - Updated labels -- length == batch size - """ - cur_labels, indicators, em_bounds = inputs - n_vars = em_bounds.shape[0] - - for i in range(n_vars): - inds = np.where(indicators == i)[0] - lower = np.int(np.floor(em_bounds[i, 0] * inds.shape[0])) - upper = np.int(np.ceil(em_bounds[i, 1] * inds.shape[0])) - cur_labels[inds] = exmax.expectation_range_CUBIC(cur_labels[inds], lower, upper).reshape(cur_labels[inds].shape) - - bad_inds = np.where(np.isnan(cur_labels)) - cur_labels[bad_inds] = 0 - try: - assert((cur_labels >= 0.).all() and (cur_labels <= 1.).all()) - except AssertionError: - neg_inds = np.where(cur_labels<0)[0] - pos_inds = np.where(cur_labels>1)[0] - bad_inds = neg_inds.tolist() + pos_inds.tolist() - for iis in bad_inds: - print(" ", indicators[iis], cur_labels[iis]) - print(" #bad neg, pos", len(neg_inds), len(pos_inds)) - #np.save("tmp.npy", tmp_labels) - cur_labels[neg_inds] = 0.0 - cur_labels[pos_inds] = 1.0 - #sys.exit(1) - return cur_labels.reshape((cur_labels.shape[0], 1)) - - def train(self, data, training_generator, validation_generator, em_generator, - targets, indicators, train_inds, test_inds,net, label_str, - job, lr_fact=1.0): - """Core method for training - - Parameters - ---------- - data : np.ndarray, shape=(n_frames,3*n_atoms) OR str to path - Training data - training_generator: Dataset object - Generator to sample training data - validation_generator: Dataset object - Generator to sample validation data - em_generator: Dataset object - Generator to sample training data in batches for expectation - maximization - targets : np.ndarray, shape=(len(data),) - classification labels used for training - indicators : np.ndarray, shape=(len(data),) - Value to indicate which variant each data frame came from. - train_inds : np.ndarray - Indices in data that are to be trained on - test_inds : np.ndarray - Indices in data that are to be validated on - net : nnutils neural network object - Neural network - label_str: int - For file naming. Indicates what iteration of training we're - on. Training goes through several iterations where neural net - architecture is progressively built deeper. - job : dict - See training_dict.tx for all keys. - lr_fact : float - Factor to multiply the learning rate by. - - Returns - ------- - best_nn : nnutils neural network object - Neural network that has the lowest reconstruction error - on the validation set. - targets : np.ndarry, shape=(len(data),) - Classification labels after training. - """ - job = self.job - do_em = job['do_em'] - n_epochs = job['n_epochs'] - lr = job['lr'] * lr_fact - subsample = job['subsample'] - batch_size = job['batch_size'] - batch_output_freq = job['batch_output_freq'] - epoch_output_freq = job['epoch_output_freq'] - test_batch_size = job['test_batch_size'] - em_bounds = job['em_bounds'] - nntype = job['nntype'] - em_batch_size = job['em_batch_size'] - em_n_cores = job['em_n_cores'] - outdir = job['outdir'] - - use_cuda = torch.cuda.is_available() - device = torch.device("cuda:0" if use_cuda else "cpu") - - n_test = test_inds.shape[0] - lam_cls = 1.0 - lam_corr = 1.0 - - n_batch = np.ceil(train_inds.shape[0]*1.0/subsample/batch_size) - - optimizer = optim.Adam(net.parameters(), lr=lr) - bce = nn.BCELoss() - training_loss_full = [] - test_loss_full = [] - epoch_test_loss = [] - best_loss = np.inf - best_nn = None - for epoch in range(n_epochs): - # go through mini batches - running_loss = 0 - i = 0 - for local_batch, local_labels, _ in training_generator: - if use_cuda: - local_labels = local_labels.type(torch.cuda.FloatTensor) - else: - local_labels = local_labels.type(torch.FloatTensor) - local_batch, local_labels = local_batch.to(device), local_labels.to(device) - - optimizer.zero_grad() - x_pred, latent, class_pred = net(local_batch) - loss = nnutils.my_mse(local_batch, x_pred) - loss += nnutils.my_l1(local_batch, x_pred) - if class_pred is not None: - loss += bce(class_pred, local_labels).mul_(lam_cls) - #Minimize correlation between latent variables - n_feat = net.sizes[-1] - my_c00 = torch.einsum('bi,bo->io', (latent, latent)).mul(1.0/local_batch.shape[0]) - my_mean = torch.mean(latent, 0) - my_mean = torch.einsum('i,o->io', (my_mean, my_mean)) - ide = np.identity(n_feat) - if use_cuda: - ide = torch.from_numpy(ide).type(torch.cuda.FloatTensor) - else: - ide = torch.from_numpy(ide).type(torch.FloatTensor) - #ide = Variable(ide) - #ide = torch.from_numpy(np.identity(n_feat)) - #ide = ide.to(device) - zero_inds = np.where(1-ide.cpu().numpy()>0) - corr_penalty = nnutils.my_mse(ide[zero_inds], my_c00[zero_inds]-my_mean[zero_inds]) - loss += corr_penalty - loss.backward() - optimizer.step() - running_loss += loss.item() - - if i%batch_output_freq == 0: - train_loss = running_loss - if i != 0: - train_loss /= batch_output_freq - training_loss_full.append(train_loss) - - test_loss = 0 - for local_batch, local_labels, _ in validation_generator: - local_batch, local_labels = local_batch.to(device), local_labels.to(device) - x_pred, latent, class_pred = net(local_batch) - loss = nnutils.my_mse(local_batch,x_pred) - test_loss += loss.item() * local_batch.shape[0] # mult for averaging across samples, as in train_loss - #print(" ", test_loss) - test_loss /= n_test # division averages across samples, as in train_loss - test_loss_full.append(test_loss) - print(" [%s %d, %5d/%d] train loss: %0.6f test loss: %0.6f" % (label_str, epoch, i, n_batch, train_loss, test_loss)) - running_loss = 0 - - if test_loss < best_loss: - best_loss = test_loss - best_nn = copy.deepcopy(net) - i += 1 - - if do_em and hasattr(nntype, "classify"): - print(" Doing EM") - targets = self.em_parallel(net, em_generator, train_inds, - em_batch_size, indicators, em_bounds, - em_n_cores, label_str, epoch) - training_generator, validation_generator, em_generator = \ - self.set_training_data(job, train_inds, test_inds, targets, data) - - if epoch % epoch_output_freq == 0: - print("my_l1", nnutils.my_l1(local_batch, x_pred)) - print("corr penalty",corr_penalty) - print("classify", bce(class_pred, local_labels).mul_(lam_cls)) - print("my_mse", nnutils.my_mse(local_batch, x_pred)) - epoch_test_loss.append(test_loss) - out_fn = os.path.join(outdir, "epoch_test_loss_%s.npy" % label_str) - np.save(out_fn, epoch_test_loss) - out_fn = os.path.join(outdir, "training_loss_%s.npy" % label_str) - np.save(out_fn, training_loss_full) - out_fn = os.path.join(outdir, "test_loss_%s.npy" % label_str) - np.save(out_fn, test_loss_full) - # nets need be on cpu to load multiple in parallel, e.g. with multiprocessing - net.cpu() - out_fn = os.path.join(outdir, "nn_%s_e%d.pkl" % (label_str, epoch)) - pickle.dump(net, open(out_fn, 'wb')) - if use_cuda: - net.cuda() - if hasattr(nntype, "classify"): - out_fn = os.path.join(outdir, "tmp_targets_%s_%s.npy" % (label_str,epoch)) - np.save(out_fn, targets) - - # save best net every epoch - best_nn.cpu() - out_fn = os.path.join(outdir, "nn_best_%s.pkl" % label_str) - pickle.dump(best_nn, open(out_fn, 'wb')) - if use_cuda: - best_nn.cuda() - return best_nn, targets - - def get_targets(self,act_map,indicators,label_spread=None): - """Convert variant indicators into classification labels. - - Parameters - ---------- - act_map : np.ndarray, shape=(n_variants,) - Initial classification labels to give each variant. - indicators : np.ndarray, shape=(len(data),) - Value to indicate which variant each data frame came from. - - Returns - ------- - targets : np.ndarry, shape=(len(data),) - Classification labels for training. - """ - targets = np.zeros((len(indicators), 1)) - print(targets.shape) - if label_spread == 'gaussian': - targets = np.array([np.random.normal(act_map[i],0.1) for i in indicators]) - zero_inds = np.where(targets < 0)[0] - targets[zero_inds] = 0 - one_inds = np.where(targets > 1)[0] - targets[one_inds] = 1 - elif label_spread == 'uniform': - targets = np.vstack([np.random.uniform() for i in targets]) - elif label_spread == 'bimodal': - targets = np.array([np.random.normal(0.8, 0.1) if np.random.uniform() < act_map[i] - else np.random.normal(0.2, 0.1) for i in indicators]) - zero_inds = np.where(targets < 0)[0] - targets[zero_inds] = 0 - one_inds = np.where(targets > 1)[0] - targets[one_inds] = 1 - else: - targets[:, 0] = act_map[indicators] - return targets - - def split_test_train(self,n,frac_test): - """Split data into training and validation sets. - - Parameters - ---------- - n : int - number of data points - frac_test : float between 0 and 1 - Fraction of dataset to reserve for validation set - - Returns - ------- - train_inds : np.ndarray - Indices in data that are to be trained on - test_inds : np.ndarray - Indices in data that are to be validated on - """ - n_test = int(n*frac_test) - - inds = np.arange(n) - np.random.shuffle(inds) - train_inds = inds[:-n_test] - test_inds = inds[-n_test:] - - return train_inds, test_inds - - def run(self, data_in_mem=False): - """Wrapper for running the training code - - Parameters - ---------- - data_in_mem: boolean - If true, load all training data into memory. Training faster this way. - - Returns - ------- - net : nnutils neural network object - Trained DiffNet - """ - job = self.job - data_dir = job['data_dir'] - outdir = job['outdir'] - n_latent = job['n_latent'] - layer_sizes = job['layer_sizes'] - nntype = job['nntype'] - frac_test = job['frac_test'] - act_map = job['act_map'] - - use_cuda = torch.cuda.is_available() - print("Using cuda? %s" % use_cuda) - - indicator_dir = os.path.join(data_dir, "indicators") - indicators = utils.load_npy_dir(indicator_dir, "*.npy") - indicators = np.array(indicators, dtype=int) - - if 'label_spreading' in job.keys(): - targets = self.get_targets(act_map,indicators, - label_spread=job['label_spreading']) - else: - targets = self.get_targets(act_map,indicators) - n_snapshots = len(indicators) - np.save(os.path.join(outdir, 'initial_targets.npy'), targets) - - train_inds, test_inds = self.split_test_train(n_snapshots,frac_test) - if data_in_mem: - xtc_dir = os.path.join(data_dir,"aligned_xtcs") - top_fn = os.path.join(data_dir, "master.pdb") - master = md.load(top_fn) - data = utils.load_traj_coords_dir(xtc_dir, "*.xtc", master.top) - else: - data = os.path.join(data_dir, "data") - - training_generator, validation_generator, em_generator = \ - self.set_training_data(job, train_inds, test_inds, targets, data) - - print(" data generators created") - - print("# of examples", targets.shape) - - wm_fn = os.path.join(data_dir, "wm.npy") - uwm_fn = os.path.join(data_dir, "uwm.npy") - cm_fn = os.path.join(data_dir, "cm.npy") - wm = np.load(wm_fn) - uwm = np.load(uwm_fn) - cm = np.load(cm_fn).flatten() - - n_train = train_inds.shape[0] - n_test = test_inds.shape[0] - out_fn = os.path.join(outdir, "train_inds.npy") - np.save(out_fn, train_inds) - out_fn = os.path.join(outdir, "test_inds.npy") - np.save(out_fn, test_inds) - print(" n train/test", n_train, n_test) - - if hasattr(nntype, 'split_inds'): - inds1 = job['inds1'] - inds2 = job['inds2'] - old_net = nntype(layer_sizes[0:2],inds1,inds2,wm,uwm) - else: - old_net = nntype(layer_sizes[0:2],wm,uwm) - old_net.freeze_weights() - - for cur_layer in range(2,len(layer_sizes)): - if hasattr(nntype, 'split_inds'): - net = nntype(layer_sizes[0:cur_layer+1],inds1,inds2,wm,uwm) - else: - net = nntype(layer_sizes[0:cur_layer+1],wm,uwm) - net.freeze_weights(old_net) - if use_cuda: - net.cuda() - net, targets = self.train(data, training_generator, - validation_generator, em_generator, - targets, indicators, train_inds, - test_inds, net, str(cur_layer), job) - #Might make sense to make this optional - training_generator, validation_generator, em_generator = \ - self.set_training_data(job, train_inds, test_inds, targets, data) - old_net = net - - #Polishing - net.unfreeze_weights() - if use_cuda: - net.cuda() - net, targets = self.train(data, training_generator, validation_generator, - em_generator, targets, indicators, train_inds, - test_inds, net, "polish", job, lr_fact=0.1) - return net diff --git a/pensa/diffnets/utils.py b/pensa/diffnets/utils.py deleted file mode 100644 index 26590b94..00000000 --- a/pensa/diffnets/utils.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import numpy as np -import glob -import mdtraj as md - -def mkdir(dir_name): - if not os.path.exists(dir_name): - os.mkdir(dir_name) - -def get_fns(dir_name,pattern): - return np.sort(glob.glob(os.path.join(dir_name, pattern))) - -def load_traj_coords_dir(dir_name,pattern,top): - fns = get_fns(dir_name, pattern) - all_d = [] - for fn in fns: - t = md.load(fn, top=top) - d = t.xyz.reshape((len(t), 3*top.n_atoms)) - all_d.append(d) - all_d = np.vstack(all_d) - return all_d - -def load_npy_dir(dir_name,pattern): - fns = get_fns(dir_name, pattern) - all_d = [] - for fn in fns: - d = np.load(fn) - all_d.append(d) - if len(d.shape) == 1: - all_d = np.hstack(all_d) - else: - all_d = np.vstack(all_d) - return all_d