From 73c4702d34f73b7d683877e74a91fa26c726db75 Mon Sep 17 00:00:00 2001 From: martinvoegele Date: Wed, 4 Oct 2023 00:08:41 -0400 Subject: [PATCH] make imports explicit and improve style --- pensa/__init__.py | 7 -- pensa/clusters/__init__.py | 15 ++- pensa/comparison/__init__.py | 35 +++++-- pensa/comparison/metrics.py | 120 +++++++++++++++++------ pensa/comparison/relative_entropy.py | 3 +- pensa/comparison/statespecific.py | 79 +++++++++++---- pensa/comparison/uncertainty_analysis.py | 11 ++- pensa/dimensionality/__init__.py | 29 +++++- pensa/dimensionality/pca.py | 2 +- pensa/features/__init__.py | 61 ++++++++++-- pensa/features/atom_features.py | 9 +- pensa/features/hbond_features.py | 83 +++++++++++----- pensa/features/mda_combined.py | 4 +- pensa/features/mda_torsions.py | 3 +- pensa/features/pyemma_features.py | 3 +- pensa/features/water_features.py | 6 +- pensa/preprocessing/__init__.py | 26 ++++- pensa/statesinfo/__init__.py | 9 +- tests/test_workflow.py | 8 +- 19 files changed, 391 insertions(+), 122 deletions(-) diff --git a/pensa/__init__.py b/pensa/__init__.py index db8aa5b7..e69de29b 100644 --- a/pensa/__init__.py +++ b/pensa/__init__.py @@ -1,7 +0,0 @@ -from .preprocessing import * -from .features import * -from .statesinfo import * -from .clusters import * -from .comparison import * -from .dimensionality import * - diff --git a/pensa/clusters/__init__.py b/pensa/clusters/__init__.py index 4a4f9f30..25fcc3a2 100644 --- a/pensa/clusters/__init__.py +++ b/pensa/clusters/__init__.py @@ -1,3 +1,12 @@ -from .clustering import * -from .trajectory import * -from .wss import * +from .clustering import \ + obtain_clusters, \ + obtain_combined_clusters, \ + obtain_mult_combined_clusters, \ + find_closest_frames + +from .trajectory import \ + write_cluster_traj + +from .wss import \ + wss_over_number_of_clusters, \ + wss_over_number_of_combined_clusters diff --git a/pensa/comparison/__init__.py b/pensa/comparison/__init__.py index 9e34078a..694fb884 100644 --- a/pensa/comparison/__init__.py +++ b/pensa/comparison/__init__.py @@ -1,6 +1,29 @@ -from .statistics import * -from .relative_entropy import * -from .statespecific import * -from .visualization import * -from .metrics import * -from .uncertainty_analysis import * +from .statistics import \ + kolmogorov_smirnov_analysis, \ + mean_difference_analysis, \ + feature_correlation + +from .relative_entropy import \ + relative_entropy_analysis + +from .statespecific import \ + ssi_feature_analysis, \ + ssi_ensemble_analysis, \ + cossi_featens_analysis + +from .visualization import \ + residue_visualization, \ + distances_visualization, \ + pair_features_heatmap, \ + resnum_heatmap + +from .metrics import \ + pca_sampling_efficiency, \ + average_jsd, average_kld, average_ksp, average_kss, average_ssi, \ + max_jsd, max_kld, max_ksp, max_kss, max_ssi, min_ksp + +from .uncertainty_analysis import \ + relen_block_analysis, \ + relen_sem_analysis, \ + ssi_block_analysis, \ + ssi_sem_analysis diff --git a/pensa/comparison/metrics.py b/pensa/comparison/metrics.py index 3a41efbf..f1463933 100644 --- a/pensa/comparison/metrics.py +++ b/pensa/comparison/metrics.py @@ -8,7 +8,9 @@ """ - Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. Each of four functions uses the relative_entropy_analysis function with the same parameters. + Calculates the average and maximum Jensen-Shannon distance and the Kullback-Leibler divergences + for each feature from two ensembles. + Each of the four functions uses the relative_entropy_analysis function with the same parameters. Parameters ---------- @@ -20,16 +22,19 @@ Can be obtained from features object via .describe(). Must be the same as features_a. Provided as a sanity check. all_data_a : float array - Trajectory data from the first ensemble. Format: [frames, frame_data]. + Trajectory data from the first ensemble. + Format: [frames, frame_data]. all_data_b : float array Trajectory data from the second ensemble. For kld functions, the second ensemble should be the reference ensemble. Format: [frames, frame_data]. bin_width : float, default=None Bin width for the axis to compare the distributions on. - If bin_width is None, bin_num (see below) bins are used and the width is determined from the common histogram. + If bin_width is None, bin_num (see below) bins are used + and the width is determined from the common histogram. bin_num : int, default=10 - Number of bins for the axis to compare the distributions on (only if bin_width=None). + Number of bins for the axis to compare the distributions on + (only if bin_width=None). verbose : bool, default=True Print intermediate results. override_name_check : bool, default=False @@ -50,28 +55,52 @@ """ -def average_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): - _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) +def average_jsd(features_a, features_b, all_data_a, all_data_b, + bin_width=None, bin_num=10, verbose=True, + override_name_check=False): + _, data_jsdist, _, _ = relative_entropy_analysis( + features_a, features_b, all_data_a, all_data_b, + bin_width=bin_width, bin_num=bin_num, verbose=verbose, + override_name_check=override_name_check + ) return np.mean(data_jsdist) -def max_jsd(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): - _, data_jsdist, _, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) +def max_jsd(features_a, features_b, all_data_a, all_data_b, + bin_width=None, bin_num=10, verbose=True, + override_name_check=False): + _, data_jsdist, _, _ = relative_entropy_analysis( + features_a, features_b, all_data_a, all_data_b, + bin_width=bin_width, bin_num=bin_num, verbose=verbose, + override_name_check=override_name_check + ) return np.max(data_jsdist) -def average_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): - _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) +def average_kld(features_a, features_b, all_data_a, all_data_b, + bin_width=None, bin_num=10, verbose=True, + override_name_check=False): + _, _, data_kld_ab, _ = relative_entropy_analysis( + features_a, features_b, all_data_a, all_data_b, + bin_width=bin_width, bin_num=bin_num, verbose=verbose, + override_name_check=override_name_check + ) return np.mean(data_kld_ab) -def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): - _, _, data_kld_ab, _ = relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=bin_width, bin_num=bin_num, verbose=verbose, override_name_check=override_name_check) +def max_kld(features_a, features_b, all_data_a, all_data_b, + bin_width=None, bin_num=10, verbose=True, + override_name_check=False): + _, _, data_kld_ab, _ = relative_entropy_analysis( + features_a, features_b, all_data_a, all_data_b, + bin_width=bin_width, bin_num=bin_num, verbose=verbose, + override_name_check=override_name_check) return np.max(data_kld_ab) """ - Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. Each of five functions uses the kolmogorov_smirnov_analysis function with the same parameters. + Calculates the average and maximum Kolmogorov-Smirnov statistic for two distributions. + Each of the five functions uses the kolmogorov_smirnov_analysis function with the same parameters. Parameters ---------- @@ -108,33 +137,54 @@ def max_kld(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_ """ -def average_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): - _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) +def average_kss(features_a, features_b, all_data_a, all_data_b, + verbose=True, override_name_check=False): + _, data_kss, _ = kolmogorov_smirnov_analysis( + features_a, features_b, all_data_a, all_data_b, + verbose=verbose, override_name_check=override_name_check + ) return np.mean(data_kss) -def max_kss(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): - _, data_kss, _ = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) +def max_kss(features_a, features_b, all_data_a, all_data_b, + verbose=True, override_name_check=False): + _, data_kss, _ = kolmogorov_smirnov_analysis( + features_a, features_b, all_data_a, all_data_b, + verbose=verbose, override_name_check=override_name_check + ) return np.max(data_kss) -def average_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): - _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) +def average_ksp(features_a, features_b, all_data_a, all_data_b, + verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis( + features_a, features_b, all_data_a, all_data_b, + verbose=verbose, override_name_check=override_name_check + ) return np.mean(data_ksp) -def max_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): - _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) +def max_ksp(features_a, features_b, all_data_a, all_data_b, + verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis( + features_a, features_b, all_data_a, all_data_b, + verbose=verbose, override_name_check=override_name_check + ) return np.max(data_ksp) -def min_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, override_name_check=False): - _, _, data_ksp = kolmogorov_smirnov_analysis(features_a, features_b, all_data_a, all_data_b, verbose=verbose, override_name_check=override_name_check) +def min_ksp(features_a, features_b, all_data_a, all_data_b, + verbose=True, override_name_check=False): + _, _, data_ksp = kolmogorov_smirnov_analysis( + features_a, features_b, all_data_a, all_data_b, + verbose=verbose, override_name_check=override_name_check + ) return np.min(data_ksp) """ - Calculates average and maximum State Specific Information statistic for a feature across two ensembles. Each of two functions uses the ssi_ensemble_analysis function with the same parameters. + Calculates average and maximum State Specific Information statistic for a feature across two ensembles. + Each of two functions uses the ssi_ensemble_analysis function with the same parameters. Parameters ---------- @@ -174,13 +224,27 @@ def min_ksp(features_a, features_b, all_data_a, all_data_b, verbose=True, overri """ -def average_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False): - _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) +def average_ssi(features_a, features_b, all_data_a, all_data_b, + torsions=None, pocket_occupancy=None, pbc=True, + verbose=True, write_plots=None, override_name_check=False): + _, data_ssi = ssi_ensemble_analysis( + features_a, features_b, all_data_a, all_data_b, + torsions=torsions, pocket_occupancy=pocket_occupancy, + pbc=pbc, verbose=verbose, write_plots=write_plots, + override_name_check=override_name_check + ) return np.mean(data_ssi) -def max_ssi(features_a, features_b, all_data_a, all_data_b, torsions=None, pocket_occupancy=None, pbc=True, verbose=True, write_plots=None, override_name_check=False): - _, data_ssi = ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, torsions=torsions, pocket_occupancy=pocket_occupancy, pbc=pbc, verbose=verbose, write_plots=write_plots, override_name_check=override_name_check) +def max_ssi(features_a, features_b, all_data_a, all_data_b, + torsions=None, pocket_occupancy=None, pbc=True, + verbose=True, write_plots=None, override_name_check=False): + _, data_ssi = ssi_ensemble_analysis( + features_a, features_b, all_data_a, all_data_b, + torsions=torsions, pocket_occupancy=pocket_occupancy, + pbc=pbc, verbose=verbose, write_plots=write_plots, + override_name_check=override_name_check + ) return np.max(data_ssi) diff --git a/pensa/comparison/relative_entropy.py b/pensa/comparison/relative_entropy.py index 213d62f2..f84eeb98 100644 --- a/pensa/comparison/relative_entropy.py +++ b/pensa/comparison/relative_entropy.py @@ -5,7 +5,8 @@ import scipy.spatial.distance -def relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, bin_width=None, bin_num=10, verbose=True, override_name_check=False): +def relative_entropy_analysis(features_a, features_b, all_data_a, all_data_b, + bin_width=None, bin_num=10, verbose=True, override_name_check=False): """ Calculates the Jensen-Shannon distance and the Kullback-Leibler divergences for each feature from two ensembles. diff --git a/pensa/comparison/statespecific.py b/pensa/comparison/statespecific.py index b27166bd..7166b788 100644 --- a/pensa/comparison/statespecific.py +++ b/pensa/comparison/statespecific.py @@ -14,7 +14,8 @@ # -- Functions to calculate SSI statistics across paired ensembles -- -def ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, discrete_states_ab, max_thread_no=1, pbc=True, h2o=False, +def ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, discrete_states_ab, + max_thread_no=1, pbc=True, h2o=False, verbose=True, write_plots=False, override_name_check=False): """ Calculates State Specific Information statistic for a feature across two ensembles. @@ -101,7 +102,8 @@ def ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, discre traj_1_fraction = traj1_len / (traj1_len + traj2_len) traj_2_fraction = 1 - traj_1_fraction - norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) - traj_2_fraction * math.log(traj_2_fraction, 2) + norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) + norm_factor -= traj_2_fraction * math.log(traj_2_fraction, 2) H_ens = norm_factor featens_joint_states = res_states + ens_states @@ -121,7 +123,9 @@ def ssi_ensemble_analysis(features_a, features_b, all_data_a, all_data_b, discre return data_names, data_ssi -def ssi_feature_analysis(features_a, features_b, all_data_a, all_data_b, discrete_states_ab, max_thread_no=1, pbc=True, h2o=False, verbose=True, override_name_check=False): +def ssi_feature_analysis(features_a, features_b, all_data_a, all_data_b, discrete_states_ab, + max_thread_no=1, pbc=True, h2o=False, + verbose=True, override_name_check=False): """ Calculates State Specific Information statistic between two features across two ensembles. @@ -241,7 +245,8 @@ def ssi_feature_analysis(features_a, features_b, all_data_a, all_data_b, discret traj_1_fraction = traj1_len / (traj1_len + traj2_len) traj_2_fraction = 1 - traj_1_fraction - norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) - traj_2_fraction * math.log(traj_2_fraction, 2) + norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) + norm_factor -= traj_2_fraction * math.log(traj_2_fraction, 2) SSI = ((H_a + H_b) - H_ab) / norm_factor @@ -267,7 +272,8 @@ def ssi_feature_analysis(features_a, features_b, all_data_a, all_data_b, discret def cossi_featens_analysis(features_a, features_b, features_c, features_d, all_data_a, all_data_b, all_data_c, all_data_d, discrete_states_ab, discrete_states_cd, - max_thread_no=1, pbca=True, pbcb=True, h2oa=False, h2ob=False, verbose=True, override_name_check=False): + max_thread_no=1, pbca=True, pbcb=True, h2oa=False, h2ob=False, + verbose=True, override_name_check=False): """ Calculates State Specific Information Co-SSI statistic between two features and the ensembles condition. @@ -390,7 +396,8 @@ def cossi_featens_analysis(features_a, features_b, features_c, features_d, if H_b != 0: traj_1_fraction = traj1_len / (traj1_len + traj2_len) traj_2_fraction = 1 - traj_1_fraction - norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) - traj_2_fraction * math.log(traj_2_fraction, 2) + norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) + norm_factor -= traj_2_fraction * math.log(traj_2_fraction, 2) set_distr_c = [[0.5] * traj1_len + [1.5] * traj2_len] set_c_states = [[0, 1, 2]] @@ -576,7 +583,8 @@ def _ssi_feat_feat_analysis(features_a, features_b, features_c, features_d, if H_b != 0: traj_1_fraction = traj1_len / (traj1_len + traj2_len) traj_2_fraction = 1 - traj_1_fraction - norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) - traj_2_fraction * math.log(traj_2_fraction, 2) + norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) + norm_factor -= traj_2_fraction * math.log(traj_2_fraction, 2) # ---------------- ab_joint_states = set_a_states + set_b_states @@ -669,7 +677,13 @@ def _calculate_ssi(distr_a_input, traj1_len, distr_b_input=None, else: plot_name = None try: - set_a_states.append(determine_state_limits(set_distr_a[dim_num], traj1_len, gauss_bins, gauss_smooth, write_plots, plot_name)) + set_a_states.append( + determine_state_limits( + set_distr_a[dim_num], traj1_len, + gauss_bins, gauss_smooth, + write_plots, plot_name + ) + ) except Exception: warnings.warn('Distribution A not clustering properly.\nTry altering Gaussian parameters or input custom states.') else: @@ -700,7 +714,13 @@ def _calculate_ssi(distr_a_input, traj1_len, distr_b_input=None, else: plot_name = None try: - set_b_states.append(determine_state_limits(set_distr_b[dim_num], traj1_len, gauss_bins, gauss_smooth, write_plots, plot_name)) + set_b_states.append( + determine_state_limits( + set_distr_b[dim_num], traj1_len, + gauss_bins, gauss_smooth, + write_plots, plot_name + ) + ) except Exception: warnings.warn('Distribution B not clustering properly.\nTry altering Gaussian parameters or input custom states.') @@ -898,7 +918,8 @@ def _calculate_cossi(distr_a_input, traj1_len, distr_b_input, distr_c_input=None traj_1_fraction = traj1_len / len(set_distr_a[0]) traj_2_fraction = 1 - traj_1_fraction - norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) - traj_2_fraction * math.log(traj_2_fraction, 2) + norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) + norm_factor -= traj_2_fraction * math.log(traj_2_fraction, 2) SSI = ((H_a + H_b) - H_ab) / norm_factor coSSI = ((H_a + H_b + H_c) - (H_ab + H_ac + H_bc) + H_abc) / norm_factor @@ -918,7 +939,8 @@ def _calculate_cossi(distr_a_input, traj1_len, distr_b_input, distr_c_input=None return round(SSI, 4), round(coSSI, 4) -def _cossi_featens_analysis(features_a, features_b, all_data_a, all_data_b, max_thread_no=1, torsions=None, verbose=True, override_name_check=False): +def _cossi_featens_analysis(features_a, features_b, all_data_a, all_data_b, + max_thread_no=1, torsions=None, verbose=True, override_name_check=False): """ Calculates State Specific Information Co-SSI statistic between two features and the ensembles condition. @@ -961,10 +983,16 @@ def _cossi_featens_analysis(features_a, features_b, all_data_a, all_data_b, max_ mv_res_feat_a, mv_res_data_a = features_a, all_data_a mv_res_feat_b, mv_res_data_b = features_b, all_data_b else: - mv_res_feat_a, mv_res_data_a = get_multivar_res_timeseries(features_a, all_data_a, torsions + '-torsions', write=False, out_name='') - mv_res_feat_b, mv_res_data_b = get_multivar_res_timeseries(features_b, all_data_b, torsions + '-torsions', write=False, out_name='') - mv_res_feat_a, mv_res_data_a = mv_res_feat_a[torsions + '-torsions'], mv_res_data_a[torsions + '-torsions'] - mv_res_feat_b, mv_res_data_b = mv_res_feat_b[torsions + '-torsions'], mv_res_data_b[torsions + '-torsions'] + mv_res_feat_a, mv_res_data_a = get_multivar_res_timeseries( + features_a, all_data_a, torsions + '-torsions', write=False, out_name='' + ) + mv_res_feat_b, mv_res_data_b = get_multivar_res_timeseries( + features_b, all_data_b, torsions + '-torsions', write=False, out_name='' + ) + mv_res_feat_a = mv_res_feat_a[torsions + '-torsions'] + mv_res_data_a = mv_res_data_a[torsions + '-torsions'] + mv_res_feat_b = mv_res_feat_b[torsions + '-torsions'] + mv_res_data_b = mv_res_data_b[torsions + '-torsions'] # Assert that the features are the same and data sets have same number of features if override_name_check: @@ -989,7 +1017,7 @@ def _cossi_featens_analysis(features_a, features_b, all_data_a, all_data_b, max_ res1_data_ens2 = mv_res_data_b[res1] res1_combined_dist = [] for dist_no_a in range(len(res1_data_ens1)): - # # # combine the ensembles into one distribution (condition_a + condition_b) + # Combine the ensembles into one distribution (condition_a + condition_b) res1_data_both = list(res1_data_ens1[dist_no_a]) + list(res1_data_ens2[dist_no_a]) res1_combined_dist.append(res1_data_both) @@ -1055,7 +1083,8 @@ def _cossi_featens_analysis(features_a, features_b, all_data_a, all_data_b, max_ traj_1_fraction = traj1_len / len(set_distr_a[0]) traj_2_fraction = 1 - traj_1_fraction - norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) - traj_2_fraction * math.log(traj_2_fraction, 2) + norm_factor = -traj_1_fraction * math.log(traj_1_fraction, 2) + norm_factor -= traj_2_fraction * math.log(traj_2_fraction, 2) set_distr_c = [[0.5] * traj1_len + [1.5] * int(len(set_distr_a[0]) - traj1_len)] set_c_states = [[0, 1, 2]] @@ -1065,22 +1094,30 @@ def _cossi_featens_analysis(features_a, features_b, all_data_a, all_data_b, max_ ab_joint_states = set_a_states + set_b_states ab_joint_distributions = set_distr_a + set_distr_b - H_ab = calculate_entropy_multthread(ab_joint_states, ab_joint_distributions, max_thread_no) + H_ab = calculate_entropy_multthread( + ab_joint_states, ab_joint_distributions, max_thread_no + ) # ---------------- ac_joint_states = set_a_states + set_c_states ac_joint_distributions = set_distr_a + set_distr_c - H_ac = calculate_entropy_multthread(ac_joint_states, ac_joint_distributions, max_thread_no) + H_ac = calculate_entropy_multthread( + ac_joint_states, ac_joint_distributions, max_thread_no + ) # ---------------- bc_joint_states = set_b_states + set_c_states bc_joint_distributions = set_distr_b + set_distr_c - H_bc = calculate_entropy_multthread(bc_joint_states, bc_joint_distributions, max_thread_no) + H_bc = calculate_entropy_multthread( + bc_joint_states, bc_joint_distributions, max_thread_no + ) # ---------------- abc_joint_states = set_a_states + set_b_states + set_c_states abc_joint_distributions = set_distr_a + set_distr_b + set_distr_c - H_abc = calculate_entropy_multthread(abc_joint_states, abc_joint_distributions, max_thread_no) + H_abc = calculate_entropy_multthread( + abc_joint_states, abc_joint_distributions, max_thread_no + ) SSI = ((H_a + H_b) - H_ab) / norm_factor coSSI = ((H_a + H_b + H_c) - (H_ab + H_ac + H_bc) + H_abc) / norm_factor diff --git a/pensa/comparison/uncertainty_analysis.py b/pensa/comparison/uncertainty_analysis.py index 997d45d8..adff9c03 100644 --- a/pensa/comparison/uncertainty_analysis.py +++ b/pensa/comparison/uncertainty_analysis.py @@ -75,11 +75,16 @@ def ssi_block_analysis(features_a, features_b, all_data_a, all_data_b, features_a, block_data_a = get_multivar_res(features_a, block_data_a) features_b, block_data_b = get_multivar_res(features_b, block_data_b) - discrete_states = get_discrete_states(block_data_a, block_data_b, - discretize=discretize, pbc=pbc) + discrete_states = get_discrete_states( + block_data_a, block_data_b, + discretize=discretize, pbc=pbc + ) print('block length = ', bl) - ssi_names, data_ssi = ssi_ensemble_analysis(features_a, features_b, block_data_a, block_data_b, discrete_states, verbose=verbose) + ssi_names, data_ssi = ssi_ensemble_analysis( + features_a, features_b, block_data_a, block_data_b, + discrete_states, verbose=verbose + ) ssi_blocks.append(data_ssi) ssi_names, ssi_blocks = np.transpose(ssi_names), np.transpose(ssi_blocks) diff --git a/pensa/dimensionality/__init__.py b/pensa/dimensionality/__init__.py index 5ed7bbe5..36c768ec 100644 --- a/pensa/dimensionality/__init__.py +++ b/pensa/dimensionality/__init__.py @@ -1,3 +1,26 @@ -from .visualization import * -from .pca import * -from .tica import * +from .pca import \ + calculate_pca, \ + pca_eigenvalues_plot, \ + pca_features, \ + project_on_pc, \ + get_components_pca, \ + sort_traj_along_pc, \ + sort_trajs_along_common_pc, \ + sort_mult_trajs_along_common_pc + +from .tica import \ + calculate_tica, \ + tica_eigenvalues_plot, \ + tica_features, \ + project_on_tic, \ + get_components_tica, \ + sort_traj_along_tic, \ + sort_trajs_along_common_tic, \ + sort_mult_trajs_along_common_tic + +from .visualization import \ + project_on_eigenvector_pca, \ + project_on_eigenvector_tica, \ + compare_mult_projections, \ + compare_projections, \ + sort_traj_along_projection diff --git a/pensa/dimensionality/pca.py b/pensa/dimensionality/pca.py index 8b61f33a..2e1126dd 100644 --- a/pensa/dimensionality/pca.py +++ b/pensa/dimensionality/pca.py @@ -86,7 +86,7 @@ def pca_features(pca, features, data, num, threshold, plot_file=None, add_labels """ - warnings.warn("The function pca_features in versions > 0.2.8 needs the data for the features, not only their names!") + warnings.warn("pca_features() in versions > 0.2.8 needs the data for the features, not only their names!") # Project the trajectory data on the principal components projection = get_components_pca(data, num, pca)[1] # Plot the highest PC correlations and print relevant features diff --git a/pensa/features/__init__.py b/pensa/features/__init__.py index b6f4f27c..1d27c520 100644 --- a/pensa/features/__init__.py +++ b/pensa/features/__init__.py @@ -4,13 +4,54 @@ """ -from .mda_distances import * -from .mda_torsions import * -from .mda_combined import * -from .processing import * -from .atom_features import * -from .water_features import * -from .txt_features import * -from .csv_features import * -from .hbond_features import * -# from .pyemma_features import * +from .mda_distances import \ + get_atom_group_distances, \ + get_atom_self_distances, \ + get_calpha_distances, \ + get_gpcr_calpha_distances, \ + select_gpcr_residues + +from .mda_torsions import \ + get_torsions, \ + get_protein_backbone_torsions, \ + get_protein_sidechain_torsions, \ + get_nucleicacid_backbone_torsions, \ + get_nucleicacid_pseudotorsions + +from .mda_combined import \ + get_structure_features, \ + sort_traj_along_combined_feature + +from .processing import \ + get_feature_subset, \ + get_feature_data, \ + get_common_features_data, \ + get_feature_timeseries, \ + get_multivar_res, \ + get_multivar_res_timeseries, \ + correct_angle_periodicity, \ + correct_spher_angle_periodicity, \ + select_common_features, match_sim_lengths, \ + sort_features, \ + sort_features_alphabetically, \ + sort_distances_by_resnum, \ + sort_sincos_torsions_by_resnum, \ + sort_torsions_by_resnum, \ + sort_traj_along_feature + +from .atom_features import \ + get_atom_features + +from .water_features import \ + get_water_features + +from .txt_features import \ + get_txt_features_ala2 + +from .csv_features import \ + get_drormd_features, \ + read_csv_features + +from .hbond_features import \ + get_h_bonds, \ + get_cavity_bonds diff --git a/pensa/features/atom_features.py b/pensa/features/atom_features.py index 2b64e4f9..da7b5e9e 100644 --- a/pensa/features/atom_features.py +++ b/pensa/features/atom_features.py @@ -167,16 +167,19 @@ def get_atom_features(structure_input, xtc_input, atomgroup, element, top_atoms= features_data[element + 'Pocket_Idx'] = np.array(atom_dists, dtype=object) # Add atom pocket frequencies + pocket_occup = [atinfo[2] for atinfo in atom_information] feature_names[element + 'Pocket_Occup'] = [atinfo[0] for atinfo in atom_information] - features_data[element + 'Pocket_Occup'] = np.array([atinfo[2] for atinfo in atom_information], dtype=object) + features_data[element + 'Pocket_Occup'] = np.array(pocket_occup, dtype=object) # Add atom pocket occupancy timeseries + pocket_occup_distr = [convert_to_occ(distr, -1, water=False) for distr in atom_dists] feature_names[element + 'Pocket_OccupDistr'] = [atinfo[0] for atinfo in atom_information] - features_data[element + 'Pocket_OccupDistr'] = np.array([convert_to_occ(distr, -1, water=False) for distr in atom_dists], dtype=object) + features_data[element + 'Pocket_OccupDistr'] = np.array(pocket_occup_distr, dtype=object) # Add atom pocket locations + pocket_xyz = [atinfo[1] for atinfo in atom_information] feature_names[element + 'Pocket_xyz'] = [atinfo[0] for atinfo in atom_information] - features_data[element + 'Pocket_xyz'] = np.array([atinfo[1] for atinfo in atom_information], dtype=object) + features_data[element + 'Pocket_xyz'] = np.array(pocket_xyz, dtype=object) # Return the dictionaries. return feature_names, features_data diff --git a/pensa/features/hbond_features.py b/pensa/features/hbond_features.py index 3650052d..086b76c0 100644 --- a/pensa/features/hbond_features.py +++ b/pensa/features/hbond_features.py @@ -121,7 +121,8 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, for frame_no in tqdm(range(len(u.trajectory))): u.trajectory[frame_no] radius = ' 3.5' - atomgroup_IDS = list(u.select_atoms('byres (name ' + atomgroups[0] + ' and point ' + point_str + radius + ')').indices)[::3] + at_sel = 'byres (name ' + atomgroups[0] + ' and point ' + point_str + radius + ')' + atomgroup_IDS = list(u.select_atoms(at_sel).indices)[::3] counting.append(list(set(atomgroup_IDS))) # Water atom indices that appear in the water site @@ -136,9 +137,10 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, if len(site_resid) == 1: # (x, y, z) positions for the water oxygen at trajectory frame_no - proxprotatg = 'protein and around 3.5 byres index ' + str(site_resid[0]) - O_site = 'index ' + str(site_resid[0]) - H_site = '((byres index ' + str(site_resid[0]) + ') and (name ' + atomgroups[1] + ' or name ' + atomgroups[2] + '))' + resid = str(site_resid[0]) + proxprotatg = 'protein and around 3.5 byres index ' + resid + O_site = 'index ' + resid + H_site = '((byres index ' + resid + ') and (name ' + atomgroups[1] + ' or name ' + atomgroups[2] + '))' hbond = HBA(universe=u) protein_hydrogens_sel = hbond.guess_hydrogens(proxprotatg) @@ -166,9 +168,10 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, freq_count.append([flat_list.count(ID), ID]) freq_count.sort(key=lambda x: x[0]) - proxprotatg = 'protein and around 3.5 byres index ' + str(freq_count[-1][1]) - O_site = 'index ' + str(freq_count[-1][1]) - H_site = '((byres index ' + str(freq_count[-1][1]) + ') and (name ' + atomgroups[1] + ' or name ' + atomgroups[2] + '))' + fcstr = str(freq_count[-1][1]) + proxprotatg = 'protein and around 3.5 byres index ' + fcstr + O_site = 'index ' + fcstr + H_site = '((byres index ' + fcstr + ') and (name ' + atomgroups[1] + ' or name ' + atomgroups[2] + '))' hbond = HBA(universe=u) protein_hydrogens_sel = hbond.guess_hydrogens(proxprotatg) @@ -224,17 +227,35 @@ def get_cavity_bonds(structure_input, xtc_input, atomgroups, site_IDs, # Write data out and visualize water sites in pdb # "FIX OUTPUT UNIFORMITY, SINGLE BONDS NOT OUTPUT WITH ANY ARRAY DIMENSION" if write is True: - np.savetxt('h2o_hbonds/' + out_name + O_site_pdb_id + '_names.txt', np.array(bondouts[0][0], dtype=object), fmt='%s') - np.savetxt('h2o_hbonds/' + out_name + O_site_pdb_id + '_data.txt', np.array(bondouts[0][1], dtype=object), fmt='%s') - np.savetxt('h2o_hbonds/' + out_name + H_site_pdb_id + '_names.txt', np.array(bondouts[1][0], dtype=object), fmt='%s') - np.savetxt('h2o_hbonds/' + out_name + H_site_pdb_id + '_data.txt', np.array(bondouts[1][1], dtype=object), fmt='%s') - - feature_names['W' + str(site_no)]['acceptor_names'] = np.array(bondouts[0][0], dtype=object) - feature_names['W' + str(site_no)]['donor_names'] = np.array(bondouts[1][0], dtype=object) - features_data['W' + str(site_no)]['acceptor_timeseries'] = np.array(bondouts[0][1], dtype=object) - features_data['W' + str(site_no)]['donor_timeseries'] = np.array(bondouts[1][1], dtype=object) - features_data['W' + str(site_no)]['acceptor_frequencies'] = np.sum(np.array(bondouts[0][1], dtype=object), axis=1) / len(u.trajectory) - features_data['W' + str(site_no)]['donor_frequencies'] = np.sum(np.array(bondouts[1][1], dtype=object), axis=1) / len(u.trajectory) + np.savetxt( + 'h2o_hbonds/' + out_name + O_site_pdb_id + '_names.txt', + np.array(bondouts[0][0], dtype=object), fmt='%s' + ) + np.savetxt( + 'h2o_hbonds/' + out_name + O_site_pdb_id + '_data.txt', + np.array(bondouts[0][1], dtype=object), fmt='%s' + ) + np.savetxt( + 'h2o_hbonds/' + out_name + H_site_pdb_id + '_names.txt', + np.array(bondouts[1][0], dtype=object), fmt='%s' + ) + np.savetxt( + 'h2o_hbonds/' + out_name + H_site_pdb_id + '_data.txt', + np.array(bondouts[1][1], dtype=object), fmt='%s' + ) + + feature_names['W' + str(site_no)]['acceptor_names'] = \ + np.array(bondouts[0][0], dtype=object) + feature_names['W' + str(site_no)]['donor_names'] = \ + np.array(bondouts[1][0], dtype=object) + features_data['W' + str(site_no)]['acceptor_timeseries'] = \ + np.array(bondouts[0][1], dtype=object) + features_data['W' + str(site_no)]['donor_timeseries'] = \ + np.array(bondouts[1][1], dtype=object) + features_data['W' + str(site_no)]['acceptor_frequencies'] = \ + np.sum(np.array(bondouts[0][1], dtype=object), axis=1) / len(u.trajectory) + features_data['W' + str(site_no)]['donor_frequencies'] = \ + np.sum(np.array(bondouts[1][1], dtype=object), axis=1) / len(u.trajectory) return feature_names, features_data @@ -345,8 +366,10 @@ def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, all_donor_pairs = Unique_bonding_pairs([y for subl in [x for sub in all_bonds[0] for x in sub] for y in subl]) all_acceptor_pairs = Unique_bonding_pairs([y for subl in [x for sub in all_bonds[1] for x in sub] for y in subl]) - all_donor_pair_names = [[atg_to_names(u.select_atoms('index ' + str(i[0])))[0], atg_to_names(u.select_atoms('index ' + str(i[1])))[0]] for i in all_donor_pairs] - all_acceptor_pair_names = [[atg_to_names(u.select_atoms('index ' + str(i[0])))[0], atg_to_names(u.select_atoms('index ' + str(i[1])))[0]] for i in all_acceptor_pairs] + all_donor_pair_names = [[atg_to_names(u.select_atoms('index ' + str(i[0])))[0], + atg_to_names(u.select_atoms('index ' + str(i[1])))[0]] for i in all_donor_pairs] + all_acceptor_pair_names = [[atg_to_names(u.select_atoms('index ' + str(i[0])))[0], + atg_to_names(u.select_atoms('index ' + str(i[1])))[0]] for i in all_acceptor_pairs] donor_dist = np.zeros((len(all_donor_pairs), len(u.trajectory))) acceptor_dist = np.zeros((len(all_acceptor_pairs), len(u.trajectory))) @@ -361,10 +384,22 @@ def get_h_bonds(structure_input, xtc_input, fixed_group, dyn_group, write=None, # Write data out and visualize water sites in pdb if write is True: - np.savetxt('lig_hbonds/' + out_name + 'all_donor_pair_names.txt', np.array(all_donor_pair_names, dtype=object), fmt='%s') - np.savetxt('lig_hbonds/' + out_name + 'all_acceptor_pair_names.txt', np.array(all_acceptor_pair_names, dtype=object), fmt='%s') - np.savetxt('lig_hbonds/' + out_name + 'all_donor_pair_data.txt', np.array(donor_dist, dtype=object), fmt='%s') - np.savetxt('lig_hbonds/' + out_name + 'all_acceptor_pair_data.txt', np.array(acceptor_dist, dtype=object), fmt='%s') + np.savetxt( + 'lig_hbonds/' + out_name + 'all_donor_pair_names.txt', + np.array(all_donor_pair_names, dtype=object), fmt='%s' + ) + np.savetxt( + 'lig_hbonds/' + out_name + 'all_acceptor_pair_names.txt', + np.array(all_acceptor_pair_names, dtype=object), fmt='%s' + ) + np.savetxt( + 'lig_hbonds/' + out_name + 'all_donor_pair_data.txt', + np.array(donor_dist, dtype=object), fmt='%s' + ) + np.savetxt( + 'lig_hbonds/' + out_name + 'all_acceptor_pair_data.txt', + np.array(acceptor_dist, dtype=object), fmt='%s' + ) feature_names['donor_names'] = np.array(all_donor_pair_names) feature_names['acceptor_names'] = np.array(all_acceptor_pair_names) diff --git a/pensa/features/mda_combined.py b/pensa/features/mda_combined.py index 1dfb2554..9a8b07bc 100644 --- a/pensa/features/mda_combined.py +++ b/pensa/features/mda_combined.py @@ -125,7 +125,9 @@ def _remove_resnum_offset(features, offset): return new_features -def sort_traj_along_combined_feature(feat, data, feature_name, feature_type, ref_name, trj_name, out_name, start_frame=0): +def sort_traj_along_combined_feature(feat, data, feature_name, feature_type, + ref_name, trj_name, out_name, + start_frame=0): """ Sort a trajectory along one feature in a combined set. diff --git a/pensa/features/mda_torsions.py b/pensa/features/mda_torsions.py index 84d7c3f2..5dac4d6d 100644 --- a/pensa/features/mda_torsions.py +++ b/pensa/features/mda_torsions.py @@ -278,7 +278,8 @@ def get_nucleicacid_backbone_torsions(pdb, xtc, selection='all', angles += ['ZETA'] * len(indices_zeta) angles += ['CHI'] * len(indices_chi) # Calculate the torsions - all_indices = indices_alpha + indices_beta + indices_gamma + indices_delta + indices_epsilon + indices_zeta + indices_chi + all_indices = indices_alpha + indices_beta + indices_gamma + indices_delta \ + + indices_epsilon + indices_zeta + indices_chi torsions = get_torsions( pdb, xtc, sel=all_indices, first_frame=0, last_frame=None, step=1, diff --git a/pensa/features/pyemma_features.py b/pensa/features/pyemma_features.py index 190d685d..a1f15a57 100644 --- a/pensa/features/pyemma_features.py +++ b/pensa/features/pyemma_features.py @@ -65,7 +65,8 @@ def get_pyemma_features(pdb, xtc, start_frame=0, step_width=1, cossin=False, # Add backbone C-alpha distances. if 'bb-distances' in features: bbdistances_feat = pyemma.coordinates.featurizer(pdb) - bbdistances_feat.add_distances(bbdistances_feat.pairs(bbdistances_feat.select_Ca(), excluded_neighbors=2), periodic=False) + bbdistances_pairs = bbdistances_feat.pairs(bbdistances_feat.select_Ca(), excluded_neighbors=2) + bbdistances_feat.add_distances(bbdistances_pairs, periodic=False) bbdistances_data = pyemma.coordinates.load(xtc, features=bbdistances_feat, stride=step_width)[start_frame:] feature_names['bb-distances'] = _describe_dist_without_atom_numbers(bbdistances_feat) features_data['bb-distances'] = bbdistances_data diff --git a/pensa/features/water_features.py b/pensa/features/water_features.py index c3939c3d..277f224a 100644 --- a/pensa/features/water_features.py +++ b/pensa/features/water_features.py @@ -179,7 +179,8 @@ def get_water_features(structure_input, xtc_input, atomgroup, top_waters=10, waters_resid = counting[frame_no] if len(waters_resid) == 1: # (x, y, z) positions for the water oxygen at trajectory frame_no - water_atom_positions = [list(pos) for pos in u.select_atoms('byres index ' + str(waters_resid[0])).positions] + _selection = 'byres index ' + str(waters_resid[0]) + water_atom_positions = [list(pos) for pos in u.select_atoms(_selection).positions] psi, theta = _convert_to_dipole(water_atom_positions) psilist.append(psi) thetalist.append(theta) @@ -189,7 +190,8 @@ def get_water_features(structure_input, xtc_input, atomgroup, top_waters=10, for ID in waters_resid: freq_count.append([flat_list.count(ID), ID]) freq_count.sort(key=lambda x: x[0]) - water_atom_positions = [list(pos) for pos in u.select_atoms('byres index ' + str(freq_count[-1][1])).positions] + _selection = 'byres index ' + str(freq_count[-1][1]) + water_atom_positions = [list(pos) for pos in u.select_atoms(_selection).positions] psi, theta = _convert_to_dipole(water_atom_positions) psilist.append(psi) thetalist.append(theta) diff --git a/pensa/preprocessing/__init__.py b/pensa/preprocessing/__init__.py index b864ce9a..2b7c3451 100644 --- a/pensa/preprocessing/__init__.py +++ b/pensa/preprocessing/__init__.py @@ -1,4 +1,22 @@ -from .coordinates import * -from .selection import * -from .download import * -from .density import * +from .coordinates import \ + extract_coordinates, \ + extract_coordinates_combined, \ + align_coordinates, \ + sort_coordinates, \ + merge_coordinates, \ + merge_and_sort_coordinates + +from .selection import \ + range_to_string, \ + load_selection + +from .download import \ + download_from_gpcrmd, \ + get_transmem_from_uniprot + +from .density import \ + extract_aligned_coords, \ + extract_combined_grid, \ + get_grid, \ + dens_grid_pdb, \ + local_maxima_3D diff --git a/pensa/statesinfo/__init__.py b/pensa/statesinfo/__init__.py index c06d1997..b250f11e 100644 --- a/pensa/statesinfo/__init__.py +++ b/pensa/statesinfo/__init__.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Methods for state-specific information. @@ -9,4 +8,10 @@ | https://doi.org/10.1101/2020.08.28.271510 """ -from .discrete_states import * +from .discrete_states import \ + smart_gauss_fit, \ + get_intersects, \ + determine_state_limits, \ + get_discrete_states, \ + calculate_entropy, \ + calculate_entropy_multthread diff --git a/tests/test_workflow.py b/tests/test_workflow.py index 38e4c67f..49aec996 100644 --- a/tests/test_workflow.py +++ b/tests/test_workflow.py @@ -8,21 +8,27 @@ obtain_clusters, wss_over_number_of_clusters, \ obtain_combined_clusters, wss_over_number_of_combined_clusters, \ write_cluster_traj + +from pensa.statesinfo import \ + get_discrete_states + from pensa.comparison import \ - get_discrete_states, \ relative_entropy_analysis, relen_block_analysis, relen_sem_analysis, \ ssi_ensemble_analysis, ssi_block_analysis, ssi_sem_analysis, \ residue_visualization, distances_visualization + from pensa.features import \ get_structure_features, \ get_multivar_res_timeseries, \ sort_features + from pensa.dimensionality import \ calculate_pca, pca_features, pca_eigenvalues_plot, \ calculate_tica, tica_features, tica_eigenvalues_plot, \ sort_traj_along_pc, sort_trajs_along_common_pc, \ sort_traj_along_tic, sort_trajs_along_common_tic, \ compare_projections + from pensa.preprocessing import \ load_selection, extract_coordinates, extract_coordinates_combined