From a1c37e85ec1278055bde0d799146969ff4afff6e Mon Sep 17 00:00:00 2001
From: neilsyt <contactneilt@icloud.com>
Date: Tue, 26 Nov 2024 16:55:10 +0000
Subject: [PATCH] Updated and tested tutorials. Unified scripts.

---
 pensa/features/hbond_features.py              |  2 +-
 pensa/features/water_features.py              |  2 +-
 .../density_featurizer.py                     | 52 +++++++++----------
 .../hbond_featurizer.py                       | 12 ++---
 .../state_specific_info.py                    | 38 +++++++-------
 tutorial/0-download.sh                        |  0
 tutorial/1-preprocessing-gpcrmd.sh            |  0
 .../2-comparison-of-feature-distributions.sh  |  0
 tutorial/3-principal-component-analysis.sh    |  0
 tutorial/4-clustering.sh                      |  0
 tutorial/5-density-hbond-featurization.sh     |  8 +++
 tutorial/6-state-specific-information.sh      |  4 ++
 tutorial/runsh.sh                             | 16 ++++++
 13 files changed, 78 insertions(+), 56 deletions(-)
 rename tutorial/PENSA_Tutorial_density_featurizer.py => scripts/density_featurizer.py (54%)
 rename tutorial/PENSA_Tutorial_hbond_featurizer.py => scripts/hbond_featurizer.py (71%)
 rename tutorial/PENSA_Tutorial_SSI.py => scripts/state_specific_info.py (86%)
 mode change 100644 => 100755 tutorial/0-download.sh
 mode change 100644 => 100755 tutorial/1-preprocessing-gpcrmd.sh
 mode change 100644 => 100755 tutorial/2-comparison-of-feature-distributions.sh
 mode change 100644 => 100755 tutorial/3-principal-component-analysis.sh
 mode change 100644 => 100755 tutorial/4-clustering.sh
 create mode 100755 tutorial/5-density-hbond-featurization.sh
 create mode 100755 tutorial/6-state-specific-information.sh
 create mode 100755 tutorial/runsh.sh

diff --git a/pensa/features/hbond_features.py b/pensa/features/hbond_features.py
index 634aab3..fa7bfba 100644
--- a/pensa/features/hbond_features.py
+++ b/pensa/features/hbond_features.py
@@ -483,7 +483,7 @@ def read_water_site_h_bonds_quickly(structure_input, xtc_input, atomgroups, site
         else:
             g = Grid(grid_input)
     elif grid_input is None:
-        g = generate_grid(u, atomgroups)
+        g = generate_grid(u, atomgroups[0])
     else:
         g = Grid(grid_input)
 
diff --git a/pensa/features/water_features.py b/pensa/features/water_features.py
index 1add901..ced3470 100644
--- a/pensa/features/water_features.py
+++ b/pensa/features/water_features.py
@@ -202,7 +202,7 @@ def read_water_features(structure_input, xtc_input, atomgroup, top_waters=10,
         water_information.append([water_ID, list(atom_location), water_pocket_occupation_frequency])
 
         # Write data out and visualize water sites in pdb
-        if write is True:
+        if out_name is not None:
             data_out(out_name + '_' + water_ID + '.txt', water_out)
             data_out(out_name + '_WaterSummary.txt', water_information)
             write_atom_to_pdb(pdb_outname, atom_location, water_ID, atomgroup)
diff --git a/tutorial/PENSA_Tutorial_density_featurizer.py b/scripts/density_featurizer.py
similarity index 54%
rename from tutorial/PENSA_Tutorial_density_featurizer.py
rename to scripts/density_featurizer.py
index a726a77..64fc185 100755
--- a/tutorial/PENSA_Tutorial_density_featurizer.py
+++ b/scripts/density_featurizer.py
@@ -22,45 +22,43 @@
 
  """
 
-struc = "mor-data/11426_dyn_151.pdb"
-xtc = "mor-data/11423_trj_151.xtc"
-water_feat, water_data = read_water_features(
-    structure_input=struc,
-    xtc_input=xtc,
-    top_waters=1,
-    atomgroup="OH2",
-    write=True,
-    write_grid_as="TIP3P",
-    out_name="11426_dyn_151"
-)
+# struc = "mor-data/11426_dyn_151.pdb"
+# xtc = "mor-data/11423_trj_151.xtc"
+# water_feat, water_data = read_water_features(
+#     structure_input=struc,
+#     xtc_input=xtc,
+#     top_waters=1,
+#     atomgroup="OH2",
+#     write_grid_as="TIP3P",
+#     out_name="11426_dyn_151"
+# )
 
 
-# We can use the get_atom_features, which provides the same
-# functionality but ignores orientations as atoms are considered spherically symmetric.
+# # We can use the get_atom_features, which provides the same
+# # functionality but ignores orientations as atoms are considered spherically symmetric.
 
-struc = "mor-data/11426_dyn_151.pdb"
+# struc = "mor-data/11426_dyn_151.pdb"
 
-xtc = "mor-data/11423_trj_151.xtc"
+# xtc = "mor-data/11423_trj_151.xtc"
 
-# Here we locate the sodium site which has the highest probability
-# The density grid is written (write=True) using the default density conversion "Angstrom^{-3}" in MDAnalysis
+# # Here we locate the sodium site which has the highest probability
+# # The density grid is written (write=True) using the default density conversion "Angstrom^{-3}" in MDAnalysis
 
-atom_feat, atom_data = read_atom_features(
-    structure_input=struc,
-    xtc_input=xtc,
-    top_atoms=1,
-    atomgroup="SOD",
-    element="Na",
-    write=True,
-    out_name="11426_dyn_151"
-)
+# atom_feat, atom_data = read_atom_features(
+#     structure_input=struc,
+#     xtc_input=xtc,
+#     top_atoms=1,
+#     atomgroup="SOD",
+#     element="Na",
+#     out_name="11426_dyn_151"
+# )
 
 
 # If we have already obtained the grid, we can speed up featurization by reading it in.
 
 struc = "mor-data/11426_dyn_151.pdb"
 xtc = "mor-data/11423_trj_151.xtc"
-grid = "dens/11426_dyn_151OH2_density.dx"
+grid = "11426_dyn_151_OH2_density.dx"
 water_feat, water_data = read_water_features(
     structure_input=struc,
     xtc_input=xtc,
diff --git a/tutorial/PENSA_Tutorial_hbond_featurizer.py b/scripts/hbond_featurizer.py
similarity index 71%
rename from tutorial/PENSA_Tutorial_hbond_featurizer.py
rename to scripts/hbond_featurizer.py
index 2ddab20..e23f3ae 100644
--- a/tutorial/PENSA_Tutorial_hbond_featurizer.py
+++ b/scripts/hbond_featurizer.py
@@ -1,4 +1,4 @@
-from pensa.features import read_cavity_bonds, read_h_bonds
+from pensa.features import hbond_features, atom_features
 
 # Featurize water cavity hydrogen bonds
 
@@ -8,26 +8,24 @@
 pdb_file_a = root_dir + '/11426_dyn_151.pdb'
 trj_file_a = root_dir + '/11423_trj_151.xtc'
 
-names, data = read_cavity_bonds(
+names, data = hbond_features.read_water_site_h_bonds_quickly(
     ref_file_a, trj_file_a,
     atomgroups=['OH2', 'H1', 'H2'],
     site_IDs=[1, 2],
     grid_input=None,
-    write=True,
     write_grid_as='TIP3P',
-    out_name='11423_trj_169'
+    out_name='11423_trj_151'
 )
 
-# Faturize ligand-protein hydrogen bonds
+# Featurize ligand-protein hydrogen bonds
 
 ref_file_a = root_dir + '/11580_dyn_169.psf'
 pdb_file_a = root_dir + '/11579_dyn_169.pdb'
 trj_file_a = root_dir + '/11578_trj_169.xtc'
 
-names, data = read_h_bonds(
+names, data = hbond_features.read_h_bonds(
     ref_file_a, trj_file_a,
     fixed_group='resname 4VO',
     dyn_group='protein',
-    write=True,
     out_name='4VO_hbonds'
 )
diff --git a/tutorial/PENSA_Tutorial_SSI.py b/scripts/state_specific_info.py
similarity index 86%
rename from tutorial/PENSA_Tutorial_SSI.py
rename to scripts/state_specific_info.py
index f608c43..1485d3d 100755
--- a/tutorial/PENSA_Tutorial_SSI.py
+++ b/scripts/state_specific_info.py
@@ -52,7 +52,7 @@
     if not os.path.exists(subdir):
         os.makedirs(subdir)
 
-# Extract the coordinates of the receptor from the trajectory
+# # # Extract the coordinates of the receptor from the trajectory
 extract_coordinates(
     ref_file_a, pdb_file_a, trj_file_a,
     out_name_a + "_receptor", sel_base_a
@@ -62,7 +62,7 @@
     out_name_b + "_receptor", sel_base_b
 )
 
-# Extract the features from the beginning (start_frame) of the trajectory
+# # Extract the features from the beginning (start_frame) of the trajectory
 start_frame = 0
 a_rec = read_structure_features(
     out_name_a + "_receptor.gro",
@@ -81,9 +81,9 @@
 out_name_a = "condition-a"
 out_name_b = "condition-b"
 
-# Extract the multivariate torsion coordinates of each residue as a
-# timeseries from the trajectory and write into subdirectory
-# output = [[torsion 1 timeseries], [torsion 2 timeseries], ..., [torsion n timeseries]]
+# # Extract the multivariate torsion coordinates of each residue as a
+# # timeseries from the trajectory and write into subdirectory
+# # output = [[torsion 1 timeseries], [torsion 2 timeseries], ..., [torsion n timeseries]]
 sc_multivar_res_feat_a, sc_multivar_res_data_a = get_multivar_res_timeseries(
     a_rec_feat, a_rec_data, 'sc-torsions', write=True, out_name=out_name_a
 )
@@ -96,16 +96,16 @@
     discretize='gaussian', pbc=True
 )
 
-# We can calculate the State Specific Information (SSI) shared between the
-# ensemble switch and the combined ensemble residue conformations. As the ensemble
-# is a binary change, SSI can exist within the range [0, 1] units=bits.
-# 0 bits = no information, 1 bits = maximum information, i.e. you can predict the state of the ensemble
-# with certainty from the state of the residue.
-# Set write_plots = True to generate a folder with all the clustered states for each residue.
+# # We can calculate the State Specific Information (SSI) shared between the
+# # ensemble switch and the combined ensemble residue conformations. As the ensemble
+# # is a binary change, SSI can exist within the range [0, 1] units=bits.
+# # 0 bits = no information, 1 bits = maximum information, i.e. you can predict the state of the ensemble
+# # with certainty from the state of the residue.
+# # Set write_plots = True to generate a folder with all the clustered states for each residue.
 data_names, data_ssi = ssi_ensemble_analysis(
     sc_multivar_res_feat_a['sc-torsions'], sc_multivar_res_feat_b['sc-torsions'],
     sc_multivar_res_data_a['sc-torsions'], sc_multivar_res_data_b['sc-torsions'],
-    discrete_states_ab, verbose=True, write_plots=False
+    discrete_states_ab, verbose=True, write_plots=True
 )
 
 # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
@@ -149,7 +149,7 @@
 
 # Extract the combined density of the waters in both ensembles a and b
 extract_combined_grid(
-    out_name_a + ".gro", "dens/cond-a_wateraligned.xtc",
+    out_name_a + ".gro", "traj/cond-a_water_aligned.xtc",
     out_name_b + ".gro", out_name_b + ".xtc",
     atomgroup="OH2",
     write_grid_as="TIP3P",
@@ -157,18 +157,17 @@
     use_memmap=True
 )
 
-grid_combined = "dens/ab_grid_OH2_density.dx"
+grid_combined = "ab_grid_OH2_density.dx"
 
 # Then we featurize the waters common to both simulations
 # We can do the same analysis for ions using the get_atom_features featurizer.
 
 water_feat_a, water_data_a = read_water_features(
     structure_input=out_name_a + ".gro",
-    xtc_input="dens/cond-a_wateraligned.xtc",
+    xtc_input="traj/cond-a_water_aligned.xtc",
     top_waters=2,
     atomgroup="OH2",
     grid_input=grid_combined,
-    write=True,
     out_name="cond_a"
 )
 
@@ -178,15 +177,14 @@
     top_waters=2,
     atomgroup="OH2",
     grid_input=grid_combined,
-    write=True,
     out_name="cond_b"
 )
 
-# Calculating SSI is then exactly the same as for residues
+# Calculating SSI is then exactly the same as for residues, with the h2o argument set to True. 
 discrete_states_ab1 = get_discrete_states(
     water_data_a['WaterPocket_Distr'],
     water_data_b['WaterPocket_Distr'],
-    discretize='gaussian', pbc=True
+    discretize='gaussian', pbc=True, h2o=True, write_plots=True
 )
 
 # SSI shared between waters and the switch between ensemble conditions
@@ -198,7 +196,7 @@
 
 # Alternatively we can see if the pocket occupancy (the presence/absence of water at the site) shares SSI
 # Currently this is only enabled with ssi_ensemble_analysis. We need to turn off the periodic boundary conditions
-# as the distributions are no longer periodic.
+# as the distributions are no longer periodic. 
 
 discrete_states_ab2 = get_discrete_states(
     water_data_a['WaterPocket_OccupDistr'],
diff --git a/tutorial/0-download.sh b/tutorial/0-download.sh
old mode 100644
new mode 100755
diff --git a/tutorial/1-preprocessing-gpcrmd.sh b/tutorial/1-preprocessing-gpcrmd.sh
old mode 100644
new mode 100755
diff --git a/tutorial/2-comparison-of-feature-distributions.sh b/tutorial/2-comparison-of-feature-distributions.sh
old mode 100644
new mode 100755
diff --git a/tutorial/3-principal-component-analysis.sh b/tutorial/3-principal-component-analysis.sh
old mode 100644
new mode 100755
diff --git a/tutorial/4-clustering.sh b/tutorial/4-clustering.sh
old mode 100644
new mode 100755
diff --git a/tutorial/5-density-hbond-featurization.sh b/tutorial/5-density-hbond-featurization.sh
new file mode 100755
index 0000000..b2936be
--- /dev/null
+++ b/tutorial/5-density-hbond-featurization.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+
+
+#python ../scripts/hbond_featurizer.py
+
+python ../scripts/density_featurizer.py
+
diff --git a/tutorial/6-state-specific-information.sh b/tutorial/6-state-specific-information.sh
new file mode 100755
index 0000000..7a98c8e
--- /dev/null
+++ b/tutorial/6-state-specific-information.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+python ../scripts/state_specific_info.py
+
diff --git a/tutorial/runsh.sh b/tutorial/runsh.sh
new file mode 100755
index 0000000..bae5a4e
--- /dev/null
+++ b/tutorial/runsh.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+
+
+./1-preprocessing-gpcrmd.sh
+
+./2-comparison-of-feature-distributions.sh
+
+./3-principal-component-analysis.sh
+
+./4-clustering.sh
+
+./5-density-hbond-featurization.sh
+
+./6-state-specific-information.sh
+-