From a1c37e85ec1278055bde0d799146969ff4afff6e Mon Sep 17 00:00:00 2001 From: neilsyt Date: Tue, 26 Nov 2024 16:55:10 +0000 Subject: [PATCH] Updated and tested tutorials. Unified scripts. --- pensa/features/hbond_features.py | 2 +- pensa/features/water_features.py | 2 +- .../density_featurizer.py | 52 +++++++++---------- .../hbond_featurizer.py | 12 ++--- .../state_specific_info.py | 38 +++++++------- tutorial/0-download.sh | 0 tutorial/1-preprocessing-gpcrmd.sh | 0 .../2-comparison-of-feature-distributions.sh | 0 tutorial/3-principal-component-analysis.sh | 0 tutorial/4-clustering.sh | 0 tutorial/5-density-hbond-featurization.sh | 8 +++ tutorial/6-state-specific-information.sh | 4 ++ tutorial/runsh.sh | 16 ++++++ 13 files changed, 78 insertions(+), 56 deletions(-) rename tutorial/PENSA_Tutorial_density_featurizer.py => scripts/density_featurizer.py (54%) rename tutorial/PENSA_Tutorial_hbond_featurizer.py => scripts/hbond_featurizer.py (71%) rename tutorial/PENSA_Tutorial_SSI.py => scripts/state_specific_info.py (86%) mode change 100644 => 100755 tutorial/0-download.sh mode change 100644 => 100755 tutorial/1-preprocessing-gpcrmd.sh mode change 100644 => 100755 tutorial/2-comparison-of-feature-distributions.sh mode change 100644 => 100755 tutorial/3-principal-component-analysis.sh mode change 100644 => 100755 tutorial/4-clustering.sh create mode 100755 tutorial/5-density-hbond-featurization.sh create mode 100755 tutorial/6-state-specific-information.sh create mode 100755 tutorial/runsh.sh diff --git a/pensa/features/hbond_features.py b/pensa/features/hbond_features.py index 634aab3..fa7bfba 100644 --- a/pensa/features/hbond_features.py +++ b/pensa/features/hbond_features.py @@ -483,7 +483,7 @@ def read_water_site_h_bonds_quickly(structure_input, xtc_input, atomgroups, site else: g = Grid(grid_input) elif grid_input is None: - g = generate_grid(u, atomgroups) + g = generate_grid(u, atomgroups[0]) else: g = Grid(grid_input) diff --git a/pensa/features/water_features.py b/pensa/features/water_features.py index 1add901..ced3470 100644 --- a/pensa/features/water_features.py +++ b/pensa/features/water_features.py @@ -202,7 +202,7 @@ def read_water_features(structure_input, xtc_input, atomgroup, top_waters=10, water_information.append([water_ID, list(atom_location), water_pocket_occupation_frequency]) # Write data out and visualize water sites in pdb - if write is True: + if out_name is not None: data_out(out_name + '_' + water_ID + '.txt', water_out) data_out(out_name + '_WaterSummary.txt', water_information) write_atom_to_pdb(pdb_outname, atom_location, water_ID, atomgroup) diff --git a/tutorial/PENSA_Tutorial_density_featurizer.py b/scripts/density_featurizer.py similarity index 54% rename from tutorial/PENSA_Tutorial_density_featurizer.py rename to scripts/density_featurizer.py index a726a77..64fc185 100755 --- a/tutorial/PENSA_Tutorial_density_featurizer.py +++ b/scripts/density_featurizer.py @@ -22,45 +22,43 @@ """ -struc = "mor-data/11426_dyn_151.pdb" -xtc = "mor-data/11423_trj_151.xtc" -water_feat, water_data = read_water_features( - structure_input=struc, - xtc_input=xtc, - top_waters=1, - atomgroup="OH2", - write=True, - write_grid_as="TIP3P", - out_name="11426_dyn_151" -) +# struc = "mor-data/11426_dyn_151.pdb" +# xtc = "mor-data/11423_trj_151.xtc" +# water_feat, water_data = read_water_features( +# structure_input=struc, +# xtc_input=xtc, +# top_waters=1, +# atomgroup="OH2", +# write_grid_as="TIP3P", +# out_name="11426_dyn_151" +# ) -# We can use the get_atom_features, which provides the same -# functionality but ignores orientations as atoms are considered spherically symmetric. +# # We can use the get_atom_features, which provides the same +# # functionality but ignores orientations as atoms are considered spherically symmetric. -struc = "mor-data/11426_dyn_151.pdb" +# struc = "mor-data/11426_dyn_151.pdb" -xtc = "mor-data/11423_trj_151.xtc" +# xtc = "mor-data/11423_trj_151.xtc" -# Here we locate the sodium site which has the highest probability -# The density grid is written (write=True) using the default density conversion "Angstrom^{-3}" in MDAnalysis +# # Here we locate the sodium site which has the highest probability +# # The density grid is written (write=True) using the default density conversion "Angstrom^{-3}" in MDAnalysis -atom_feat, atom_data = read_atom_features( - structure_input=struc, - xtc_input=xtc, - top_atoms=1, - atomgroup="SOD", - element="Na", - write=True, - out_name="11426_dyn_151" -) +# atom_feat, atom_data = read_atom_features( +# structure_input=struc, +# xtc_input=xtc, +# top_atoms=1, +# atomgroup="SOD", +# element="Na", +# out_name="11426_dyn_151" +# ) # If we have already obtained the grid, we can speed up featurization by reading it in. struc = "mor-data/11426_dyn_151.pdb" xtc = "mor-data/11423_trj_151.xtc" -grid = "dens/11426_dyn_151OH2_density.dx" +grid = "11426_dyn_151_OH2_density.dx" water_feat, water_data = read_water_features( structure_input=struc, xtc_input=xtc, diff --git a/tutorial/PENSA_Tutorial_hbond_featurizer.py b/scripts/hbond_featurizer.py similarity index 71% rename from tutorial/PENSA_Tutorial_hbond_featurizer.py rename to scripts/hbond_featurizer.py index 2ddab20..e23f3ae 100644 --- a/tutorial/PENSA_Tutorial_hbond_featurizer.py +++ b/scripts/hbond_featurizer.py @@ -1,4 +1,4 @@ -from pensa.features import read_cavity_bonds, read_h_bonds +from pensa.features import hbond_features, atom_features # Featurize water cavity hydrogen bonds @@ -8,26 +8,24 @@ pdb_file_a = root_dir + '/11426_dyn_151.pdb' trj_file_a = root_dir + '/11423_trj_151.xtc' -names, data = read_cavity_bonds( +names, data = hbond_features.read_water_site_h_bonds_quickly( ref_file_a, trj_file_a, atomgroups=['OH2', 'H1', 'H2'], site_IDs=[1, 2], grid_input=None, - write=True, write_grid_as='TIP3P', - out_name='11423_trj_169' + out_name='11423_trj_151' ) -# Faturize ligand-protein hydrogen bonds +# Featurize ligand-protein hydrogen bonds ref_file_a = root_dir + '/11580_dyn_169.psf' pdb_file_a = root_dir + '/11579_dyn_169.pdb' trj_file_a = root_dir + '/11578_trj_169.xtc' -names, data = read_h_bonds( +names, data = hbond_features.read_h_bonds( ref_file_a, trj_file_a, fixed_group='resname 4VO', dyn_group='protein', - write=True, out_name='4VO_hbonds' ) diff --git a/tutorial/PENSA_Tutorial_SSI.py b/scripts/state_specific_info.py similarity index 86% rename from tutorial/PENSA_Tutorial_SSI.py rename to scripts/state_specific_info.py index f608c43..1485d3d 100755 --- a/tutorial/PENSA_Tutorial_SSI.py +++ b/scripts/state_specific_info.py @@ -52,7 +52,7 @@ if not os.path.exists(subdir): os.makedirs(subdir) -# Extract the coordinates of the receptor from the trajectory +# # # Extract the coordinates of the receptor from the trajectory extract_coordinates( ref_file_a, pdb_file_a, trj_file_a, out_name_a + "_receptor", sel_base_a @@ -62,7 +62,7 @@ out_name_b + "_receptor", sel_base_b ) -# Extract the features from the beginning (start_frame) of the trajectory +# # Extract the features from the beginning (start_frame) of the trajectory start_frame = 0 a_rec = read_structure_features( out_name_a + "_receptor.gro", @@ -81,9 +81,9 @@ out_name_a = "condition-a" out_name_b = "condition-b" -# Extract the multivariate torsion coordinates of each residue as a -# timeseries from the trajectory and write into subdirectory -# output = [[torsion 1 timeseries], [torsion 2 timeseries], ..., [torsion n timeseries]] +# # Extract the multivariate torsion coordinates of each residue as a +# # timeseries from the trajectory and write into subdirectory +# # output = [[torsion 1 timeseries], [torsion 2 timeseries], ..., [torsion n timeseries]] sc_multivar_res_feat_a, sc_multivar_res_data_a = get_multivar_res_timeseries( a_rec_feat, a_rec_data, 'sc-torsions', write=True, out_name=out_name_a ) @@ -96,16 +96,16 @@ discretize='gaussian', pbc=True ) -# We can calculate the State Specific Information (SSI) shared between the -# ensemble switch and the combined ensemble residue conformations. As the ensemble -# is a binary change, SSI can exist within the range [0, 1] units=bits. -# 0 bits = no information, 1 bits = maximum information, i.e. you can predict the state of the ensemble -# with certainty from the state of the residue. -# Set write_plots = True to generate a folder with all the clustered states for each residue. +# # We can calculate the State Specific Information (SSI) shared between the +# # ensemble switch and the combined ensemble residue conformations. As the ensemble +# # is a binary change, SSI can exist within the range [0, 1] units=bits. +# # 0 bits = no information, 1 bits = maximum information, i.e. you can predict the state of the ensemble +# # with certainty from the state of the residue. +# # Set write_plots = True to generate a folder with all the clustered states for each residue. data_names, data_ssi = ssi_ensemble_analysis( sc_multivar_res_feat_a['sc-torsions'], sc_multivar_res_feat_b['sc-torsions'], sc_multivar_res_data_a['sc-torsions'], sc_multivar_res_data_b['sc-torsions'], - discrete_states_ab, verbose=True, write_plots=False + discrete_states_ab, verbose=True, write_plots=True ) # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # @@ -149,7 +149,7 @@ # Extract the combined density of the waters in both ensembles a and b extract_combined_grid( - out_name_a + ".gro", "dens/cond-a_wateraligned.xtc", + out_name_a + ".gro", "traj/cond-a_water_aligned.xtc", out_name_b + ".gro", out_name_b + ".xtc", atomgroup="OH2", write_grid_as="TIP3P", @@ -157,18 +157,17 @@ use_memmap=True ) -grid_combined = "dens/ab_grid_OH2_density.dx" +grid_combined = "ab_grid_OH2_density.dx" # Then we featurize the waters common to both simulations # We can do the same analysis for ions using the get_atom_features featurizer. water_feat_a, water_data_a = read_water_features( structure_input=out_name_a + ".gro", - xtc_input="dens/cond-a_wateraligned.xtc", + xtc_input="traj/cond-a_water_aligned.xtc", top_waters=2, atomgroup="OH2", grid_input=grid_combined, - write=True, out_name="cond_a" ) @@ -178,15 +177,14 @@ top_waters=2, atomgroup="OH2", grid_input=grid_combined, - write=True, out_name="cond_b" ) -# Calculating SSI is then exactly the same as for residues +# Calculating SSI is then exactly the same as for residues, with the h2o argument set to True. discrete_states_ab1 = get_discrete_states( water_data_a['WaterPocket_Distr'], water_data_b['WaterPocket_Distr'], - discretize='gaussian', pbc=True + discretize='gaussian', pbc=True, h2o=True, write_plots=True ) # SSI shared between waters and the switch between ensemble conditions @@ -198,7 +196,7 @@ # Alternatively we can see if the pocket occupancy (the presence/absence of water at the site) shares SSI # Currently this is only enabled with ssi_ensemble_analysis. We need to turn off the periodic boundary conditions -# as the distributions are no longer periodic. +# as the distributions are no longer periodic. discrete_states_ab2 = get_discrete_states( water_data_a['WaterPocket_OccupDistr'], diff --git a/tutorial/0-download.sh b/tutorial/0-download.sh old mode 100644 new mode 100755 diff --git a/tutorial/1-preprocessing-gpcrmd.sh b/tutorial/1-preprocessing-gpcrmd.sh old mode 100644 new mode 100755 diff --git a/tutorial/2-comparison-of-feature-distributions.sh b/tutorial/2-comparison-of-feature-distributions.sh old mode 100644 new mode 100755 diff --git a/tutorial/3-principal-component-analysis.sh b/tutorial/3-principal-component-analysis.sh old mode 100644 new mode 100755 diff --git a/tutorial/4-clustering.sh b/tutorial/4-clustering.sh old mode 100644 new mode 100755 diff --git a/tutorial/5-density-hbond-featurization.sh b/tutorial/5-density-hbond-featurization.sh new file mode 100755 index 0000000..b2936be --- /dev/null +++ b/tutorial/5-density-hbond-featurization.sh @@ -0,0 +1,8 @@ +#!/bin/bash + + + +#python ../scripts/hbond_featurizer.py + +python ../scripts/density_featurizer.py + diff --git a/tutorial/6-state-specific-information.sh b/tutorial/6-state-specific-information.sh new file mode 100755 index 0000000..7a98c8e --- /dev/null +++ b/tutorial/6-state-specific-information.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +python ../scripts/state_specific_info.py + diff --git a/tutorial/runsh.sh b/tutorial/runsh.sh new file mode 100755 index 0000000..bae5a4e --- /dev/null +++ b/tutorial/runsh.sh @@ -0,0 +1,16 @@ +#!/bin/bash + + + +./1-preprocessing-gpcrmd.sh + +./2-comparison-of-feature-distributions.sh + +./3-principal-component-analysis.sh + +./4-clustering.sh + +./5-density-hbond-featurization.sh + +./6-state-specific-information.sh +-