added example for data preparation

Degiacomi-Lab · degiacom · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024 · Jun 3, 2024
commit e2c55054c3756923fea6fb9fe758d4483da05a43
diff --git a/README.md b/README.md
@@ -27,6 +27,9 @@ The current version of molearn only supports Linux, and has verified to support
 
 #### Optional Packages
 
+To prepare a raw trajectory for training:
+* [mdtraj](https://mdtraj.org/1.9.4/index.html)
+
 To run energy evaluations with OpenMM:
 * [OpenMM](https://openmm.org/documentation)
 * [openmmtorchplugin](https://github.com/SCMusson/openmmtorchplugin)

diff --git a/environment.yml b/environment.yml
@@ -19,5 +19,6 @@ dependencies:
   - plotly
   - nglview
   - openmmtorchplugin
+  - mdtraj
   - pip:
     - geomloss
diff --git a/examples/data/preparation/MurDclosed.dcd b/examples/data/preparation/MurDclosed.dcd
diff --git a/examples/data/preparation/MurDopen.dcd b/examples/data/preparation/MurDopen.dcd
diff --git a/examples/data/preparation/topo_MurDclosed1F.pdb b/examples/data/preparation/topo_MurDclosed1F.pdb
diff --git a/examples/data/preparation/topo_MurDopen1F.pdb b/examples/data/preparation/topo_MurDopen1F.pdb
diff --git a/examples/prepare_example.py b/examples/prepare_example.py
@@ -0,0 +1,57 @@
+import os
+import sys
+
+sys.path.insert(0, os.path.join(os.path.abspath(os.pardir), "src"))
+from molearn.data import DataAssembler
+
+
+def main():
+    storage_path = "./clustered"
+    if not os.path.exists(storage_path):
+        os.mkdir(storage_path)
+    tm = DataAssembler(
+        # trajectories
+        [
+            "./data/preparation/MurDopen.dcd",
+            "./data/preparation/MurDclosed.dcd",
+        ],
+        # topologies
+        [
+            "./data/preparation/topo_MurDopen1F.pdb",
+            "./data/preparation/topo_MurDclosed1F.pdb",
+        ],
+        test_size=0.0,
+        n_cluster=5,
+        outpath=storage_path,
+        verbose=True,
+    )
+    # reading in the trajectories and removing of all atoms apart from protein atoms
+    tm.read_traj()
+    # using agglomerative clustering to sample the trajectories
+    tm.distance_cluster()
+    # creating the new trajectory as dcd file and a new topology as pdb file
+    tm.create_trajectories()
+    # using PCA and the first n components for KMeans clustering to sample the trajectories
+    tm.pca_cluster()
+    tm.create_trajectories()
+    # simply striding over the trajectories with a step size computed to result in n_cluster frames
+    tm.stride()
+    tm.create_trajectories()
+    """
+    the given example will create the following files in a new directory named 'clustered'
+    *   MurDopen_CLUSTER_aggl_train.dcd
+    *   MurDopen_CLUSTER_aggl_train_frames.txt
+    *   MurDopen_CLUSTER_pca_train.dcd
+    *   MurDopen_CLUSTER_pca_train_frames.txt
+    *   MurDopen_NEW_TOPO.pdb
+    *   MurDopen_STRIDE_5_train.dcd
+    *   MurDopen_STRIDE_5_train_frames.txt
+    the txt files contain the indices of frames of the original trajectory
+    the dcd files contain the new trajectory
+    the pdb file is the new topology for the new trajectory
+    all atoms apart from protein atoms will be removed
+    """
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/molearn/data/__init__.py b/src/molearn/data/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 Samuel C. Musson
+# Copyright (c) 2022 Samuel C. Musson, Gregor Wirnsberger
 #
 # Molearn is free software ;
 # you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation ;
-Original file line number
+Diff line change
@@ @@ -19,5 +19,6 @@ dependencies: @@
       - plotly
       - nglview
       - openmmtorchplugin
+      - mdtraj
       - pip:
         - geomloss