Added a few program files.

kumaranu · May 1, 2024 · 11773f9 · 11773f9
1 parent 76b0858
commit 11773f9
Show file tree

Hide file tree

Showing 9 changed files with 6,451 additions and 0 deletions.
diff --git a/3sxr_dasatinib_removed.pdb b/3sxr_dasatinib_removed.pdb
diff --git a/call_combine.tcsh b/call_combine.tcsh
@@ -0,0 +1,18 @@
+#!/bin/tcsh
+
+#########################################################################
+#                                                                       #
+# This script submits slurm jobs for ligand+protein and the ligand only #
+# calculations.                                                         #
+#                                                                       #
+#########################################################################
+
+
+#Call to the energy calculations for the ligand+protein and the ligand
+set nSplits = `ls -ltr moleculeLists/fiveSplits/file* | wc`
+
+foreach i(`seq -w 0 $nSplits`)
+  sed "s/XX/$i/g" run_template.slurm > run$i.slurm
+  sbatch run$i.slurm
+end
+
diff --git a/combine.py b/combine.py
@@ -0,0 +1,50 @@
+import os, glob, sys
+from rdkit import Chem
+from multiprocessing import Pool
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import numpy as np
+
+###########################################################################
+#                                                                         #
+# This program takes in one argument for the file index for the file      #
+# inside the directory called moleculeLists.txt named something like      #
+# file000, file001 etc. that contains the names of five drugs. The energy #
+# calculations for these drugs is submitted together. This way, I         #
+# parallelize the calculations with brute force separate job submitions.  #
+#                                                                         #
+###########################################################################
+
+molList = open('moleculeLists/fiveSplits/file' + sys.argv[1] + '.txt', 'r').readlines()
+molList = [mol.strip() for mol in molList]
+enzymeFile = '../../3sxr_dasatinib_removed.pdb'
+
+os.chdir('rDock_inputs/')
+for ligandName in molList:
+    os.chdir(ligandName)
+    suppl = Chem.SDMolSupplier(ligandName + '_docking_out_sorted.sd')
+    mols = [x for x in suppl]
+    m1 = mols[-1]
+    m2 = Chem.rdmolfiles.MolFromPDBFile(enzymeFile)
+
+    #Generating the ligand + protein geometry
+    combo = Chem.CombineMols(m1, m2)
+    combo = Chem.AddHs(combo, addCoords=True)
+    Chem.rdmolfiles.MolToPDBFile(combo, 'combo.pdb')
+
+    #Calculating the energy for the (ligand + protein) geometry
+    #As I say in the other document, xtb did not work and I ended up using rdkit
+    #for this step
+    #os.system('xtb --gfnff combo.xyz > combo.log')
+    res = AllChem.MMFFOptimizeMoleculeConfs(combo, maxIters=0, numThreads=0)
+
+    #Calculating the energies for the ligand only geometry
+    m1 = Chem.AddHs(m1, addCoords=True)
+    res1 = AllChem.MMFFOptimizeMoleculeConfs(m1, maxIters=0, numThreads=0)
+
+    #Saving the energies for the (ligand + protein) and the ligand only geometry in a file
+    np.savetxt('energies.txt', [res[0][1], res1[0][1]])
+    os.chdir('../')
+os.chdir('../')
+
+
diff --git a/drugs.txt b/drugs.txt
diff --git a/getEnergy.py b/getEnergy.py
@@ -0,0 +1,32 @@
+import os, glob, re, sys
+import numpy as np
+
+#######################################################################
+#                                                                     #
+# This script extracts the energies for the three components required #
+# in the binding energy calculations, which are (protein + ligand),   #
+# protein, and ligand energies. It calculates the binding energies    #
+# for all the drug molecules and prints them in kcal/mol.             #
+#                                                                     #
+#######################################################################
+
+
+os.chdir('moleculeLists')
+molList = open('fileList.txt', 'r').readlines()
+molList = [mol.strip() for mol in molList]
+os.chdir('../')
+
+e1 = np.loadtxt('energy_protein/energies.txt')
+
+os.chdir('rDock_inputs')
+for mol in molList:
+    os.chdir(mol)
+    try:
+        eCombined = np.loadtxt('energies.txt')
+        print("%s, %3f, %3f, %3f, %3f, %3f" % (mol, eCombined[0], eCombined[1], e1, (eCombined[0] - eCombined[1] - e1), (eCombined[0] - eCombined[1] - e1)*627.503))
+        os.chdir('../') 
+    except:
+        os.chdir('../') 
+        continue
+        print(mol, 'skipped due to some error')
+os.chdir('../')
diff --git a/getScores.tcsh b/getScores.tcsh
@@ -0,0 +1,23 @@
+#!/bin/tcsh
+
+#################################################################
+#                                                               #
+# This script extracts the docking scores for all the drugs and #
+# sorts them.                                                   #
+#                                                               #
+#################################################################
+
+rm -f allScores.txt
+
+cd rDock_inputs/
+  foreach mol(`cat ../moleculeLists/fileList.txt | xargs`)
+    cd $mol
+      set score = `grep -iwA1 "<SCORE>" "$mol"_docking_out_sorted.sd | tail -1`
+      echo "$mol $score" >> ../../allScores.txt
+    cd ..
+  end
+cd ..
+
+sort -k2g allScores.txt > sorted_allScores.txt
+
+
diff --git a/getSmilesFromFile.py b/getSmilesFromFile.py
@@ -0,0 +1,29 @@
+#This script extracts the smiles from the file named as drugs.txt
+import os
+def getSmilesFromFile(fileName):
+    INF = open(fileName,'r').readlines()
+    tmp1  = [i.strip().split() for i in INF][1:]
+    molNames, smiles = [], []
+    for i in tmp1:
+        if (i[-1] == 'TRUE') or (i[-1] == 'FALSE'):
+            continue
+        else:
+            if len(i) > 3:
+                molNames.append('_'.join([j.replace('/','-') for j in i[:-2]]))
+                smiles.append(i[-1])
+            elif len(i) == 3:
+                molNames.append(i[0])
+                smiles.append(i[2])
+            else:
+                print('something wrong with ', i, '.')
+
+    #Writing the names of molecules in a file inside a separate directory
+    #for future calculations
+    if not os.path.exists('moleculeLists'):
+        os.mkdir('moleculeLists')
+    os.chdir('moleculeLists')
+    with open("fileList.txt", 'w') as file:
+        file.write('\n'.join(molNames))
+
+    return [molNames, smiles]
+
diff --git a/main.py b/main.py
@@ -0,0 +1,43 @@
+import getSmilesFromFile
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import os
+import xyzFromSmiles
+
+############################################################
+#                                                          #
+# This program generates the xyz coordinates for the drugs #
+# molecules and the rdock inputs                           #
+#                                                          #
+############################################################
+
+#Extracting the names and the smiles for all the drug molecules
+molNames, smiles = getSmilesFromFile.getSmilesFromFile('drugs.txt')
+
+#Creating a separate directory for the docking calculations
+if not os.path.exists('rDock_inputs'):
+    os.mkdir('rDock_inputs')
+os.chdir('rDock_inputs')
+
+#Loading a prm file template to be edited later for each molecule
+f = open('prm-template.prm', 'r').read()
+
+for i in range(len(molNames)):
+    #Creating a separate directory for each drug's calculation
+    if not os.path.exists(molNames[i]):
+        os.mkdir(molNames[i])
+    os.chdir(molNames[i])
+
+    #Generating the xyz coordinates from smiles for the drug molecule
+    #and writing to a file in the sdf format
+    xyzFromSmiles.xyzFromSmiles(smiles[i], molNames[i])
+
+    #Generate prm file with the use of the template loaded earlier
+    f1 = f.replace('YYYYY', molNames[i])
+    file1 = open(molNames[i] + '_rdock.prm', "w") 
+    file1.write(f1)
+    file1.close() 
+
+    os.chdir('../')
+os.chdir('../')
+
diff --git a/make.bash b/make.bash
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+#Activating a conda environment that contains rdock
+conda activate docking-rdock
+
+#######################################################################
+#                                                                     #
+# The python program main.py below does the preprocessing on the data #
+# provided inside the file drugs.txt. It extracts smiles format and   #
+# generats the xyz coordinates for each molecule. It also creates the #
+# prm files for each them and stores them in separate directories for #
+# the docking calculations.                                           #
+#                                                                     #
+#######################################################################
+python main.py
+
+source deactivate
+
+#Generating a mol2 file for the receptor from the user-provided pdb
+babel -ipdb 3sxr_dasatinib_removed.pdb -omol2 receptorFile.mol2
+
+########################################################################
+#                                                                      #
+# A serial implementation is written below and hence it should only be #
+# used for small number of input drugs molecules. I ended up splitting #
+# the calculations into multiple parts.                                #
+#                                                                      #
+########################################################################
+
+cd rDock_inputs
+  for mol in $( ls -d1 * ); do
+    cd $mol
+      echo "$mol"
+      rm *log *docking*sd *.as *.mol2
+      sed "s/YYYYY/$mol/g" ../../prm-template.prm > "$mol"_rdock.prm
+      cp ../../receptorFile.mol2 "$mol"_rdock.mol2
+      rbcavity -r "$mol"_rdock.prm -W > "$mol"_cavity.log
+      rbdock -r "$mol"_rdock.prm -p dock.prm -n 10 -i "$mol".sd -o "$mol"_docking_out > "$mol"_docking_out.log
+      sdsort -n -f'SCORE' "$mol"_docking_out.sd > "$mol"_docking_out_sorted.sd
+    cd ..
+  done
+cd ..
+
+#Extracting the docking scores for all the drug molecules
+./getScores.tcsh
+
+###############################################################
+#                                                             #
+# Calculating the binding energy for protein ligand complexes #
+#                                                             #
+###############################################################
+
+#Splitting the drugs into sets of 5 to parallelize the energy calculation process
+mkdir -p moleculeLists
+cd moleculeLists
+  mkdir -p fiveSplits
+  cd fiveSplits
+    split -n l/5 --suffix-length=3 --additional-suffix=.txt --numeric-suffixes ../fileList.txt file
+  cd ..
+cd ..
+
+#Call to the energy calculations for the ligand+protein and the ligand
+./call_combine.tcsh
+
+#Calling a job submission script for the energy calculations for the protein.
+#It requires the user to create a directory named energy_protein with the
+#script protein.py and a slurm script run.slurm to be present in that directory.
+#This step can obviously be done cleanly but this is how I did it.
+cd energy_protein
+  sbatch run.slurm
+cd ..
+
+#Call to energy grepping python script
+python getEnergy.py > all_energies.txt
+sort -k6rg all_energies.txt | column -t > sorted_energies.txt
+