Skip to content

Commit

Permalink
Added a few program files.
Browse files Browse the repository at this point in the history
  • Loading branch information
kumaranu committed May 1, 2024
1 parent 76b0858 commit 11773f9
Show file tree
Hide file tree
Showing 9 changed files with 6,451 additions and 0 deletions.
4,488 changes: 4,488 additions & 0 deletions 3sxr_dasatinib_removed.pdb

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions call_combine.tcsh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/tcsh

#########################################################################
# #
# This script submits slurm jobs for ligand+protein and the ligand only #
# calculations. #
# #
#########################################################################


#Call to the energy calculations for the ligand+protein and the ligand
set nSplits = `ls -ltr moleculeLists/fiveSplits/file* | wc`

foreach i(`seq -w 0 $nSplits`)
sed "s/XX/$i/g" run_template.slurm > run$i.slurm
sbatch run$i.slurm
end

50 changes: 50 additions & 0 deletions combine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import os, glob, sys
from rdkit import Chem
from multiprocessing import Pool
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

###########################################################################
# #
# This program takes in one argument for the file index for the file #
# inside the directory called moleculeLists.txt named something like #
# file000, file001 etc. that contains the names of five drugs. The energy #
# calculations for these drugs is submitted together. This way, I #
# parallelize the calculations with brute force separate job submitions. #
# #
###########################################################################

molList = open('moleculeLists/fiveSplits/file' + sys.argv[1] + '.txt', 'r').readlines()
molList = [mol.strip() for mol in molList]
enzymeFile = '../../3sxr_dasatinib_removed.pdb'

os.chdir('rDock_inputs/')
for ligandName in molList:
os.chdir(ligandName)
suppl = Chem.SDMolSupplier(ligandName + '_docking_out_sorted.sd')
mols = [x for x in suppl]
m1 = mols[-1]
m2 = Chem.rdmolfiles.MolFromPDBFile(enzymeFile)

#Generating the ligand + protein geometry
combo = Chem.CombineMols(m1, m2)
combo = Chem.AddHs(combo, addCoords=True)
Chem.rdmolfiles.MolToPDBFile(combo, 'combo.pdb')

#Calculating the energy for the (ligand + protein) geometry
#As I say in the other document, xtb did not work and I ended up using rdkit
#for this step
#os.system('xtb --gfnff combo.xyz > combo.log')
res = AllChem.MMFFOptimizeMoleculeConfs(combo, maxIters=0, numThreads=0)

#Calculating the energies for the ligand only geometry
m1 = Chem.AddHs(m1, addCoords=True)
res1 = AllChem.MMFFOptimizeMoleculeConfs(m1, maxIters=0, numThreads=0)

#Saving the energies for the (ligand + protein) and the ligand only geometry in a file
np.savetxt('energies.txt', [res[0][1], res1[0][1]])
os.chdir('../')
os.chdir('../')


1,692 changes: 1,692 additions & 0 deletions drugs.txt

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions getEnergy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import os, glob, re, sys
import numpy as np

#######################################################################
# #
# This script extracts the energies for the three components required #
# in the binding energy calculations, which are (protein + ligand), #
# protein, and ligand energies. It calculates the binding energies #
# for all the drug molecules and prints them in kcal/mol. #
# #
#######################################################################


os.chdir('moleculeLists')
molList = open('fileList.txt', 'r').readlines()
molList = [mol.strip() for mol in molList]
os.chdir('../')

e1 = np.loadtxt('energy_protein/energies.txt')

os.chdir('rDock_inputs')
for mol in molList:
os.chdir(mol)
try:
eCombined = np.loadtxt('energies.txt')
print("%s, %3f, %3f, %3f, %3f, %3f" % (mol, eCombined[0], eCombined[1], e1, (eCombined[0] - eCombined[1] - e1), (eCombined[0] - eCombined[1] - e1)*627.503))
os.chdir('../')
except:
os.chdir('../')
continue
print(mol, 'skipped due to some error')
os.chdir('../')
23 changes: 23 additions & 0 deletions getScores.tcsh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/tcsh

#################################################################
# #
# This script extracts the docking scores for all the drugs and #
# sorts them. #
# #
#################################################################

rm -f allScores.txt

cd rDock_inputs/
foreach mol(`cat ../moleculeLists/fileList.txt | xargs`)
cd $mol
set score = `grep -iwA1 "<SCORE>" "$mol"_docking_out_sorted.sd | tail -1`
echo "$mol $score" >> ../../allScores.txt
cd ..
end
cd ..

sort -k2g allScores.txt > sorted_allScores.txt


29 changes: 29 additions & 0 deletions getSmilesFromFile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#This script extracts the smiles from the file named as drugs.txt
import os
def getSmilesFromFile(fileName):
INF = open(fileName,'r').readlines()
tmp1 = [i.strip().split() for i in INF][1:]
molNames, smiles = [], []
for i in tmp1:
if (i[-1] == 'TRUE') or (i[-1] == 'FALSE'):
continue
else:
if len(i) > 3:
molNames.append('_'.join([j.replace('/','-') for j in i[:-2]]))
smiles.append(i[-1])
elif len(i) == 3:
molNames.append(i[0])
smiles.append(i[2])
else:
print('something wrong with ', i, '.')

#Writing the names of molecules in a file inside a separate directory
#for future calculations
if not os.path.exists('moleculeLists'):
os.mkdir('moleculeLists')
os.chdir('moleculeLists')
with open("fileList.txt", 'w') as file:
file.write('\n'.join(molNames))

return [molNames, smiles]

43 changes: 43 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import getSmilesFromFile
from rdkit import Chem
from rdkit.Chem import AllChem
import os
import xyzFromSmiles

############################################################
# #
# This program generates the xyz coordinates for the drugs #
# molecules and the rdock inputs #
# #
############################################################

#Extracting the names and the smiles for all the drug molecules
molNames, smiles = getSmilesFromFile.getSmilesFromFile('drugs.txt')

#Creating a separate directory for the docking calculations
if not os.path.exists('rDock_inputs'):
os.mkdir('rDock_inputs')
os.chdir('rDock_inputs')

#Loading a prm file template to be edited later for each molecule
f = open('prm-template.prm', 'r').read()

for i in range(len(molNames)):
#Creating a separate directory for each drug's calculation
if not os.path.exists(molNames[i]):
os.mkdir(molNames[i])
os.chdir(molNames[i])

#Generating the xyz coordinates from smiles for the drug molecule
#and writing to a file in the sdf format
xyzFromSmiles.xyzFromSmiles(smiles[i], molNames[i])

#Generate prm file with the use of the template loaded earlier
f1 = f.replace('YYYYY', molNames[i])
file1 = open(molNames[i] + '_rdock.prm', "w")
file1.write(f1)
file1.close()

os.chdir('../')
os.chdir('../')

76 changes: 76 additions & 0 deletions make.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/bin/bash

#Activating a conda environment that contains rdock
conda activate docking-rdock

#######################################################################
# #
# The python program main.py below does the preprocessing on the data #
# provided inside the file drugs.txt. It extracts smiles format and #
# generats the xyz coordinates for each molecule. It also creates the #
# prm files for each them and stores them in separate directories for #
# the docking calculations. #
# #
#######################################################################
python main.py

source deactivate

#Generating a mol2 file for the receptor from the user-provided pdb
babel -ipdb 3sxr_dasatinib_removed.pdb -omol2 receptorFile.mol2

########################################################################
# #
# A serial implementation is written below and hence it should only be #
# used for small number of input drugs molecules. I ended up splitting #
# the calculations into multiple parts. #
# #
########################################################################

cd rDock_inputs
for mol in $( ls -d1 * ); do
cd $mol
echo "$mol"
rm *log *docking*sd *.as *.mol2
sed "s/YYYYY/$mol/g" ../../prm-template.prm > "$mol"_rdock.prm
cp ../../receptorFile.mol2 "$mol"_rdock.mol2
rbcavity -r "$mol"_rdock.prm -W > "$mol"_cavity.log
rbdock -r "$mol"_rdock.prm -p dock.prm -n 10 -i "$mol".sd -o "$mol"_docking_out > "$mol"_docking_out.log
sdsort -n -f'SCORE' "$mol"_docking_out.sd > "$mol"_docking_out_sorted.sd
cd ..
done
cd ..

#Extracting the docking scores for all the drug molecules
./getScores.tcsh

###############################################################
# #
# Calculating the binding energy for protein ligand complexes #
# #
###############################################################

#Splitting the drugs into sets of 5 to parallelize the energy calculation process
mkdir -p moleculeLists
cd moleculeLists
mkdir -p fiveSplits
cd fiveSplits
split -n l/5 --suffix-length=3 --additional-suffix=.txt --numeric-suffixes ../fileList.txt file
cd ..
cd ..

#Call to the energy calculations for the ligand+protein and the ligand
./call_combine.tcsh

#Calling a job submission script for the energy calculations for the protein.
#It requires the user to create a directory named energy_protein with the
#script protein.py and a slurm script run.slurm to be present in that directory.
#This step can obviously be done cleanly but this is how I did it.
cd energy_protein
sbatch run.slurm
cd ..

#Call to energy grepping python script
python getEnergy.py > all_energies.txt
sort -k6rg all_energies.txt | column -t > sorted_energies.txt

0 comments on commit 11773f9

Please sign in to comment.