forked from Laboratoire-de-Chemoinformatique/Synt-On
-
Notifications
You must be signed in to change notification settings - Fork 0
/
SyntOn_BBScaffoldGeneration.py
70 lines (64 loc) · 3.94 KB
/
SyntOn_BBScaffoldGeneration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import sys,os
from rdkit import Chem
srcPath = os.path.split(os.path.realpath(__file__))[0]
sys.path.insert(1, srcPath)
from src.UsefulFunctions import *
from src.SyntOn_BBs import *
def main(args):
with open(args.output + "_Scaffolds.smi", "w") as out:
scaffoldsCount = {}
for line in open(args.input):
sline = line.strip()
if sline:
scaffold, mol = generateScaffoldForBB(sline.split()[0], returnObjects=True)
if mol:
out.write(sline + " " + Chem.MolToSmiles(mol))
if scaffold:
if scaffold not in scaffoldsCount:
scaffoldsCount[scaffold] = 0
scaffoldsCount[scaffold] += 1
out.write(" " + scaffold + "\n")
else:
out.write(" linearMolecule\n")
scaffoldsCountSorted = {r: scaffoldsCount[r] for r in sorted(scaffoldsCount, key=scaffoldsCount.get, reverse=True)}
scaffoldsCount.clear()
with open(args.output + "_scaffoldsCounts.smi", "w") as outCounts:
for scaffold in scaffoldsCountSorted:
outCounts.write(scaffold + " " + str(scaffoldsCountSorted[scaffold]) + "\n")
with open(args.output + "_cumulativeprecentage.smi", "w") as outCumPer:
cumSum = 0
TotalCompNumb = sum(scaffoldsCountSorted.values())
TotalScaffNumb = len(scaffoldsCountSorted)
for ind,scaff in enumerate(scaffoldsCountSorted):
cumSum += scaffoldsCountSorted[scaff]
outCumPer.write(str(int(round((ind + 1) / TotalScaffNumb * 100))) + " " + str(
int(round(cumSum / TotalCompNumb * 100))) + "\n")
scaffoldsCountSorted.clear()
scaffoldPlot(args.output + "_cumulativeprecentage.smi", args.output)
def scaffoldPlot(cumPercentageFile, outName):
from matplotlib import pyplot as plt
from numpy import genfromtxt
Data = genfromtxt(cumPercentageFile, delimiter=' ', names=['x', 'y'])
fig, ax = plt.subplots()
ax.tick_params(axis='both', which='major', labelsize=12)
plt.plot(Data['x'], Data['y'], color="darkgreen")
plt.ylim(ymin=0, ymax=100)
plt.xlim(xmin=0, xmax=100)
plt.ylabel("Fraction of BBs, %", fontweight='bold', fontsize=14)
plt.xlabel("Fraction of scaffolds, %", fontweight='bold', fontsize=14)
plt.title("Cumulative Scaffold Frequency Plot", fontweight='bold', fontsize=14)
plt.savefig("Scaffolds_FreqPlot_" + outName + ".png")
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description="BBs Scaffold analysis. Generates meaningful BBs scaffolds after removing ring-containing leaving and protective groups. Count scaffolds occurrence in the provided collection of BBs, and construct cumulative scaffold frequency plot.",
epilog="Code implementation: Yuliana Zabolotna, Alexandre Varnek\n"
" Laboratoire de Chémoinformatique, Université de Strasbourg.\n\n"
"Knowledge base (SMARTS library): Dmitriy M.Volochnyuk, Sergey V.Ryabukhin, Kostiantyn Gavrylenko, Olexandre Oksiuta\n"
" Institute of Organic Chemistry, National Academy of Sciences of Ukraine\n"
" Kyiv National Taras Shevchenko University\n"
"2021 Strasbourg, Kiev",
prog="SyntOn_BBScaffoldGeneration", formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("-i", "--input", type=str, help="Input BBs file.")
parser.add_argument("-o", "--output", type=str, help="Output files suffix name.")
args = parser.parse_args()
main(args)