-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathClusterFinder.py
106 lines (76 loc) · 3.68 KB
/
ClusterFinder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
##############################################################################
# ClusterFinder - main script #
# Peter Cimermancic #
##############################################################################
PATH = './scr'
import sys
sys.path.append(PATH)
from Predict import *
from FindClusters import *
import re
organism_name = 'example_org'
input_file = 'example_input.txt'
# --- read in the HMM model data
PATH_TO_FREQUENCIES = './freqdata/' # link to where the frequencies are
PATH_TO_DATA = './data/'
out4 = open(PATH_TO_FREQUENCIES + 'TP_arr_A_latest.pkl','rb')
out5 = open(PATH_TO_FREQUENCIES + 'NewTS_all_B_reduced_6filter.pkl','rb')
out6 = open(PATH_TO_FREQUENCIES + 'SP_arr.pkl','rb')
out7 = open(PATH_TO_FREQUENCIES + 'NewTS_all_B_index.pkl','rb')
A=pickle.load(out4)
B=pickle.load(out5)
start_p=pickle.load(out6)
index_dict=pickle.load(out7)
out4.close()
out5.close()
out6.close()
out7.close()
# --- read in an input file
'''
Costumize according to your input.
'''
data = open(PATH_TO_DATA+input_file)
D = data.readlines()
data.close()
DATA = []
for d in D:
d = d.strip().split('\t')
gene,pfamID = d[5],d[13].replace('pfam','PF')
genstart,genestop = int(d[6]),int(d[7])
try: pfamstart,pfamstop = int(d[11]),int(d[12])
except ValueError: pfamstart,pfamstop = genstart,genestop
chain = d[3]
# - if gene has a Pfam annotation
if pfamID != 'n/a': DATA.append([ chain,gene,genstart,genestop,pfamstart,pfamstop,pfamID ])
# - else
else: DATA.append( [ chain,gene,genstart,genestop,pfamstart,pfamstop,'Q' ] )
# --- sort Pfams as they appear in the genome
PfamSorted = sorted(DATA,key = itemgetter(0,2,4))
# --- filter out the repeats (optional)
repeats = set(['PF07721', 'PF05593', 'PF07719', 'PF00515', 'PF00132', 'PF03130', 'PF01839', 'PF01816', 'PF07720', 'PF00400', 'PF05594', 'PF07661', 'PF02985', 'PF06049', 'PF08238', 'PF06696', 'PF00353', 'PF02412', 'PF00023', 'PF02071', 'PF03991', 'PF01469', 'PF07676', 'PF00514', 'PF00904', 'PF07634', 'PF02370', 'PF03335', 'PF01851', 'PF04728', 'PF06715', 'PF03373', 'PF04680', 'PF00805', 'PF04508', 'PF07918', 'PF01535', 'PF01011', 'PF05017', 'PF06671', 'PF00818', 'PF03406', 'PF00399', 'PF09373', 'PF01744', 'PF01436', 'PF01239', 'PF05906', 'PF03729', 'PF00404', 'PF04022', 'PF02363', 'PF02524', 'PF07981', 'PF02095', 'PF00414', 'PF00560', 'PF05001', 'PF02162', 'PF01473', 'PF05465', 'PF02493', 'PF03578', 'PF08043', 'PF06392', 'PF07142', 'PF08309', 'PF02184'])
PfamSorted = [i for i in PfamSorted if i[-1] not in repeats]
# - check that there are Pfam domain in the data
pfpatt = re.compile(r"""PF+\d+\d+\d+\d+\d""")
if len([re.match(pfpatt,p[-1]) for p in PfamSorted])==0:
print 'There is no Pfam domain in you dataset!'
exit()
# --- run the HMM
do_HMM([i[-1] for i in PfamSorted],A,B,index_dict,start_p,graph=False,report=True,outs=PfamSorted,\
method='HMM',path=PATH_TO_DATA,name=organism_name)
# --- filter for biosynthetic domains
data = open(PATH_TO_FREQUENCIES+'biosynthetic_pfams.txt','r')
BioPfams = set([i.strip().split('\t')[0] for i in data.readlines()])
data.close()
# --- run clustering method
Clusters, OrgName = get_clusters(PATH_TO_DATA+organism_name+'.out',X=2,TRSH=0.2)
CLUSTERS = [] # store the new clusters
for clstr in Clusters:
pfams = set([i[-4] for i in clstr])
if len(pfams - BioPfams)>=3: # - requires at least three biosynthetic domains
CLUSTERS.append(clstr)
# --- write the data (in ClusterFinder format)
output = open(PATH_TO_DATA+organism_name+'.clusters.out','w')
for x,clstr in enumerate(CLUSTERS):
for domain in clstr:
output.write('\t'.join([str(j) for j in [organism_name+'_'+str(x)]+domain])+'\n')
output.close()