-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbaseline_naive.py
executable file
·120 lines (85 loc) · 4.01 KB
/
baseline_naive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import os
import logging
import numpy as np
import copy
import argparse
from pathlib import Path
# relative imports
from caid import set_logger
from vectorized_metrics import bvaluation
from baseline_random import get_reference
def tokenize(seq, n=7):
"""
Tokenize an iterable in a series of adjacent windows
Yield a series of blobs of size n * 2 + 1 with the
actual residue in the middle.
For ending residues mirrored windows are added.
:param n: n * 2 + 1 is the size of the window
:type n: int
:return: an iterator of blobs
:rtype: iterator
"""
lstates = len(seq)
n = n if n < lstates else lstates - 1
new_seq = copy.copy(seq[1:n + 1][::-1]) # Reversed start
new_seq += copy.copy(seq)
new_seq += copy.copy(seq[-n - 1:-1][::-1]) # Reversed end
seq = new_seq
for pos in range(n, len(seq) - n):
yield seq[pos - n:pos + n + 1]
def get_scores_movingwindow(sequence):
tokens = list(tokenize(sequence))
return np.array(tokens).mean(axis=1)
def save_prediction(ref_file, acc, seq, scores, states):
lseq = len(seq)
scores = scores if scores is not None else [''] * lseq
states = states if states is not None else [''] * lseq
assert lseq == len(scores) == len(states), 'sequence, states and scores must have the same len'
ref_file.write('>{}\n'.format(acc))
for i, (aa, sc, st) in enumerate(zip(seq, scores, states), 1):
ref_file.write('{}\t{}\t{}\t{}\n'.format(i, aa, '{:.3f}'.format(sc) if isinstance(sc, float) else '', st))
def naive_prediction(loaded_ref, outbase, invert):
logging.info('building naive baseline from reverse input reference')
fname = '{}.txt'.format(outbase)
with open(fname, 'w') as fhandle:
for acc, data in loaded_ref.groupby(level=0):
if invert is True:
states = [1 if r == 0 else 0 for r in data['ref']['states']]
else:
states = [0 if r == 0 else 1 for r in data['ref']['states']]
# scores = get_scores_movingwindow(states)
scores = [''] * len(states)
save_prediction(fhandle, acc, data['ref']['seq'], scores, states)
return fname
def build_args(options, predfiles, output_basename):
optns = [options.reference] + predfiles + ['-l', options.log,
'-ll', options.logLevel,
'-o', output_basename,
'--suffix', 'cons']
if options.replaceUndefined is not None:
optns.extend(['-r', options.replaceUndefined])
return optns
def parse_args():
parser = argparse.ArgumentParser(
prog='caid-assess', description="CAID: Critical Assessment of Intrinsic Disorder",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('reference',
help='reference file to which predictions are to be compared')
parser.add_argument("--pRef", help='reference from which to derive naif predictions', required=True)
parser.add_argument('-i', '--invert', default=False, action='store_true')
parser.add_argument('-o', '--outdir', default='.')
parser.add_argument('-l', '--log', type=str, default=None, help='log file')
parser.add_argument("-ll", "--logLevel", default="ERROR",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help='log level filter. All levels <= choice will be displayed')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = parse_args()
set_logger(args.log, args.logLevel)
ref, refname = get_reference(args.reference)
pred, predname = get_reference(args.pRef)
predname = predname if refname != predname else "self"
pred_fname = naive_prediction(pred, Path(args.outdir) / predname, args.invert)
bvaluation(reference=args.reference, predictions=[pred_fname], outpath=args.outdir,
run_tag="naive-{}".format(predname), dataset=True, target=True, bootstrap=True)