-
Notifications
You must be signed in to change notification settings - Fork 0
/
grab.py
77 lines (51 loc) · 1.99 KB
/
grab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from data import Genome
import numpy as np
genome = Genome()
key = "CM004294.1"
nucleotide, annotation = genome.getContig(key, end_at=100000)
threshold = 1
minimum_basepairs = 0
clustering = list(genome.annotation_fields)
print(clustering)
def computeNucleotideScores (vector):
# [start, stop, % score]
contiguous_coordinates = list()
latest_contig = None
for i in range(len(vector)):
if vector[i] >= threshold:
# Count
# Add new entry
if latest_contig == None or i > (latest_contig + 1):
# Put to start of array
contiguous_coordinates = [[i + 1, i + 1, [vector[i]]]] + contiguous_coordinates
latest_contig = i
else:
contiguous_coordinates[0][1] = i + 1
contiguous_coordinates[0][2].append(vector[i])
latest_contig = i
# Normalise contiguous scores
qc_coordinates = list()
for contigs in contiguous_coordinates:
contigs[2] = sum(contigs[2])/(contigs[1] - contigs[0] + 1)
if (contigs[1] - contigs[0] + 1) >= minimum_basepairs:
qc_coordinates.append(contigs)
return qc_coordinates
annotation = np.swapaxes(annotation, 0, 1)
annotations = dict()
print(annotation)
for i in range(len(clustering)):
cluster = clustering[i]
annotations[cluster] = computeNucleotideScores(annotation[i])
all_annotations = dict()
all_annotations[key] = annotations
gff_annotations = list()
id_numeral = 0
for sequence in all_annotations.keys():
sequence_annotations = all_annotations[sequence]
for cluster in clustering:
annotations = sequence_annotations[cluster]
for annotation in annotations:
data = "\t".join([sequence, "Pneuma", cluster, str(annotation[0]), str(annotation[1]), str(annotation[2]), ".", ".", "ID={};".format(id_numeral)])
gff_annotations.append(data)
id_numeral += 1
open("test/ground_truth.gff", "w+").write("\n".join(gff_annotations))