-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbuild-ontie-from-iedb.py
executable file
·180 lines (154 loc) · 4.42 KB
/
build-ontie-from-iedb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
import os, collections
import cx_Oracle
out_file = 'test/phony-ontie.kn'
proteins = """
SELECT iri,
source_id,
name,
aliases,
synonyms,
organism_id,
organism_name
FROM source
WHERE database = 'IEDB'
AND organism_id IS NOT NULL
AND organism_name IS NOT NULL
ORDER BY source_id
"""
organisms = """
SELECT o1.iri,
o1.organism_id,
o1.organism_name AS label,
o1.rank,
o1.parent_tax_id,
o1.parent_tax_id_string,
o2.organism_name AS parent
FROM organism o1, organism o2
WHERE o1.parent_tax_id = o2.organism_id
AND o1.organism_id >= 10000000
ORDER BY o1.organism_id
"""
synonyms = """
SELECT tax_id, name_txt
FROM names
WHERE name_class = 'synonym'
AND tax_id >= 10000000
ORDER BY tax_id
"""
parent_name = """
SELECT organism_id, organism_name
FROM organism
WHERE organism_id = {0}
"""
base = "https://ontology.iedb.org/ontology/ONTIE_"
prefix = "ONTIE:"
fake_prefix = "TEMP:%d"
# A dictionary from tax_id to a list of synonyms.
alternative_terms = collections.defaultdict(list)
ontie_map = {}
fake_id = 9000000
def main():
"""Connect to the database to query for IEDB taxon, then create a 'phony'
ONTIE.kn file to compare to the original ONTIE.kn file."""
# Connect to the Oracle DB
connect_string = "newdb/[email protected]:1521/iedbprod"
conn = cx_Oracle.connect(connect_string, encoding = "UTF-8", nencoding = "UTF-8")
print("Connecting: {}".format(connect_string))
cur = conn.cursor()
second_cur = conn.cursor()
# Collect alternative terms
cur.execute(synonyms)
for row in cur:
(tax_id, name) = row
alternative_terms[tax_id].append(name.strip())
# Collect organisms
cur.execute(organisms)
for row in cur:
add_organism(row, second_cur)
# Collect proteins
cur.execute(proteins)
for row in cur:
add_protein(row)
# Build ONTIE in CURIE order
with open(out_file, 'w') as f:
for key in sorted(ontie_map.keys()):
f.write(ontie_map[key])
def add_organism(row, cur):
"""For each IEDB taxon, write a stanza to a phony ONTIE.kn file."""
global ontie_map, fake_id
(iri, tax_id, label, rank, parent_tax_id, parent_tax_id_string, parent) = row
if iri is not None:
curie = iri.replace(base, prefix)
else:
curie = fake_prefix % fake_id
fake_id += 1
label = clean_name(label)
parent = clean_name(parent)
if rank:
rank = clean_name(rank)
stanza = ''
stanza += (': %s\n' % curie)
stanza += ('apply template: taxon class\n')
stanza += (' label: %s\n' % label)
stanza += (' parent taxon: %s\n' % parent)
if ',' in parent_tax_id_string:
superclasses = get_superclasses(parent_tax_id_string, parent, cur)
for sc in superclasses:
stanza += ('subclass of: %s\n' % sc)
for alternative_term in alternative_terms[tax_id]:
stanza += ('alternative term: %s\n' % alternative_term)
if rank:
stanza += ('rank: %s\n' % rank)
stanza += ('\n')
ontie_map[curie] = stanza
def add_protein(row):
"""For each IEDB SRC protein, write a stanza to a phony ONTIE.kn file."""
global ontie_map, fake_id
(iri, source_id, name, aliases, synonyms, organism_id, organism) = row
if iri is not None:
curie = iri.replace(base, prefix)
else:
curie = fake_prefix % fake_id
fake_id += 1
name = clean_name(name)
organism = clean_name(organism)
label = '%s (%s)' % (name, organism)
if not aliases:
aliases = ''
if synonyms:
synonyms = synonyms.read()
else:
synonyms = ''
alternative_terms = aliases.split(', ') + synonyms.split(', ')
stanza = ''
stanza += (': %s\n' % curie)
stanza += ('apply template: protein class\n')
stanza += (' label: %s\n' % name)
stanza += (' taxon: %s\n' % organism)
for alternative_term in alternative_terms:
alternative_term = alternative_term.strip()
if alternative_term != '':
stanza += ('alternative term: %s\n' % alternative_term)
stanza += ('\n')
ontie_map[curie] = stanza
def clean_name(name):
"""Return a tab-replaced name."""
return name.strip().replace('\t', ' ')
def get_superclasses(parent_tax_id_string, parent, cur):
"""Given a string with multiple parent IDs, the original parent name,
and a cursor to query with, get the labels and return them as a list,
excluding the original parent name."""
superclasses = []
for s in parent_tax_id_string.split(','):
query = parent_name.format(s)
cur.execute(query)
for row in cur:
(organism_id, organism_name) = row
organism_name = clean_name(organism_name)
if organism_name != parent:
superclasses.append(organism_name)
return superclasses
# Execute
if __name__ == '__main__':
main()