-
Notifications
You must be signed in to change notification settings - Fork 0
/
conv_mondo_obo2tsv.py
59 lines (45 loc) · 2.18 KB
/
conv_mondo_obo2tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from nltk.tokenize import word_tokenize
import sys
import re
import csv
from pronto import Ontology
def load_mondo_obo(filename):
dict_mondo_label = {}
dict_mondo_synonym = {}
ont = Ontology(filename)
for term in ont.terms():
# exclude obsolete classes
if term.obsolete:
continue
id_mondo = term.id
label = term.name
if id_mondo in dict_mondo_label:
(dict_mondo_label[id_mondo]).append(label)
else:
dict_mondo_label[id_mondo] = []
(dict_mondo_label[id_mondo]).append(label)
# exclude the synonyms those SynonymType is "ABBREVIATION" or "EXCLUDE" or "DEPRECATED" or "AMBIGUOUS" or "DUBIOUS" or "MISSPELLING"
for synonym in term.synonyms:
if re.search(r"ABBREVIATION", str(synonym.type)) or re.search(r"EXCLUDE", str(synonym.type)) or re.search(r"DEPRECATED", str(synonym.type)) or re.search(r"AMBIGUOUS", str(synonym.type)) or re.search(r"DUBIOUS", str(synonym.type)) or re.search(r"MISSPELLING", str(synonym.type)):
continue
if id_mondo in dict_mondo_synonym:
(dict_mondo_synonym[id_mondo]).append(synonym.description)
else:
dict_mondo_synonym[id_mondo] = []
(dict_mondo_synonym[id_mondo]).append(synonym.description)
return dict_mondo_label, dict_mondo_synonym
def main():
dict_mondo_label, dict_mondo_synonym = load_mondo_obo(sys.argv[1])
for id_mondo in sorted(dict_mondo_label.keys()):
for label in list(set(dict_mondo_label[id_mondo])):
tokens = word_tokenize(label.replace(',', ''))
tokens_sorted = sorted(tokens)
print(id_mondo + "\tlabel\t" + label + "\t" + " ".join(tokens_sorted))
if id_mondo in dict_mondo_synonym:
for synonym in list(set(dict_mondo_synonym[id_mondo])):
tokens = word_tokenize(synonym.replace(',', ''))
tokens_sorted = sorted(tokens)
#print(id_mondo + "\tlabel\t" + synonym + "\t" + " ".join(tokens_sorted))
print(id_mondo + "\tsynonym\t" + synonym + "\t" + " ".join(tokens_sorted))
if __name__ == '__main__':
main()