-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
167 lines (118 loc) · 4.56 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
from modgrammar import *
import tax
def iter_flatten(iterable):
it = iter(iterable)
for e in it:
if isinstance(e, (list, tuple)):
for f in iter_flatten(e):
yield f
else:
yield e
# a taxon name is upper and lower case letters plus space
# to allow for common names / binomials
class Name (Grammar):
grammar = (WORD("A-Za-z "))
# there are three possible extensions to a taxon name
class TaxonSuffix (Grammar):
grammar = (L("children") | L("parent") | L("siblings") | L("species") | L("genus") | L("order") | L("family")| L("class") | L("phylum") )
class TaxonSuffixQuantifier (Grammar):
grammar = ( WORD("0123456789") )
# not used
class TaxonPrefix (Grammar):
grammar = (L("sister"))
class Exclude (Grammar):
grammar = ( L('-') )
# a full taxon name is a normal taxon name followed, optionally,
# by a colon then the extension
class TaxonFull (Grammar):
grammar = ( OPTIONAL(Exclude), Name, OPTIONAL(':', TaxonSuffix, OPTIONAL('{', TaxonSuffixQuantifier, '}')) )
grammar_tags = ("list element",)
# a TaxonList starts and ends with brackets
# inside the brackets is a list separated by commas
# each element of the list is either a full taxon name or another list
class TaxonList (Grammar):
grammar = ( '(' , LIST_OF(OR(TaxonFull, REF('TaxonList')), sep=",") , ')' )
grammar_tags = ("list element",)
# a tree contains a top-level taxon list
class Tree (Grammar):
grammar = ( Name, ':', TaxonList )
class TreeList (Grammar):
grammar = ( LIST_OF(Tree, sep=";") )
def expand_taxon(taxon):
# print(repr(taxon))
taxon_name = taxon.find(Name).string
suffix = taxon.find(TaxonSuffix)
# if the extension is none, then just add the name
if suffix == None:
return [taxon_name]
# otherwise do something interesting
else:
# grab the suffix quantifier, if there is one, and turn it into an int
suffix_quantifier = taxon.find(TaxonSuffixQuantifier)
if suffix_quantifier == None:
suffix_quantifier = 1
else:
suffix_quantifier = int(suffix_quantifier.string)
# print(suffix)
# deal with suffixes
if suffix.string == 'children':
return tax.get_children_multiple(taxon_name, suffix_quantifier)
elif suffix.string == 'parent':
return [tax.get_parent_multiple(taxon_name, suffix_quantifier)]
elif suffix.string == 'siblings':
return tax.get_siblings_multiple(taxon_name, suffix_quantifier)
else:
print(taxon_name, suffix.string)
return tax.get_named_children(taxon_name, suffix.string)
def list_subtrees(tree, level = 0):
if tree == None:
return
for subtree in tree.find_tag_all("list element"):
print(' ' * level + repr(subtree))
list_subtrees(subtree, level+1)
def parse_rec(tree, level = 0):
result = []
to_remove = []
for subtree in tree.find_tag_all("list element"):
if isinstance(subtree, TaxonFull):
if subtree.find(Exclude):
to_remove.extend(expand_taxon(subtree))
else:
result.extend(expand_taxon(subtree))
if isinstance(subtree, TaxonList):
result.append(parse_rec(subtree))
for taxon in to_remove:
if taxon in result:
result.remove(taxon)
return result
def parse_trees(input_trees):
print('about to parse ' , input_trees)
parsed_trees = {}
for tree in input_trees:
tree_parser = Tree.parser()
parsed_tree = tree_parser.parse_string(tree)
tree_name = parsed_tree.find(Name).string
tree_as_list = parse_rec(parsed_tree.find(TaxonList))
parsed_trees[tree_name] = tree_as_list
return parsed_trees
# playing with negation and multi-childrening
# Coleoptera and all siblings
# parse_trees('my tree:(Coleoptera, Coleoptera:siblings);')
# Coleoptera and all siblings excluding diptera
# parse_trees('my tree:(Coleoptera, Coleoptera:siblings, -Diptera);')
# Coleoptera is closer to Diptera than to any other sibling
# parse_trees('my tree:((Coleoptera, Diptera), Coleoptera:siblings, -Diptera);')
# compare two hypotheses; coleoptera+diptera vs coleoptera + hymenoptera
# parse_trees([
# 'hypa:((Coleoptera, Diptera), Coleoptera:siblings, -Diptera)',
# 'hypb:((Coleoptera, Hymenoptera), Coleoptera:siblings, -Hymenoptera)'
# ])
# parse_trees('my tree:(Mandibulata:children{1});')
# parse_trees('my tree:(Pancrustacea:children{1});')
# parse_trees('my tree:(Mandibulata:children{2});')
# parse_trees(['my tree:((Mandibulata:children{2}), Crustacea)'])
# parse_trees(['my tree:(Mandibulata:children{2}, -Mandibulata:children{2})'])
# parse_trees(['my tree:(Mandibulata:parent{1}, Mandibulata:parent{2}, Mandibulata:parent{3})'])
#parse_trees(['my tree:(Mandibulata:siblings{2})'])
#parse_trees(['my tree:(Arthropoda:class)'])
# TODO last common ancestors e.g. Diptera^Coleoptera