-
Notifications
You must be signed in to change notification settings - Fork 2
/
problems.py
158 lines (127 loc) · 3.87 KB
/
problems.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
'''
The various parameter estimation problems are defined here, as well as
several convenience functions that can be used to build up new problems. All
problems are stored in the global constant DEFINITIONS.
'''
from collections import OrderedDict
from data import kb
import structure
import fitting
_DEFAULT_EXCLUDE_DATATYPE = (
# Pseudo-fitting data; not desired by default
'upper_reactant_saturation_limit',
'upper_product_saturation_limit',
# SEOF data proved to be very inaccurate and was ultimately excluded
'standard_energy_of_formation'
)
_DEFAULT_RULES = (
(
fitting.field_value_rule(datatype = _DEFAULT_EXCLUDE_DATATYPE),
0
),
)
def accept_all(value = 1e0):
'''
A rule-generation function that accepts everything.
'''
return (
lambda entry: True,
value
)
DEFINITIONS = OrderedDict()
'''
The 'data agnostic' problem weights all data evenly.
'''
DEFINITIONS['data_agnostic'] = _DEFAULT_RULES + (accept_all(),)
'''
The 'no data' problem excludes all training data.
'''
DEFINITIONS['no_data'] = _DEFAULT_RULES + (accept_all(0),)
_KINETICS_TYPES = (
'forward_catalytic_rate',
'reverse_catalytic_rate',
'substrate_saturation'
)
from collections import defaultdict
def gather_by_fields(dataset, *fields):
out = defaultdict(set)
for entry in dataset:
out[tuple(getattr(entry, field) for field in fields)].add(entry)
return out
def gather_by_field(dataset, field):
return {
key[0]:value
for key, value in gather_by_fields(dataset, field).viewitems()
}
from itertools import izip
def normalize_by_number_of_observations(dataset, *fields):
'''
Creates a rule-weight pair that normalizes the weighting on the data by the
number of observations. This was found to give better fits, since data
were often consistent and errors were usually consequent of a systemic
change rather than experimental error. E.g. chemical equilibria constants
were found to not be totally consistent for the cell interior, despite
having many (sometimes 10+) observations on each constant.
'''
return tuple(
(
fitting.field_value_rule(**{
field:(value,)
for (field, value) in izip(fields, values)
}),
1.0/len(entries)
)
for (values, entries) in gather_by_fields(
dataset,
*fields
).viewitems()
)
'''
The 'all scaled' ruleset is the default.
'''
DEFINITIONS['all_scaled'] = (
_DEFAULT_RULES
+ normalize_by_number_of_observations(kb.concentration, 'datatype', 'compound')
+ normalize_by_number_of_observations(kb.equilibrium, 'datatype', 'reaction')
+ normalize_by_number_of_observations(kb.forward_catalytic_rate, 'datatype', 'reaction')
+ normalize_by_number_of_observations(kb.reverse_catalytic_rate, 'datatype', 'reaction')
+ normalize_by_number_of_observations(kb.reactant_saturation, 'datatype', 'reaction', 'compound')
+ normalize_by_number_of_observations(kb.product_saturation, 'datatype', 'reaction', 'compound')
) + (
(
fitting.field_value_rule( # relative data hard to scale by # of obs; instead scaling by total # of data sets (3)
datatype = ('relative_protein_count',),
),
1.0/3 # currently using three sets of proteomics data
),
)
'''
Variants on the 'all scaled' data set that include the saturation penalty, with
increasing weight.
'''
DEFINITIONS['all_scaled_upper_sat_limits_1e-1'] = (
(
fitting.field_value_rule(source = ('custom_saturation_limits',)),
1e-1
),
) + DEFINITIONS['all_scaled']
DEFINITIONS['all_scaled_upper_sat_limits_1e0'] = (
(
fitting.field_value_rule(source = ('custom_saturation_limits',)),
1e0
),
) + DEFINITIONS['all_scaled']
DEFINITIONS['all_scaled_upper_sat_limits_1e1'] = (
(
fitting.field_value_rule(source = ('custom_saturation_limits',)),
1e1
),
) + DEFINITIONS['all_scaled']
DEFINITIONS['all_scaled_upper_sat_limits_1e2'] = (
(
fitting.field_value_rule(source = ('custom_saturation_limits',)),
1e2
),
) + DEFINITIONS['all_scaled']
if __name__ == '__main__':
test()