-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathevaluate_submissions.py
executable file
·306 lines (276 loc) · 10.4 KB
/
evaluate_submissions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
from pathlib import Path
from CompetitionEvaluation import structure_data, calculate_metrics
from utilities import list_submissions, get_target_data, TargetType
import os
import xarray
import numpy as np
import numpy.typing as npt
from scipy.signal import resample
import argparse
import pandas as pd
import pyarrow
import logging
logging.getLogger(__name__)
logging.basicConfig(
filename="evaluate_submission.log", encoding="utf-8", level=logging.INFO
)
def evaluate_forecast(
forecast: pd.DataFrame,
actuals: pd.DataFrame,
target: TargetType,
expected_samples: int,
save_to: str | os.PathLike,
draw_column: str = "draw",
data_column: str = "outcome",
bins: list[float] = [
0,
0.5,
2.5,
5.5,
10.5,
25.5,
50.5,
100.5,
250.5,
500.5,
1000.5,
],
) -> None:
if target == "pgm":
unit = "priogrid_gid"
elif target == "cm":
unit = "country_id"
else:
raise ValueError(f'Target {target} must be either "pgm" or "cm".')
# Cast to xarray
observed, predictions = structure_data(
actuals, forecast, draw_column_name=draw_column, data_column_name=data_column
)
if bool((predictions["outcome"] > 10e9).any()):
logging.warning(
f"Found predictions larger than earth population. These are censored at 10 billion."
)
predictions["outcome"] = xarray.where(
predictions["outcome"] > 10e9, 10e9, predictions["outcome"]
)
crps = calculate_metrics(
observed, predictions, metric="crps", aggregate_over="nothing"
)
mis = calculate_metrics(
observed,
predictions,
metric="mis",
prediction_interval_level=0.9,
aggregate_over="nothing",
)
if predictions.dims["member"] != expected_samples:
logging.warning(
f'Number of samples ({predictions.dims["member"]}) is not 1000. Using scipy.signal.resample to get {expected_samples} samples when calculating Ignorance Score.'
)
np.random.seed(284975)
arr: npt.ArrayLike = resample(predictions.to_array(), expected_samples, axis=3)
arr = np.where(
arr < 0, 0, arr
) # For the time when resampling happens to go below zero.
new_container = predictions.sel(member=1)
new_container = (
new_container.expand_dims({"member": range(0, expected_samples)})
.to_array()
.transpose("variable", "month_id", unit, "member")
)
predictions: xarray.Dataset = xarray.DataArray(
data=arr, coords=new_container.coords
).to_dataset(dim="variable")
if bool((predictions["outcome"] < 0).any()):
logging.warning(
f"Found negative predictions. These are censored at 0 before calculating Ignorance Score."
)
predictions["outcome"] = xarray.where(
predictions["outcome"] < 0, 0, predictions["outcome"]
)
ign = calculate_metrics(
observed, predictions, metric="ign", bins=bins, aggregate_over="nothing"
)
# Save data in .parquet long-format (month_id, unit_id, metric, value)
dfs = {"crps": crps, "ign": ign, "mis": mis}
for metric in ["crps", "ign", "mis"]:
dfs[metric].rename(columns={metric: "value"}, inplace=True)
metric_dir = save_to / f"metric={metric}"
metric_dir.mkdir(exist_ok=True, parents=True)
dfs[metric].to_parquet(metric_dir / f"{metric}.parquet")
def match_forecast_with_actuals(
submission, actuals_folder, target: TargetType, window: str
) -> tuple[pd.DataFrame, pd.DataFrame]:
filter = pyarrow.compute.field("window") == window
actuals = get_target_data(actuals_folder, target=target, filters=filter)
predictions = get_target_data(submission, target=target, filters=filter)
predictions.drop(columns=["window"], inplace=True)
actuals.drop(columns=["window"], inplace=True)
return actuals, predictions
def evaluate_submission(
submission: str | os.PathLike,
acutals: str | os.PathLike,
targets: list[TargetType],
windows: list[str],
expected: int,
bins: list[float],
draw_column: str = "draw",
data_column: str = "outcome",
) -> None:
"""Loops over all targets and windows in a submission folder, match them with the correct test dataset, and estimates evaluation metrics.
Stores evaluation data as .parquet files in {submission}/eval/{target}/window={window}/.
Parameters
----------
submission : str | os.PathLike
Path to a folder structured like a submission_template
acutals : str | os.PathLike
Path to actuals folder structured like {actuals}/{target}/window={window}/data.parquet
targets : list[TargetType]
A list of strings, either ["pgm"] for PRIO-GRID-months, or ["cm"] for country-months, or both.
windows : list[str]
A list of strings indicating the window of the test dataset. The string should match windows in data in the actuals folder.
expected : int
The expected numbers of samples. Due to how Ignorance Score is defined, all IGN metric comparisons must be across models with equal number of samples.
bins : list[float]
The binning scheme used in the Ignorance Score.
draw_column : str
The name of the sample column. We assume samples are drawn independently from the model. Default = "draw"
data_column : str
The name of the data column. Default = "outcome"
"""
for target in targets:
for window in windows:
if any(
(submission / target).glob("**/*.parquet")
): # test if there are prediction files in the target
observed_df, pred_df = match_forecast_with_actuals(
submission, acutals, target, window
)
save_to = submission / "eval" / f"{target}" / f"window={window}"
evaluate_forecast(
forecast=pred_df,
actuals=observed_df,
target=target,
expected_samples=expected,
draw_column=draw_column,
data_column=data_column,
bins=bins,
save_to=save_to,
)
def evaluate_all_submissions(
submissions: str | os.PathLike,
acutals: str | os.PathLike,
targets: list[TargetType],
windows: list[str],
expected: int,
bins: list[float],
draw_column: str = "draw",
data_column: str = "outcome",
) -> None:
"""Loops over all submissions in the submissions folder, match them with the correct test dataset, and estimates evaluation metrics.
Stores evaluation data as .parquet files in {submissions}/{submission_name}/eval/{target}/window={window}/.
Parameters
----------
submissions : str | os.PathLike
Path to a folder only containing folders structured like a submission_template
acutals : str | os.PathLike
Path to actuals folder structured like {actuals}/{target}/window={window}/data.parquet
targets : list[TargetType]
A list of strings, either ["pgm"] for PRIO-GRID-months, or ["cm"] for country-months, or both.
windows : list[str]
A list of strings indicating the window of the test dataset. The string should match windows in data in the actuals folder.
expected : int
The expected numbers of samples. Due to how Ignorance Score is defined, all IGN metric comparisons must be across models with equal number of samples.
bins : list[float]
The binning scheme used in the Ignorance Score.
draw_column : str
The name of the sample column. We assume samples are drawn independently from the model. Default = "draw"
data_column : str
The name of the data column. Default = "outcome"
"""
submissions = Path(submissions)
submissions = list_submissions(submissions)
acutals = Path(acutals)
for submission in submissions:
try:
logging.info(f"Evaluating {submission.name}")
evaluate_submission(
submission,
acutals,
targets,
windows,
expected,
bins,
draw_column,
data_column,
)
except Exception as e:
logging.error(f"{str(e)}")
def main():
parser = argparse.ArgumentParser(
description="Method for evaluation of submissions to the ViEWS Prediction Challenge",
epilog="Example usage: python evaluate_submissions.py -s ./submissions -a ./actuals -e 100",
)
parser.add_argument(
"-s",
metavar="submissions",
type=str,
help="path to folder with submissions complying with submission_template",
)
parser.add_argument(
"-a", metavar="actuals", type=str, help="path to folder with actuals"
)
parser.add_argument(
"-t",
metavar="targets",
nargs="+",
type=str,
help="pgm or cm or both",
default=["pgm", "cm"],
)
parser.add_argument(
"-w",
metavar="windows",
nargs="+",
type=str,
help="windows to evaluate",
default=["Y2018", "Y2019", "Y2020", "Y2021"],
)
parser.add_argument(
"-e", metavar="expected", type=int, help="expected samples", default=1000
)
parser.add_argument(
"-sc",
metavar="draw_column",
type=str,
help="(Optional) name of column for the unique samples",
default="draw",
)
parser.add_argument(
"-dc",
metavar="data_column",
type=str,
help="(Optional) name of column with data, must be same in both observed and predictions data",
default="outcome",
)
parser.add_argument(
"-ib",
metavar="bins",
nargs="+",
type=float,
help='Set a binning scheme for the ignorance score. List or integer (nbins). E.g., "--ib 0 0.5 1 5 10 100 1000". None also allowed.',
default=[0, 0.5, 2.5, 5.5, 10.5, 25.5, 50.5, 100.5, 250.5, 500.5, 1000.5],
)
args = parser.parse_args()
submissions = Path(args.s)
acutals = Path(args.a)
expected = args.e
targets = args.t
windows = args.w
draw_column = args.sc
data_column = args.dc
bins = args.ib
evaluate_all_submissions(
submissions, acutals, targets, windows, expected, bins, draw_column, data_column)
if __name__ == "__main__":
main()