-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkaggle_metric_utilities.py
85 lines (65 loc) · 3.26 KB
/
kaggle_metric_utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
'''
This script exists to reduce code duplication across metrics.
'''
import numpy as np
import pandas as pd
import pandas.api.types
from typing import Union
class ParticipantVisibleError(Exception):
pass
class HostVisibleError(Exception):
pass
def treat_as_participant_error(error_message: str, solution: Union[pd.DataFrame, np.ndarray]) -> bool:
''' Many metrics can raise more errors than can be handled manually. This function attempts
to identify errors that can be treated as ParticipantVisibleError without leaking any competition data.
If the solution is purely numeric, and there are no numbers in the error message,
then the error message is sufficiently unlikely to leak usable data and can be shown to participants.
We expect this filter to reject many safe messages. It's intended only to reduce the number of errors we need to manage manually.
'''
# This check treats bools as numeric
if isinstance(solution, pd.DataFrame):
solution_is_all_numeric = all([pandas.api.types.is_numeric_dtype(x) for x in solution.dtypes.values])
solution_has_bools = any([pandas.api.types.is_bool_dtype(x) for x in solution.dtypes.values])
elif isinstance(solution, np.ndarray):
solution_is_all_numeric = pandas.api.types.is_numeric_dtype(solution)
solution_has_bools = pandas.api.types.is_bool_dtype(solution)
if not solution_is_all_numeric:
return False
for char in error_message:
if char.isnumeric():
return False
if solution_has_bools:
if 'true' in error_message.lower() or 'false' in error_message.lower():
return False
return True
def safe_call_score(metric_function, solution, submission, **metric_func_kwargs):
'''
Call score. If that raises an error and that already been specifically handled, just raise it.
Otherwise make a conservative attempt to identify potential participant visible errors.
'''
try:
score_result = metric_function(solution, submission, **metric_func_kwargs)
except Exception as err:
error_message = str(err)
if err.__class__.__name__ == 'ParticipantVisibleError':
raise ParticipantVisibleError(error_message)
elif err.__class__.__name__ == 'HostVisibleError':
raise HostVisibleError(error_message)
else:
if treat_as_participant_error(error_message, solution):
raise ParticipantVisibleError(error_message)
else:
raise err
return score_result
def verify_valid_probabilities(df: pd.DataFrame, df_name: str):
""" Verify that the dataframe contains valid probabilities.
The dataframe must be limited to the target columns; do not pass in any ID columns.
"""
if not pandas.api.types.is_numeric_dtype(df.values):
raise ParticipantVisibleError(f'All target values in {df_name} must be numeric')
if df.min().min() < 0:
raise ParticipantVisibleError(f'All target values in {df_name} must be at least zero')
if df.max().max() > 1:
raise ParticipantVisibleError(f'All target values in {df_name} must be no greater than one')
if not np.allclose(df.sum(axis=1), 1):
raise ParticipantVisibleError(f'Target values in {df_name} do not add to one within all rows')