-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathI4H_custom_functions.py
260 lines (211 loc) · 7.31 KB
/
I4H_custom_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
"""
Author: Philip Ciunkiewicz
Custom functions for the I4H data science workshop.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.regressor import ResidualsPlot
from yellowbrick.regressor import PredictionError
from yellowbrick.features import RFECV
from yellowbrick.features.importances import FeatureImportances
def load_data(dataset, simplify=True, missing=0):
""" Load either the sample classification
or regression dataset provided in the
SKLearn library.
Parameters:
-----------
dataset : str
One of "classification" or "regression".
simplify : bool
If True removes some features for simplicity.
missing : float
Fraction of values to remove at random [0, 1).
Returns:
--------
features : pd.DataFrame
DataFrame containing the data features
(independent variables).
target : pd.DataFrame
DataFrame containing the data target
(dependent variable).
description : str
Descriptive text providing a summary of the data.
"""
if dataset == 'classification':
data = load_breast_cancer()
if dataset == 'regression':
data = fetch_california_housing()
features = pd.DataFrame(data['data'], columns=data['feature_names'])
target = pd.DataFrame(data['target'], columns=['target'])
description = data['DESCR']
if simplify:
features = _simplify_features(features)
if missing:
nulls = np.random.random(features.shape) < missing
features = features.mask(nulls)
return features, target, description
def _simplify_features(df):
""" Remove 'standard error' columns from
the breast cancer dataset for simplicity
and replace spaces with underscores.
"""
simple = [col for col in df.columns if 'error' not in col]
simple_df = df[simple]
simple_df.columns = [col.replace(' ', '_') for col in simple]
return simple_df
def region_grid(X, n_pixels):
""" Compute a square grid of n_pixels
by n_pixels in the provided data range.
"""
X_range = np.ptp(X.values, axis=0)
X_max = X.max().values + (0.05 * X_range)
X_min = X.min().values - (0.05 * X_range)
res = X_range / n_pixels
xx, yy = np.meshgrid(
np.arange(X_min[0], X_max[0], res[0]),
np.arange(X_min[1], X_max[1], res[1]))
return xx, yy
def draw_decisions(model, X, y, features):
""" Plots the decision boundaries in
two dimensions for a given classifier.
Parameters:
-----------
model : sklearn.model
SKLearn classification model.
X : pd.DataFrame
Input features for classification.
y : pd.DataFrame
Target labels for classification.
features : array_like
Names of features for analysis.
"""
assert len(features) == 2, 'Requires exactly two features.'
X, y = X[features], y.target
model.fit(X, y)
# Predict value probabilities across the full axis
xx, yy = region_grid(X, 1000)
if hasattr(model, "decision_function"):
Z = model.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
zz = Z.reshape(xx.shape)
# Put the result into a color plot
fig, ax = plt.subplots(figsize=(8, 7))
ax.set(
xlabel=features[0],
ylabel=features[1],
title=model.__class__.__name__)
CS = ax.contourf(
xx, yy, zz,
cmap='RdBu',
extend='both',
levels=np.linspace(0, 1, 6))
ax.text(
0.95, 0.95,
f'Acc={model.score(X, y)}',
horizontalalignment='right',
bbox={'facecolor': 'white', 'pad':5},
transform=ax.transAxes)
ax.scatter(*X[y == 0].values.T, alpha=0.75, c=[[1, 0, 0]])
ax.scatter(*X[y == 1].values.T, alpha=0.75, c=[[0, 0, 1]])
plt.colorbar(CS)
plt.show()
def draw_confusion_matrix(model, X, y, classnames=None):
""" Renders the confusion matrix in terms
of % accuracy for a given classifier.
Parameters:
-----------
model : sklearn.model
SKLearn classification model.
X : pd.DataFrame
Input features for classification.
y : pd.DataFrame
Target labels for classification.
classnames : array_like
Name mapping of class labels in
ascending order (len = #classes).
"""
split = train_test_split(X, y, random_state=123)
X_train, X_test, y_train, y_test = split
visualizer = ConfusionMatrix(model, percent=True, cmap='Greens')
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
if classnames is not None:
visualizer.classes_ = classnames
visualizer.draw()
visualizer.poof()
def draw_feature_importances(model, X, y):
""" Displays relative feature importances
for models with coeff or feature attributes.
Parameters:
-----------
model : sklearn.model
SKLearn classification/regression model.
X : pd.DataFrame
Input features for classification/regression.
y : pd.DataFrame
Target data for classification/regression.
"""
visualizer = FeatureImportances(model)
visualizer.fit(X, y['target'])
visualizer.poof()
def draw_RFECV(model, X, y):
""" Displays recursive feature elimination
cross-validation for most models.
Parameters:
-----------
model : sklearn.model
SKLearn classification/regression model.
X : pd.DataFrame
Input features for classification/regression.
y : pd.DataFrame
Target data for classification/regression.
"""
visualizer = RFECV(model, cv=2)
visualizer.fit(X, y['target'])
visualizer.poof()
def draw_residuals(model, X, y):
""" Displays regression model residuals.
Parameters:
-----------
model : sklearn.model
SKLearn regression model.
X : pd.DataFrame
Unscaled input features for regression.
y : pd.DataFrame
Target data for regression.
"""
split = train_test_split(X, y, random_state=123)
X_train, X_test, y_train, y_test = split
scaler = StandardScaler()
X_train_rs = scaler.fit_transform(X_train)
X_test_rs = scaler.transform(X_test)
visualizer = ResidualsPlot(model, alpha=0.15)
visualizer.fit(X_train_rs, y_train['target'])
visualizer.score(X_test_rs, y_test['target'])
visualizer.poof()
def draw_prediction_error(model, X, y):
""" Displays regression model prediction error.
Parameters:
-----------
model : sklearn.model
SKLearn regression model.
X : pd.DataFrame
Unscaled input features for regression.
y : pd.DataFrame
Target data for regression.
"""
split = train_test_split(X, y, random_state=123)
X_train, X_test, y_train, y_test = split
scaler = StandardScaler()
X_train_rs = scaler.fit_transform(X_train)
X_test_rs = scaler.transform(X_test)
visualizer = PredictionError(model, alpha=0.25)
visualizer.fit(X_train_rs, y_train['target'])
visualizer.score(X_test_rs, y_test['target'])
visualizer.poof()