-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodeling.py
132 lines (107 loc) · 5.71 KB
/
modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import xgboost as xgb
from scipy.stats import ks_2samp
import scipy.stats as st
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, precision_recall_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
class Model:
def __init__(self, xgb_params=None, rf_params=None, class_weights=None,
train_data_path=None, features=None, cv=None):
self.xgb_params = xgb_params
self.rf_params = rf_params
self.class_weights = class_weights
self.train_data_path = train_data_path
self.features = features
self.cv = cv
def fit(self):
# Load and preprocess training data
df_train = pd.read_csv(self.train_data_path)
X_train = df_train.drop(["ID", "86"], axis=1)[self.features]
y_train = df_train["86"]
# Create classifiers
rf_classifier = RandomForestClassifier(**self.rf_params)
xgb_classifier = xgb.XGBClassifier(scale_pos_weight=self.class_weights[1], **self.xgb_params)
gaussian_nb = GaussianNB()
svc_classifier = SVC(probability=True, class_weight=self.class_weights)
final_estimator = LogisticRegression(class_weight=self.class_weights, random_state=42)
estimators = [('random_forest', rf_classifier),
('xgboost', xgb_classifier),
('GaussianNB', GaussianNB()),
('SVC', svc_classifier)
]
# Create a StackingClassifier
classifier = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
# Perform cross-validation and metrics
skf = StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=42)
y_pred_prob_cv = cross_val_predict(classifier, X_train, y_train, cv=skf, method='predict_proba')[:, 1]
self.metrics(y_train, y_pred_prob_cv)
# finalize the model with parameters on the entire training data
classifier.fit(X_train, y_train)
# Save the trained model
self.classifier = classifier
def metrics(self, y_test, y_pred_prob):
# Calculate ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)
# Calculate Precision-Recall curve and AUC
precision, recall, _ = precision_recall_curve(y_test, y_pred_prob)
auc_pr = auc(recall, precision)
# Calculate KS statistic
ks_statistic, p_value = ks_2samp(y_pred_prob[y_test == 1], y_pred_prob[y_test == 0])
# Plot metrics
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
axs[0, 0].hist(y_pred_prob[y_test == 1], bins=100, color="blue", edgecolor="black", label="class 1")
axs[0, 0].hist(y_pred_prob[y_test == 0], bins=100, color="green", alpha=0.5, edgecolor="black", label="class 0")
axs[0, 0].set_xlabel("Probability", color="white")
axs[0, 0].set_ylabel("Frequency", color="white")
axs[0, 0].set_title("Histogram of Predicted Probabilities", color="white")
axs[0, 0].tick_params(axis="both", colors="white")
axs[0, 0].legend()
axs[0, 1].plot(fpr, tpr, color="darkorange", lw=2, label=f"AUC = {roc_auc:.2f}")
axs[0, 1].plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
axs[0, 1].set_xlim([0.0, 1.0])
axs[0, 1].set_ylim([0.0, 1.05])
axs[0, 1].set_xlabel("False Positive Rate", color="white")
axs[0, 1].set_ylabel("True Positive Rate", color="white")
axs[0, 1].set_title("Receiver Operating Characteristic", color="white")
axs[0, 1].legend(loc="lower right")
axs[0, 1].tick_params(axis="both", colors="white")
axs[1, 0].plot(recall, precision, color="darkorange", lw=2, label=f"AUC = {auc_pr:.2f}")
axs[1, 0].set_xlim([0.0, 1.0])
axs[1, 0].set_ylim([0.0, 1.05])
axs[1, 0].set_xlabel("Recall", color="white")
axs[1, 0].set_ylabel("Precision", color="white")
axs[1, 0].set_title("Precision-Recall Curve", color="white")
axs[1, 0].legend(loc="lower left")
axs[1, 0].tick_params(axis="both", colors="white")
y_0 = np.sort(y_pred_prob[y_test == 0])
y_1 = np.sort(y_pred_prob[y_test == 1])
cdf_y_0 = [st.norm.cdf(x) for x in y_0]
cdf_y_1 = [st.norm.cdf(x) for x in y_1]
axs[1, 1].plot(np.arange(0, len(cdf_y_0)) / len(cdf_y_0), cdf_y_0, label="class 0")
axs[1, 1].plot(np.arange(0, len(cdf_y_1)) / len(cdf_y_1), cdf_y_1, label="class 1")
axs[1, 1].set_xlabel("% of sample", color="white")
axs[1, 1].set_ylabel("F(x)", color="white")
axs[1, 1].set_title(f"KS CDF curve normalize: {round(ks_statistic, 2)} \n p-value = {p_value}", color="white")
axs[1, 1].legend()
axs[1, 1].tick_params(axis="both", colors="white")
plt.tight_layout()
plt.savefig("metrics.png", dpi=400, bbox_inches="tight", transparent=False)
plt.show()
def predict(self, test_data_path, top_selection=1000):
df_test = pd.read_csv(test_data_path)
X_test = df_test[self.features]
y_pred_proba = self.classifier.predict_proba(X_test)[:, 1]
df_test["probability_prediction"] = y_pred_proba
df_test = df_test.sort_values(by="probability_prediction", ascending=False)
selected_id = df_test.iloc[:top_selection].ID.unique()
df_test["promising"] = 0
df_test.loc[df_test.ID.isin(selected_id), "promising"] = 1
return df_test