-
Notifications
You must be signed in to change notification settings - Fork 0
/
my_func.py
229 lines (173 loc) · 8.92 KB
/
my_func.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
### this file contains my functions and other one's that can be used multiply times
import pickle
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
def save_model(model = None, features = []):
name = str(model.__class__).split('.')[-1][:-2] + '_' + datetime.today().strftime("%d%m%Y_%H_%M") + '.pickle'
if model:
with open(name, 'wb') as file:
pickle.dump((model, features), file)
print('Save', name)
def save_table(table, file_path, table_name):
name = file_path +'\\' + table_name + '.pickle'
with open(name, 'wb') as file:
pickle.dump(table, file)
print('Save', name)
def load_pickle(file_name):
with open(file_name, 'rb') as file:
model_tpl = pickle.load(file)
return model_tpl
### reduce memory
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
# дивимось на категоріальні дані, та їх кількість
# в зміну cat_col_lst зберігаю назви категоріальних колонок
# вивиджу датасет із кількістю унікальних значень в кожній колонці і кількість місінгів
def categorical_col_info(df: pd.DataFrame) -> pd.DataFrame:
"""
подаємо на вхід датафрейм.
функція відбирає обєкти, і далі видає датафрейм, який
показує назву колонки, кількість унікальних значень, і кількість пропущених значень
"""
unique_values_lst = []
missing_values_lst = []
cat_col_lst = []
for col in df.select_dtypes(include='object').columns:
unique_values = df[col].nunique()
missing_values = df[col].isna().sum()
cat_col_lst.append(col)
unique_values_lst.append(unique_values)
missing_values_lst.append(missing_values)
column_summary_df = pd.DataFrame({
'Column': cat_col_lst,
'Unique values': unique_values_lst,
'Missing values': missing_values_lst
})
column_summary_df.loc['Total'] = [len(cat_col_lst), sum(unique_values_lst), sum(missing_values_lst)]
<<<<<<< HEAD
return column_summary_df
||||||| 122ca19
return column_summary_df
def count_missing_values(df):
missing_values = df.isnull().sum()
missing_values_percent = (missing_values / len(df)) * 100
result_df = pd.DataFrame({
'Column': missing_values.index,
'Missing Values': missing_values.values,
'Missing Values (%)': missing_values_percent.values
})
result_df = result_df[result_df['Missing Values'] > 0] # Filter out columns with no missing values
missing_values_df_sorted = result_df.sort_values(by='Missing Values (%)', ascending=False)
return missing_values_df_sorted
=======
return column_summary_df
def count_missing_values(df):
missing_values = df.isnull().sum()
missing_values_percent = (missing_values / len(df)) * 100
result_df = pd.DataFrame({
'Column': missing_values.index,
'Missing_Values': missing_values.values,
'Missing_Values_%': missing_values_percent.values
})
result_df = result_df[result_df['Missing_Values'] > 0] # Filter out columns with no missing values
missing_values_df_sorted = result_df.sort_values(by='Missing_Values_%', ascending=False)
return missing_values_df_sorted
# model_pipeline = {'Model_name': 'Titanic',
# 'preprocess': preprocess_string,
# 'algorythm': 'dont know yet',
# 'model': 'in process',
# 'score': 0.9}
# # 'features': df_train.columns}
def binary_classification_metrics(y_true_tr, y_pred_tr, y_true_val=None, y_pred_val=None, report=False):
print("{:<15} {:<10} {:<10} {:<10}".format('Metrics', 'Train', 'Test', '\u0394'))
metrics_dict = {}
metrics_dict['roc_auc'] = np.round(roc_auc_score(y_true_tr, y_pred_tr), 4)
metrics_dict['accuracy'] = np.round(accuracy_score(y_true_tr, y_pred_tr), 4)
metrics_dict['precision'] = np.round(precision_score(y_true_tr, y_pred_tr), 4)
metrics_dict['recall'] = np.round(recall_score(y_true_tr, y_pred_tr), 4)
metrics_dict['f1_score'] = np.round(f1_score(y_true_tr, y_pred_tr), 4)
if y_true_val is not None:
metrics_dict_test = {}
metrics_dict_test['roc_auc'] = np.round(roc_auc_score(y_true_val, y_pred_val), 4)
metrics_dict_test['accuracy'] = np.round(accuracy_score(y_true_val, y_pred_val), 4)
metrics_dict_test['precision'] = np.round(precision_score(y_true_val, y_pred_val), 4)
metrics_dict_test['recall'] = np.round(recall_score(y_true_val, y_pred_val), 4)
metrics_dict_test['f1_score'] = np.round(f1_score(y_true_val, y_pred_val), 4)
for metrics, value in metrics_dict.items():
value_test = metrics_dict_test[metrics]
diff = np.round(metrics_dict_test[metrics] - value, 4)
print("{:<15} {:<10} {:<10} {:<10}".format(metrics, value, value_test, diff))
else:
for metrics, value in metrics_dict.items():
print("{:<15} {:<10}".format(metrics, value))
if report:
print('\n')
print('Train:')
print(classification_report(y_true_tr, y_pred_tr))
if y_true_val is not None:
print('Test:')
print(classification_report(y_true_val, y_pred_val))
def plot_roc_auc_ensemble(y_true, y_pred, title):
# Calculate the ROC curve and AUC
fpr, tpr, _ = roc_curve(y_true, y_pred)
auc = roc_auc_score(y_true, y_pred)
# Plot the ROC curve
plt.plot(fpr, tpr, color='blue', label="ROC curve (AUC = {:.3f})".format(auc))
plt.plot([0, 1], [0, 1], color='gray', linestyle='--') # Plot the diagonal line
# Set x-axis and y-axis labels
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
# Set the title and legend
plt.title(title)
plt.legend(loc="lower right")
# Show the plot
plt.show()
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
def plot_roc_auc_train_test(y_train, y_train_pred, y_test, y_test_pred, title):
fpr_train, tpr_train, _ = roc_curve(y_train, y_train_pred)
auc_train = roc_auc_score(y_train, y_train_pred)
fpr_test, tpr_test, _ = roc_curve(y_test, y_test_pred)
auc_test = roc_auc_score(y_test, y_test_pred)
plt.plot(fpr_train, tpr_train, color='blue', label="Train, AUC={:.3f}".format(auc_train), linestyle='-')
plt.plot(fpr_test, tpr_test, color='black', label="Test, AUC={:.3f}".format(auc_test), linestyle='--')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)
plt.title(title, fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')
plt.show()
>>>>>>> only_lgbm