-
Notifications
You must be signed in to change notification settings - Fork 1
/
dashboard_CLOUD.py
593 lines (425 loc) · 25.8 KB
/
dashboard_CLOUD.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
# -*- coding: utf-8 -*-
import mlflow
import mlflow.pyfunc
from mlflow.exceptions import MlflowException
import pandas as pd
import streamlit as st
import numpy as np
import shap
import os
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
import requests
from io import StringIO
import streamviz
import seaborn
def preprocess_dataframe(df):
"""
Preprocesses the input DataFrame (df) for machine learning tasks.
Args:
df (pandas.DataFrame): The DataFrame to preprocess.
Returns:
tuple: A tuple containing the following elements:
- X_train (pandas.DataFrame): The preprocessed training features.
- y_train (pandas.Series): The training target variable.
- X_test (pandas.DataFrame): The preprocessed testing features.
- y_test (pandas.Series): The testing target variable.
"""
# Check for missing values in numerical and categorical features separately
numerical_missing = df.select_dtypes(include=['int64', 'float64']).isnull().sum().any()
categorical_missing = df.select_dtypes(include=['object']).isnull().sum().any()
# Define imputation strategies based on missing value presence
numerical_imputer = SimpleImputer(strategy='median')
categorical_imputer = SimpleImputer(strategy='most_frequent')
# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
('imputer', numerical_imputer),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', categorical_imputer),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
# Combine transformers into a single ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
('numerical', numerical_transformer, df.select_dtypes(include=['int64', 'float64']).columns),
('categorical', categorical_transformer, df.select_dtypes(include=['object']).columns)
])
# Apply preprocessing to the DataFrame
df_transformed = preprocessor.fit_transform(df)
# Convert the NumPy array back to a DataFrame with column names (without prefixes)
new_feature_names = []
for name in preprocessor.get_feature_names_out():
new_feature_names.append(name.split('__')[1]) # Remove prefix using split and indexing
# Convert the NumPy array back to a DataFrame with column names
df_transformed = pd.DataFrame(df_transformed, columns=new_feature_names)
# Drop preprocessed SK_ID_CURR and add original SK_ID_CURR
df_transformed.drop('SK_ID_CURR', axis=1, inplace=True) # Drop preprocessed feature
df_transformed['SK_ID_CURR'] = df['SK_ID_CURR'] # Add original feature
# Reorder features to place SK_ID_CURR as the first one (efficient way)
reordered_cols = ['SK_ID_CURR'] + [col for col in df_transformed.columns if col != 'SK_ID_CURR']
df_transformed = df_transformed[reordered_cols]
return df_transformed
def get_mlflow_artifact(run_id, artifact_path):
"""Fetches an artifact from the specified MLflow run."""
client = mlflow.tracking.MlflowClient()
try:
local_path = client.download_artifacts(run_id, artifact_path)
return local_path
except Exception as e:
st.error(f"Failed to download artifact: {artifact_path}. Error: {e}")
return None
def shap_values_to_dataframe_instance(shap_values, feature_names, instance_index):
"""
Convert SHAP values to a dataframe indicating the feature and its percentage contribution for a specific instance.
Parameters:
- shap_values: SHAP values object.
- feature_names: List of feature names.
- instance_index: Index of the instance for which to calculate the SHAP values.
Returns:
- df_feature_importance: Dataframe with features and their percentage contributions for the specific instance.
"""
# Get SHAP values for the specific instance
instance_shap_values = shap_values[instance_index]
# Create a dataframe with feature names and their SHAP values for the instance
df_feature_importance = pd.DataFrame({
'Feature': feature_names,
'SHAPValue': instance_shap_values
})
# # Calculate the percentage contribution for each feature
# total_shap_value = np.sum(instance_shap_values)
# df_feature_importance['Percentage'] = 100 * df_feature_importance['SHAPValue'] / total_shap_value
# Sort the dataframe by percentage contribution in descending order
df_feature_importance = df_feature_importance.sort_values(by='SHAPValue', ascending=False).reset_index(drop=True)
return df_feature_importance
@st.cache_resource()
def load_model():
# Set the MLflow tracking URI (update with your server URI if necessary)
mlflow.set_tracking_uri("https://dagshub.com/Isdinval/OC_PROJET7.mlflow")
# Define the model URI from the provided information
model_uri = 'runs:/19e1265fed5543db8878f67479e4f60b/model'
# Load the model using the appropriate method
model = mlflow.sklearn.load_model(model_uri)
return model
@st.cache_data() # Cache the feature names to avoid reloading
def retrieve_feature_names():
url = 'https://raw.githubusercontent.com/Isdinval/OC_PROJET7/main/feature_names.txt'
response = requests.get(url)
if response.status_code == 200:
# Read the text content of the response
text_data = response.text
# Split the text by line breaks to get individual feature names
feature_names = text_data.splitlines()
return feature_names
else:
st.error("Failed to load data from GitHub.")
return None
# Load the test data
@st.cache_data() # Cache the test data to avoid reloading
def load_test_data():
url = 'https://raw.githubusercontent.com/Isdinval/OC_PROJET7/main/application_test.csv'
response = requests.get(url)
if response.status_code == 200:
return pd.read_csv(StringIO(response.text), delimiter=",")
else:
st.error("Failed to load data from GitHub.")
return None
# Load the test data
@st.cache_data() # Cache the test data to avoid reloading
def load_test_data_description():
url = 'https://raw.githubusercontent.com/Isdinval/OC_PROJET7/main/HomeCredit_columns_description.csv'
response = requests.get(url)
if response.status_code == 200:
return pd.read_csv(StringIO(response.text))
else:
st.error("Failed to load data from GitHub.")
return None
# Load the model and SHAP values
model = load_model()
# Load FEATURE NAMES
feature_names_from_Model = retrieve_feature_names()
feature_names = feature_names_from_Model
# Load Test DATA
customer_data = load_test_data()
customer_data_description = load_test_data_description()
# Optimal threshold from MLflow
optimal_threshold = 0.636364
# Define a function to make predictions
def make_prediction(input_data, model, optimal_threshold):
input_df = pd.DataFrame([input_data])
# Get the raw prediction score
probability_class1 = model.predict_proba(input_df)[:, 1]
# Extract scalar value if probability is an array
if isinstance(probability_class1, np.ndarray):
probability_class1 = probability_class1[0]
# Convert probability to human-readable format
prediction_label = "Refused" if probability_class1 >= optimal_threshold else "Accepted"
return probability_class1, prediction_label
def get_final_estimator(pipeline):
"""
Extracts the final estimator from a scikit-learn pipeline.
Args:
pipeline: The scikit-learn pipeline object.
Returns:
The final estimator object from the pipeline.
"""
# Access the steps in the pipeline
steps = pipeline.steps
# Assuming the final step is the estimator (common case)
final_estimator_name = steps[-1][0] # Get the name of the final step
final_estimator = steps[-1][1] # Get the final estimator object
return final_estimator #final_estimator
# Streamlit app code
def main():
# Set page title
st.title("Credit Scoring Dashboard")
st.write("Welcome to the Credit Scoring Dashboard! Use the form below to make predictions.")
# IMAGE PROJECT BANNER
# col1, col2, col3 = st.columns([1, 3, 1]) # Adjust column ratios as needed
# with col2:
st.image("P7_Banner.png", width=700) # Adjust width as desired
# =========================================================================
# EXPLAINABILITY SECTIONS
# =========================================================================
explainability_sections = """
This loan approval prediction model is an XGBoost classifier. XGBoost stands for eXtreme Gradient Boosting, a powerful machine learning algorithm that combines the strengths of multiple decision trees to make more accurate predictions. It's known for its efficiency, scalability, and ability to handle complex relationships between features.
The model analyzes various customer attributes, such as income, credit history, and debt-to-income ratio, to estimate the probability of loan default. The model's output is a probability score between 0% and 100%, where a lower score indicates a lower risk of the borrower defaulting on the loan.
"""
st.write(explainability_sections)
st.header("I. Choice of the Customer in the Dataset Test")
# Input field for SK_ID_CURR
sk_id_curr = st.number_input('Enter SK_ID_CURR (ex: 100001 or 101268):', min_value=customer_data['SK_ID_CURR'].min(), max_value=customer_data['SK_ID_CURR'].max())
# Preprocess Data
customer_data_copy = customer_data
customer_data_preprocessed = preprocess_dataframe(customer_data)
# Remove DAYS EMPLOTYED ANOMALIES
# Exclude the specific value from mean calculation (assuming it's unique)
filtered_data = customer_data_copy[customer_data_copy['DAYS_EMPLOYED'] != 365243]
mean_days_employed = int(filtered_data['DAYS_EMPLOYED'].mean())
customer_data_copy['DAYS_EMPLOYED'].replace(365243, mean_days_employed, inplace=True)
customer_data_preprocessed['DAYS_EMPLOYED'].replace(365243, mean_days_employed, inplace=True)
# =========================================================================
# ADD Missing features Manually (due to preprocess) then re-order features
# =========================================================================
# Feature names with spaces (use double quotes)
new_features = {
"NAME_FAMILY_STATUS_Unknown": 0, # Double quotes for space
"NAME_INCOME_TYPE_Maternity leave": 0, # Double quotes for space
"CODE_GENDER_XNA": 0
}
# Update customer_data_preprocessed
customer_data_preprocessed = customer_data_preprocessed.assign(**new_features)
# Reorder features using "feature_names_from_Model"
ordered_features = [col for col in feature_names_from_Model if col in customer_data_preprocessed.columns]
customer_data_preprocessed = customer_data_preprocessed[ordered_features]
# Check if SK_ID_CURR exists in the data
if sk_id_curr in customer_data_preprocessed['SK_ID_CURR'].values:
# =========================================================================
# CUSTOMERS DATA
# =========================================================================
# Get the index of the selected customer
customer_index = customer_data_preprocessed[customer_data_preprocessed['SK_ID_CURR'] == sk_id_curr].index[0]
# Get the data for the selected customer
input_data = customer_data_preprocessed[customer_data_preprocessed['SK_ID_CURR'] == sk_id_curr].iloc[0].to_dict()
input_data_copy = customer_data_copy[customer_data_copy['SK_ID_CURR'] == sk_id_curr].iloc[0].to_dict()
# =========================================================================
# CUSTOMERS BASIC INFORMATIONS
# =========================================================================
# Display customer information
st.header("II. Customer Information:")
age_years = -input_data_copy['DAYS_BIRTH'] // 365 # Calculate age from DAYS_BIRTH
employment_duration_years = -input_data_copy.get('DAYS_EMPLOYED', 0) // 365 # Calculate employment duration from DAYS_EMPLOYED
gender_pronoun = "He" if input_data_copy['CODE_GENDER'] == 'M' else "She"
gender2_pronoun = "his" if input_data_copy['CODE_GENDER'] == 'M' else "her"
education_level = input_data_copy['NAME_EDUCATION_TYPE'].lower()
family_status = input_data_copy['NAME_FAMILY_STATUS'].lower()
housing_type = input_data_copy['NAME_HOUSING_TYPE'].lower()
customer_description = f"""
**{gender_pronoun.capitalize()}** is a **{age_years}** years old **{input_data_copy['NAME_EDUCATION_TYPE']}** who works in the **{input_data_copy['NAME_INCOME_TYPE']}** sector. **{gender_pronoun.capitalize()}** lives in a **{housing_type}** and is currently **{employment_duration_years}** years into employment. **{gender_pronoun.capitalize()}** is **{family_status}** and has applied for a **{input_data_copy['NAME_CONTRACT_TYPE'].lower()}** loan. **{gender2_pronoun.capitalize()}** income is **{input_data_copy['AMT_INCOME_TOTAL']}** €.
"""
st.write(customer_description)
loan_description = f"""
The loan asked is **{input_data_copy['AMT_CREDIT']}** €, and the annuity asked are **{input_data_copy['AMT_ANNUITY']}** €.
"""
st.write(loan_description)
# =========================================================================
# COMPARATIVE ANALYSIS USING GRAPHS
# ========================================================================
st.header('III. Comparative Analysis')
st.subheader('III.1. Univariate Analysis')
# Get all features (assuming numerical features)
all_features = customer_data_copy.select_dtypes(include=[np.number]) # Adjust for categorical features if needed
# Filter controls
selected_feature = st.selectbox('Select Feature:', all_features.columns, index=all_features.columns.get_loc('AMT_INCOME_TOTAL')) # Set default
# Load feature descriptions (assuming customer_data_description is a Pandas DataFrame)
feature_descriptions = customer_data_description
# Select feature
all_features = list(feature_descriptions["Row"]) # Assuming "Row" contains feature names
# Find description for the selected feature
feature_description = feature_descriptions[feature_descriptions["Row"] == selected_feature]["Description"].iloc[0]
# Print description
st.write(f"Feature : **{feature_description}**")
# Filter data based on selected feature
filtered_data = customer_data_copy.copy() # Avoid modifying original data
# Separate data for full dataset and current customer
full_data_values = np.array(customer_data_copy[selected_feature])
customer_value = customer_data_copy[selected_feature].iloc[customer_index]
# Create bins (adjust number of bins as needed)
bins = np.linspace(filtered_data[selected_feature].min(), filtered_data[selected_feature].max(), 10) # 10 bins
# Calculate bin width (assuming equally spaced bins)
bin_width = bins[1] - bins[0]
# Count data points within each bin for all customers and the selected customer
counts_all, bins_all = np.histogram(filtered_data[selected_feature], bins=bins)
count_customer, _ = np.histogram(filtered_data[selected_feature].iloc[customer_index], bins=bins)
# Find the bin index for the customer value
customer_bin_index = np.digitize(customer_value, bins=bins) - 1 # Adjust for zero-based indexing
# Create bar chart with bins and log scale on y-axis
fig, ax = plt.subplots()
ax.bar(bins_all[:-1] + bin_width/2, counts_all, width=bin_width, color='gray', alpha=0.7, label='All Clients')
ax.bar(bins_all[customer_bin_index] + bin_width/2, counts_all, width=bin_width, color='red', label='Current Customer') # Use customer_bin_index
ax.set_xlabel(selected_feature) # Adjust label based on feature
ax.set_ylabel('Count (Log Scale)') # Update label
ax.set_title(f'Distribution of {selected_feature} (Binned)')
ax.set_yscale('log') # Set log scale for y-axis
ax.legend()
plt.tight_layout()
st.pyplot(plt.gcf())
# =========================================================================
# BIVARIATE GRAPHS
# ========================================================================
st.subheader('III.2. Bivariate Analysis')
# Feature selection (assuming UI elements are already defined)
all_features = customer_data_copy.select_dtypes(include=[np.number]) # Adjust for categorical features if needed
feature1 = st.selectbox('Select Feature 1:', all_features.columns, index=all_features.columns.get_loc('AMT_INCOME_TOTAL'))
feature2 = st.selectbox('Select Feature 2:', all_features.columns, index=all_features.columns.get_loc('AMT_ANNUITY'))
feature1_description = feature_descriptions[feature_descriptions["Row"] == feature1]["Description"].iloc[0] # Find description for the selected feature
st.write(f"Feature 1 : **{feature1_description}**")
feature2_description = feature_descriptions[feature_descriptions["Row"] == feature2]["Description"].iloc[0] # Find description for the selected feature
st.write(f"Feature 2 : **{feature2_description}**")
# Data preparation for bivariate plot
def prepare_bivariate_data(customer_data, feature1, feature2):
# Select and prepare features
feature1_values = customer_data[feature1]
feature2_values = customer_data[feature2]
return feature1_values, feature2_values
# Bivariate plot generation
def generate_bivariate_plot(feature1_values, feature2_values, customer_data, sk_id_curr):
import seaborn as sns
# Clear the current figure before displaying the new one
plt.clf()
# Extract data for the current customer
customer_data = customer_data[customer_data['SK_ID_CURR'] == sk_id_curr]
customer_feature1 = customer_data[feature1].iloc[0]
customer_feature2 = customer_data[feature2].iloc[0]
# Create the plot
sns.scatterplot(x=feature1_values, y=feature2_values, color='grey')
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.title(f"Bivariate Analysis: {feature1} vs. {feature2}")
# Highlight the current customer with a red point
plt.scatter(customer_feature1, customer_feature2, color='red', marker='o', label='Current Customer')
ax.set_yscale('log') # Set log scale for y-axis
ax.set_xscale('log') # Set log scale for y-axis
plt.legend()
# Customize and display the plot
st.pyplot(plt.gcf())
# Generate bivariate plot when a button is clicked
# if st.button("Generate Bivariate Plot"):
feature1_values, feature2_values = prepare_bivariate_data(customer_data_copy, feature1, feature2)
generate_bivariate_plot(feature1_values, feature2_values, customer_data, sk_id_curr)
# =========================================================================
# PREDICTION USING MODEL FOR SELECTED CUSTOMER
# =========================================================================
st.header("IV. Model Prediction - Probability of Default ")
st.subheader("IV.1. Probability of Loan Default")
input_df = pd.DataFrame([input_data])
probability_class1 = model.predict_proba(input_df)[:, 1] # Get the raw prediction score
# Extract scalar value if probability is an array
if isinstance(probability_class1, np.ndarray):
probability_class1 = probability_class1[0]
# Convert probability to human-readable format
prediction_label = "Refused" if probability_class1 >= optimal_threshold else "Accepted"
# Display prediction result and probability
# st.markdown("---")
# st.write(f"The probability of default on the loan is estimated to be {probability_class1 * 100:.2f}% (Threshold: {optimal_threshold * 100:.2f}%).")
# FF1708 1B8720
streamviz.gauge(
probability_class1, gSize="LRG", gTitle="Probability of Loan Default (Treshold: 63.64%)", sFix="%",
grLow=0, grMid=optimal_threshold, gcLow="#1B8720",
gcMid="#1B8720", gcHigh="#FF1708")
if prediction_label == "Accepted":
st.markdown("<p style='text-align: center; font-size: 40px; color: green;'>The loan application is approved.</p>", unsafe_allow_html=True)
else:
st.markdown("<p style='text-align: center; font-size: 40px; color: red;'>The loan application is declined.</p>", unsafe_allow_html=True)
if prediction_label == "Refused":
# =========================================================================
# SHAP VALUES FOR SELECTED CUSTOMER
# =========================================================================
# SHAP VALUES
st.subheader("IV.2. Importance Feature Analysis (SHAP)")
final_estimator = get_final_estimator(model)
explainer = shap.TreeExplainer(final_estimator)
shap_values = explainer.shap_values(input_df)
instance_index = 0 # It is always 0 since there is only customer!
df_feature_importance_instance = shap_values_to_dataframe_instance(shap_values, feature_names, instance_index)
# =========================================================================
# TOP 10 POSITIVE OR NEGATIVE FEATURES
# TABLE and MODIFICATION OF VALUES
# =========================================================================
# Create two columns for the plots
col1, col2 = st.columns(2)
# Top 10 Positive Features
top_10_positive = df_feature_importance_instance.head(10) # Get the top 10 rows (positive SHAP values)
modified_input_data = input_data.copy() # Create a copy of the input data
# Plot for Top 10 Positive Features
with col1:
st.header("TOP 10 POSITIVE Features")
st.write(top_10_positive)
st.markdown("---")
st.header("Modify Top 10 Positive Features")
for feature in top_10_positive['Feature']:
try:
modified_value = st.number_input(f"POSITIVE - Modify {feature} - Default Value: {round(float(input_data[feature]), 2)}:", value=float(input_data[feature]))
modified_input_data[feature] = modified_value
except KeyError:
# st.error(f"Feature {feature} not found in the input data. Please check the feature names.")
pass
# Top 10 Negative Features
top_10_negative = df_feature_importance_instance.tail(10) # Get the last 10 rows (negative SHAP values)
# Plot for Top 10 Negative Features
with col2:
st.header("TOP 10 NEGATIVE Features")
st.write(top_10_negative)
st.markdown("---")
st.header("Modify Top 10 Negative Features")
for feature in top_10_negative['Feature']:
try:
modified_value = st.number_input(f"NEGATIVE - Modify {feature} - Default Value: {round(float(input_data[feature]), 2)}:", value=float(input_data[feature]))
modified_input_data[feature] = modified_value
except KeyError:
# st.error(f"Feature {feature} not found in the input data. Please check the feature names.")
pass
# =========================================================================
# UPDATE PREDICTION WITH THE MODIFIED VALUES
# =========================================================================
# Button to make a new prediction with modified data
if st.button("Re-Predict with Modified Features"):
probability_class1, prediction_label = make_prediction(modified_input_data, model, optimal_threshold)
# st.write(f"With the modified features, the probability of default on the loan is now estimated to be {probability_class1 * 100:.2f}% (Threshold: {optimal_threshold * 100:.2f}%).")
streamviz.gauge(
probability_class1, gSize="LRG", gTitle="Probability of Loan Default (Treshold: 63.64%)", sFix="%",
grLow=0, grMid=optimal_threshold, gcLow="#1B8720",
gcMid="#1B8720", gcHigh="#FF1708")
if prediction_label == "Accepted":
st.markdown("<p style='text-align: center; font-size: 40px; color: green;'>The loan application is approved.</p>", unsafe_allow_html=True)
else:
st.markdown("<p style='text-align: center; font-size: 40px; color: red;'>The loan application is still declined.</p>", unsafe_allow_html=True)
st.markdown("---")
else:
st.write("SK_ID_CURR not found.")
if __name__ == "__main__":
main()