ZhangKaede
/
DS-internship-project-Hands-on-Projects-on-Regression-and-Classification-Algorithms
Public
forked from EmAcq0812/Hands-on-Projects-on-Regression-and-Classification-Algorithms
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLogReg_LoanAppProj.py
96 lines (62 loc) · 2.5 KB
/
LogReg_LoanAppProj.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import pandas as pd
loan_data = pd.read_csv("C:/Users/AcquahEmmanuel/Desktop/Udemy DS/Section 4/Logistic/01Exercise1.csv")
loan_prep = loan_data.copy()
loan_prep.isnull().sum(axis=0)
# we simply drop the missing values
loan_prep = loan_prep.dropna()
loan_prep.isnull().sum(axis=0)
# drop irrelevant columns: "gender"
loan_prep = loan_prep.drop(['gender'],axis=1)
# create dummy variable for categorical features
loan_prep.dtypes
loan_prep['married'].astype('category')
loan_prep = pd.get_dummies(loan_prep,drop_first=True)
# Normalize the numeric data
from sklearn.preprocessing import StandardScaler
scalar_ = StandardScaler()
loan_prep['income'] = scalar_.fit_transform(loan_prep[['income']])
loan_prep['loanamt'] = scalar_.fit_transform(loan_prep[['loanamt']])
# Split the data
Y = loan_prep[['status_Y']]
X = loan_prep.drop(['status_Y'], axis=1)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1234, stratify=Y)
# Build the Logistic model
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
y_pred = pd.DataFrame(y_pred)
# Check the accuracy of our model Using confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
score = lr.score(x_test, y_test)
# Precision, Accuracy, Sensitivity, F1score
cr = classification_report(y_test, y_pred)
score2 = accuracy_score(y_test, y_pred)
# Getting the probabilities of predictions
Y_prob = lr.predict_proba(x_test)
# We want to access the positive column (1) of Y_prob
Y_prob_2 = Y_prob[:, 1]
# We are going to make new predictions based on Y_prob2 and an increased threshold
threshold = 0.8
Y_new_pred = []
for i in range(0, len(Y_prob_2)):
if Y_prob_2[i] > threshold:
Y_new_pred.append(1)
else:
Y_new_pred.append(0)
cr_1 = classification_report(y_test, Y_new_pred)
score3 = accuracy_score(y_test, Y_new_pred)
cm_2 = confusion_matrix(y_test, Y_new_pred)
# Get the AUC and plot the curve
from sklearn.metrics import roc_curve, roc_auc_score
fpr, tpr, threshold = roc_curve(y_test, Y_prob_2)
AUC = roc_auc_score(y_test, Y_prob_2)
import matplotlib.pyplot as plt
plt.plot(fpr,tpr, linewidth=4)
plt.title('ROC curve for loan prediction')
plt.xlabel('False positve')
plt.ylabel('True positve')
plt.grid()