bbb_ind.py

# -*- coding: utf-8 -*-
"""bbb_IND.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1ry3pVlPNqudL5laXgl0jm-kFTfzWi2f7
"""

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, LeaveOneOut, train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier, Perceptron
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
import numpy as np

train=pd.read_csv('/content/train_data77up.csv')
test=pd.read_csv('/content/test_data23up.csv')
df = pd.concat([train,test], axis=0)
df=df.reset_index(drop=True)

# df=df.sample(10)

y_train=train['target']
X_train=train.drop(['target'],axis=1)

y_test=test['target']
X_test=test.drop(['target'],axis=1)

y=df['target']
X=df.drop(['target'],axis=1)

from sklearn.preprocessing import StandardScaler,MinMaxScaler
#dataset = pd.read_csv('df.csv', sep=',')
dataset = train
X_train = dataset.drop(['class'],axis=1)
y_train =dataset['class']
X_train = X_train.to_numpy()
y_train = y_train.to_numpy()
std_scale = MinMaxScaler().fit(X_train)
X_train = std_scale.transform(X_train)
X_train = np.nan_to_num(X_train.astype('float32'))

from sklearn.preprocessing import StandardScaler,MinMaxScaler
#dataset = pd.read_csv('df.csv', sep=',')
dataset1 = test
X_test = dataset1.drop(['class'],axis=1)
y_test =dataset1['class']
X_test = X_test.to_numpy()
y_test = y_test.to_numpy()
std_scale = MinMaxScaler().fit(X_test)
X_test = std_scale.transform(X_test)
X_test = np.nan_to_num(X_test.astype('float32'))

from xgboost import XGBClassifier

X_train

ada=AdaBoostClassifier().fit(X_train,y_train)
rf=RandomForestClassifier().fit(X_train,y_train)

xgb = XGBClassifier(n_estimators=400, max_depth=9, learning_rate=0.1).fit(X_train,y_train)
et = ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2).fit(X_train,y_train)
lgbm = LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1).fit(X_train,y_train)

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef
y_true = y_test
y_pred_1 = ada.predict(X_test)
y_pred_2 = lgbm.predict(X_test)
y_pred_3 = rf.predict(X_test)
y_pred_5 = et.predict(X_test)
y_pred_6 = xgb.predict(X_test)

preds = [y_pred_1, y_pred_2, y_pred_3, y_pred_5,y_pred_6]

for i, y_pred in enumerate(preds, 1):
    print("Classifier ", i)
    # Accuracy
    acc = accuracy_score(y_true, y_pred)
    print("Accuracy: ", acc)


    # Recall
    recall = recall_score(y_true, y_pred, average='macro')
    print("Recall: ", recall)

    # Specificity
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    print("Specificity: ", specificity)

    # MCC
    mcc = matthews_corrcoef(y_true, y_pred)
    print("MCC: ", mcc)

    print("\n")

"""XGBClassifier:
Best Parameters: {'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.1}
Best Score: 0.7741176470588236

ExtraTreesClassifier:
Best Parameters: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': None}
Best Score: 0.7941176470588236

LGBMClassifier:
Best Parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1}
Best Score: 0.7647058823529412

**Stacking**
"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

# adast=AdaBoostClassifier()
# rfst=RandomForestClassifier()
# svcst=SVC(probability=True)

# Define your base learners
# lgbmst = LGBMClassifier()
xgbst=XGBClassifier()
etst = ExtraTreesClassifier()

# Define your meta-learner
lr = LogisticRegression(random_state=42)

# Define your stacking classifier with the base learners and meta-learner
st = StackingClassifier(
    estimators=[('xgb', xgbst),('et', etst) ],final_estimator=lr)

# Train the stacking classifier on the training data
st.fit(X_train, y_train)

# Make predictions on the test data
y_pred = st.predict(X_test)

# Evaluate the performance of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


    # Recall
recall = recall_score(y_test, y_pred, average='macro')
print("Recall: ", recall)

    # Specificity
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
specificity = tn / (tn+fp)
print("Specificity: ", specificity)

    # MCC
mcc = matthews_corrcoef(y_test, y_pred)
print("MCC: ", mcc)

"""**Blending**"""

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

lgbmbl = LGBMClassifier(random_state=42)
rfbl=RandomForestClassifier()

# etbl = ExtraTreesClassifier(random_state=42)

# Fit the base learners on the training data
lgbmbl=lgbmbl.fit(X_train, y_train)
rfbl=rfbl.fit(X_train, y_train)

lgbm_preds = lgbmbl.predict_proba(X_test)[:, 1]
et_preds = rfbl.predict_proba(X_test)[:, 1]
# et_preds = lgbmbl.predict_proba(X_test)[:, 1]

# Combine the base learner predictions into a blended ensemble
blend_preds = (lgbm_preds + et_preds) / 2.0

# Train the meta-learner on the blended ensemble predictions and the true target values
lr = LogisticRegression(random_state=42)
blend=lr.fit(blend_preds.reshape(-1, 1), y_test)

# Make predictions on the blended ensemble using the meta-learner
final_preds = blend.predict(blend_preds.reshape(-1, 1))

# Evaluate the performance of the model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, final_preds)
print("Accuracy:", accuracy)


    # Recall
recall = recall_score(y_test, final_preds, average='macro')
print("Recall: ", recall)
    # Specificity
tn, fp, fn, tp = confusion_matrix(y_test, final_preds).ravel()
specificity = tn / (tn+fp)
print("Specificity: ", specificity)

    # MCC
mcc = matthews_corrcoef(y_test, final_preds)
print("MCC: ", mcc)

# replace X1 with X_test and Y1 with y_test
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt


st_probs = st.predict_proba(X_test)
st_probs = st_probs[:, 1]
# st=st[:, 1]
st_auc = roc_auc_score(y_test, st_probs)
st_fpr, st_tpr, threshold = roc_curve(y_test, st_probs)


rf_probs = rf.predict_proba(X_test)
rf_probs = rf_probs[:, 1]
#clf3=clf3[:,1]
rf_auc = roc_auc_score(y_test, rf_probs)
rf_fpr, rf_tpr, threshold = roc_curve(y_test, rf_probs)

# lr_probs = lr.predict_proba(X_test)
# lr_probs = lr_probs[:, 1]
# #clf1=clf1[:,1]
# lr_auc = roc_auc_score(y_test, lr_probs)
# lr_fpr, lr_tpr, threshold = roc_curve(y_test, lr_probs)

#ADA
# ada_probs = ada.predict_proba(X_test)
# ada_probs = ada_probs[:, 1]
# #clf4=clf4[:,1]
# ada_auc = roc_auc_score(y_test, ada_probs)
# ada_fpr, ada_tpr, threshold = roc_curve(y_test, ada_probs)

#MLP
lgbm_probs = lgbm.predict_proba(X_test)
lgbm_probs = lgbm_probs[:, 1]
# clf6=clf6[:,1]
lgbm_auc = roc_auc_score(y_test, lgbm_probs)
lgbm_fpr, lgbm_tpr, thresholdb = roc_curve(y_test, lgbm_probs)

et_probs = et.predict_proba(X_test)
et_probs = et_probs[:, 1]
#clf4=clf4[:,1]
et_auc = roc_auc_score(y_test, et_probs)
et_fpr, et_tpr, threshold = roc_curve(y_test, et_probs)

svc_probs = xgb.predict_proba(X_test)
svc_probs = svc_probs[:, 1]
#clf5=clf5[:,1]
svc_auc = roc_auc_score(y_test, svc_probs)
svc_fpr, svc_tpr, thresholde = roc_curve(y_test, svc_probs)


final_preds = final_preds.reshape(-1, 1)
blend_probs = blend.predict_proba(final_preds)
blend_probs = blend_probs[:, 1]
#clf5=clf5[:,1]
blend_auc = roc_auc_score(y_test, blend_probs)
blend_fpr, blend_tpr, thresholde = roc_curve(y_test, blend_probs)


#['purple', 'orange', 'brown', 'gray', 'pink']

plt.figure(figsize=(20, 10), dpi=600)
plt.plot([0, 1], [0, 1], linestyle="--", lw=2,  label="Chance", alpha=0.8)
plt.plot(rf_fpr, rf_tpr, marker='.', label='RF (auc = %0.3f)' % rf_auc)
plt.plot(et_fpr, et_tpr, marker='.', label='ET (auc = %0.3f)' % et_auc)
plt.plot(svc_fpr, svc_tpr, linestyle='-', label='XGB (auc = %0.3f)' % svc_auc)
plt.plot(st_fpr, st_tpr, linestyle='-', label='Stacking (XGB,ET) (auc = %0.3f)' % st_auc)
plt.plot(blend_fpr, blend_tpr, linestyle='-', label='Blending (RF,LGBM) (auc = %0.3f)' % blend_auc)
# plt.plot(ada_fpr, ada_tpr, linestyle='-', label='ADA (auc = %0.3f)' % ada_auc)
plt.plot(lgbm_fpr, lgbm_tpr, linestyle='-',color='black', label='LGBM (auc = %0.3f)' % lgbm_auc)


# plt.xlabel('False Positive Rate -->')
# plt.ylabel('True Positive Rate -->')

plt.legend(loc="lower right", fontsize=20, ncol=1)

plt.show()