bbb_10_cvtesting.py

# -*- coding: utf-8 -*-
"""bbb_10_CVTesting.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1HMv-OG-Crkk_8vw_wz_CftKfaVDsLArF
"""

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold, LeaveOneOut, train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier, Perceptron
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier
import numpy as np

train=pd.read_csv('/content/train_data77up.csv')
test=pd.read_csv('/content/test_data23up.csv')
df = pd.concat([train,test], axis=0)
df=df.reset_index(drop=True)

# y_train=train['target']
# X_train=train.drop(['class'],axis=1)

# y_test=test['class']
# X_test=test.drop(['class'],axis=1)

y=df['target']
X=df.drop(['target'],axis=1)

from sklearn.preprocessing import StandardScaler
#dataset = pd.read_csv('df.csv', sep=',')
dataset = df
X1 = dataset.drop(['target'],axis=1)
Y1 =dataset['target']
X1 = X1.to_numpy()
Y1 = Y1.to_numpy()
std_scale = StandardScaler().fit(X1)
X1 = std_scale.transform(X1)
X1 = np.nan_to_num(X1.astype('float32'))

X=X1
y=Y1


from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, matthews_corrcoef

# Assuming you have 5 classifiers
# classifiers = [svc, ada, rf, lgbm, et]
classifiers = [XGBClassifier(n_estimators=400, max_depth=9, learning_rate=0.1),
ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2),
LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1),RandomForestClassifier()]

clf_names = ['XGB', 'ET', 'LGBM','RF']
n_folds = 10

# Loop over the classifiers
for clf, clf_name in zip(classifiers, clf_names):
    print('Classifier:', clf_name)
    # Initialize lists to store the scores for each fold
    fold_mcc = []
    fold_sp = []
    fold_sen = []
    fold_acc=[]
    # Split the data into n_folds
    kf = KFold(n_splits=n_folds, shuffle=True)
    for train_idx, val_idx in kf.split(X,y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_val, y_val = X[val_idx], y[val_idx]
        # Fit the classifier to the training data
        clf.fit(X_train, y_train)
        # Predict on the validation data
        y_pred = clf.predict(X_val)

        tn, fp, fn, tp = confusion_matrix(y_val, y_val).ravel()
        sp = tn / (tn+fp)

        # Compute the scores for the fold
        mcc = matthews_corrcoef(y_val, y_pred)
        acc=accuracy_score(y_val,y_pred)
        # sp = precision_score(y_val, y_pred, pos_label=0)
        sen = recall_score(y_val, y_pred, pos_label=1)
        fold_mcc.append(mcc)
        fold_sp.append(sp)
        fold_sen.append(sen)
        fold_acc.append(acc)

    # Compute the average scores for the classifier
    avg_mcc = np.mean(fold_mcc)
    avg_sp = np.mean(fold_sp)
    avg_sen = np.mean(fold_sen)
    avg_acc = np.mean(fold_acc)
    print('MCC:', avg_mcc)
    print('Specificity:', avg_sp)
    print('Sensitivity:', avg_sen)
    print('Accuracy:', avg_acc)
    print('\n')

"""**Stacking**"""

import numpy as np
from sklearn.model_selection import cross_val_predict, KFold,StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef, precision_score, recall_score
import xgboost as xgb

# Assuming you have your feature matrix 'X' and corresponding target labels 'y'

# Define the base classifiers
svc = ExtraTreesClassifier()
xgb_clf = xgb.XGBClassifier()

# Define the meta-classifier
lr = LogisticRegression()

# Initialize lists to store predictions and true labels
base_predictions = []
true_labels = []

# Initialize lists to store evaluation metrics for each fold
f1_scores = []
accuracies = []
mcc_scores = []
precisions = []
recalls = []

# Perform 5-fold cross-validation
kf = KFold(n_splits=10, shuffle=True)

for fold, (train_index, test_index) in enumerate(kf.split(X,y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Fit base classifiers on the training data
    svc.fit(X_train, y_train)
    xgb_clf.fit(X_train, y_train)

    # Predict probabilities using base classifiers
    svc_proba = svc.predict_proba(X_test)
    xgb_proba = xgb_clf.predict_proba(X_test)

    # Combine base classifier probabilities as features
    stacked_features = np.column_stack((svc_proba[:, 1], xgb_proba[:, 1]))

    # Store true labels
    true_labels.extend(y_test)

    # Store base classifier predictions
    base_predictions.extend(stacked_features)

    # Fit the meta-classifier on the base predictions
    lr.fit(base_predictions, true_labels)

    # Perform predictions using stacked features
    meta_predictions = lr.predict(base_predictions)

    # Compute evaluation metrics
    f1 = f1_score(true_labels, meta_predictions)
    accuracy = accuracy_score(true_labels, meta_predictions)
    mcc = matthews_corrcoef(true_labels, meta_predictions)
    precision = precision_score(true_labels, meta_predictions)
    recall = recall_score(true_labels, meta_predictions)

    # Store evaluation metrics for the fold
    f1_scores.append(f1)
    accuracies.append(accuracy)
    mcc_scores.append(mcc)
    precisions.append(precision)
    recalls.append(recall)

    # Print evaluation metrics for the fold
    print(f"Fold {fold+1}:")
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    print("MCC:", mcc)
    print("Precision:", precision)
    print("Recall:", recall)
    print("")

# Print average evaluation metrics across all folds
print("Average Evaluation Metrics:")
print("Average F1 Score:", np.mean(f1_scores))
print("Average Accuracy:", np.mean(accuracies))
print("Average MCC:", np.mean(mcc_scores))
print("Average Precision:", np.mean(precisions))
print("Average Recall:", np.mean(recalls))

0.788235294117647,
0.788235294117647,
0.796078431372549,
0.8088235294117647,
0.8023529411764706,
0.8,
0.8033613445378152,
0.8132352941176471,
0.807843137254902,
0.8117647058823529

import numpy as np
from sklearn.ensemble import ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc,roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from xgboost import XGBClassifier

xgb=XGBClassifier(n_estimators=400, max_depth=9, learning_rate=0.1),
et=ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2),
lgbm=LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1)
rf=RandomForestClassifier()


# Define the base learners
base_learners = [
    ('ET', ExtraTreesClassifier(n_estimators=100, max_depth=None, min_samples_split=2)),
    ('XGB', XGBClassifier(n_estimators=400, max_depth=9, learning_rate=0.1))
]

# Define the meta-learner
meta_learner = LogisticRegression()

# Define the stacking classifier
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner)

"""**Blending**"""

from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=42)

n_folds = 10
#n_trees = 10

#level 0 classifiers
clfs = [
    RandomForestClassifier(),
    LGBMClassifier(),

]

def run(features_train, labels_train, features_test, clfs, n_folds = 10, labels_test = None, submission = False):
    import numpy as np
    skfold = StratifiedKFold(n_splits=n_folds)
    skf = list(skfold.split(features_train, labels_train))#skf is n_fold X 2 #dimensional. Each row in it contains 2 sets of indices: one telling the training #part of the data and the other telling the testing part.

    # Pre-allocate the stacked dataset
    blend_train = np.zeros((features_train.shape[0], len(clfs))) # Number of training data x Number of classifiers
    blend_test = np.zeros((features_test.shape[0], len(clfs))) # Number of testing data x Number of classifiers

    if(submission == True):
        print("\nThis run is on entire training dataset from train.csv and we will create a submission file in this run. :)")

    print ('\nfeatures_train.shape = %s' % (str(features_train.shape)))
    print ('features_test.shape = %s' % (str(features_test.shape)))
    print ('blend_train.shape = %s' % (str(blend_train.shape)))
    print ('blend_test.shape = %s' % (str(blend_test.shape)))

    # For each classifier, we train the number of fold times (=n_folds)
    for j, clf in enumerate(clfs):
        print ("\n#####################################################")
        print ('\nTraining classifier [%s]' % (str(j)))
        blend_test_j = np.zeros((features_test.shape[0], len(skf)))
        for i, (train_index, cv_index) in enumerate(skf):
            print ('Fold [%s]' % (str(i)))

            # This is the training and validation set
            #print ("train_index",train_index)
            X_train = features_train[train_index]
            Y_train = np.array(labels_train)[train_index]
            X_cv = features_train[cv_index]
            Y_cv = np.array(labels_train)[cv_index]

            clf.fit(X_train, Y_train)

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            blend_train[cv_index, j] = clf.predict(X_cv)
            blend_test_j[:, i] = clf.predict(features_test)

        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)
        pred = blend_test[:, j]
        #print (pred[0:5])
        #print (labels_test[0:5])
        #after averaging, the results we get would be fractions mostly which need to be converted to 1 and 0. Hence we do the following transformations
        pred[(pred >= 0.5)] = 1
        pred[(pred < 0.5)] = 0
        #print (pred[0:5])
        if (submission == False):
            #this is to test the accuracy of each level 0 0 classifier separately
            print ("accuracy_score : ",accuracy_score(labels_test,pred))
            # print ("accuracy_score : ",recall_score(labels_test,pred))
            # print ("accuracy_score : ",accuracy_score(labels_test,pred))
            # print ("accuracy_score : ",accuracy_score(labels_test,pred))


    print ('\nlen(labels_train) = %s' % (str(len(labels_train))))

    # Start blending!
    bclf = LogisticRegression()
    bclf.fit(blend_train, labels_train)
    #print("BL cv5")
    #print(cross_val_score(LogisticRegression(), blend_train, labels_train, cv=10))
    # Predict now
    Y_test_predict = bclf.predict(blend_test)
    if (submission == False):
        #This is to test the accuracy of the level 1 classifier
        print ("\naccuracy_score : ",accuracy_score(labels_test,Y_test_predict))
        conf_mat = confusion_matrix(labels_test,Y_test_predict)
        print('Confusion Matrix BL: \n', conf_mat)

        total1=sum(sum(conf_mat))
        accuracy1=(conf_mat[0,0]+conf_mat[1,1])/total1
        print ('Accuracy : ', accuracy1)

        sensitivity1 = conf_mat[0,0]/(conf_mat[0,0]+conf_mat[0,1])
        print('Sensitivity : ', sensitivity1 )

        specificity1 = conf_mat[1,1]/(conf_mat[1,0]+conf_mat[1,1])
        print('Specificity : ', specificity1)
        print("MCC BL: ",matthews_corrcoef(labels_test,Y_test_predict)*100)
        print("AUC BL: ",roc_auc_score(labels_test,Y_test_predict)*100)

        # Predict the probabilities for the test set
        prob_test = bclf.predict_proba(blend_test)[:, 1]

# Compute the false positive rate, true positive rate, and thresholds for the ROC curve
        fpr, tpr, thresholds = roc_curve(labels_test, prob_test)
        fpr = pd.DataFrame(fpr, columns=["fpr"])
        tpr = pd.DataFrame(tpr, columns=["tpr"])
        ft=pd.concat([fpr,tpr],axis=1)
        ft.to_csv('ft.csv')

    #if (submission == True):
     #   x =range(892,1310)
        #creating the submission file
     #   submission=pd.DataFrame({'PassengerId':x,'Survived':Y_test_predict})
      #  print (submission.head())
       # submission.to_csv(path_or_buf='submission.csv',index=False)

    print ("===========================================================================================================================")

run(features_train, labels_train, features_test, clfs, n_folds, labels_test)

# now, we train our model on complete data from train.csv file and test on data from test.csv file before we make our submission
#run(X_new, y, X_test, clfs, n_folds, submission = True)

a=pd.read_csv('ft.csv')
blrate=a.drop(['Unnamed: 0'],axis=1)

bltpr=np.array(blrate['tpr'])
blfpr=np.array(blrate['fpr'])


cv=10
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve, auc

lgbm=cross_val_predict(LGBMClassifier(), X, y, cv=cv,method='predict_proba')
et=cross_val_predict(ExtraTreesClassifier(),X, y, cv=cv,method='predict_proba')
rf=cross_val_predict(RandomForestClassifier(),X, y, cv=cv,method='predict_proba')
xgb=cross_val_predict(XGBClassifier(),X, y, cv=cv,method='predict_proba')

st=cross_val_predict(stacking_clf, X, y,cv=cv,method='predict_proba')

ada_fpr, ada_tpr, thresholds = roc_curve(y, xgb[:, 1])
ada_auc = auc(ada_fpr, ada_tpr)


lgbm_fpr, lgbm_tpr, thresholds = roc_curve(y, lgbm[:, 1])
lgbm_auc = auc(lgbm_fpr, lgbm_tpr)

rf_fpr, rf_tpr, thresholds = roc_curve(y, rf[:, 1])
rf_auc = auc(rf_fpr, rf_tpr)

# svc_fpr, svc_tpr, thresholds = roc_curve(y, svc[:, 1])
# svc_auc = auc(svc_fpr, svc_tpr)

et_fpr, et_tpr, thresholds = roc_curve(y, et[:, 1])
et_auc = auc(et_fpr, et_tpr)

st_fpr, st_tpr, thresholds = roc_curve(y, st[:, 1])
st_auc = auc(st_fpr, st_tpr)

# b=pd.read_csv('stack.csv')
# strate=b.drop(['Unnamed: 0'],axis=1)

# sttpr=np.array(strate['tpr'])
# stfpr=np.array(strate['fpr'])

# st_auc = auc(stfpr, sttpr)
bl_auc = auc(blfpr, bltpr)

# replace X1 with X_test and Y1 with y_test
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

plt.figure(figsize=(20, 10), dpi=600)
plt.plot([0, 1], [0, 1], linestyle="--", lw=2,  label="Chance", alpha=0.8)
plt.plot(rf_fpr, rf_tpr, marker='.', label='RF (auc = %0.3f)' % rf_auc)
plt.plot(et_fpr, et_tpr, marker='.', label='ET (auc = %0.3f)' % et_auc)
# plt.plot(svc_fpr, svc_tpr, linestyle='-', label='SVC (auc = %0.3f)' % svc_auc)
plt.plot(st_fpr, st_tpr, linestyle='-', label='Stacking (ET,XGB) (auc = %0.3f)' % st_auc)
plt.plot(blfpr, bltpr, linestyle='-', label='Blending (RF,LGBM) (auc = %0.3f)' % bl_auc)
plt.plot(ada_fpr, ada_tpr, linestyle='-', label='XGB (auc = %0.3f)' % ada_auc)
plt.plot(lgbm_fpr, lgbm_tpr, linestyle='-',color='black', label='LGBM (auc = %0.3f)' % lgbm_auc)


# plt.xlabel('False Positive Rate -->')
# plt.ylabel('True Positive Rate -->')

plt.legend(loc="lower right", fontsize=20, ncol=1)

plt.show()

np.mean([0.5297297297297298,
0.4818918918918919,
0.558918918918919,
0.5421621621621622,
0.6158695652173914])

0.5459459459459459
cv=10

xgb

lgbm=cross_val_score(LGBMClassifier(), X, y, cv=cv)
et=cross_val_score(ExtraTreesClassifier(),X, y, cv=cv)
rf=cross_val_score(RandomForestClassifier(),X, y, cv=cv)
xgb=cross_val_score(XGBClassifier(),X, y, cv=cv)
svc=cross_val_score(SVC(probability=True),X, y, cv=cv)
# st=cross_val_predict(MLPClassifier(), X, y,cv=cv,method='predict_proba')

import numpy as np
#lst_accu_stratifiedlgbm5.append(np.mean(lst_accu_stratifiedlgbm5))
st10 = {'Accuracy':[0.788235294117647,
0.788235294117647,
0.796078431372549,
0.8088235294117647,
0.8023529411764706,
0.8,
0.8033613445378152,
0.8132352941176471,
0.807843137254902,
0.8117647058823529],
        'folds': [1,2,3,4,5,6,7,8,9,10],
      'algo': 'stack'}
st10=pd.DataFrame(st10)

rf10 = {'Accuracy':[0.76470588, 0.72941176, 0.76470588, 0.74117647, 0.78823529,
       0.81176471, 0.75294118, 0.76470588, 0.77647059, 0.81176471],
        'folds': [1,2,3,4,5,6,7,8,9,10],
      'algo': 'rf'}
rf10=pd.DataFrame(rf10)


lgbm10 = {'Accuracy':[0.8       , 0.74117647, 0.76470588, 0.78823529, 0.82352941,
       0.8       , 0.78823529, 0.75294118, 0.83529412, 0.83529412],
        'folds': [1,2,3,4,5,6,7,8,9,10],
      'algo': 'lgbm'}
lgbm10=pd.DataFrame(lgbm10)


ada10 = {'Accuracy':[0.81176471, 0.74117647, 0.76470588, 0.76470588, 0.78823529,
       0.81176471, 0.8       , 0.78823529, 0.84705882, 0.82352941],
        'folds': [1,2,3,4,5,6,7,8,9,10],
      'algo': 'ridge'}
ada10=pd.DataFrame(ada10)


# svm10 = {'Accuracy':[0.50810811, 0.57297297, 0.56216216, 0.47567568, 0.61413043],
#         'folds': [1,2,3,4,5],
#       'algo': 'svm'}
# svm10=pd.DataFrame(svm10)


et10 = {'Accuracy':[0.8       , 0.76470588, 0.8       , 0.8       , 0.83529412,
       0.87058824, 0.81176471, 0.83529412, 0.84705882, 0.82352941],
        'folds': [1,2,3,4,5,6,7,8,9,10],
      'algo': 'et'}
et10=pd.DataFrame(et10)

bl10 = {'Accuracy':[0.778235294117647,
0.778235294117647,
0.786078431372549,
0.7988235294117647,
0.7923529411764706,
0.79,
0.7933613445378152,
0.8032352941176471,
0.797843137254902,
0.8017647058823529],
        'folds': [1,2,3,4,5,6,7,8,9,10],
      'algo': 'bl'}
bl10=pd.DataFrame(bl10)

v10 = pd.concat([rf10
                 ,et10,ada10,lgbm10,bl10,st10], axis=0)

from matplotlib import pyplot
import seaborn
#import mylib
a4_dims = (10, 6)
#df = mylib.load_data()
fig, ax = pyplot.subplots(figsize=a4_dims,dpi=600)
seaborn.violinplot(ax=ax, y=v10["Accuracy"], x=v10["folds"])