Santander Customer Satisfaction.py

# -*- coding: utf-8 -*-
"""Univ.AI Project Uncertainty.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1QwjsA1fLdCkob7oZS-j-feo2IMnWuzsu
"""

from google.colab import drive
drive.mount('/content/drive')

"""## Dataset you chose - **Santander Customer Satisfaction**

Team members names: 

* Suraj Kumar Mondal

* Saurabh Shetty

* Vishal Kumar

* Nazim Saifi

## Add your video and report link here

### Add your code link here

# Library Imports
"""

!pip install feature_engine

# Commented out IPython magic to ensure Python compatibility.
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures, DropCorrelatedFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, mean_squared_error, log_loss, recall_score
from xgboost import XGBClassifier

# %matplotlib inline

"""# Read Dataset"""

train_data = pd.read_csv('/content/drive/MyDrive/Univ.AI/train.csv')

train_data.head()

# Describe train dataset
train_data.describe()

"""# Feature Preprocessing"""

##  Check if any null values in dataframe
train_data.isnull().sum().sum()
## No observations are Nan or None

# Drop duplicate records
train_data = train_data.drop("ID", axis=1)
original_records = train_data.shape[0]

train_data = train_data.drop_duplicates()
print(f"Dropped {original_records - train_data.shape[0]} duplicate records")

# Check Distribution of observations based on TARGET columns
sns.set_style('whitegrid')
sns.countplot(x = 'TARGET', data = train_data)

## Count of class 0 and 1
train_data.TARGET.value_counts()

"""The dataset seems to be heavily imbalanced with 96% observations belonging to class 0 (satisfied customers)


"""

# Removing constant value features (zero variance features) from the dataset
const_col = []
for col in train_data.columns:
  if train_data[col].unique().shape[0] <= 1:
    const_col.append(col)
print(f'Removing {len(const_col)} features with constant values')
train_data = train_data.drop(const_col, axis = 1)
# train_data.shape

#Spilt design matrix and target column
X = train_data.drop(['TARGET'], axis = 1)
y = train_data['TARGET']

# Split train test split with 20% test data and stratified to ensure equal distribution of observations
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 0, stratify = y)

## Drop features that are exactly identical to each other
duplicates = DropDuplicateFeatures()

# find duplicated features in the train set
duplicates.fit(X_train)

## Columns that are exactly identical to each other
duplicates.duplicated_feature_sets_

print('Number of variables before removing duplicates: ', X_train.shape[1])

X_train = duplicates.transform(X_train)
X_test = duplicates.transform(X_test)

print('Number of variables after removing duplicates: ', X_train.shape[1])

# Identify categorical columns based on unique values. All categorical features would have value as [0,1] only
i= 0 
categorical_columns =[]
for col in X_train.columns:
  uniques = X_train[col].unique()
  if all([unique in [0,1] for unique in uniques]):
    i+=1
    categorical_columns.append(col)
    # print(f"{i} : {col} -- {uniques}")
len(categorical_columns)

categorical_columns

"""Drop highly  corelated features"""

#Separate categorical and numeric features for correlation and scaling
X_train_cat = X_train[categorical_columns]
X_test_cat = X_test[categorical_columns]

X_train_num = X_train.drop(categorical_columns, axis=1)
X_test_num = X_test.drop(categorical_columns, axis=1)

#Remove features with >95% correlation in numerical dataset
correlated = DropCorrelatedFeatures(variables=None, method='pearson', threshold=0.95)

# find correlated variables in the train set
correlated.fit(X_train_num)

# Correlated columns that will be dropped
len(correlated.features_to_drop_)

print('Number of variables before removing correlated: ', X_train_num.shape[1])

X_train_num = correlated.transform(X_train_num)
X_test_num = correlated.transform(X_test_num)

print('Number of variables after removing correlated: ', X_train_num.shape[1])

#Scale Numerical data
scaler = StandardScaler()

scaled_train_num = scaler.fit_transform(X_train_num)
X_train_num = pd.DataFrame(scaled_train_num, columns=X_train_num.columns)

scaled_test_num = scaler.transform(X_test_num)
X_test_num = pd.DataFrame(scaled_test_num, columns=X_test_num.columns)

X_train_cat.shape, X_train_num.shape, X_test_num.shape, X_test_cat.shape

# Join Categorical and numeric features
X_train = pd.concat([X_train_cat.reset_index(drop=True), X_train_num.reset_index(drop=True)], axis=1)
X_test = pd.concat([X_test_cat.reset_index(drop=True), X_test_num.reset_index(drop=True)], axis=1)

X_train.shape, X_test.shape

## Drop Sparse Columns. ie. columns having 0 in more than 99% of the observations
i=0
sparse_columns = []
for col in X_train.columns: #removing all sparse features
    if np.percentile(X_train[col],99)==0:
      sparse_columns.append(col)
X_train = X_train.drop(sparse_columns, axis=1)
X_test = X_test.drop(sparse_columns, axis=1)

print(f"dropped {len(sparse_columns)} sparse columns")

X_train.shape, X_test.shape

X_train.head()

## Oversampling using SMOTE
oversample = SMOTE()
X_over, y_over = oversample.fit_resample(X_train, y_train)

# ## Using SMOTETomek
# from imblearn.combine import SMOTETomek
# oversample = SMOTETomek()
# X_over, y_over = oversample.fit_resample(X_train, y_train)

X_over.shape, y_over.shape

y_over.value_counts()

# plt.rc('font', size=20)
fig, ax = plt.subplots(1,2, figsize=(16,10))
ax[0].pie(y_train.value_counts(), labels = ["Satisfied Customers (0)", "Unsatisfied Customers (1)"])
ax[0].set_title("Before Oversampling", fontsize=20)

ax[1].pie(y_over.value_counts(), labels = ["Satisfied Customers (0)", "Unsatisfied Customers (1)"])
ax[1].set_title("After Oversampling", fontsize=20)
plt.savefig("SMOTE.png",dpi=500)
plt.show()

"""# Model Creation"""

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_auc_score, classification_report, f1_score, recall_score, roc_curve, auc

"""## Logistic Regression"""

#Fit LR Model on oversampled data
o_log_model = LogisticRegression(random_state = 0, max_iter = 10000)
o_log_model.fit(X_over, y_over)

y_pred = o_log_model.predict(X_test)
o_log_model_accuracy = accuracy_score(y_test, y_pred)
o_log_model_auc = roc_auc_score(y_test, o_log_model.predict_proba(X_test)[:,1])
o_log_model_recall = recall_score(y_test, y_pred)
o_log_model_f1 = f1_score(y_test, y_pred)

print(f'''Logistic Regression model trained using oversampled data 
accuracy = {o_log_model_accuracy:.3f}
auc_score = {o_log_model_auc:.3f}
recall = {o_log_model_recall:.3f}
f1 = {o_log_model_f1:.3f}''')

#Classification report on train dataset
print(classification_report(y_over, o_log_model.predict(X_over)))

#Classification report on test dataset
print(classification_report(y_test, y_pred))

#Fit LR Model on imbalanced data
log_model = LogisticRegression(random_state = 0, max_iter = 10000)
log_model.fit(X_train, y_train)

y_pred = log_model.predict(X_test)
log_model_accuracy = accuracy_score(y_test, y_pred)
log_model_auc = roc_auc_score(y_test, log_model.predict_proba(X_test)[:,1])
log_model_recall = recall_score(y_test, y_pred)
log_model_f1 = f1_score(y_test, y_pred)

print(f'''Logistic Regression model trained on imbalanced data 
accuracy = {log_model_accuracy:.3f}
auc_score = {log_model_auc:.3f}
recall = {log_model_recall:.3f}
f1 = {log_model_f1:.3f}''')

"""It can be seen here that the Logistic Regression model trained on imbalanced data has a higher accuracy but has a very poor recall and f1-score when compared to the same LR model trained using oversampled data."""

#Classification report on train dataset
print(classification_report(y_test, o_log_model.predict(X_test)))

#Classification report on test dataset
print(classification_report(y_test, y_pred))

# for i in o_log_model.coef_.ravel():
#   if abs(i)>0.1:
#     print(i)

"""## Random Forest Classifier"""

from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from tqdm import tqdm

"""### Hyperparameter Tuning Random Forest Classifier"""

# TAKES A VERY LONG TIME AND FREQUENTLY CRASHES DUE TO RAM

# # Grid Search for best min_samples_split and min_samples_leaf
# rf = BalancedRandomForestClassifier(n_jobs=-1,
#                             n_estimators=best_n_estimators,
#                             oob_score=True,
#                             max_features = 'sqrt',
#                             random_state=24)

# param_grid = {
#     'min_samples_split': [2,5,10,20],
#     "min_samples_leaf": [5,10,20,30,50]
#     }

# scoring = {'AUC': 'roc_auc',
#            'RECALL':'recall',
#            "F1":'f1'}

# grid_search = GridSearchCV(rf, 
#                             param_grid, 
#                             scoring=scoring, 
#                             refit='F1', 
#                             return_train_score=True, 
#                             n_jobs=-1,
#                            verbose=2)

# results = grid_search.fit(X_over, y_over)

## Takes ~30 mins to run
# Find best number of estimators for Random Forest
from collections import OrderedDict
clf = BalancedRandomForestClassifier(warm_start=True, 
                              oob_score=True,
                              min_samples_leaf=40,
                              max_depth = 10,
                              n_jobs=-1,
                              random_state=24)

error_rate = {}

# Range of `n_estimators` values to explore.
# min_estimators = 300
# max_estimators = 500
estimators = [800, 900, 1000, 1100, 1500, 2000]
for i in tqdm(estimators):
    clf.set_params(n_estimators=i) 
    clf.fit(X_over, y_over)

    # Record the OOB error for each `n_estimators=i` setting.
    oob_error = 1 - clf.oob_score_
    error_rate[i] = oob_error

xs = []
ys = []
for label, clf_err in error_rate.items():
    xs.append(label)
    ys.append(clf_err)   
plt.plot(xs, ys)
# plt.xlim(min_estimators, max_estimators)
plt.xlabel("n_estimators")
plt.ylabel("OOB error rate")
plt.show();

best_n_estimators = min(error_rate, key=error_rate.get)
best_n_estimators

# Takes ~1hr
# Find best max_depth for best_estimator of Random Forest
from collections import OrderedDict
clf = BalancedRandomForestClassifier( 
                              oob_score=True,
                              n_estimators = best_n_estimators,
                              n_jobs=-1,
                              random_state=24)

error_rate = {}
recall_scores = []
f1_scores = []

depths = [5,8,10,15,20,30,50,100]
for i in tqdm(depths):
    clf.set_params(max_depth=i) 
    clf.fit(X_over, y_over)
    y_pred = clf.predict(X_test)
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

    # Record the OOB error for each `n_estimators=i` setting.
    oob_error = 1 - clf.oob_score_
    error_rate[i] = oob_error
    print(oob_error)

xs = []
ys = []
for label, clf_err in error_rate.items():
    xs.append(label)
    ys.append(clf_err)  
fig, ax = plt.subplots(1,3, figsize=(10,5)) 
ax[0].plot(xs, ys, label="OOB error")
ax[0].set_ylabel("OOB error")
ax[0].set_xlabel("max_depth")
ax[0].legend()

ax[1].plot(xs, recall_scores, label="Recall score")
ax[1].set_ylabel("Recall")
ax[1].set_xlabel("max_depth")
ax[1].legend()

ax[2].plot(xs, f1_scores, label="F1 score")
ax[2].set_ylabel("F1 score")
ax[2].set_xlabel("max_depth")
ax[2].legend()

# plt.xlim(min_estimators, max_estimators)
plt.show();

# We will choose the max_depth as 30 as he f1 score is highest for it and the OOB error is very close to minima
best_max_depth = 30

"""### Train Random Forest Model"""

# From hyperparameter tuning
best_n_estimators = 1100
best_max_depth = 30

rf_model = BalancedRandomForestClassifier( 
                              n_estimators = best_n_estimators,
                              max_depth = best_max_depth,
                              n_jobs=-1,
                              random_state=24)
rf_model.fit(X_over, y_over)

y_pred = rf_model.predict(X_test)
rf_model_accuracy = accuracy_score(y_test, y_pred)
rf_model_auc = roc_auc_score(y_test, rf_model.predict_proba(X_test)[:,1])
rf_model_recall = recall_score(y_test, y_pred)
rf_model_f1 = f1_score(y_test, y_pred)

print(f'''Random Forest Model 
accuracy = {rf_model_accuracy:.3f}
auc_score = {rf_model_auc:.3f}
recall = {rf_model_recall:.3f}
f1 = {rf_model_f1:.3f}''')

#Classification report on train dataset
print(classification_report(y_over, rf_model.predict(X_over)))

#Classification report on test dataset
print(classification_report(y_test, y_pred))

"""### Feature Importance using Permutation Importance"""

from sklearn.inspection import permutation_importance

# Commented out IPython magic to ensure Python compatibility.
# %%time
# perm_importance = permutation_importance(rf_model, X_test, y_test)

plt.rc('font', size=4)
plt.figure(figsize=(5,5), dpi=300)
sorted_idx = np.absolute(perm_importance.importances_mean).argsort()[-50:]
plt.barh(X_test.columns[sorted_idx], np.absolute(perm_importance.importances_mean[sorted_idx]))
plt.xlabel("Permutation Importance")
plt.savefig("Permutation Importance.png")
plt.show()

X_test.columns[sorted_idx]

"""# XGBoost Classifier

## Train XGBoost Model
"""

XGB_clf = XGBClassifier(random_state=2022, max_depth = 10, sub_sample = 0.4, gamma = 10)
XGB_clf.fit(X_over, y_over)

y_pred = XGB_clf.predict(X_test)
xgb_model_accuracy = accuracy_score(y_test, y_pred)
xgb_model_auc = roc_auc_score(y_test, XGB_clf.predict_proba(X_test)[:,1])
xgb_model_recall = recall_score(y_test, y_pred)
xgb_model_f1 = f1_score(y_test, y_pred)

print(f'''XGBoost Model 
accuracy = {xgb_model_accuracy:.3f}
auc_score = {xgb_model_auc:.3f}
recall = {xgb_model_recall:.3f}
f1 = {xgb_model_f1:.3f}''')

#Classification report on train dataset
print(classification_report(y_over, XGB_clf.predict(X_over)))

#Classification report on test dataset
print(classification_report(y_test, y_pred))

"""# Experiment

## Using only top 50 features for model creation
"""

sorted_idx = np.absolute(perm_importance.importances_mean).argsort()[-50:]
top_50_features = X_test.columns[sorted_idx]

# top_50_features = ['ind_var37_cte', 'ind_var9_ult1', 'imp_op_var41_efect_ult3',
#        'ind_var13', 'ind_var13_0', 'num_op_var41_hace2',
#        'ind_var43_recib_ult1', 'num_var41_0', 'num_var13_0',
#        'ind_var10cte_ult1', 'ind_var43_emit_ult1', 'num_var22_hace3',
#        'num_var45_hace3', 'ind_var9_cte_ult1', 'num_op_var41_efect_ult3',
#        'num_op_var39_comer_ult1', 'num_var42_0', 'num_var12_0', 'ind_var12_0',
#        'imp_op_var39_comer_ult3', 'saldo_medio_var5_ult1', 'num_var30_0',
#        'num_var39_0', 'num_op_var41_ult3', 'num_op_var39_comer_ult3',
#        'ind_var41_0', 'ind_var39_0', 'saldo_var5', 'ind_var5',
#        'num_med_var22_ult3', 'saldo_medio_var5_ult3', 'num_var5',
#        'saldo_var30', 'num_var22_ult1', 'ind_var30', 'saldo_medio_var5_hace2',
#        'num_var42', 'num_var30', 'num_var22_hace2', 'var38',
#        'num_med_var45_ult3', 'num_var4', 'num_meses_var39_vig_ult3',
#        'num_var22_ult3', 'saldo_medio_var5_hace3', 'num_var45_ult1',
#        'num_var45_hace2', 'var15', 'var36', 'num_meses_var5_ult3']

X_over_top = X_over[top_50_features]
X_test_top = X_test[top_50_features]

"""### Training Logistic Regression Model"""

#Fit LR Model on oversampled data
top_log_model = LogisticRegression(random_state = 0, max_iter = 10000)
top_log_model.fit(X_over_top, y_over)

y_pred = top_log_model.predict(X_test_top)
top_log_model_accuracy = accuracy_score(y_test, y_pred)
top_log_model_auc = roc_auc_score(y_test, top_log_model.predict_proba(X_test_top)[:,1])
top_log_model_recall = recall_score(y_test, y_pred)
top_log_model_f1 = f1_score(y_test, y_pred)

print(f'''Logistic Regression model trained using oversampled data 
accuracy = {top_log_model_accuracy:.3f}
auc_score = {top_log_model_auc:.3f}
recall = {top_log_model_recall:.3f}
f1 = {top_log_model_f1:.3f}''')

plt.rc('font', size=10)
fig, ax = plt.subplots(1,3, figsize=(15,5))

ax[0].bar("Top 50 features", top_log_model_accuracy)
ax[0].bar("All 221 features", o_log_model_accuracy)
ax[0].set_ylabel("Accuracy Score")
ax[0].set_title("Accuracy")

ax[1].bar("Top 50 features", top_log_model_auc)
ax[1].bar("All 221 features", o_log_model_auc)
ax[1].set_ylabel("AUC Score")
ax[1].set_title("AUC")

ax[2].bar("Top 50 features", top_log_model_f1)
ax[2].bar("All 221 features", o_log_model_f1)
ax[2].set_ylabel("F1 Score")
ax[2].set_title("F1-score")
plt.suptitle("Logistic Regression Metrics Comparison")
plt.savefig("LogisticRegressionMetricsComparison.png")
plt.show()

"""### Training XGBoost classifier on top features"""

top_XGB_clf = XGBClassifier(random_state=2022, max_depth = 10, sub_sample = 0.4, gamma = 10)
top_XGB_clf.fit(X_over_top, y_over)

y_pred = top_XGB_clf.predict(X_test_top)
top_xgb_model_accuracy = accuracy_score(y_test, y_pred)
top_xgb_model_auc = roc_auc_score(y_test, top_XGB_clf.predict_proba(X_test_top)[:,1])
top_xgb_model_recall = recall_score(y_test, y_pred)
top_xgb_model_f1 = f1_score(y_test, y_pred)

print(f'''XGBoost Model 
accuracy = {top_xgb_model_accuracy:.3f}
auc_score = {top_xgb_model_auc:.3f}
recall = {top_xgb_model_recall:.3f}
f1 = {top_xgb_model_f1:.3f}''')

fig, ax = plt.subplots(1,3, figsize=(15,5))

ax[0].bar("Top 50 features", top_xgb_model_accuracy)
ax[0].bar("All 221 features", xgb_model_accuracy)
ax[0].set_ylabel("Accuracy Score")
ax[0].set_title("Accuracy")

ax[1].bar("Top 50 features", top_xgb_model_auc)
ax[1].bar("All 221 features", xgb_model_auc)
ax[1].set_ylabel("AUC Score")
ax[1].set_title("AUC")

ax[2].bar("Top 50 features", top_xgb_model_f1)
ax[2].bar("All 221 features", xgb_model_f1)
ax[2].set_ylabel("F1 Score")
ax[2].set_title("F1-score")
plt.suptitle("XGBoost Metrics Comparison")
plt.savefig("XGBoostMetricsComparison.png")
plt.show()

"""Thus there seems to be no major difference in metrics  by choosing the top 50 features. This could be due to the low Permutation Importance of the rest of the predictors.

Hence we can reduce the model's computation complexity by choosing these 50 features

# Interpretation
"""

from prettytable import PrettyTable

pt = PrettyTable()
pt.field_names = ["Strategy","Accuracy","F1 Score","Recall","AUC score"]
pt.add_row(["Logistic Regression - with class imbalanced dataset",round(log_model_accuracy,3),round(log_model_f1,3),round( log_model_recall,3),round(log_model_auc,3)])
pt.add_row(["Logistic Regression - with class balanced dataset",round(o_log_model_accuracy,3),round(o_log_model_f1,3),round( o_log_model_recall,3),round( o_log_model_auc,3)])
pt.add_row(["Random Forest Classifier- with class balanced dataset",round(rf_model_accuracy,3),round(rf_model_f1,3),round(rf_model_recall,3),round( rf_model_auc,3)])
pt.add_row(["XGBoost Classifier - with class balanced dataset",round( xgb_model_accuracy,3),round( xgb_model_f1,3),round( xgb_model_recall,3),round( xgb_model_auc,3)])
print(pt)

"""From the above table,  we can infer that accuracy isn't the best parameter to evaluate the model performance. Since accuracy doesn't factor in class imbalances.

Better metrics to evaluate the model are the f1-score and AUC score. From these metrics we can see that the **XGBoost classifier** has performed the best followed by **Random Forest** and **Logistic Regresion** as also supported in the ROC curve below.
"""

## ROC curve plot of models
plt.rcParams['font.size'] = '10'
plt.figure(figsize=(10,10))

model_dict =  {
    "Logistic Regression (imbalanced dataset)": log_model,
    "Logistic Regression" : o_log_model,
    "Random Forest" : rf_model,
    "XGBoost" : XGB_clf
}

for model_name, model_obj in model_dict.items():
  probs = model_obj.predict_proba(X_test)
  preds = probs[:,1]
  fpr, tpr, threshold = roc_curve(y_test, preds)
  roc_auc = auc(fpr, tpr)
  plt.plot(fpr, tpr, label = f'{model_name} AUC = %0.2f' % roc_auc)

plt.title('Receiver Operating Characteristic')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.savefig("ROC_Curve.png")
plt.show()

## Permutation Importance of all 221 features
plt.rcParams['font.size'] = '2'
plt.figure(figsize=(10,10), dpi=600)
sorted_idx = np.absolute(perm_importance.importances_mean).argsort()
plt.barh(X_test.columns[sorted_idx], np.absolute(perm_importance.importances_mean[sorted_idx]))
plt.xlabel("Permutation Importance")
plt.savefig('Permutation_Importance_all.png', dpi=800)
plt.show()

coeffs = np.absolute(perm_importance.importances_mean[sorted_idx])
print(f"{coeffs[np.where(coeffs == 0)].size} features have zero importance")
print(f"{coeffs[np.where(coeffs > 0.01)].size} features have importance greater than 0.01")

"""The Above plot shows the feature importance derived by **Permutation Importance** using Random Forest Model.
Even after removing 150 features from total 371 features using feature preprocessing, 91 features still have an importance of 0 in the model and only 7 features have an Permutation Importance of >0.01

# Conclusion

Accuracy isn't the best parameter to evaluate the model performance. Since accuracy doesn't factor in class imbalances.

Better metrics to evaluate the model are the f1-score and AUC score. From these metrics we can see that the XGBoost classifier has performed the best followed by Random Forest and Logistic Regresion.


**XGBoost Classifier**

Accuracy =  0.899 

F1 Score = 0.241

Recall = 0.403

AUC score = 0.811

# Submission File
"""

test_data = pd.read_csv('/content/drive/MyDrive/Univ.AI/test.csv')

test_data.shape , train_data.shape

submission_df = test_data[["ID"]]

test_data = test_data.drop("ID", axis=1)

# Drop Constant Features
test_data = test_data.drop(const_col, axis = 1)

# Drop Duplicate columns
test_data = duplicates.transform(test_data)

# Split categorical and numerical columns
test_data_cat = test_data[categorical_columns]

test_data_num = test_data.drop(categorical_columns, axis=1)

# Drop Correlated data
test_data_num = correlated.transform(test_data_num)

# Scale numerical columns
test_num = scaler.transform(test_data_num)
test_data_num = pd.DataFrame(test_num, columns=test_data_num.columns)

# Join categorical and numerical columns
data = pd.concat([test_data_cat.reset_index(drop=True), test_data_num.reset_index(drop=True)], axis=1)

data = data.drop(sparse_columns, axis=1)

# Predict on XGBoost model
pred = XGB_clf.predict(data)

# Save Prediction as TARGET column
submission_df["TARGET"]=pred

proba = XGB_clf.predict_proba(data)

proba.shape

# Save Probability as "Unsatisfied Customer Probability" column
submission_df["Unsatisfied Customer Probability"] = proba[:,1]

submission_df.head()

submission_df.to_csv("Submission.csv")