-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_BDT.py
153 lines (111 loc) · 5.23 KB
/
main_BDT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# DiphotonBDT training and evaluation code
#nescessary ones
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import matplotlib.pyplot as plt
import pyarrow.parquet as pq
import glob
import os
import yaml
from yaml import Loader
import xgboost as xgb
from scipy.stats import chisquare
#plotting libraries
import mplhep, hist
import mplhep as hep
plt.style.use([mplhep.style.CMS])
from matplotlib import pyplot
import traininig_utils as utils
import ploting_utils as plot_utils
from sklearn import metrics
# defining some plot functions - should move this to ploting_utils file
def plot_BDT_output(predictions, test, path_to_plot):
# plot signal and background separately
plt.figure()
plt.hist(predictions[test.get_label().astype(bool)],bins=np.linspace(0,1,50),
histtype='step',color='midnightblue',label='signal')
plt.hist(predictions[~(test.get_label().astype(bool))],bins=np.linspace(0,1,50),
histtype='step',color='firebrick',label='background')
# make the plot readable
plt.xlabel('Prediction from BDT',fontsize=18)
plt.ylabel('Events',fontsize=18)
plt.legend(frameon=False)
plt.savefig( path_to_plot + 'BDT_outputs.png' )
def plot_ROC_curve( predictions, test, test_data,test_labels , path_to_plot, evals_result ):
# choose score cuts:
cuts = np.linspace(0,0.95,200)
nsignal = np.zeros(len(cuts))
nbackground = np.zeros(len(cuts))
for i,cut in enumerate(cuts):
nsignal[i] = len(np.where(predictions[test.get_label().astype(bool)] > cut)[0])
nbackground[i] = len(np.where(predictions[~(test.get_label().astype(bool))] > cut)[0])
# plot efficiency vs. purity (ROC curve)
plt.figure()
plt.plot(nsignal/len(test_data[test_labels == 1]),nsignal/(nsignal + nbackground),'o-',color='blueviolet', label = 'AUC -' + str( (np.max(evals_result['train']['auc']) )))
# make the plot readable
plt.xlabel('Efficiency (nsignal/ntotal)',fontsize=18)
plt.ylabel('Purity (nsignal/(nsignal + nbkg))',fontsize=18)
plt.legend(frameon=False)
plt.savefig( path_to_plot + 'ROC_curve.png' )
def plot_model_loss(evals_result, plot_path):
train_losses = evals_result['train']['error']
val_losses = evals_result['validation']['error']
fig, ax = pyplot.subplots()
ax.plot(train_losses, label='Train')
ax.plot(val_losses, label='Test')
ax.legend()
plt.savefig( plot_path + 'loss_curve.png')
def plot_model_ams(evals_result, plot_path):
train_losses = evals_result['train']['ams@0']
val_losses = evals_result['validation']['ams@0']
fig, ax = pyplot.subplots()
ax.plot(train_losses, label='Train')
ax.plot(val_losses, label='Test')
ax.legend()
plt.savefig( plot_path + 'ams.png')
#start of the main code
def main():
path_to_plots = './results/'
x_train, x_test, y_train, y_test, w_train, w_test, inputs_list = utils.load_data()
train = xgb.DMatrix(data= x_train, weight = w_train ,label= y_train,
missing=-999.0, feature_names = inputs_list )
test = xgb.DMatrix(data= x_test , weight = w_test , label= y_test,
missing=-999.0, feature_names = inputs_list )
# Defining the parameters for the classifier and training
param = {}
# Booster parameters
param['eta'] = 0.1 # learning rate
param['max_depth'] = 6 # maximum depth of a tree
param['subsample'] = 0.4 # fraction of events to train tree on
param['colsample_bytree'] = 1.0 # fraction of features to train tree on
# Learning task parameters
param['objective'] = 'binary:logistic' # objective function
param['eval_metric'] = 'error' # evaluation metric for cross validation
param = list(param.items()) + [('eval_metric', 'logloss')] + [('eval_metric', 'auc')]
# trying to make in multithreat!
#param['nthread'] = 16 #by default xgboost already uses everything
num_trees = 999 # number of trees to make
evals_result = {} # object to keep track of the losses
evals = [(train, "train"), (test, "validation")]
# perform the training: what is the default? pre stop or post prunning? - need to understand that
booster = xgb.train(param,train,num_boost_round=num_trees, evals = evals, evals_result=evals_result, verbose_eval=10, early_stopping_rounds=20)
booster.set_param({"device": "cuda:0"}) #setting it up to cuda!
# now, evaluating the BDT
predictions = booster.predict(test)
# Printing the importance of every variables
xgb.plot_importance(booster,grid=False)
plt.savefig( path_to_plots + 'BDT_importance.png' ) #saving the plot
# Plotting the background and signal BDT scores
plot_BDT_output( predictions, test, path_to_plots )
# Now, the ROC curve
plot_ROC_curve( predictions, test, x_test,y_test, path_to_plots, evals_result )
#lets also plot the loss curve for the model
plot_model_loss( evals_result , path_to_plots )
#plot_model_ams( evals_result , path_to_plots )
#last plot
plot_utils.per_process_BDTscore(booster)
#saving the model in .json format
booster.save_model("diphoton_BDT.json")
if __name__ == "__main__":
main()