diff --git a/data_reading/read_data.py b/data_reading/read_data.py index f194eb1..26d10f3 100644 --- a/data_reading/read_data.py +++ b/data_reading/read_data.py @@ -67,7 +67,7 @@ def perform_zee_kinematics_reweighting(data_array, data_weights, mc_array, mc_we phi_min, phi_max = -3.15,3.15 # now the number of bins in each distribution - pt_bins,rho_bins,eta_bins,phi_bins = 10, 30, 10, 2 #was 20, 30, 10, 4 + pt_bins,rho_bins,eta_bins,phi_bins = 10, 32, 12, 2 #was 20, 30, 10, 4 # Now we create a 4d histogram of this kinematic variables mc_histo, edges = np.histogramdd( sample = (mc_array[:,0] , mc_array[:,3], mc_array[:,1], mc_array[:,2]) , bins = (pt_bins,rho_bins,eta_bins,phi_bins), range = [ [pt_min,pt_max], [ rho_min, rho_max ],[eta_min,eta_max], [phi_min,phi_max] ], weights = mc_weights ) @@ -225,18 +225,23 @@ def read_zee_data(): path_to_data = "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/" # Lets now read the data and simultion as pandas dataframes - files_DY_mc = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/DY_postEE_v13/nominal/*.parquet") + files_DY_mc = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/DY_postEE_v13/nominal/*.parquet") files_DY_mc = files_DY_mc[:300] simulation = [pd.read_parquet(f) for f in files_DY_mc] drell_yan_df = pd.concat(simulation,ignore_index=True) # now the data files for the epochs F and G - files_DY_data_F = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/dataG_v13/nominal/*.parquet") + files_DY_data_E = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/dataE_v13/nominal/*.parquet") + files_DY_data_E = files_DY_data_E[:300] + + files_DY_data_F = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/dataF_v13/nominal/*.parquet") files_DY_data_F = files_DY_data_F[:300] files_DY_data_G = glob.glob( "/net/scratch_cms3a/daumann/HiggsDNA/v13_samples_2/dataG_v13/nominal/*.parquet") files_DY_data_G = files_DY_data_G[:300] + # I think there is a problem with the samples above, lets test with the new samples! + else: path_to_data = "/net/scratch_cms3a/daumann/nanoAODv12_Production/" @@ -255,7 +260,7 @@ def read_zee_data(): files_DY_data_G = files_DY_data_G[:30] # merhging both dataframes - files_DY_data = [files_DY_data_G,files_DY_data_F] + files_DY_data = [files_DY_data_E,files_DY_data_G,files_DY_data_F] data = [pd.read_parquet(f) for f in files_DY_data] data_df = pd.concat(data,ignore_index=True) @@ -269,11 +274,18 @@ def read_zee_data(): # The selection will be done in the perform_zee_selection() function mask_data, mask_mc = perform_zee_selection( data_df, drell_yan_df ) + # Switch to decide if reweihting is used or no! + perform_reweithing = True + # now, due to diferences in kinematics, a rewighting in the four kinematic variables [pt,eta,phi and rho] will be perform mc_weights = drell_yan_df["weight"] mc_weights_before = mc_weights data_weights = np.ones( len( data_df["fixedGridRhoAll"] ) ) - data_weights, mc_weights = perform_zee_kinematics_reweighting(data_df[conditions_list][mask_data], data_weights[mask_data], drell_yan_df[conditions_list][mask_mc], mc_weights[mask_mc]) + + if( perform_reweithing ): + data_weights, mc_weights = perform_zee_kinematics_reweighting(data_df[conditions_list][mask_data], data_weights[mask_data], drell_yan_df[conditions_list][mask_mc], mc_weights[mask_mc]) + else: + data_weights, mc_weights = np.array(data_weights[mask_data])/np.sum( np.array(data_weights[mask_data]) ), np.array(mc_weights[mask_mc])/np.sum(np.array(mc_weights[mask_mc])) #perform_zee_kinematics_reweighting(data_df[conditions_list][mask_data], data_weights[mask_data], drell_yan_df[conditions_list][mask_mc], mc_weights[mask_mc]) # now lets call a plotting function to perform the plots of the read distributions for validation porpuses path_to_plots = "/net/scratch_cms3a/daumann/PhD/EarlyHgg/simulation_to_data_corrections/plot/validation_plots/" diff --git a/main.py b/main.py index b4f03b2..54875ac 100644 --- a/main.py +++ b/main.py @@ -51,6 +51,8 @@ def main(): corrections.setup_flow() corrections.train_the_flow() + #exit() + # Now, we call the class that handles the transformations, training and validaiton of the corrections #corrections = training_utils.Simulation_correction() #corrections.setup_flow() diff --git a/normalizing_flows/training_utils.py b/normalizing_flows/training_utils.py index 1b3d451..04249e4 100644 --- a/normalizing_flows/training_utils.py +++ b/normalizing_flows/training_utils.py @@ -36,7 +36,7 @@ def __init__(self, configuration, n_transforms, n_splines_bins, aux_nodes, aux_l # Checking if cuda is avaliable print('Checking cuda avaliability: ', torch.cuda.is_available()) - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + device = torch.device('cpu' if torch.cuda.is_available() else 'cpu') self.device = device #reading important tensors @@ -158,15 +158,15 @@ def train_the_flow(self): # load the trained model and perform the final plots! def evaluate_results(self): - # Results are evaluated in the test dataset, never seen by the neural network! - # A mask is applied so only MC events are chossen from the data space - MaskOnlyvalidationMC = self.validation_conditions[:,self.validation_conditions.size()[1] -1 ] == 0 - self.validation_conditions = self.validation_conditions[MaskOnlyvalidationMC] - self.validation_inputs = self.validation_inputs[MaskOnlyvalidationMC] - self.validation_weights = self.validation_weights[MaskOnlyvalidationMC] - with torch.no_grad(): + # Results are evaluated in the test dataset, never seen by the neural network! + # A mask is applied so only MC events are chossen from the data space + MaskOnlyvalidationMC = self.validation_conditions[:,self.validation_conditions.size()[1] -1 ] == 0 + self.validation_conditions = self.validation_conditions[MaskOnlyvalidationMC] + self.validation_inputs = self.validation_inputs[MaskOnlyvalidationMC] + self.validation_weights = self.validation_weights[MaskOnlyvalidationMC] + trans = self.flow(self.mc_test_conditions).transform sim_latent = trans( self.mc_test_inputs ) @@ -175,6 +175,13 @@ def evaluate_results(self): trans2 = self.flow(self.mc_test_conditions).transform self.samples = trans2.inv( sim_latent) + # lets save the means and std used for the transformations + np.save( os.getcwd() + '/results/' +self.configuration + '/' + 'input_means.npy', self.input_mean_for_std) + np.save( os.getcwd() + '/results/' +self.configuration + '/' + 'input_std.npy' , self.input_std_for_std ) + + np.save( os.getcwd() + '/results/' +self.configuration + '/' + 'conditions_means.npy', self.condition_mean_for_std) + np.save( os.getcwd() + '/results/' +self.configuration + '/' + 'conditions_std.npy' , self.condition_std_for_std ) + # now, lets invert the transformations self.invert_transformation() @@ -218,11 +225,11 @@ def perform_transformation(self): # since hoe has very low values, the shift value (value until traingular events are sampled) must be diferent here if( index == 6 ): - self.vector_for_iso_constructors_data.append( Make_iso_continuous(self.data_training_inputs[:,index], b = 0.001) ) - self.vector_for_iso_constructors_mc.append( Make_iso_continuous(self.mc_training_inputs[:,index] , b= 0.001) ) + self.vector_for_iso_constructors_data.append( Make_iso_continuous(self.data_training_inputs[:,index], self.device , b = 0.001) ) + self.vector_for_iso_constructors_mc.append( Make_iso_continuous(self.mc_training_inputs[:,index], self.device , b= 0.001) ) else: - self.vector_for_iso_constructors_data.append( Make_iso_continuous(self.data_training_inputs[:,index]) ) - self.vector_for_iso_constructors_mc.append( Make_iso_continuous(self.mc_training_inputs[:,index]) ) + self.vector_for_iso_constructors_data.append( Make_iso_continuous(self.data_training_inputs[:,index], self.device ) ) + self.vector_for_iso_constructors_mc.append( Make_iso_continuous(self.mc_training_inputs[:,index], self.device ) ) # now really applying the transformations counter = 0 @@ -267,7 +274,7 @@ def perform_transformation(self): print( self.condition_std_for_std ) exit() """ - + # transorming the training tensors self.training_inputs = ( self.training_inputs - self.input_mean_for_std )/self.input_std_for_std self.training_conditions[:,:-1] = ( self.training_conditions[:,:-1] - self.condition_mean_for_std )/self.condition_std_for_std @@ -366,8 +373,10 @@ def read_saved_tensor(self): # this is the class responsable for the isolation variables transformation class Make_iso_continuous: - def __init__(self, tensor, b = False): + def __init__(self, tensor, device, b = False): + self.device = device + #tensor = tensor.cpu() self.iso_bigger_zero = tensor > 0 self.iso_equal_zero = tensor == 0 @@ -393,7 +402,7 @@ def shift_and_sample(self, tensor): # now a log trasform is applied on top of the smoothing to stretch the events in the 0 traingular and "kill" the iso tails tensor = torch.log( 1e-3 + tensor ) - return tensor.to('cuda') + return tensor.to(self.device) #inverse operation of the above shift_and_sample transform def inverse_shift_and_sample( self,tensor , processed = False): @@ -414,4 +423,4 @@ def inverse_shift_and_sample( self,tensor , processed = False): assert (abs(torch.sum( self.before_transform - tensor )) < tensor.size()[0]*1e-6 ) #added the tensor.size()[0]*1e-6 term due to possible numerical fluctioations! - return tensor.to('cuda') \ No newline at end of file + return tensor.to(self.device) \ No newline at end of file diff --git a/plot/plot_utils.py b/plot/plot_utils.py index b4c454e..f44b99c 100644 --- a/plot/plot_utils.py +++ b/plot/plot_utils.py @@ -225,13 +225,13 @@ def plot_distributions_for_tensors( data_tensor, mc_tensor, flow_samples, mc_wei std = np.std( np.array(data_tensor[:,i]) ) if( 'Iso' in str(var_list[i]) or 'DR' in str(var_list[i]) or 'esE' in str(var_list[i]) or 'hoe' in str(var_list[i]) or 'energy' in str(var_list[i]) ): - data_hist = hist.Hist(hist.axis.Regular(45, 0.0 , mean + 2.0*std)) - mc_hist = hist.Hist(hist.axis.Regular(45, 0.0 , mean + 2.0*std)) - mc_rw_hist = hist.Hist(hist.axis.Regular(45, 0.0 , mean + 2.0*std)) + data_hist = hist.Hist(hist.axis.Regular(50, 0.0 , mean + 2.0*std)) + mc_hist = hist.Hist(hist.axis.Regular(50, 0.0 , mean + 2.0*std)) + mc_rw_hist = hist.Hist(hist.axis.Regular(50, 0.0 , mean + 2.0*std)) else: - data_hist = hist.Hist(hist.axis.Regular(45, mean - 2.5*std, mean + 2.5*std)) - mc_hist = hist.Hist(hist.axis.Regular(45, mean - 2.5*std, mean + 2.5*std)) - mc_rw_hist = hist.Hist(hist.axis.Regular(45, mean - 2.5*std, mean + 2.5*std)) + data_hist = hist.Hist(hist.axis.Regular(50, mean - 2.5*std, mean + 2.5*std)) + mc_hist = hist.Hist(hist.axis.Regular(50, mean - 2.5*std, mean + 2.5*std)) + mc_rw_hist = hist.Hist(hist.axis.Regular(50, mean - 2.5*std, mean + 2.5*std)) if( 'DR04' in str(var_list[i]) ): data_hist = hist.Hist(hist.axis.Regular(50, 0.0 , 5.0))