From e4cad668a84309ecef5e0bc590d2aab1e85ab8bb Mon Sep 17 00:00:00 2001
From: Joern Bach <jobach18@googlemail.com>
Date: Thu, 13 Jul 2023 19:50:46 +0200
Subject: [PATCH 1/5] added the script for the Fastjet DJR calculation in
 python

---
 bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py | 134 +++++++++++++++++++
 1 file changed, 134 insertions(+)
 create mode 100644 bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py

diff --git a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
new file mode 100644
index 000000000000..e3512e382cbe
--- /dev/null
+++ b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
@@ -0,0 +1,134 @@
+'''
+script to calculate the DRJ differentially in the number of
+ additional LHE Particles (based on DY + x Particles)
+'''
+
+import awkward as ak
+import argparse
+import os
+import warnings
+import coffea
+import numpy as np
+import matplotlib.pyplot as plt
+import mplhep as hep
+from tqdm import tqdm
+#from coffea.nanoevents.methods import vector  
+from pathlib import Path
+from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
+warnings.filterwarnings("ignore", message="Missing cross-reference index")
+import fastjet
+
+
+def retrieve_4arr(dict_dij,add_jet):
+    dij1 = [item[0] for item in dict_dij[add_jet]]
+    dij2 = [item[1] for item in dict_dij[add_jet]]
+    dij3 = [item[2] for item in dict_dij[add_jet]]
+    dij4 = [item[3] for item in dict_dij[add_jet]]
+    return dij1, dij2, dij3, dij4
+
+def main():
+    parser = argparse.ArgumentParser()
+    #
+    parser.add_argument("-i", "--Input", help= "Input File")
+    parser.add_argument('-vtau', '--VetoTau', help='Veto events with taus in LHE', default=False)
+    parser.add_argument('-s', '--SaveResults', help='enables saving the files as npy files', default=True)
+    args = parser.parse_args()
+    fname = args.Input
+    events = NanoEventsFactory.from_root(
+        fname,schemaclass=NanoAODSchema.v7,
+        metadata={"dataset": "DYJets"},
+    ).events()
+    #veto all tau events
+    if args.VetoTau:
+        events = events[np.all(np.abs(events["LHEPart"].pdgId) != 15, axis=1)]
+    w = events["genWeight"]
+    add_part = list()
+    #count how many additional jets each event has 
+    for ev in events["LHEPart"]:
+        add_part.append(len(ev[ev.status==1])-2)
+    add_part = np.array(add_part)
+    dijs = dict()
+    for i in range(4):
+        mindijs = list()
+        for ev  in tqdm(events["GenPart"][np.where(add_part==i)]):
+                #select (pseudo)particles with status stable, pt>0.01 GeV and rule out the leptons from the DY directly
+                ev = ev[((ev.status==1)& (ev.pt>0.01)) & (np.abs(ev.pdgId) != 11)& (np.abs(ev.pdgId) != 13)& (np.abs(ev.pdgId) != 15)]
+                #print(f'length of event:{(len(ev_prompt))}')
+                fjet_in = [fastjet.PseudoJet(part.px, part.py, part.pz, part.E) for part in ev]
+                #kt, antikt and Radius definition
+                jetdef = fastjet.JetDefinition(fastjet.kt_algorithm, 0.4)
+                cl_alg = fastjet.ClusterSequence(fjet_in, jetdef)
+                mindijs.append([cl_alg.exclusive_dmerge( 0),cl_alg.exclusive_dmerge( 1),cl_alg.exclusive_dmerge( 2),cl_alg.exclusive_dmerge( 3)])
+                dijs[i] = mindijs
+
+    if args.SaveResults:
+        np.save('dijs.npy', dijs) 
+        we_np = ak.to_numpy(w)
+        np.save('weights.npy', we_np)
+        np.save('addpart.npy', add_part)
+    #for dijx_y: x: number of additional jets, y:dij at merge for yth jet 
+    dij0_1, dij0_2, dij0_3, dij0_4 = retrieve_4arr(dijs, 0)
+    dij1_1, dij1_2, dij1_3, dij1_4 = retrieve_4arr(dijs, 1)
+    dij2_1, dij2_2, dij2_3, dij2_4 = retrieve_4arr(dijs, 2)
+    dij3_1, dij3_2, dij3_3, dij3_4 = retrieve_4arr(dijs, 3)
+
+
+    hep.style.use(hep.style.CMS) 
+    hep.cms.text('Simulation')
+    binsi = np.logspace(1e-2,5,70)
+    efficiencies = [len(dij0_1), len(dij0_2), len(dij0_3), len(dij0_4),
+                    len(dij1_1), len(dij1_2),len(dij1_3),len(dij1_4),
+                    len(dij2_1), len(dij2_2), len(dij2_3), len(dij2_4),
+                    len(dij3_1),len( dij3_2), len(dij3_3), len(dij3_4)]
+    efficiencies = np.array(efficiencies) / ev_per_job
+    #print(efficiencies)
+
+
+    plt.hist(dij0_1, weights=w[add_part==0], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 0 additional jets, eff: "+str(round(efficiencies[0],3)*100))
+    plt.hist(dij1_1, weights=w[add_part==1], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 1 additional jet, eff: "+str(round(efficiencies[4],3)*100))
+    plt.hist(dij2_1, weights=w[add_part==2], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 2 additional jets, eff: "+str(round(efficiencies[8],3)*100))
+    plt.hist(dij3_1, weights=w[add_part==3], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 3 additional jets, eff: "+str(round(efficiencies[12],3)*100))
+    plt.legend()
+    plt.xlabel(r'dij [GeV]')
+    plt.yscale("log")
+    plt.xscale("log")
+    plt.savefig('0to1.pdf',dpi=200)
+    plt.clf()
+    hep.style.use(hep.style.CMS) # For now ROOT defaults to CMS
+    hep.cms.text('Simulation')
+    plt.hist(dij0_2, weights=w[add_part==0], bins=binsi, histtype="step", label=r"$1 \rightarrow 2$, 0 additional jets")
+    plt.hist(dij1_2, weights=w[add_part==1], bins=binsi, histtype="step", label=r"$1 \rightarrow 2$, 1 additional jet")
+    plt.hist(dij2_2, weights=w[add_part==2], bins=binsi, histtype="step", label=r"$1 \rightarrow 2$, 2 additional jet")
+    plt.hist(dij3_2, weights=w[add_part==3], bins=binsi, histtype="step", label=r"$1 \rightarrow 2$, 3 additional jet")
+    plt.legend()
+    plt.xlabel(r'dij [GeV]')
+    plt.yscale("log")
+    plt.xscale("log")
+    plt.savefig('1to2.pdf',dpi=200)
+    plt.clf()
+    hep.style.use(hep.style.CMS) # For now ROOT defaults to CMS
+    hep.cms.text('Simulation')
+    plt.hist(dij0_3, weights=w[add_part==0], bins=binsi, histtype="step", label=r"$2 \rightarrow 3$, 0 additional jets")
+    plt.hist(dij1_3, weights=w[add_part==1], bins=binsi, histtype="step", label=r"$2 \rightarrow 3$, 1 additional jet")
+    plt.hist(dij2_3, weights=w[add_part==2], bins=binsi, histtype="step", label=r"$2 \rightarrow 3$, 2 additional jet")
+    plt.hist(dij3_3, weights=w[add_part==3], bins=binsi, histtype="step", label=r"$2 \rightarrow 3$, 3 additional jet")
+    plt.legend()
+    plt.xlabel(r'dij [GeV]')
+    plt.yscale("log")
+    plt.xscale("log")
+    plt.savefig('2to3.pdf',dpi=200)
+    plt.clf()
+    hep.style.use(hep.style.CMS) # For now ROOT defaults to CMS
+    hep.cms.text('Simulation')
+    plt.hist(dij0_4, weights=w[add_part==0], bins=binsi, histtype="step", label=r"$3 \rightarrow 4$, 0 additional jets")
+    plt.hist(dij1_4, weights=w[add_part==1], bins=binsi, histtype="step", label=r"$3 \rightarrow 4$, 1 additional jet")
+    plt.hist(dij2_4, weights=w[add_part==2], bins=binsi, histtype="step", label=r"$3 \rightarrow 4$, 2 additional jet")
+    plt.hist(dij3_4, weights=w[add_part==3], bins=binsi, histtype="step", label=r"$3 \rightarrow 4$, 3 additional jet")
+    plt.legend()
+    plt.xlabel(r'dij [GeV]')
+    plt.yscale("log")
+    plt.xscale("log")
+    plt.savefig('3to4.pdf',dpi=200)
+
+if __name__== '__main__':
+        main()

From d1ebab552c7bb70a08736a50151f4cd8da9b823e Mon Sep 17 00:00:00 2001
From: Joern Bach <jobach18@googlemail.com>
Date: Mon, 17 Jul 2023 17:24:55 +0200
Subject: [PATCH 2/5] columnar computation to speed things up

---
 bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py | 125 ++++++++++---------
 1 file changed, 64 insertions(+), 61 deletions(-)

diff --git a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
index e3512e382cbe..8293fd21af60 100644
--- a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
+++ b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
@@ -1,9 +1,7 @@
+''' 
+The script calculates the DJR differentially in the number of additional jets (taken as additional particles in the LHEpart)
+It uses the scikit-hep package of fastjet to calculate the clustering 
 '''
-script to calculate the DRJ differentially in the number of
- additional LHE Particles (based on DY + x Particles)
-'''
-
-import awkward as ak
 import argparse
 import os
 import warnings
@@ -17,73 +15,36 @@
 from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
 warnings.filterwarnings("ignore", message="Missing cross-reference index")
 import fastjet
+import uproot
+import vector 
+import awkward as ak
 
 
 def retrieve_4arr(dict_dij,add_jet):
-    dij1 = [item[0] for item in dict_dij[add_jet]]
-    dij2 = [item[1] for item in dict_dij[add_jet]]
-    dij3 = [item[2] for item in dict_dij[add_jet]]
-    dij4 = [item[3] for item in dict_dij[add_jet]]
-    return dij1, dij2, dij3, dij4
+    '''
+    function to extract the dij from the list into arrays
+    '''
+    dij = dict_dij[add_jet]
+    return dij[0], dij[1], dij[2], dij[3]
 
-def main():
-    parser = argparse.ArgumentParser()
-    #
-    parser.add_argument("-i", "--Input", help= "Input File")
-    parser.add_argument('-vtau', '--VetoTau', help='Veto events with taus in LHE', default=False)
-    parser.add_argument('-s', '--SaveResults', help='enables saving the files as npy files', default=True)
-    args = parser.parse_args()
-    fname = args.Input
-    events = NanoEventsFactory.from_root(
-        fname,schemaclass=NanoAODSchema.v7,
-        metadata={"dataset": "DYJets"},
-    ).events()
-    #veto all tau events
-    if args.VetoTau:
-        events = events[np.all(np.abs(events["LHEPart"].pdgId) != 15, axis=1)]
-    w = events["genWeight"]
-    add_part = list()
-    #count how many additional jets each event has 
-    for ev in events["LHEPart"]:
-        add_part.append(len(ev[ev.status==1])-2)
-    add_part = np.array(add_part)
-    dijs = dict()
-    for i in range(4):
-        mindijs = list()
-        for ev  in tqdm(events["GenPart"][np.where(add_part==i)]):
-                #select (pseudo)particles with status stable, pt>0.01 GeV and rule out the leptons from the DY directly
-                ev = ev[((ev.status==1)& (ev.pt>0.01)) & (np.abs(ev.pdgId) != 11)& (np.abs(ev.pdgId) != 13)& (np.abs(ev.pdgId) != 15)]
-                #print(f'length of event:{(len(ev_prompt))}')
-                fjet_in = [fastjet.PseudoJet(part.px, part.py, part.pz, part.E) for part in ev]
-                #kt, antikt and Radius definition
-                jetdef = fastjet.JetDefinition(fastjet.kt_algorithm, 0.4)
-                cl_alg = fastjet.ClusterSequence(fjet_in, jetdef)
-                mindijs.append([cl_alg.exclusive_dmerge( 0),cl_alg.exclusive_dmerge( 1),cl_alg.exclusive_dmerge( 2),cl_alg.exclusive_dmerge( 3)])
-                dijs[i] = mindijs
 
-    if args.SaveResults:
-        np.save('dijs.npy', dijs) 
-        we_np = ak.to_numpy(w)
-        np.save('weights.npy', we_np)
-        np.save('addpart.npy', add_part)
+def plot(dijs, w, add_part):
+    '''
+    plotting the dijs in 4 plots, separated by the DJR (0to1, 1to2, ...)
+    '''
     #for dijx_y: x: number of additional jets, y:dij at merge for yth jet 
     dij0_1, dij0_2, dij0_3, dij0_4 = retrieve_4arr(dijs, 0)
     dij1_1, dij1_2, dij1_3, dij1_4 = retrieve_4arr(dijs, 1)
     dij2_1, dij2_2, dij2_3, dij2_4 = retrieve_4arr(dijs, 2)
     dij3_1, dij3_2, dij3_3, dij3_4 = retrieve_4arr(dijs, 3)
-
-
     hep.style.use(hep.style.CMS) 
     hep.cms.text('Simulation')
-    binsi = np.logspace(1e-2,5,70)
-    efficiencies = [len(dij0_1), len(dij0_2), len(dij0_3), len(dij0_4),
-                    len(dij1_1), len(dij1_2),len(dij1_3),len(dij1_4),
-                    len(dij2_1), len(dij2_2), len(dij2_3), len(dij2_4),
-                    len(dij3_1),len( dij3_2), len(dij3_3), len(dij3_4)]
-    efficiencies = np.array(efficiencies) / ev_per_job
-    #print(efficiencies)
-
-
+    binsi = np.logspace(1,7,70)
+    efficiencies = [len(dij0_1)/len(w), len(dij0_2/len(w)), len(dij0_3/len(w)), len(dij0_4/len(w)),
+                    len(dij1_1/len(w)), len(dij1_2/len(w)),len(dij1_3/len(w)),len(dij1_4/len(w)),
+                    len(dij2_1/len(w)), len(dij2_2/len(w)), len(dij2_3/len(w)), len(dij2_4/len(w)),
+                    len(dij3_1/len(w)),len( dij3_2/len(w)), len(dij3_3/len(w)), len(dij3_4/len(w))]
+    efficiencies = np.array(efficiencies) 
     plt.hist(dij0_1, weights=w[add_part==0], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 0 additional jets, eff: "+str(round(efficiencies[0],3)*100))
     plt.hist(dij1_1, weights=w[add_part==1], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 1 additional jet, eff: "+str(round(efficiencies[4],3)*100))
     plt.hist(dij2_1, weights=w[add_part==2], bins=binsi, histtype="step", label=r"$0 \rightarrow 1$, 2 additional jets, eff: "+str(round(efficiencies[8],3)*100))
@@ -128,7 +89,49 @@ def main():
     plt.xlabel(r'dij [GeV]')
     plt.yscale("log")
     plt.xscale("log")
-    plt.savefig('3to4.pdf',dpi=200)
+    plt.savefig('3to4.pdf', dpi=200)
+
+def main():
+    vector.register_awkward()
+    parser = argparse.ArgumentParser()
+    #input file
+    parser.add_argument("-i", "--Input", help= "Input File")
+    parser.add_argument('-vtau', '--Veto_Tau', help='Veto tau events in the LHEpart?', default=False)
+    parser.add_argument('-s', '--SaveResults', help='enables saving the files as npy files', default=True)
+    args = parser.parse_args()
+    fname = args.Input
+    with uproot.open(fname) as file:
+        events = file['Events;1'].arrays()
+    #veto all tau events
+    veto_tau = False
+    if veto_tau:
+        events = events[np.all(np.abs(events.LHEPart_pdgId) != 15, axis=1)]
+    w = events.Generator_weight
+    #count how many additional jets each event has 
+    #add_part = [len(ev[ev.status==1])-2 for ev in events['LHEPart']]
+    add_part = events.LHE_Njets  
+    add_part = np.array(add_part)
+    dijs = dict()
+    #loop over the cases of extra jets 
+    for i in range(4):
+        ev_per_add_jet = events[add_part==i]
+        #ev_per_add_jet = ev_per_add_jet[((ev_per_add_jet.GenPart_status==1)& (ev_per_add_jet.GenPart_pt>0.01)) & (np.abs(ev_per_add_jet.GenPart_pdgId) != 11)&
+        #                                 (np.abs(ev_per_add_jet.GenPart_pdgId) != 13)& (np.abs(ev_per_add_jet.GenPart_pdgId) != 15)]
+        jetdef = fastjet.JetDefinition(fastjet.kt_algorithm, 1)
+        data = ak.zip({'pt':ev_per_add_jet.GenPart_pt, 'eta': ev_per_add_jet.GenPart_eta, 'phi': ev_per_add_jet.GenPart_phi, 'M': ev_per_add_jet.GenPart_mass},with_name="Momentum4D",)
+        cl_alg = fastjet.ClusterSequence(data, jetdef)
+        dijs[i] =  [cl_alg.exclusive_dmerge_max( 0), cl_alg.exclusive_dmerge_max( 1),cl_alg.exclusive_dmerge_max( 2),
+                    cl_alg.exclusive_dmerge_max( 4)]
+    plot(dijs, w, add_part)
+    if args.SaveResults:
+        np.save('dijs.npy', dijs) 
+        we_np = ak.to_numpy(w)
+        np.save('weights.npy', we_np)
+        np.save('addpart.npy', add_part)
+
+
+
+
 
 if __name__== '__main__':
         main()

From d7bb8f8181cb8c8beac0db861e1932a792a7ff4b Mon Sep 17 00:00:00 2001
From: Joern Bach <jobach18@googlemail.com>
Date: Mon, 17 Jul 2023 17:33:37 +0200
Subject: [PATCH 3/5] fixed a typo

---
 bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
index 8293fd21af60..e07992d292ca 100644
--- a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
+++ b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
@@ -103,8 +103,7 @@ def main():
     with uproot.open(fname) as file:
         events = file['Events;1'].arrays()
     #veto all tau events
-    veto_tau = False
-    if veto_tau:
+    if args.Veto_Tau:
         events = events[np.all(np.abs(events.LHEPart_pdgId) != 15, axis=1)]
     w = events.Generator_weight
     #count how many additional jets each event has 

From 1ea5eeebd177b9d689828c75742fc5e00d0f6ed1 Mon Sep 17 00:00:00 2001
From: Joern Bach <jobach18@googlemail.com>
Date: Mon, 17 Jul 2023 17:34:53 +0200
Subject: [PATCH 4/5] cleanup

---
 bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
index e07992d292ca..f1a8dcae80c2 100644
--- a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
+++ b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
@@ -107,15 +107,12 @@ def main():
         events = events[np.all(np.abs(events.LHEPart_pdgId) != 15, axis=1)]
     w = events.Generator_weight
     #count how many additional jets each event has 
-    #add_part = [len(ev[ev.status==1])-2 for ev in events['LHEPart']]
     add_part = events.LHE_Njets  
     add_part = np.array(add_part)
     dijs = dict()
     #loop over the cases of extra jets 
     for i in range(4):
         ev_per_add_jet = events[add_part==i]
-        #ev_per_add_jet = ev_per_add_jet[((ev_per_add_jet.GenPart_status==1)& (ev_per_add_jet.GenPart_pt>0.01)) & (np.abs(ev_per_add_jet.GenPart_pdgId) != 11)&
-        #                                 (np.abs(ev_per_add_jet.GenPart_pdgId) != 13)& (np.abs(ev_per_add_jet.GenPart_pdgId) != 15)]
         jetdef = fastjet.JetDefinition(fastjet.kt_algorithm, 1)
         data = ak.zip({'pt':ev_per_add_jet.GenPart_pt, 'eta': ev_per_add_jet.GenPart_eta, 'phi': ev_per_add_jet.GenPart_phi, 'M': ev_per_add_jet.GenPart_mass},with_name="Momentum4D",)
         cl_alg = fastjet.ClusterSequence(data, jetdef)

From 056903c61d66fdc9da74aa808825451ffc3d3d77 Mon Sep 17 00:00:00 2001
From: Joern Bach <jobach18@googlemail.com>
Date: Wed, 19 Jul 2023 12:06:39 +0200
Subject: [PATCH 5/5] deleted unnecessary coffea dependece

---
 bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
index f1a8dcae80c2..7896fe3b63fd 100644
--- a/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
+++ b/bin/MadGraph5_aMCatNLO/macros/fastjet_dij.py
@@ -5,14 +5,11 @@
 import argparse
 import os
 import warnings
-import coffea
 import numpy as np
 import matplotlib.pyplot as plt
 import mplhep as hep
 from tqdm import tqdm
-#from coffea.nanoevents.methods import vector  
 from pathlib import Path
-from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
 warnings.filterwarnings("ignore", message="Missing cross-reference index")
 import fastjet
 import uproot