From c8ad5c866ed890998ab386a6e9b76927d8aba4e1 Mon Sep 17 00:00:00 2001 From: Carlos Pareja <86133628+cpareja3025@users.noreply.github.com> Date: Mon, 6 Mar 2023 16:23:53 -0800 Subject: [PATCH 1/9] updating utils.py with helper methods These helper methods are needed for the JetClass dataset. --- jetnet/utils/utils.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/jetnet/utils/utils.py b/jetnet/utils/utils.py index 19b1fdd..e90de32 100644 --- a/jetnet/utils/utils.py +++ b/jetnet/utils/utils.py @@ -205,3 +205,26 @@ def gen_jet_corrections( jets[:, :, pt_index][jets[:, :, pt_index] < 0] = 0 return (jets[:, :, :-1], mask) if ret_mask_separate else jets + +def findMaxLengthList(lst): + maxLength = max(len(x) for x in lst) + return maxLength + + +def zero_padding(lst): + returned_list = [] + for sub_list in lst: + sub_list = list(sub_list) + returned_list.append(sub_list) + + padded_list = [] + max_value = findMaxLengthList(returned_list) + for i in returned_list: + # print(type(i)) + pad_list = np.pad(i, (0, max_value - len(i)), "constant", constant_values=0) + padded_list.append(pad_list) + + zero_padded_arr = np.array(padded_list) + + return zero_padded_arr + From e2e5bde6999ab8253168d351c81e943b8dd9e920 Mon Sep 17 00:00:00 2001 From: Lint Action Date: Tue, 7 Mar 2023 00:24:08 +0000 Subject: [PATCH 2/9] Fix code style issues with Black --- jetnet/utils/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jetnet/utils/utils.py b/jetnet/utils/utils.py index e90de32..679c9f1 100644 --- a/jetnet/utils/utils.py +++ b/jetnet/utils/utils.py @@ -206,6 +206,7 @@ def gen_jet_corrections( return (jets[:, :, :-1], mask) if ret_mask_separate else jets + def findMaxLengthList(lst): maxLength = max(len(x) for x in lst) return maxLength @@ -227,4 +228,3 @@ def zero_padding(lst): zero_padded_arr = np.array(padded_list) return zero_padded_arr - From 845551931a8a7f11a5f483c930e332d22f245bc9 Mon Sep 17 00:00:00 2001 From: Carlos Pareja <86133628+cpareja3025@users.noreply.github.com> Date: Mon, 6 Mar 2023 16:27:17 -0800 Subject: [PATCH 3/9] creating jetclass.py --- jetnet/datasets/JetClass.py | 149 ++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 jetnet/datasets/JetClass.py diff --git a/jetnet/datasets/JetClass.py b/jetnet/datasets/JetClass.py new file mode 100644 index 0000000..7c448f1 --- /dev/null +++ b/jetnet/datasets/JetClass.py @@ -0,0 +1,149 @@ +from typing import Callable, List, Set, Union, Optional, Tuple +import numpy as np +import logging +import uproot +import os +from utils import * + + +class JetClass: + """ + PyTorch ``torch.unit.data.Dataset`` class for the JetClass dataset. + If root files are not found in the ``data_dir`` directory then dataset will be downloaded + from Zenodo (https://zenodo.org/record/6619768). + Args: + jet_type (Union[str, Set[str]], optional): individual type or set of types out of 'HToBB' , + "HtoCC", "HtoGG", "HtoWW", "HtoWW2Q1L", "HtoWW4Q", "TTBar", "TTBarLep", "WtoQQ", + "ZJetstoNuNu", "ZtoQQ" ). "all" will get all types. Defaults to "all". + data_dir (str, optional): directory in which data is (to be) stored. Defaults to "./". + particle_features (List[str], optional): list of particle features to retrieve. If empty + or None, gets no particle features. Defaults to + `` ["part_px", "part_py", "part_pz", "part_energy", "part_deta", "part_dphi", "part_d0val", + "part_d0err", "part_dzval", "part_dzerr", "part_charge", "part_isChargedHadron", + "part_isNeutralHadron", "part_isPhoton", "part_isElectron", "part_isMuon"]``. + jet_features (List[str], optional): list of jet features to retrieve. If empty or None, + gets no jet features. Defaults to + ``["jet_pt", "jet_eta", "jet_phi", "jet_energy", "jet_nparticles", "jet_sdmass", "jet_tau1", + "jet_tau2", "jet_tau3", "jet_tau4"]``. + """ + + zenodo_record_id = 6619768 + + jet_type = [ + "HtoBB", + "HtoCC", + "HtoGG", + "HtoWW", + "HtoWW2Q1L", + "HtoWW4Q", + "TTBar", + "TTBarLep", + "WtoQQ", + "ZJetstoNuNu", + "ZtoQQ", + ] + all_particle_features = [ + "part_px", + "part_py", + "part_pz", + "part_energy", + "part_deta", + "part_dphi", + "part_d0val", + "part_d0err", + "part_dzval", + "part_dzerr", + "part_charge", + "part_isChargedHadron", + "part_isNeutralHadron", + "part_isPhoton", + "part_isElectron", + "part_isMuon", + ] + all_jet_features = [ + "jet_pt", + "jet_eta", + "jet_phi", + "jet_energy", + "jet_nparticles", + "jet_sdmass", + "jet_tau1", + "jet_tau2", + "jet_tau3", + "jet_tau4", + ] + splits = ["train", "valid", "test", "all"] + + def __init__( + self, + jet_type: Union[str, Set[str]] = "all", + data_dir: str = "./", + particle_features: List[str] = all_particle_features, + jet_features: List[str] = all_jet_features, + split: str = "train", + split_fraction: List[float] = [0.7, 0.15, 0.15], + seed: int = 42, + ): + self.particle_data, self.jet_data = self.getData( + jet_type, data_dir, particle_features, jet_features + ) + + super().__init__( + data_dir=data_dir, + particle_features=particle_features, + jet_features=jet_features, + ) + self.split = split + self.split_fraction = split_fraction + + @classmethod + def getData(self, jet_type, data_dir, particle_features, jet_features): + dataset_name = "JetClass Validation Set" + file_download_name = "Val_5M" + key = "JetClass_Pythia_val_5M.tar" + record_id = 6619768 + jet_matrix = np.zeros((1, 100000)) + particle_matrix = np.zeros((1, 136)) + file_path = checkDownloadZenodoDataset( + data_dir, dataset_name, record_id, key, file_download_name + ) + print("Processing Data: ...") + for jet_file in os.listdir(file_path): + f = os.path.join(file_path, jet_file) + for jet in jet_type: + if jet in f: + open_file = uproot.open(f) + branch = open_file["tree"] + for i in branch.keys(): + for feature in jet_features: + if feature in i: + arr = branch[i].array() + arr = np.array(arr) + jet_matrix = np.vstack([jet_matrix, arr]) + for particle in particle_features: + if particle in i: + arr_awk = branch[i].array() + awk_list = list(arr_awk) + zero_pad_arr = zero_padding(awk_list) + length_curr = findMaxLengthList(zero_pad_arr) + length_matrix = findMaxLengthList(particle_matrix) + zeros = np.zeros(100001) + if length_curr > length_matrix: + zeros = np.zeros(100001) + diff = length_curr - length_matrix + for i in range(diff): + particle_matrix = np.column_stack((particle_matrix, zeros)) + elif length_curr < length_matrix: + zeros = np.zeros(100000) + diff = length_matrix - length_curr + for i in range(diff): + zero_pad_arr = np.column_stack((zero_pad_arr, zeros)) + particle_matrix = np.vstack([particle_matrix, zero_pad_arr]) + updated_particle_matrix = np.delete(particle_matrix, 0, axis=0) + + updated_jet_matrix = np.delete(jet_matrix, 0, axis=0) + dim1 = updated_jet_matrix.shape[0] + dim2 = updated_jet_matrix.shape[1] + dim_res = dim1 / len(jet_features) + dim = int(dim_res * dim2) + return updated_jet_matrix.reshape(dim, len(jet_features)), updated_particle_matrix From 41258a6995487d7911d07c5d8c7e4a1a6ceafa0f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Mar 2023 08:54:14 +0000 Subject: [PATCH 4/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- jetnet/datasets/JetClass.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/jetnet/datasets/JetClass.py b/jetnet/datasets/JetClass.py index 7c448f1..52cfb4a 100644 --- a/jetnet/datasets/JetClass.py +++ b/jetnet/datasets/JetClass.py @@ -1,8 +1,9 @@ -from typing import Callable, List, Set, Union, Optional, Tuple -import numpy as np import logging -import uproot import os +from typing import Callable, List, Optional, Set, Tuple, Union + +import numpy as np +import uproot from utils import * From 19fe3d398f913c65cb13d48afea75ba6d2a97a78 Mon Sep 17 00:00:00 2001 From: cpareja3025 Date: Thu, 9 Mar 2023 16:56:43 -0800 Subject: [PATCH 5/9] documenting JetClass.py --- jetnet/datasets/JetClass.py | 53 ++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 6 deletions(-) diff --git a/jetnet/datasets/JetClass.py b/jetnet/datasets/JetClass.py index 52cfb4a..6204a18 100644 --- a/jetnet/datasets/JetClass.py +++ b/jetnet/datasets/JetClass.py @@ -1,13 +1,13 @@ -import logging -import os -from typing import Callable, List, Optional, Set, Tuple, Union - +from typing import Callable, List, Set, Union, Optional, Tuple import numpy as np +import logging import uproot +import os from utils import * +from .dataset import JetDataset -class JetClass: +class JetClass(JetDataset): """ PyTorch ``torch.unit.data.Dataset`` class for the JetClass dataset. If root files are not found in the ``data_dir`` directory then dataset will be downloaded @@ -99,33 +99,72 @@ def __init__( @classmethod def getData(self, jet_type, data_dir, particle_features, jet_features): + ''' + Downloads JetClass dataset from zenodo if dataset is not already downloaded in + user specified data directory. Loads and returns the JetClass data in the form a + multidimensional NumPy array. + + Args: + jet_type (Union[str, Set[str]]): individual type or set of types out of 'HToBB' , + "HtoCC", "HtoGG", "HtoWW", "HtoWW2Q1L", "HtoWW4Q", "TTBar", "TTBarLep", "WtoQQ", + "ZJetstoNuNu", "ZtoQQ" ). + data_dir (str, optional): + data_dir (str, optional): directory in which data is (to be) stored. Defaults to "./". + particle_features (List[str], optional): list of particle features to retrieve. If empty + or None, gets no particle features. Defaults to + `` ["part_px", "part_py", "part_pz", "part_energy", "part_deta", "part_dphi", "part_d0val", + "part_d0err", "part_dzval", "part_dzerr", "part_charge", "part_isChargedHadron", + "part_isNeutralHadron", "part_isPhoton", "part_isElectron", "part_isMuon"]``. + jet_features (List[str], optional): list of jet features to retrieve. If empty or None, + gets no jet features. Defaults to ["jet_pt", "jet_eta", "jet_phi", "jet_energy", "jet_nparticles", "jet_sdmass", "jet_tau1", + "jet_tau2", "jet_tau3", "jet_tau4"]. + Returns: + Tuple[Optional[np.ndarray], Optional[np.ndarray]]: jet data, particle data + + ''' + dataset_name = "JetClass Validation Set" file_download_name = "Val_5M" key = "JetClass_Pythia_val_5M.tar" record_id = 6619768 + # Initializing empty matrix to return jet data jet_matrix = np.zeros((1, 100000)) + # Initializing empty matrix to return particle data particle_matrix = np.zeros((1, 136)) + # Extracting the file path file_path = checkDownloadZenodoDataset( data_dir, dataset_name, record_id, key, file_download_name ) print("Processing Data: ...") + # Looping thrpugh each root file in directory for jet_file in os.listdir(file_path): f = os.path.join(file_path, jet_file) for jet in jet_type: + # Checking if user specified jet type(s) is in one of the filepaths of our directory if jet in f: + # opening root file that contains user specified jet type open_file = uproot.open(f) + # root file contains one branch 'tree' branch = open_file["tree"] + # looping through keys in the tree branch for i in branch.keys(): for feature in jet_features: + # checking if user specified jet feature type(s) are part of the keys if feature in i: arr = branch[i].array() + # Converting the array to a numpy array arr = np.array(arr) + # Concatenating np array to jet matrix jet_matrix = np.vstack([jet_matrix, arr]) for particle in particle_features: + # checking if user specified particle feature type(s) are part of the keys if particle in i: arr_awk = branch[i].array() + # Converting awkward level array to a list awk_list = list(arr_awk) + # takes in the 'awk_list' and zero pads the sublists in order to match dimensions zero_pad_arr = zero_padding(awk_list) + # finds the max length sub list length_curr = findMaxLengthList(zero_pad_arr) length_matrix = findMaxLengthList(particle_matrix) zeros = np.zeros(100001) @@ -140,9 +179,11 @@ def getData(self, jet_type, data_dir, particle_features, jet_features): for i in range(diff): zero_pad_arr = np.column_stack((zero_pad_arr, zeros)) particle_matrix = np.vstack([particle_matrix, zero_pad_arr]) + # removing extra row from 'particle_matrix' updated_particle_matrix = np.delete(particle_matrix, 0, axis=0) - + # removing extra row from 'jet_matrix updated_jet_matrix = np.delete(jet_matrix, 0, axis=0) + # reshaping Jet Matrix dim1 = updated_jet_matrix.shape[0] dim2 = updated_jet_matrix.shape[1] dim_res = dim1 / len(jet_features) From 19e540573778fdbb5075cea9096d12f6203e4f41 Mon Sep 17 00:00:00 2001 From: Lint Action Date: Fri, 10 Mar 2023 00:57:03 +0000 Subject: [PATCH 6/9] Fix code style issues with Black --- jetnet/datasets/JetClass.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/jetnet/datasets/JetClass.py b/jetnet/datasets/JetClass.py index 6204a18..e0ab6a8 100644 --- a/jetnet/datasets/JetClass.py +++ b/jetnet/datasets/JetClass.py @@ -99,16 +99,16 @@ def __init__( @classmethod def getData(self, jet_type, data_dir, particle_features, jet_features): - ''' + """ Downloads JetClass dataset from zenodo if dataset is not already downloaded in - user specified data directory. Loads and returns the JetClass data in the form a + user specified data directory. Loads and returns the JetClass data in the form a multidimensional NumPy array. Args: jet_type (Union[str, Set[str]]): individual type or set of types out of 'HToBB' , "HtoCC", "HtoGG", "HtoWW", "HtoWW2Q1L", "HtoWW4Q", "TTBar", "TTBarLep", "WtoQQ", "ZJetstoNuNu", "ZtoQQ" ). - data_dir (str, optional): + data_dir (str, optional): data_dir (str, optional): directory in which data is (to be) stored. Defaults to "./". particle_features (List[str], optional): list of particle features to retrieve. If empty or None, gets no particle features. Defaults to @@ -121,7 +121,7 @@ def getData(self, jet_type, data_dir, particle_features, jet_features): Returns: Tuple[Optional[np.ndarray], Optional[np.ndarray]]: jet data, particle data - ''' + """ dataset_name = "JetClass Validation Set" file_download_name = "Val_5M" @@ -160,7 +160,7 @@ def getData(self, jet_type, data_dir, particle_features, jet_features): # checking if user specified particle feature type(s) are part of the keys if particle in i: arr_awk = branch[i].array() - # Converting awkward level array to a list + # Converting awkward level array to a list awk_list = list(arr_awk) # takes in the 'awk_list' and zero pads the sublists in order to match dimensions zero_pad_arr = zero_padding(awk_list) From c6aecb9e905cdc868a3da3852bc10ff219614b41 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Mar 2023 00:57:08 +0000 Subject: [PATCH 7/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- jetnet/datasets/JetClass.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/jetnet/datasets/JetClass.py b/jetnet/datasets/JetClass.py index e0ab6a8..f21f643 100644 --- a/jetnet/datasets/JetClass.py +++ b/jetnet/datasets/JetClass.py @@ -1,9 +1,11 @@ -from typing import Callable, List, Set, Union, Optional, Tuple -import numpy as np import logging -import uproot import os +from typing import Callable, List, Optional, Set, Tuple, Union + +import numpy as np +import uproot from utils import * + from .dataset import JetDataset From 3c89403fbdba01d1a7c1718e13a24454e32ecfe7 Mon Sep 17 00:00:00 2001 From: cpareja3025 Date: Thu, 9 Mar 2023 16:58:51 -0800 Subject: [PATCH 8/9] documenting utils.py --- jetnet/datasets/utils.py | 64 ++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 13 deletions(-) diff --git a/jetnet/datasets/utils.py b/jetnet/datasets/utils.py index 89e62ce..65775a2 100644 --- a/jetnet/datasets/utils.py +++ b/jetnet/datasets/utils.py @@ -2,22 +2,23 @@ Utility methods for datasets. """ from __future__ import annotations +from typing import Set, List, Tuple, Union, Any +from numpy.typing import ArrayLike -import os +import requests import sys +import os from os.path import exists -from typing import Any, List, Set, Tuple, Union import numpy as np -import requests -from numpy.typing import ArrayLike + +import logging def download_progress_bar(file_url: str, file_dest: str): """ Download while outputting a progress bar. Modified from https://sumit-ghosh.com/articles/python-download-progress-bar/ - Args: file_url (str): url to download from file_dest (str): path at which to save downloaded file @@ -48,9 +49,11 @@ def download_progress_bar(file_url: str, file_dest: str): sys.stdout.write("\n") -def checkDownloadZenodoDataset(data_dir: str, dataset_name: str, record_id: int, key: str): +def checkDownloadZenodoDataset( + data_dir: str, dataset_name: str, record_id: int, key: str, file_download_name: str +): """Checks if dataset exists, if not downloads it from Zenodo, and returns the file path""" - file_path = f"{data_dir}/{key}" + file_path = f"{data_dir}/{file_download_name}" if not exists(file_path): os.system(f"mkdir -p {data_dir}") file_url = getZenodoFileURL(record_id, key) @@ -76,12 +79,10 @@ def getOrderedFeatures( data: ArrayLike, features: List[str], features_order: List[str] ) -> np.ndarray: """Returns data with features in the order specified by ``features``. - Args: data (ArrayLike): input data features (List[str]): desired features in order features_order (List[str]): name and ordering of features in input data - Returns: (np.ndarray): data with features in specified order """ @@ -151,13 +152,10 @@ def getSplitting( """ Returns starting and ending index for splitting a dataset of length ``length`` according to the input ``split`` out of the total possible ``splits`` and a given ``split_fraction``. - "all" is considered a special keyword to mean the entire dataset - it cannot be used to define a normal splitting, and if it is a possible splitting it must be the last entry in ``splits``. - e.g. for ``length = 100``, ``split = "valid"``, ``splits = ["train", "valid", "test"]``, ``split_fraction = [0.7, 0.15, 0.15]`` - This will return ``(70, 85)``. """ @@ -167,7 +165,7 @@ def getSplitting( if split == "all": return 0, length else: - assert splits[-1] == "all", "'all' must be last entry in ``splits`` array" + assert splits[-1] == "all", f"'all' must be last entry in ``splits`` array" splits = splits[:-1] assert np.sum(split_fraction) <= 1.0, "sum of split fractions must be ≤ 1" @@ -175,3 +173,43 @@ def getSplitting( split_index = splits.index(split) cuts = (np.cumsum(np.insert(split_fraction, 0, 0)) * length).astype(int) return cuts[split_index], cuts[split_index + 1] + + +def findMaxLengthList(lst): + ''' + Finds max length sublist in list, returns the integer value of the max sublist. + Args: + lst (List): A nested list containing sublists as its elements. + + ''' + maxLength = max(len(x) for x in lst) + return maxLength + + +def zero_padding(lst): + ''' + Takes in a list containing awkward level array elements. Converts elements into lists + and appends to a new list that will now contain list with nested lists in each eleement + of the outer list. Next, we find the max length of the sublists and use that number to convert + other sublists to lists of that max length sublist by adding zeros at the end of the list in order + to reach the length threshold. Returns a 2D NumPy array of our data after all zero padding is completed. + + Args: + lst (List): An asymmetrical list that needs to be converted to a NumPy 2D array and needs zero padding. + + ''' + returned_list = [] + for sub_list in lst: + sub_list = list(sub_list) + returned_list.append(sub_list) + + padded_list = [] + max_value = findMaxLengthList(returned_list) + for i in returned_list: + # print(type(i)) + pad_list = np.pad(i, (0, max_value - len(i)), "constant", constant_values=0) + padded_list.append(pad_list) + + zero_padded_arr = np.array(padded_list) + + return zero_padded_arr From 73dd754352cd1a9b3a9aa4251109b5e6cb2bf14b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 10 Mar 2023 00:59:39 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- jetnet/datasets/utils.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/jetnet/datasets/utils.py b/jetnet/datasets/utils.py index 65775a2..db0cbe5 100644 --- a/jetnet/datasets/utils.py +++ b/jetnet/datasets/utils.py @@ -2,17 +2,16 @@ Utility methods for datasets. """ from __future__ import annotations -from typing import Set, List, Tuple, Union, Any -from numpy.typing import ArrayLike -import requests -import sys +import logging import os +import sys from os.path import exists +from typing import Any, List, Set, Tuple, Union import numpy as np - -import logging +import requests +from numpy.typing import ArrayLike def download_progress_bar(file_url: str, file_dest: str): @@ -176,18 +175,18 @@ def getSplitting( def findMaxLengthList(lst): - ''' + """ Finds max length sublist in list, returns the integer value of the max sublist. Args: - lst (List): A nested list containing sublists as its elements. - - ''' + lst (List): A nested list containing sublists as its elements. + + """ maxLength = max(len(x) for x in lst) return maxLength def zero_padding(lst): - ''' + """ Takes in a list containing awkward level array elements. Converts elements into lists and appends to a new list that will now contain list with nested lists in each eleement of the outer list. Next, we find the max length of the sublists and use that number to convert @@ -197,7 +196,7 @@ def zero_padding(lst): Args: lst (List): An asymmetrical list that needs to be converted to a NumPy 2D array and needs zero padding. - ''' + """ returned_list = [] for sub_list in lst: sub_list = list(sub_list)