Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataset name checking script #3802

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 119 additions & 0 deletions bin/utils/check_dataset_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import re

def validate_block(name, pattern, block_name):
""" Helper function to validate individual blocks and provide feedback """
if block_name=="ME-PS":
match = re.fullmatch(pattern, name)
if not match:
return False, "Invalid ME-PS format"

me = match.group(1)
ps = match.group(3) # Only available in ME-PS combinations

if ps and me == ps:
return False, "ME and PS cannot be the same"

return True, "Valid ME-PS block"

else:
if re.match(pattern, name):
return True, f"{block_name} block is valid."
else:
return False, f"Invalid {block_name} block."

def validate_dataset_name(dataset_name):
# Define regex patterns for different blocks
process_pattern = r".*" # PROCESS is mandatory
binning_pattern = r"Bin-[\w-]+" # BINNING is optional
filter_pattern = r"Fil-[\w-]+" # FILTER is optional
param_pattern = r"Par-[\w-]+" # PARAMETERS is optional
tune_pattern = r"TuneCP[1-5]" # TUNE is mandatory (TuneCP1 to TuneCP5)
beame_pattern = r"13p6TeV|\d+TeV|\d+GeV" # BEAME is mandatory
me_ps_pattern = (
r"(pythia6|pythia8|pythia8-evtgen|herwig6|herwigpp|herwig7|sherpa|"
r"(madgraph|madgraphMLM|amcatnloFXFX|madgraph-madspin|madgraphMLM-madspin|"
r"amcatnloFXFX-madspin|amcatnlo|amcatnlo-madspin|alpgen|mcatnlo|powheg|"
r"powheg-madspin|powheg-JHUGenV\d*|powheg-minlo|powheg-minnlo|powheg-minlo-JHUGenV\d*|"
r"powheg-minnlo-JHUGen\d*|JHUGen|hardcol|bcvegpy2)"
r"-(pythia6|pythia8|herwig6|herwigpp|herwig7))"
)
blocks = dataset_name.split('_')

feedback = []
valid = True

# Step-by-step validation of blocks:

# Validate PROCESS (first block is mandatory)
if len(blocks) >= 1:
process_match, msg = validate_block(blocks[0], process_pattern, "PROCESS")
feedback.append(msg)
valid &= process_match
else:
feedback.append("Missing PROCESS block.")
valid = False

current_index = 1

# Validate BINNING (optional, second block if it starts with 'Bin-')
if len(blocks) > current_index and blocks[current_index].startswith('Bin-'):
binning_match, msg = validate_block(blocks[current_index], binning_pattern, "BINNING")
feedback.append(msg)
valid &= binning_match
current_index += 1
else:
feedback.append("BINNING block is missing or optional.")

# Validate FILTER (optional, next block if it starts with 'Fil-')
if len(blocks) > current_index and blocks[current_index].startswith('Fil-'):
filter_match, msg = validate_block(blocks[current_index], filter_pattern, "FILTER")
feedback.append(msg)
valid &= filter_match
current_index += 1
else:
feedback.append("FILTER block is missing or optional.")

# Validate PARAMETERS (optional, next block if it starts with 'Par-')
if len(blocks) > current_index and blocks[current_index].startswith('Par-'):
param_match, msg = validate_block(blocks[current_index], param_pattern, "PARAMETERS")
feedback.append(msg)
valid &= param_match
current_index += 1
else:
feedback.append("PARAMETERS block is missing or optional.")

# Validate TUNE (mandatory)
if len(blocks) > current_index:
tune_match, msg = validate_block(blocks[current_index], tune_pattern, "TUNE")
feedback.append(msg)
valid &= tune_match
current_index += 1
else:
feedback.append("Missing TUNE block.")
valid = False

# Validate BEAME (mandatory)
if len(blocks) > current_index:
beame_match, msg = validate_block(blocks[current_index], beame_pattern, "BEAME")
feedback.append(msg)
valid &= beame_match
current_index += 1
else:
feedback.append("Missing BEAME block.")
valid = False

# Validate ME-PS (mandatory)
if len(blocks) > current_index:
me_ps_match, msg = validate_block(blocks[current_index], me_ps_pattern, "ME-PS")
feedback.append(msg)
valid &= me_ps_match
else:
feedback.append("Missing ME-PS block.")
valid = False

# Return feedback and validity status
if valid:
return True, "Valid dataset name", feedback
else:
return False, "Invalid dataset name", feedback

29 changes: 29 additions & 0 deletions bin/utils/test_dataset_names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from check_dataset_names import *

test_cases = ["DYto2L-4Jets_Bin-MLL-4to10_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-MLL-10to50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-2Jets_Bin-MLL-4to10_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-2Jets_Bin-MLL-10to50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-4Jets_Bin-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-2Jets_Bin-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-4Jets_Bin-0J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-1J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-2J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-3J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-4Jets_Bin-4J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8",
"DYto2L-2Jets_Bin-0J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-2Jets_Bin-1J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8",
"DYto2L-2Jets_2J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", #THIS IS NOT A VALID NAME
"DYto2L-2Jets_Bin-2J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8"
]

# Validate the test cases
for name in test_cases:
valid, message, feedback = validate_dataset_name(name)
print(f"Dataset: {name} -> {message}")
if not valid:
for item in feedback:
print(f" - {item}")

print("\n")