From 1334e971b94efab97199cbda17416d4121aeb8bf Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Wed, 16 Oct 2024 18:27:49 -0400 Subject: [PATCH] update *_task.yaml files with correct test path Signed-off-by: Michael Clifford --- eval/final/components.py | 39 ++++++++++++++++++--------------------- pipeline.py | 2 +- pipeline.yaml | 31 +++++++++++++++---------------- standalone/standalone.py | 32 +++++++++++--------------------- 4 files changed, 45 insertions(+), 59 deletions(-) diff --git a/eval/final/components.py b/eval/final/components.py index 9608e57f..ca64a5b3 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -199,32 +199,29 @@ def branch_eval_summary_to_json( # model evaluation are taking place in separate environments. def update_test_lines_in_files(base_dir): import os - import re - # Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*' - regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)") + import yaml for root, dirs, files in os.walk(base_dir): for file_name in files: - file_path = os.path.join(root, file_name) - - with open(file_path, "r") as file: - lines = file.readlines() - - updated_lines = [] - changed = False - - for line in lines: - # Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact - new_line = re.sub(regex, rf"\1{base_dir}/\2\3", line) - if new_line != line: - changed = True # Only rewrite the file if there's a change - updated_lines.append(new_line) - - if changed: + if file_name.startswith("knowledge_") and file_name.endswith( + "_task.yaml" + ): + file_path = os.path.join(root, file_name) + + with open(file_path, "r") as file: + task_yaml = yaml.load(file, Loader=yaml.Loader) + + current_test_file_path = task_yaml["dataset_kwargs"]["data_files"][ + "test" + ] + current_test_file_path_parts = current_test_file_path.split("/") + new_test_file_path = f"{root}/{current_test_file_path_parts[-1]}" + task_yaml["dataset_kwargs"]["data_files"]["test"] = ( + new_test_file_path + ) with open(file_path, "w", encoding="utf-8") as file: - file.writelines(updated_lines) - print(f"Updated: {file_path}") + yaml.dump(task_yaml, file) # find_node_dataset_directories to find sdg output node_datasets_* def find_node_dataset_directories(base_dir: str): diff --git a/pipeline.py b/pipeline.py index a97cfeed..6f4a85f6 100644 --- a/pipeline.py +++ b/pipeline.py @@ -447,7 +447,7 @@ def gen_standalone(): "exec-git-clone-op": {}, "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")', "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/phase_2/hf_format", models_path_prefix="/data/model/output/phase_2/hf_format", max_workers="auto", merge_system_user_message=False)', - "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)", + "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/data', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)", } details = {} diff --git a/pipeline.yaml b/pipeline.yaml index fe6e46d7..df7cddd3 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1104,22 +1104,21 @@ deploymentSpec: \ MMLU_BRANCH\n\n # This is very specific to 'ilab generate', necessary\ \ because the data generation and\n # model evaluation are taking place\ \ in separate environments.\n def update_test_lines_in_files(base_dir):\n\ - \ import os\n import re\n\n # Define the regex to match\ - \ lines starting with any indentation, 'test:', and containing 'node_datasets_*'\n\ - \ regex = re.compile(r\"(\\s*test:\\s*).*/(node_datasets_[^/]*)(.*)\"\ - )\n\n for root, dirs, files in os.walk(base_dir):\n for\ - \ file_name in files:\n file_path = os.path.join(root, file_name)\n\ - \n with open(file_path, \"r\") as file:\n \ - \ lines = file.readlines()\n\n updated_lines = []\n \ - \ changed = False\n\n for line in lines:\n\ - \ # Replace the matched line with the desired format,\ - \ keeping 'test:' and leading whitespace intact\n new_line\ - \ = re.sub(regex, rf\"\\1{base_dir}/\\2\\3\", line)\n \ - \ if new_line != line:\n changed = True # Only\ - \ rewrite the file if there's a change\n updated_lines.append(new_line)\n\ - \n if changed:\n with open(file_path,\ - \ \"w\", encoding=\"utf-8\") as file:\n file.writelines(updated_lines)\n\ - \ print(f\"Updated: {file_path}\")\n\n # find_node_dataset_directories\ + \ import os\n\n import yaml\n\n for root, dirs, files\ + \ in os.walk(base_dir):\n for file_name in files:\n \ + \ if file_name.startswith(\"knowledge_\") and file_name.endswith(\n\ + \ \"_task.yaml\"\n ):\n \ + \ file_path = os.path.join(root, file_name)\n\n \ + \ with open(file_path, \"r\") as file:\n task_yaml\ + \ = yaml.load(file, Loader=yaml.Loader)\n\n current_test_file_path\ + \ = task_yaml[\"dataset_kwargs\"][\"data_files\"][\n \ + \ \"test\"\n ]\n current_test_file_path_parts\ + \ = current_test_file_path.split(\"/\")\n new_test_file_path\ + \ = f\"{root}/{current_test_file_path_parts[-1]}\"\n \ + \ task_yaml[\"dataset_kwargs\"][\"data_files\"][\"test\"] = (\n \ + \ new_test_file_path\n )\n \ + \ with open(file_path, \"w\", encoding=\"utf-8\") as file:\n \ + \ yaml.dump(task_yaml, file)\n\n # find_node_dataset_directories\ \ to find sdg output node_datasets_*\n def find_node_dataset_directories(base_dir:\ \ str):\n import os\n import re\n\n # This is specific\ \ to ilab/eval output\n pattern = r\"node_datasets_\"\n matching_dirs\ diff --git a/standalone/standalone.py b/standalone/standalone.py index dd3e5936..5b6e80be 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -1910,32 +1910,22 @@ def branch_eval_summary_to_json( # model evaluation are taking place in separate environments. def update_test_lines_in_files(base_dir): import os - import re - - # Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*' - regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)") + import yaml for root, dirs, files in os.walk(base_dir): for file_name in files: - file_path = os.path.join(root, file_name) - - with open(file_path, "r") as file: - lines = file.readlines() - - updated_lines = [] - changed = False + if file_name.startswith("knowledge_") and file_name.endswith("_task.yaml"): + file_path = os.path.join(root, file_name) - for line in lines: - # Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact - new_line = re.sub(regex, rf"\1{base_dir}/\2\3", line) - if new_line != line: - changed = True # Only rewrite the file if there's a change - updated_lines.append(new_line) + with open(file_path, "r") as file: + task_yaml = yaml.load(file, Loader=yaml.Loader) - if changed: + current_test_file_path = task_yaml["dataset_kwargs"]["data_files"]["test"] + current_test_file_path_parts = current_test_file_path.split("/") + new_test_file_path = f"{root}/{current_test_file_path_parts[-1]}" + task_yaml["dataset_kwargs"]["data_files"]["test"] = new_test_file_path with open(file_path, "w", encoding="utf-8") as file: - file.writelines(updated_lines) - print(f"Updated: {file_path}") + yaml.dump(task_yaml, file) # find_node_dataset_directories to find sdg output node_datasets_* def find_node_dataset_directories(base_dir: str): @@ -2180,7 +2170,7 @@ def find_node_dataset_directories(base_dir: str): json.dump(mt_bench_branch_data, f, indent=4) """ exec_run_final_eval_op_args = """ -run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8) +run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/data', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8) """ if eval_type == "mt-bench":