From 1334e971b94efab97199cbda17416d4121aeb8bf Mon Sep 17 00:00:00 2001
From: Michael Clifford <mcliffor@redhat.com>
Date: Wed, 16 Oct 2024 18:27:49 -0400
Subject: [PATCH] update *_task.yaml files with correct test path

Signed-off-by: Michael Clifford <mcliffor@redhat.com>
---
 eval/final/components.py | 39 ++++++++++++++++++---------------------
 pipeline.py              |  2 +-
 pipeline.yaml            | 31 +++++++++++++++----------------
 standalone/standalone.py | 32 +++++++++++---------------------
 4 files changed, 45 insertions(+), 59 deletions(-)

diff --git a/eval/final/components.py b/eval/final/components.py
index 9608e57f..ca64a5b3 100644
--- a/eval/final/components.py
+++ b/eval/final/components.py
@@ -199,32 +199,29 @@ def branch_eval_summary_to_json(
     # model evaluation are taking place in separate environments.
     def update_test_lines_in_files(base_dir):
         import os
-        import re
 
-        # Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
-        regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
+        import yaml
 
         for root, dirs, files in os.walk(base_dir):
             for file_name in files:
-                file_path = os.path.join(root, file_name)
-
-                with open(file_path, "r") as file:
-                    lines = file.readlines()
-
-                updated_lines = []
-                changed = False
-
-                for line in lines:
-                    # Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
-                    new_line = re.sub(regex, rf"\1{base_dir}/\2\3", line)
-                    if new_line != line:
-                        changed = True  # Only rewrite the file if there's a change
-                    updated_lines.append(new_line)
-
-                if changed:
+                if file_name.startswith("knowledge_") and file_name.endswith(
+                    "_task.yaml"
+                ):
+                    file_path = os.path.join(root, file_name)
+
+                    with open(file_path, "r") as file:
+                        task_yaml = yaml.load(file, Loader=yaml.Loader)
+
+                    current_test_file_path = task_yaml["dataset_kwargs"]["data_files"][
+                        "test"
+                    ]
+                    current_test_file_path_parts = current_test_file_path.split("/")
+                    new_test_file_path = f"{root}/{current_test_file_path_parts[-1]}"
+                    task_yaml["dataset_kwargs"]["data_files"]["test"] = (
+                        new_test_file_path
+                    )
                     with open(file_path, "w", encoding="utf-8") as file:
-                        file.writelines(updated_lines)
-                    print(f"Updated: {file_path}")
+                        yaml.dump(task_yaml, file)
 
     # find_node_dataset_directories to find sdg output node_datasets_*
     def find_node_dataset_directories(base_dir: str):
diff --git a/pipeline.py b/pipeline.py
index a97cfeed..6f4a85f6 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -447,7 +447,7 @@ def gen_standalone():
         "exec-git-clone-op": {},
         "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
         "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/phase_2/hf_format", models_path_prefix="/data/model/output/phase_2/hf_format", max_workers="auto", merge_system_user_message=False)',
-        "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
+        "exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/data', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
     }
 
     details = {}
diff --git a/pipeline.yaml b/pipeline.yaml
index fe6e46d7..df7cddd3 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -1104,22 +1104,21 @@ deploymentSpec:
           \ MMLU_BRANCH\n\n    # This is very specific to 'ilab generate', necessary\
           \ because the data generation and\n    # model evaluation are taking place\
           \ in separate environments.\n    def update_test_lines_in_files(base_dir):\n\
-          \        import os\n        import re\n\n        # Define the regex to match\
-          \ lines starting with any indentation, 'test:', and containing 'node_datasets_*'\n\
-          \        regex = re.compile(r\"(\\s*test:\\s*).*/(node_datasets_[^/]*)(.*)\"\
-          )\n\n        for root, dirs, files in os.walk(base_dir):\n            for\
-          \ file_name in files:\n                file_path = os.path.join(root, file_name)\n\
-          \n                with open(file_path, \"r\") as file:\n               \
-          \     lines = file.readlines()\n\n                updated_lines = []\n \
-          \               changed = False\n\n                for line in lines:\n\
-          \                    # Replace the matched line with the desired format,\
-          \ keeping 'test:' and leading whitespace intact\n                    new_line\
-          \ = re.sub(regex, rf\"\\1{base_dir}/\\2\\3\", line)\n                  \
-          \  if new_line != line:\n                        changed = True  # Only\
-          \ rewrite the file if there's a change\n                    updated_lines.append(new_line)\n\
-          \n                if changed:\n                    with open(file_path,\
-          \ \"w\", encoding=\"utf-8\") as file:\n                        file.writelines(updated_lines)\n\
-          \                    print(f\"Updated: {file_path}\")\n\n    # find_node_dataset_directories\
+          \        import os\n\n        import yaml\n\n        for root, dirs, files\
+          \ in os.walk(base_dir):\n            for file_name in files:\n         \
+          \       if file_name.startswith(\"knowledge_\") and file_name.endswith(\n\
+          \                    \"_task.yaml\"\n                ):\n              \
+          \      file_path = os.path.join(root, file_name)\n\n                   \
+          \ with open(file_path, \"r\") as file:\n                        task_yaml\
+          \ = yaml.load(file, Loader=yaml.Loader)\n\n                    current_test_file_path\
+          \ = task_yaml[\"dataset_kwargs\"][\"data_files\"][\n                   \
+          \     \"test\"\n                    ]\n                    current_test_file_path_parts\
+          \ = current_test_file_path.split(\"/\")\n                    new_test_file_path\
+          \ = f\"{root}/{current_test_file_path_parts[-1]}\"\n                   \
+          \ task_yaml[\"dataset_kwargs\"][\"data_files\"][\"test\"] = (\n        \
+          \                new_test_file_path\n                    )\n           \
+          \         with open(file_path, \"w\", encoding=\"utf-8\") as file:\n   \
+          \                     yaml.dump(task_yaml, file)\n\n    # find_node_dataset_directories\
           \ to find sdg output node_datasets_*\n    def find_node_dataset_directories(base_dir:\
           \ str):\n        import os\n        import re\n\n        # This is specific\
           \ to ilab/eval output\n        pattern = r\"node_datasets_\"\n        matching_dirs\
diff --git a/standalone/standalone.py b/standalone/standalone.py
index dd3e5936..5b6e80be 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1910,32 +1910,22 @@ def branch_eval_summary_to_json(
     # model evaluation are taking place in separate environments.
     def update_test_lines_in_files(base_dir):
         import os
-        import re
-
-        # Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
-        regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
+        import yaml
 
         for root, dirs, files in os.walk(base_dir):
             for file_name in files:
-                file_path = os.path.join(root, file_name)
-
-                with open(file_path, "r") as file:
-                    lines = file.readlines()
-
-                updated_lines = []
-                changed = False
+                if file_name.startswith("knowledge_") and file_name.endswith("_task.yaml"):
+                    file_path = os.path.join(root, file_name)
 
-                for line in lines:
-                    # Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
-                    new_line = re.sub(regex, rf"\1{base_dir}/\2\3", line)
-                    if new_line != line:
-                        changed = True  # Only rewrite the file if there's a change
-                    updated_lines.append(new_line)
+                    with open(file_path, "r") as file:
+                        task_yaml = yaml.load(file, Loader=yaml.Loader)
 
-                if changed:
+                    current_test_file_path = task_yaml["dataset_kwargs"]["data_files"]["test"]
+                    current_test_file_path_parts = current_test_file_path.split("/")
+                    new_test_file_path = f"{root}/{current_test_file_path_parts[-1]}"
+                    task_yaml["dataset_kwargs"]["data_files"]["test"] = new_test_file_path
                     with open(file_path, "w", encoding="utf-8") as file:
-                        file.writelines(updated_lines)
-                    print(f"Updated: {file_path}")
+                            yaml.dump(task_yaml, file)
 
     # find_node_dataset_directories to find sdg output node_datasets_*
     def find_node_dataset_directories(base_dir: str):
@@ -2180,7 +2170,7 @@ def find_node_dataset_directories(base_dir: str):
         json.dump(mt_bench_branch_data, f, indent=4)
 """
     exec_run_final_eval_op_args = """
-run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
+run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/data', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
 """
 
     if eval_type == "mt-bench":