Skip to content

Commit

Permalink
update *_task.yaml files with correct test path
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Oct 16, 2024
1 parent cfad476 commit 1334e97
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 59 deletions.
39 changes: 18 additions & 21 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,32 +199,29 @@ def branch_eval_summary_to_json(
# model evaluation are taking place in separate environments.
def update_test_lines_in_files(base_dir):
import os
import re

# Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
import yaml

for root, dirs, files in os.walk(base_dir):
for file_name in files:
file_path = os.path.join(root, file_name)

with open(file_path, "r") as file:
lines = file.readlines()

updated_lines = []
changed = False

for line in lines:
# Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
new_line = re.sub(regex, rf"\1{base_dir}/\2\3", line)
if new_line != line:
changed = True # Only rewrite the file if there's a change
updated_lines.append(new_line)

if changed:
if file_name.startswith("knowledge_") and file_name.endswith(
"_task.yaml"
):
file_path = os.path.join(root, file_name)

with open(file_path, "r") as file:
task_yaml = yaml.load(file, Loader=yaml.Loader)

current_test_file_path = task_yaml["dataset_kwargs"]["data_files"][
"test"
]
current_test_file_path_parts = current_test_file_path.split("/")
new_test_file_path = f"{root}/{current_test_file_path_parts[-1]}"
task_yaml["dataset_kwargs"]["data_files"]["test"] = (
new_test_file_path
)
with open(file_path, "w", encoding="utf-8") as file:
file.writelines(updated_lines)
print(f"Updated: {file_path}")
yaml.dump(task_yaml, file)

# find_node_dataset_directories to find sdg output node_datasets_*
def find_node_dataset_directories(base_dir: str):
Expand Down
2 changes: 1 addition & 1 deletion pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,7 @@ def gen_standalone():
"exec-git-clone-op": {},
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
"exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="/data/mt-bench-best.txt",mt_bench_output="/data/mt-bench-results.txt", models_folder="/data/model/output/phase_2/hf_format", models_path_prefix="/data/model/output/phase_2/hf_format", max_workers="auto", merge_system_user_message=False)',
"exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
"exec-run-final-eval-op": "run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/data', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)",
}

details = {}
Expand Down
31 changes: 15 additions & 16 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1104,22 +1104,21 @@ deploymentSpec:
\ MMLU_BRANCH\n\n # This is very specific to 'ilab generate', necessary\
\ because the data generation and\n # model evaluation are taking place\
\ in separate environments.\n def update_test_lines_in_files(base_dir):\n\
\ import os\n import re\n\n # Define the regex to match\
\ lines starting with any indentation, 'test:', and containing 'node_datasets_*'\n\
\ regex = re.compile(r\"(\\s*test:\\s*).*/(node_datasets_[^/]*)(.*)\"\
)\n\n for root, dirs, files in os.walk(base_dir):\n for\
\ file_name in files:\n file_path = os.path.join(root, file_name)\n\
\n with open(file_path, \"r\") as file:\n \
\ lines = file.readlines()\n\n updated_lines = []\n \
\ changed = False\n\n for line in lines:\n\
\ # Replace the matched line with the desired format,\
\ keeping 'test:' and leading whitespace intact\n new_line\
\ = re.sub(regex, rf\"\\1{base_dir}/\\2\\3\", line)\n \
\ if new_line != line:\n changed = True # Only\
\ rewrite the file if there's a change\n updated_lines.append(new_line)\n\
\n if changed:\n with open(file_path,\
\ \"w\", encoding=\"utf-8\") as file:\n file.writelines(updated_lines)\n\
\ print(f\"Updated: {file_path}\")\n\n # find_node_dataset_directories\
\ import os\n\n import yaml\n\n for root, dirs, files\
\ in os.walk(base_dir):\n for file_name in files:\n \
\ if file_name.startswith(\"knowledge_\") and file_name.endswith(\n\
\ \"_task.yaml\"\n ):\n \
\ file_path = os.path.join(root, file_name)\n\n \
\ with open(file_path, \"r\") as file:\n task_yaml\
\ = yaml.load(file, Loader=yaml.Loader)\n\n current_test_file_path\
\ = task_yaml[\"dataset_kwargs\"][\"data_files\"][\n \
\ \"test\"\n ]\n current_test_file_path_parts\
\ = current_test_file_path.split(\"/\")\n new_test_file_path\
\ = f\"{root}/{current_test_file_path_parts[-1]}\"\n \
\ task_yaml[\"dataset_kwargs\"][\"data_files\"][\"test\"] = (\n \
\ new_test_file_path\n )\n \
\ with open(file_path, \"w\", encoding=\"utf-8\") as file:\n \
\ yaml.dump(task_yaml, file)\n\n # find_node_dataset_directories\
\ to find sdg output node_datasets_*\n def find_node_dataset_directories(base_dir:\
\ str):\n import os\n import re\n\n # This is specific\
\ to ilab/eval output\n pattern = r\"node_datasets_\"\n matching_dirs\
Expand Down
32 changes: 11 additions & 21 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1910,32 +1910,22 @@ def branch_eval_summary_to_json(
# model evaluation are taking place in separate environments.
def update_test_lines_in_files(base_dir):
import os
import re
# Define the regex to match lines starting with any indentation, 'test:', and containing 'node_datasets_*'
regex = re.compile(r"(\s*test:\s*).*/(node_datasets_[^/]*)(.*)")
import yaml
for root, dirs, files in os.walk(base_dir):
for file_name in files:
file_path = os.path.join(root, file_name)
with open(file_path, "r") as file:
lines = file.readlines()
updated_lines = []
changed = False
if file_name.startswith("knowledge_") and file_name.endswith("_task.yaml"):
file_path = os.path.join(root, file_name)
for line in lines:
# Replace the matched line with the desired format, keeping 'test:' and leading whitespace intact
new_line = re.sub(regex, rf"\1{base_dir}/\2\3", line)
if new_line != line:
changed = True # Only rewrite the file if there's a change
updated_lines.append(new_line)
with open(file_path, "r") as file:
task_yaml = yaml.load(file, Loader=yaml.Loader)
if changed:
current_test_file_path = task_yaml["dataset_kwargs"]["data_files"]["test"]
current_test_file_path_parts = current_test_file_path.split("/")
new_test_file_path = f"{root}/{current_test_file_path_parts[-1]}"
task_yaml["dataset_kwargs"]["data_files"]["test"] = new_test_file_path
with open(file_path, "w", encoding="utf-8") as file:
file.writelines(updated_lines)
print(f"Updated: {file_path}")
yaml.dump(task_yaml, file)
# find_node_dataset_directories to find sdg output node_datasets_*
def find_node_dataset_directories(base_dir: str):
Expand Down Expand Up @@ -2180,7 +2170,7 @@ def find_node_dataset_directories(base_dir: str):
json.dump(mt_bench_branch_data, f, indent=4)
"""
exec_run_final_eval_op_args = """
run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/generated', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
run_final_eval_op(mmlu_branch_output='/data/mmlu-branch-best.txt',mt_bench_branch_output='/data/mt-bench-branch-best.txt',candidate_model='/data/model/output/phase_2/hf_format/candidate_model', taxonomy='/data/taxonomy', tasks='/data/data', base_branch='', candidate_branch='', device=None, base_model_dir='/data/model', max_workers='auto', merge_system_user_message=False, model_dtype='bfloat16', few_shots=5, batch_size=8)
"""

if eval_type == "mt-bench":
Expand Down

0 comments on commit 1334e97

Please sign in to comment.