Skip to content

Commit

Permalink
give mt_bench best_score_file a default name and use it for logging m…
Browse files Browse the repository at this point in the history
…etrics

Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Jan 7, 2025
1 parent 9ad6599 commit 1f4bd14
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 35 deletions.
10 changes: 4 additions & 6 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,15 +489,13 @@ def find_node_dataset_directories(base_dir: str):
def generate_metrics_report_op(
metrics: Output[Metrics],
):
import ast
import json

with open("/output/mt_bench_data.jsonl", "r") as f:
mt_bench_data = json.loads(f.readline())
with open("/output/mt_bench_best_data.json", "r") as f:
mt_bench_data = json.loads(f.read())

metrics.log_metric("mt_bench_best_model", mt_bench_data["model"])
metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"])
metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"])
metrics.log_metric("mt_bench_best_model", mt_bench_data["best_model"])
metrics.log_metric("mt_bench_best_score", mt_bench_data["best_score"])

with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f:
mt_bench_branch_data = json.loads(f.read())
Expand Down
8 changes: 4 additions & 4 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def run_mt_bench_op(
max_workers: str,
models_folder: str,
output_path: str = "/output/mt_bench_data.jsonl",
best_score_file: Optional[str] = None,
best_score_file: str = "/output/mt_bench_best_data.json",
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
Expand Down Expand Up @@ -195,9 +195,9 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
if best_score_file:
with open(best_score_file, "w", encoding="utf-8") as f:
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)

with open(best_score_file, "w", encoding="utf-8") as f:
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)

# Rename the best model directory to "candidate_model" for the next step
# So we know which model to use for the final evaluation
Expand Down
49 changes: 24 additions & 25 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ components:
inputDefinitions:
parameters:
best_score_file:
defaultValue: /output/mt_bench_best_data.json
isOptional: true
parameterType: STRING
max_workers:
Expand Down Expand Up @@ -686,20 +687,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\
):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.jsonl\"\
, \"r\") as f:\n mt_bench_data = json.loads(f.readline())\n\n \
\ metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"])\n\
\ metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\
])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\
error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\
, \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\
\ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\
overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\
, mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\
/output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\
\ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\
, mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\
, mmlu_branch_data[\"base_model_score\"])\n\n"
):\n import json\n\n with open(\"/output/mt_bench_best_data.json\"\
, \"r\") as f:\n mt_bench_data = json.loads(f.read())\n\n metrics.log_metric(\"\
mt_bench_best_model\", mt_bench_data[\"best_model\"])\n metrics.log_metric(\"\
mt_bench_best_score\", mt_bench_data[\"best_score\"])\n\n with open(\"\
/output/mt_bench_branch/mt_bench_branch_data.json\", \"r\") as f:\n \
\ mt_bench_branch_data = json.loads(f.read())\n\n metrics.log_metric(\"\
mt_bench_branch_score\", mt_bench_branch_data[\"overall_score\"])\n metrics.log_metric(\n\
\ \"mt_bench_branch_base_score\", mt_bench_branch_data[\"base_overall_score\"\
]\n )\n\n with open(\"/output/mmlu_branch/mmlu_branch_data.json\"\
, \"r\") as f:\n mmlu_branch_data = json.loads(f.read())\n\n metrics.log_metric(\"\
mmlu_branch_score\", mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"\
mmlu_branch_base_score\", mmlu_branch_data[\"base_model_score\"])\n\n"
image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
exec-git-clone-op:
container:
Expand Down Expand Up @@ -1449,9 +1448,9 @@ deploymentSpec:
\ - 'auto'\n # with 'auto', number of gpus allocated for serving is\
\ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
\ max_workers: str,\n models_folder: str,\n output_path: str =\
\ \"/output/mt_bench_data.jsonl\",\n best_score_file: Optional[str] =\
\ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\
\ import json\n import os\n import subprocess\n\n import httpx\n\
\ \"/output/mt_bench_data.jsonl\",\n best_score_file: str = \"/output/mt_bench_best_data.json\"\
,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n \
\ import json\n import os\n import subprocess\n\n import httpx\n\
\ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
Expand Down Expand Up @@ -1533,14 +1532,14 @@ deploymentSpec:
\ \"w\", encoding=\"utf-8\") as f:\n for data in all_mt_bench_data:\n\
\ json.dump(data, f)\n f.write(\"\\n\")\n\n outputs\
\ = NamedTuple(\"outputs\", best_model=str, best_score=float)\n best_model\
\ = max(scores, key=scores.get)\n best_score = scores[best_model]\n \
\ if best_score_file:\n with open(best_score_file, \"w\", encoding=\"\
utf-8\") as f:\n json.dump({\"best_model\": best_model, \"best_score\"\
: best_score}, f, indent=4)\n\n # Rename the best model directory to\
\ \"candidate_model\" for the next step\n # So we know which model to\
\ use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\
\ \"candidate_model\")):\n print(\"candidate_model already exists.\
\ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\
\ = max(scores, key=scores.get)\n best_score = scores[best_model]\n\n\
\ with open(best_score_file, \"w\", encoding=\"utf-8\") as f:\n \
\ json.dump({\"best_model\": best_model, \"best_score\": best_score},\
\ f, indent=4)\n\n # Rename the best model directory to \"candidate_model\"\
\ for the next step\n # So we know which model to use for the final evaluation\n\
\ if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\
\ print(\"candidate_model already exists. Skipping renaming\")\n\
\ else:\n os.rename(\n os.path.join(models_folder,\
\ best_model),\n os.path.join(models_folder, \"candidate_model\"\
),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n\
\n"
Expand Down

0 comments on commit 1f4bd14

Please sign in to comment.