Skip to content

Commit

Permalink
update generate_metrics_report_op and consolidate mt_bench reports
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Clifford <[email protected]>
  • Loading branch information
MichaelClifford committed Jan 7, 2025
1 parent 70fe60c commit 06ea8aa
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 95 deletions.
56 changes: 30 additions & 26 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def run_final_eval_op(
import json
import os
import subprocess
from pathlib import Path

import httpx
import torch
Expand Down Expand Up @@ -320,17 +321,18 @@ def find_node_dataset_directories(base_dir: str):
"report_title": "KNOWLEDGE EVALUATION REPORT",
"max_score": "1.0",
"model": candidate_model,
"model_score": round(overall_score, 2),
"trained_model_score": round(overall_score, 2),
"base_model": base_model_dir,
"base_model_score": round(base_overall_score, 2),
"summary": summary,
}

if not os.path.exists(mmlu_branch_output_path):
os.makedirs(mmlu_branch_output_path)
with open(
f"{mmlu_branch_output_path}/mmlu_branch_data.json", "w", encoding="utf-8"
) as f:
mmlu_branch_output_file = (
Path(mmlu_branch_output_path) / "mmlu_branch_data.json"
)
with open(mmlu_branch_output_file, "w", encoding="utf-8") as f:
json.dump(mmlu_branch_data, f, indent=4)

else:
Expand Down Expand Up @@ -469,16 +471,19 @@ def find_node_dataset_directories(base_dir: str):
"model": candidate_model,
"judge_model": judge_model_name,
"max_score": "10.0",
"overall_score": overall_score,
"base_overall_score": base_overall_score,
"trained_model_score": overall_score,
"base_model_score": base_overall_score,
"error_rate": error_rate,
"summary": summary,
}

if not os.path.exists(mt_bench_branch_output_path):
os.makedirs(mt_bench_branch_output_path)
mt_bench_branch_data_file = (
Path(mt_bench_branch_output_path) / "mt_bench_branch_data.json"
)
with open(
f"{mt_bench_branch_output_path}/mt_bench_branch_data.json",
mt_bench_branch_data_file,
"w",
encoding="utf-8",
) as f:
Expand All @@ -492,24 +497,23 @@ def generate_metrics_report_op(
import ast
import json

with open("/output/mt_bench_data.json", "r") as f:
mt_bench_data = f.read()
mt_bench_data = ast.literal_eval(mt_bench_data)[0]

metrics.log_metric("mt_bench_best_model", mt_bench_data["model"])
metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"])
metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"])

with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f:
mt_bench_branch_data = json.loads(f.read())

metrics.log_metric("mt_bench_branch_score", mt_bench_branch_data["overall_score"])
metrics.log_metric(
"mt_bench_branch_base_score", mt_bench_branch_data["base_overall_score"]
)
reports = {
"mt_bench": "/output/mt_bench_data.json",
"mt_bench_branch": "/output/mt_bench_branch/mt_bench_branch_data.json",
"mmlu_branch": "/output/mmlu_branch/mmlu_branch_data.json",
}

with open("/output/mmlu_branch/mmlu_branch_data.json", "r") as f:
mmlu_branch_data = json.loads(f.read())
for report, file_name in reports.items():
with open(file_name, "r") as f:
report_data = json.load(f)

metrics.log_metric("mmlu_branch_score", mmlu_branch_data["model_score"])
metrics.log_metric("mmlu_branch_base_score", mmlu_branch_data["base_model_score"])
if report == "mt_bench":
metrics.log_metric(f"{report}_best_model", report_data[0]["best_model"])
metrics.log_metric(f"{report}_best_score", report_data[0]["best_score"])
else:
metrics.log_metric(
f"{report}_trained_model_score", report_data["trained_model_score"]
)
metrics.log_metric(
f"{report}_base_model_score", report_data["base_model_score"]
)
11 changes: 4 additions & 7 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ def run_mt_bench_op(
max_workers: str,
models_folder: str,
output_path: str = "/output/mt_bench_data.json",
best_score_file: Optional[str] = None,
) -> NamedTuple("outputs", best_model=str, best_score=float):
import json
import os
Expand Down Expand Up @@ -187,15 +186,13 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20):
all_mt_bench_data.append(mt_bench_data)
scores[model_path] = overall_score

with open(output_path, "w", encoding="utf-8") as f:
json.dump(all_mt_bench_data, f, indent=4)

outputs = NamedTuple("outputs", best_model=str, best_score=float)
best_model = max(scores, key=scores.get)
best_score = scores[best_model]
if best_score_file:
with open(best_score_file, "w", encoding="utf-8") as f:
json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4)
all_mt_bench_data.insert(0, {"best_model": best_model, "best_score": best_score})

with open(output_path, "w", encoding="utf-8") as f:
json.dump(all_mt_bench_data, f, indent=4)

# Rename the best model directory to "candidate_model" for the next step
# So we know which model to use for the final evaluation
Expand Down
121 changes: 59 additions & 62 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -494,9 +494,6 @@ components:
executorLabel: exec-run-mt-bench-op
inputDefinitions:
parameters:
best_score_file:
isOptional: true
parameterType: STRING
max_workers:
parameterType: STRING
merge_system_user_message:
Expand Down Expand Up @@ -686,20 +683,18 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\
):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.json\"\
, \"r\") as f:\n mt_bench_data = f.read()\n mt_bench_data = ast.literal_eval(mt_bench_data)[0]\n\
\n metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"\
])\n metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\
])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\
error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\
, \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\
\ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\
overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\
, mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\
/output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\
\ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\
, mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\
, mmlu_branch_data[\"base_model_score\"])\n\n"
):\n import ast\n import json\n\n reports = {\n \"mt_bench\"\
: \"/output/mt_bench_data.json\",\n \"mt_bench_branch\": \"/output/mt_bench_branch/mt_bench_branch_data.json\"\
,\n \"mmlu_branch\": \"/output/mmlu_branch/mmlu_branch_data.json\"\
,\n }\n\n for report, file_name in reports.items():\n with\
\ open(file_name, \"r\") as f:\n report_data = json.load(f)\n\
\n if report == \"mt_bench\":\n metrics.log_metric(f\"\
{report}_best_model\", report_data[0][\"best_model\"])\n metrics.log_metric(f\"\
{report}_best_score\", report_data[0][\"best_score\"])\n else:\n\
\ metrics.log_metric(\n f\"{report}_trained_model_score\"\
, report_data[\"trained_model_score\"]\n )\n metrics.log_metric(\n\
\ f\"{report}_base_model_score\", report_data[\"base_model_score\"\
]\n )\n\n"
image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
exec-git-clone-op:
container:
Expand Down Expand Up @@ -1188,13 +1183,14 @@ deploymentSpec:
,\n sdg_path: str = \"/input/sdg\",\n mmlu_branch_output_path: str\
\ = \"/output/mmlu_branch\",\n mt_bench_branch_output_path: str = \"\
/output/mt_bench_branch\",\n):\n import json\n import os\n import\
\ subprocess\n\n import httpx\n import torch\n from instructlab.eval.mmlu\
\ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\
\ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\
\ sort_score\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n\
\ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint =\
\ os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\"\
)\n use_tls = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
\ subprocess\n from pathlib import Path\n\n import httpx\n import\
\ torch\n from instructlab.eval.mmlu import MMLUBranchEvaluator\n \
\ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
\ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
)\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\
\ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
\ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\
\ if use_tls else None\n\n print(\"Starting Final Eval...\")\n\n def\
\ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\
Expand Down Expand Up @@ -1335,14 +1331,15 @@ deploymentSpec:
\ regressions,\n no_changes,\n )\n\n \
\ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\
\ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\
: candidate_model,\n \"model_score\": round(overall_score, 2),\n\
\ \"base_model\": base_model_dir,\n \"base_model_score\"\
: candidate_model,\n \"trained_model_score\": round(overall_score,\
\ 2),\n \"base_model\": base_model_dir,\n \"base_model_score\"\
: round(base_overall_score, 2),\n \"summary\": summary,\n \
\ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \
\ os.makedirs(mmlu_branch_output_path)\n with open(\n \
\ f\"{mmlu_branch_output_path}/mmlu_branch_data.json\", \"w\", encoding=\"\
utf-8\"\n ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n\
\n else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
\ os.makedirs(mmlu_branch_output_path)\n mmlu_branch_output_file\
\ = (\n Path(mmlu_branch_output_path) / \"mmlu_branch_data.json\"\
\n )\n with open(mmlu_branch_output_file, \"w\", encoding=\"\
utf-8\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n\n\
\ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\
\ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Starting MT_BENCH_BRANCH\
\ ...\")\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n \
\ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"\
Expand Down Expand Up @@ -1406,12 +1403,13 @@ deploymentSpec:
\ new_qnas,\n )\n\n mt_bench_branch_data = {\n \"report_title\"\
: \"SKILLS EVALUATION REPORT\",\n \"model\": candidate_model,\n \
\ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\
,\n \"overall_score\": overall_score,\n \"base_overall_score\"\
,\n \"trained_model_score\": overall_score,\n \"base_model_score\"\
: base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\
: summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n\
\ os.makedirs(mt_bench_branch_output_path)\n with open(\n \
\ f\"{mt_bench_branch_output_path}/mt_bench_branch_data.json\",\n \
\ \"w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\
\ os.makedirs(mt_bench_branch_output_path)\n mt_bench_branch_data_file\
\ = (\n Path(mt_bench_branch_output_path) / \"mt_bench_branch_data.json\"\
\n )\n with open(\n mt_bench_branch_data_file,\n \"\
w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\
\ f, indent=4)\n\n"
env:
- name: HOME
Expand Down Expand Up @@ -1449,23 +1447,23 @@ deploymentSpec:
\ - 'auto'\n # with 'auto', number of gpus allocated for serving is\
\ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
\ max_workers: str,\n models_folder: str,\n output_path: str =\
\ \"/output/mt_bench_data.json\",\n best_score_file: Optional[str] =\
\ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\
\ import json\n import os\n import subprocess\n\n import httpx\n\
\ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
)\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\
\ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\
\ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\
\ if use_tls else None\n\n def launch_vllm(\n model_path: str,\
\ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\
\ import subprocess\n import sys\n import time\n\n\
\ import requests\n from instructlab.model.backends.common\
\ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\
127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\
\n\n command = [\n sys.executable,\n \"-m\"\
,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\
\ \"/output/mt_bench_data.json\",\n) -> NamedTuple(\"outputs\", best_model=str,\
\ best_score=float):\n import json\n import os\n import subprocess\n\
\n import httpx\n import torch\n from instructlab.eval.mt_bench\
\ import MTBenchEvaluator\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\
, \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\
\ = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"\
JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\
\ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\
\ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \
\ def launch_vllm(\n model_path: str, gpu_count: int, retries: int\
\ = 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \
\ import sys\n import time\n\n import requests\n \
\ from instructlab.model.backends.common import free_tcp_ipv4_port\n\
\n free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port =\
\ str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\
\n command = [\n sys.executable,\n \"-m\",\n\
\ \"vllm.entrypoints.openai.api_server\",\n \"--port\"\
,\n port,\n \"--model\",\n model_path,\n\
\ ]\n if gpu_count > 0:\n command += [\n \
\ \"--tensor-parallel-size\",\n str(gpu_count),\n\
Expand Down Expand Up @@ -1529,17 +1527,16 @@ deploymentSpec:
\ \"overall_score\": overall_score,\n \"turn_scores\"\
: turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\
: error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\
\ scores[model_path] = overall_score\n\n with open(output_path,\
\ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\
\ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\
\ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\
\ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\
\ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\
: best_model, \"best_score\": best_score}, f, indent=4)\n\n # Rename\
\ the best model directory to \"candidate_model\" for the next step\n \
\ # So we know which model to use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\
\ \"candidate_model\")):\n print(\"candidate_model already exists.\
\ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\
\ scores[model_path] = overall_score\n\n outputs = NamedTuple(\"\
outputs\", best_model=str, best_score=float)\n best_model = max(scores,\
\ key=scores.get)\n best_score = scores[best_model]\n all_mt_bench_data.insert(0,\
\ {\"best_model\": best_model, \"best_score\": best_score})\n\n with\
\ open(output_path, \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\
\ f, indent=4)\n\n # Rename the best model directory to \"candidate_model\"\
\ for the next step\n # So we know which model to use for the final evaluation\n\
\ if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\
\ print(\"candidate_model already exists. Skipping renaming\")\n\
\ else:\n os.rename(\n os.path.join(models_folder,\
\ best_model),\n os.path.join(models_folder, \"candidate_model\"\
),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n\
\n"
Expand Down

0 comments on commit 06ea8aa

Please sign in to comment.