From 70fe60cf50ded609b43ea2e9c7b18e9f8c946163 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Tue, 17 Dec 2024 15:26:08 -0500 Subject: [PATCH 1/3] add additional step to pipeline to generate a metrics report Signed-off-by: Michael Clifford --- eval/final/__init__.py | 4 +- eval/final/components.py | 53 +++++++++-- pipeline.py | 22 +++-- pipeline.yaml | 186 +++++++++++++++++++++++++++------------ 4 files changed, 196 insertions(+), 69 deletions(-) diff --git a/eval/final/__init__.py b/eval/final/__init__.py index 62053fe8..8edaf62f 100644 --- a/eval/final/__init__.py +++ b/eval/final/__init__.py @@ -1,5 +1,5 @@ -from .components import run_final_eval_op +from .components import generate_metrics_report_op, run_final_eval_op # from . import faked -__all__ = ["run_final_eval_op"] +__all__ = ["run_final_eval_op", "generate_metrics_report_op"] diff --git a/eval/final/components.py b/eval/final/components.py index 469d222b..99f5a9af 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -1,15 +1,13 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error -from kfp.dsl import Artifact, Output, component +from kfp.dsl import Artifact, Input, Metrics, Output, component -from utils.consts import RHELAI_IMAGE +from utils.consts import PYTHON_IMAGE, RHELAI_IMAGE @component(base_image=RHELAI_IMAGE, install_kfp_package=False) def run_final_eval_op( - mmlu_branch_output: Output[Artifact], - mt_bench_branch_output: Output[Artifact], base_model_dir: str, base_branch: str, candidate_branch: str, @@ -20,6 +18,8 @@ def run_final_eval_op( candidate_model: str = None, taxonomy_path: str = "/input/taxonomy", sdg_path: str = "/input/sdg", + mmlu_branch_output_path: str = "/output/mmlu_branch", + mt_bench_branch_output_path: str = "/output/mt_bench_branch", ): import json import os @@ -326,8 +326,13 @@ def find_node_dataset_directories(base_dir: str): "summary": summary, } - with open(mmlu_branch_output.path, "w", encoding="utf-8") as f: + if not os.path.exists(mmlu_branch_output_path): + os.makedirs(mmlu_branch_output_path) + with open( + f"{mmlu_branch_output_path}/mmlu_branch_data.json", "w", encoding="utf-8" + ) as f: json.dump(mmlu_branch_data, f, indent=4) + else: print("No MMLU tasks directories found, skipping MMLU_branch evaluation.") @@ -470,5 +475,41 @@ def find_node_dataset_directories(base_dir: str): "summary": summary, } - with open(mt_bench_branch_output.path, "w", encoding="utf-8") as f: + if not os.path.exists(mt_bench_branch_output_path): + os.makedirs(mt_bench_branch_output_path) + with open( + f"{mt_bench_branch_output_path}/mt_bench_branch_data.json", + "w", + encoding="utf-8", + ) as f: json.dump(mt_bench_branch_data, f, indent=4) + + +@component(base_image=PYTHON_IMAGE, install_kfp_package=False) +def generate_metrics_report_op( + metrics: Output[Metrics], +): + import ast + import json + + with open("/output/mt_bench_data.json", "r") as f: + mt_bench_data = f.read() + mt_bench_data = ast.literal_eval(mt_bench_data)[0] + + metrics.log_metric("mt_bench_best_model", mt_bench_data["model"]) + metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"]) + metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"]) + + with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f: + mt_bench_branch_data = json.loads(f.read()) + + metrics.log_metric("mt_bench_branch_score", mt_bench_branch_data["overall_score"]) + metrics.log_metric( + "mt_bench_branch_base_score", mt_bench_branch_data["base_overall_score"] + ) + + with open("/output/mmlu_branch/mmlu_branch_data.json", "r") as f: + mmlu_branch_data = json.loads(f.read()) + + metrics.log_metric("mmlu_branch_score", mmlu_branch_data["model_score"]) + metrics.log_metric("mmlu_branch_base_score", mmlu_branch_data["base_model_score"]) diff --git a/pipeline.py b/pipeline.py index 00d3df24..2a6643e7 100644 --- a/pipeline.py +++ b/pipeline.py @@ -83,7 +83,7 @@ def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): ) # Imports for evaluation - from eval.final import run_final_eval_op + from eval.final import generate_metrics_report_op, run_final_eval_op from eval.mt_bench import run_mt_bench_op @dsl.pipeline( @@ -452,17 +452,27 @@ def pipeline( mount_path="/output", ) - output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output) - output_pvc_delete_task.after( - output_model_task, output_mt_bench_task, final_eval_task - ) - sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output) sdg_pvc_delete_task.after(final_eval_task) model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output) model_pvc_delete_task.after(final_eval_task) + generate_metrics_report_task = generate_metrics_report_op() + generate_metrics_report_task.after(output_mt_bench_task, final_eval_task) + generate_metrics_report_task.set_caching_options(False) + mount_pvc( + task=generate_metrics_report_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) + + output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output) + output_pvc_delete_task.after( + output_model_task, + generate_metrics_report_task, + ) + return return pipeline diff --git a/pipeline.yaml b/pipeline.yaml index ef5d586e..29e70b64 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -30,6 +30,8 @@ # train_num_warmup_steps_phase_2: int [Default: 1000.0] # train_save_samples: int [Default: 250000.0] # train_seed: int [Default: 42.0] +# Outputs: +# generate-metrics-report-op-metrics: system.Metrics components: comp-createpvc: executorLabel: exec-createpvc @@ -266,6 +268,14 @@ components: description: Name of the PVC to delete. Supports passing a runtime-generated name, such as a name provided by ``kubernetes.CreatePvcOp().outputs['name']``. parameterType: STRING + comp-generate-metrics-report-op: + executorLabel: exec-generate-metrics-report-op + outputDefinitions: + artifacts: + metrics: + artifactType: + schemaTitle: system.Metrics + schemaVersion: 0.0.1 comp-git-clone-op: executorLabel: exec-git-clone-op inputDefinitions: @@ -464,6 +474,14 @@ components: parameterType: STRING merge_system_user_message: parameterType: BOOLEAN + mmlu_branch_output_path: + defaultValue: /output/mmlu_branch + isOptional: true + parameterType: STRING + mt_bench_branch_output_path: + defaultValue: /output/mt_bench_branch + isOptional: true + parameterType: STRING sdg_path: defaultValue: /input/sdg isOptional: true @@ -472,16 +490,6 @@ components: defaultValue: /input/taxonomy isOptional: true parameterType: STRING - outputDefinitions: - artifacts: - mmlu_branch_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - mt_bench_branch_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-run-mt-bench-op: executorLabel: exec-run-mt-bench-op inputDefinitions: @@ -658,6 +666,41 @@ deploymentSpec: exec-deletepvc-3: container: image: argostub/deletepvc + exec-generate-metrics-report-op: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - generate_metrics_report_op + command: + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\ + ):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.json\"\ + , \"r\") as f:\n mt_bench_data = f.read()\n mt_bench_data = ast.literal_eval(mt_bench_data)[0]\n\ + \n metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"\ + ])\n metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\ + ])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\ + error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\ + , \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\ + \ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\ + overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\ + , mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\ + /output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\ + \ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\ + , mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\ + , mmlu_branch_data[\"base_model_score\"])\n\n" + image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111 exec-git-clone-op: container: args: @@ -1138,37 +1181,38 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\ - \ mt_bench_branch_output: Output[Artifact],\n base_model_dir: str,\n\ - \ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\ - \ few_shots: int,\n batch_size: str,\n merge_system_user_message:\ - \ bool,\n candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\ - ,\n sdg_path: str = \"/input/sdg\",\n):\n import json\n import\ - \ os\n import subprocess\n\n import httpx\n import torch\n from\ - \ instructlab.eval.mmlu import MMLUBranchEvaluator\n from instructlab.eval.mt_bench\ - \ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\ - \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n judge_api_key = os.getenv(\"\ - JUDGE_API_KEY\", \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\"\ - )\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path\ - \ = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\ - \ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\ - \ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \ - \ print(\"Starting Final Eval...\")\n\n def launch_vllm(\n model_path:\ - \ str, gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ - \ import subprocess\n import sys\n import time\n\n\ - \ import requests\n from instructlab.model.backends.common\ - \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ - 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ - \n\n command = [\n sys.executable,\n \"-m\"\ - ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ - ,\n port,\n \"--model\",\n model_path,\n\ - \ ]\n if gpu_count > 0:\n command += [\n \ - \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ - \ ]\n\n process = subprocess.Popen(args=command)\n\n \ - \ print(f\"Waiting for vLLM server to start at {vllm_server}...\"\ - )\n\n for attempt in range(retries):\n try:\n \ - \ response = requests.get(f\"{vllm_server}/models\", timeout=10)\n\ - \ if response.status_code == 200:\n print(f\"\ + \ *\n\ndef run_final_eval_op(\n base_model_dir: str,\n base_branch:\ + \ str,\n candidate_branch: str,\n max_workers: str,\n few_shots:\ + \ int,\n batch_size: str,\n merge_system_user_message: bool,\n \ + \ candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\ + ,\n sdg_path: str = \"/input/sdg\",\n mmlu_branch_output_path: str\ + \ = \"/output/mmlu_branch\",\n mt_bench_branch_output_path: str = \"\ + /output/mt_bench_branch\",\n):\n import json\n import os\n import\ + \ subprocess\n\n import httpx\n import torch\n from instructlab.eval.mmlu\ + \ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\ + \ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\ + \ sort_score\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n\ + \ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint =\ + \ os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\"\ + )\n use_tls = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ + \ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\ + \ if use_tls else None\n\n print(\"Starting Final Eval...\")\n\n def\ + \ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\ + \ 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \ + \ import sys\n import time\n\n import requests\n \ + \ from instructlab.model.backends.common import free_tcp_ipv4_port\n\n\ + \ free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port = str(free_port)\n\ + \ vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\n command\ + \ = [\n sys.executable,\n \"-m\",\n \"\ + vllm.entrypoints.openai.api_server\",\n \"--port\",\n \ + \ port,\n \"--model\",\n model_path,\n \ + \ ]\n if gpu_count > 0:\n command += [\n \ + \ \"--tensor-parallel-size\",\n str(gpu_count),\n \ + \ ]\n\n process = subprocess.Popen(args=command)\n\n \ + \ print(f\"Waiting for vLLM server to start at {vllm_server}...\")\n\n\ + \ for attempt in range(retries):\n try:\n \ + \ response = requests.get(f\"{vllm_server}/models\", timeout=10)\n \ + \ if response.status_code == 200:\n print(f\"\ vLLM server is up and running at {vllm_server}.\")\n \ \ return process, vllm_server\n except requests.ConnectionError:\n\ \ pass\n\n print(\n f\"Server not\ @@ -1294,9 +1338,11 @@ deploymentSpec: : candidate_model,\n \"model_score\": round(overall_score, 2),\n\ \ \"base_model\": base_model_dir,\n \"base_model_score\"\ : round(base_overall_score, 2),\n \"summary\": summary,\n \ - \ }\n\n with open(mmlu_branch_output.path, \"w\", encoding=\"\ - utf-8\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \ - \ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\ + \ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \ + \ os.makedirs(mmlu_branch_output_path)\n with open(\n \ + \ f\"{mmlu_branch_output_path}/mmlu_branch_data.json\", \"w\", encoding=\"\ + utf-8\"\n ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n\ + \n else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\ \ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Starting MT_BENCH_BRANCH\ \ ...\")\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n \ \ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"\ @@ -1362,8 +1408,11 @@ deploymentSpec: \ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\ ,\n \"overall_score\": overall_score,\n \"base_overall_score\"\ : base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\ - : summary,\n }\n\n with open(mt_bench_branch_output.path, \"w\", encoding=\"\ - utf-8\") as f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n" + : summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n\ + \ os.makedirs(mt_bench_branch_output_path)\n with open(\n \ + \ f\"{mt_bench_branch_output_path}/mt_bench_branch_data.json\",\n \ + \ \"w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\ + \ f, indent=4)\n\n" env: - name: HOME value: /tmp @@ -1646,6 +1695,12 @@ pipelineInfo: name: instructlab root: dag: + outputs: + artifacts: + generate-metrics-report-op-metrics: + artifactSelectors: + - outputArtifactKey: metrics + producerSubtask: generate-metrics-report-op tasks: createpvc: cachingOptions: @@ -1731,16 +1786,14 @@ root: componentRef: name: comp-deletepvc dependentTasks: - - createpvc-3 - - pvc-to-model-op - - pvc-to-mt-bench-op + - createpvc - run-final-eval-op inputs: parameters: pvc_name: taskOutputParameter: outputParameterKey: name - producerTask: createpvc-3 + producerTask: createpvc taskInfo: name: deletepvc deletepvc-2: @@ -1749,14 +1802,14 @@ root: componentRef: name: comp-deletepvc-2 dependentTasks: - - createpvc + - createpvc-2 - run-final-eval-op inputs: parameters: pvc_name: taskOutputParameter: outputParameterKey: name - producerTask: createpvc + producerTask: createpvc-2 taskInfo: name: deletepvc-2 deletepvc-3: @@ -1765,16 +1818,27 @@ root: componentRef: name: comp-deletepvc-3 dependentTasks: - - createpvc-2 - - run-final-eval-op + - createpvc-3 + - generate-metrics-report-op + - pvc-to-model-op inputs: parameters: pvc_name: taskOutputParameter: outputParameterKey: name - producerTask: createpvc-2 + producerTask: createpvc-3 taskInfo: name: deletepvc-3 + generate-metrics-report-op: + cachingOptions: {} + componentRef: + name: comp-generate-metrics-report-op + dependentTasks: + - createpvc-3 + - pvc-to-mt-bench-op + - run-final-eval-op + taskInfo: + name: generate-metrics-report-op git-clone-op: cachingOptions: {} componentRef: @@ -2227,6 +2291,12 @@ root: description: Training parameter. Random seed for initializing training. isOptional: true parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + generate-metrics-report-op-metrics: + artifactType: + schemaTitle: system.Metrics + schemaVersion: 0.0.1 schemaVersion: 2.1.0 sdkVersion: kfp-2.9.0 --- @@ -2244,6 +2314,12 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc + exec-generate-metrics-report-op: + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 exec-git-clone-op: pvcMount: - mountPath: /data From f829641a4da2b0caeb9878b4c027c086a88679b8 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Tue, 7 Jan 2025 13:40:09 -0500 Subject: [PATCH 2/3] update generate_metrics_report_op and consolidate mt_bench reports Signed-off-by: Michael Clifford --- eval/final/components.py | 58 ++++++++--------- eval/mt_bench/components.py | 15 ++--- pipeline.yaml | 122 ++++++++++++++++++------------------ 3 files changed, 98 insertions(+), 97 deletions(-) diff --git a/eval/final/components.py b/eval/final/components.py index 99f5a9af..6f02a3df 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -24,6 +24,7 @@ def run_final_eval_op( import json import os import subprocess + from pathlib import Path import httpx import torch @@ -320,7 +321,7 @@ def find_node_dataset_directories(base_dir: str): "report_title": "KNOWLEDGE EVALUATION REPORT", "max_score": "1.0", "model": candidate_model, - "model_score": round(overall_score, 2), + "trained_model_score": round(overall_score, 2), "base_model": base_model_dir, "base_model_score": round(base_overall_score, 2), "summary": summary, @@ -328,11 +329,11 @@ def find_node_dataset_directories(base_dir: str): if not os.path.exists(mmlu_branch_output_path): os.makedirs(mmlu_branch_output_path) - with open( - f"{mmlu_branch_output_path}/mmlu_branch_data.json", "w", encoding="utf-8" - ) as f: + mmlu_branch_output_file = ( + Path(mmlu_branch_output_path) / "mmlu_branch_data.json" + ) + with open(mmlu_branch_output_file, "w", encoding="utf-8") as f: json.dump(mmlu_branch_data, f, indent=4) - else: print("No MMLU tasks directories found, skipping MMLU_branch evaluation.") @@ -469,16 +470,19 @@ def find_node_dataset_directories(base_dir: str): "model": candidate_model, "judge_model": judge_model_name, "max_score": "10.0", - "overall_score": overall_score, - "base_overall_score": base_overall_score, + "trained_model_score": overall_score, + "base_model_score": base_overall_score, "error_rate": error_rate, "summary": summary, } if not os.path.exists(mt_bench_branch_output_path): os.makedirs(mt_bench_branch_output_path) + mt_bench_branch_data_file = ( + Path(mt_bench_branch_output_path) / "mt_bench_branch_data.json" + ) with open( - f"{mt_bench_branch_output_path}/mt_bench_branch_data.json", + mt_bench_branch_data_file, "w", encoding="utf-8", ) as f: @@ -489,27 +493,25 @@ def find_node_dataset_directories(base_dir: str): def generate_metrics_report_op( metrics: Output[Metrics], ): - import ast import json - with open("/output/mt_bench_data.json", "r") as f: - mt_bench_data = f.read() - mt_bench_data = ast.literal_eval(mt_bench_data)[0] - - metrics.log_metric("mt_bench_best_model", mt_bench_data["model"]) - metrics.log_metric("mt_bench_best_score", mt_bench_data["overall_score"]) - metrics.log_metric("mt_bench_best_model_error_rate", mt_bench_data["error_rate"]) - - with open("/output/mt_bench_branch/mt_bench_branch_data.json", "r") as f: - mt_bench_branch_data = json.loads(f.read()) - - metrics.log_metric("mt_bench_branch_score", mt_bench_branch_data["overall_score"]) - metrics.log_metric( - "mt_bench_branch_base_score", mt_bench_branch_data["base_overall_score"] - ) + reports = { + "mt_bench": "/output/mt_bench_data.json", + "mt_bench_branch": "/output/mt_bench_branch/mt_bench_branch_data.json", + "mmlu_branch": "/output/mmlu_branch/mmlu_branch_data.json", + } - with open("/output/mmlu_branch/mmlu_branch_data.json", "r") as f: - mmlu_branch_data = json.loads(f.read()) + for report, file_name in reports.items(): + with open(file_name, "r", encoding="utf-8") as f: + report_data = json.load(f) - metrics.log_metric("mmlu_branch_score", mmlu_branch_data["model_score"]) - metrics.log_metric("mmlu_branch_base_score", mmlu_branch_data["base_model_score"]) + if report == "mt_bench": + metrics.log_metric(f"{report}_best_model", report_data["best_model"]) + metrics.log_metric(f"{report}_best_score", report_data["best_score"]) + else: + metrics.log_metric( + f"{report}_trained_model_score", report_data["trained_model_score"] + ) + metrics.log_metric( + f"{report}_base_model_score", report_data["base_model_score"] + ) diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index 5b1800a2..34723a91 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -16,7 +16,6 @@ def run_mt_bench_op( max_workers: str, models_folder: str, output_path: str = "/output/mt_bench_data.json", - best_score_file: Optional[str] = None, ) -> NamedTuple("outputs", best_model=str, best_score=float): import json import os @@ -187,15 +186,17 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20): all_mt_bench_data.append(mt_bench_data) scores[model_path] = overall_score - with open(output_path, "w", encoding="utf-8") as f: - json.dump(all_mt_bench_data, f, indent=4) - outputs = NamedTuple("outputs", best_model=str, best_score=float) best_model = max(scores, key=scores.get) best_score = scores[best_model] - if best_score_file: - with open(best_score_file, "w", encoding="utf-8") as f: - json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4) + mt_bench_report = { + "best_model": best_model, + "best_score": best_score, + "reports": all_mt_bench_data, + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(mt_bench_report, f, indent=4) # Rename the best model directory to "candidate_model" for the next step # So we know which model to use for the final evaluation diff --git a/pipeline.yaml b/pipeline.yaml index 29e70b64..ac5bcaa3 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -494,9 +494,6 @@ components: executorLabel: exec-run-mt-bench-op inputDefinitions: parameters: - best_score_file: - isOptional: true - parameterType: STRING max_workers: parameterType: STRING merge_system_user_message: @@ -686,20 +683,18 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ \ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\ - ):\n import ast\n import json\n\n with open(\"/output/mt_bench_data.json\"\ - , \"r\") as f:\n mt_bench_data = f.read()\n mt_bench_data = ast.literal_eval(mt_bench_data)[0]\n\ - \n metrics.log_metric(\"mt_bench_best_model\", mt_bench_data[\"model\"\ - ])\n metrics.log_metric(\"mt_bench_best_score\", mt_bench_data[\"overall_score\"\ - ])\n metrics.log_metric(\"mt_bench_best_model_error_rate\", mt_bench_data[\"\ - error_rate\"])\n\n with open(\"/output/mt_bench_branch/mt_bench_branch_data.json\"\ - , \"r\") as f:\n mt_bench_branch_data = json.loads(f.read())\n\n\ - \ metrics.log_metric(\"mt_bench_branch_score\", mt_bench_branch_data[\"\ - overall_score\"])\n metrics.log_metric(\n \"mt_bench_branch_base_score\"\ - , mt_bench_branch_data[\"base_overall_score\"]\n )\n\n with open(\"\ - /output/mmlu_branch/mmlu_branch_data.json\", \"r\") as f:\n mmlu_branch_data\ - \ = json.loads(f.read())\n\n metrics.log_metric(\"mmlu_branch_score\"\ - , mmlu_branch_data[\"model_score\"])\n metrics.log_metric(\"mmlu_branch_base_score\"\ - , mmlu_branch_data[\"base_model_score\"])\n\n" + ):\n import json\n\n reports = {\n \"mt_bench\": \"/output/mt_bench_data.json\"\ + ,\n \"mt_bench_branch\": \"/output/mt_bench_branch/mt_bench_branch_data.json\"\ + ,\n \"mmlu_branch\": \"/output/mmlu_branch/mmlu_branch_data.json\"\ + ,\n }\n\n for report, file_name in reports.items():\n with\ + \ open(file_name, \"r\", encoding=\"utf-8\") as f:\n report_data\ + \ = json.load(f)\n\n if report == \"mt_bench\":\n metrics.log_metric(f\"\ + {report}_best_model\", report_data[\"best_model\"])\n metrics.log_metric(f\"\ + {report}_best_score\", report_data[\"best_score\"])\n else:\n \ + \ metrics.log_metric(\n f\"{report}_trained_model_score\"\ + , report_data[\"trained_model_score\"]\n )\n metrics.log_metric(\n\ + \ f\"{report}_base_model_score\", report_data[\"base_model_score\"\ + ]\n )\n\n" image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111 exec-git-clone-op: container: @@ -1188,13 +1183,14 @@ deploymentSpec: ,\n sdg_path: str = \"/input/sdg\",\n mmlu_branch_output_path: str\ \ = \"/output/mmlu_branch\",\n mt_bench_branch_output_path: str = \"\ /output/mt_bench_branch\",\n):\n import json\n import os\n import\ - \ subprocess\n\n import httpx\n import torch\n from instructlab.eval.mmlu\ - \ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\ - \ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\ - \ sort_score\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n\ - \ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint =\ - \ os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\"\ - )\n use_tls = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ + \ subprocess\n from pathlib import Path\n\n import httpx\n import\ + \ torch\n from instructlab.eval.mmlu import MMLUBranchEvaluator\n \ + \ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\ + \ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\ + \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ + \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ + )\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\ + \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ \ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\ \ if use_tls else None\n\n print(\"Starting Final Eval...\")\n\n def\ \ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\ @@ -1335,14 +1331,15 @@ deploymentSpec: \ regressions,\n no_changes,\n )\n\n \ \ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\ \ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\ - : candidate_model,\n \"model_score\": round(overall_score, 2),\n\ - \ \"base_model\": base_model_dir,\n \"base_model_score\"\ + : candidate_model,\n \"trained_model_score\": round(overall_score,\ + \ 2),\n \"base_model\": base_model_dir,\n \"base_model_score\"\ : round(base_overall_score, 2),\n \"summary\": summary,\n \ \ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \ - \ os.makedirs(mmlu_branch_output_path)\n with open(\n \ - \ f\"{mmlu_branch_output_path}/mmlu_branch_data.json\", \"w\", encoding=\"\ - utf-8\"\n ) as f:\n json.dump(mmlu_branch_data, f, indent=4)\n\ - \n else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\ + \ os.makedirs(mmlu_branch_output_path)\n mmlu_branch_output_file\ + \ = (\n Path(mmlu_branch_output_path) / \"mmlu_branch_data.json\"\ + \n )\n with open(mmlu_branch_output_file, \"w\", encoding=\"\ + utf-8\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \ + \ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\ \ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Starting MT_BENCH_BRANCH\ \ ...\")\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n \ \ judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"\ @@ -1406,12 +1403,13 @@ deploymentSpec: \ new_qnas,\n )\n\n mt_bench_branch_data = {\n \"report_title\"\ : \"SKILLS EVALUATION REPORT\",\n \"model\": candidate_model,\n \ \ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\ - ,\n \"overall_score\": overall_score,\n \"base_overall_score\"\ + ,\n \"trained_model_score\": overall_score,\n \"base_model_score\"\ : base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\ : summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n\ - \ os.makedirs(mt_bench_branch_output_path)\n with open(\n \ - \ f\"{mt_bench_branch_output_path}/mt_bench_branch_data.json\",\n \ - \ \"w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\ + \ os.makedirs(mt_bench_branch_output_path)\n mt_bench_branch_data_file\ + \ = (\n Path(mt_bench_branch_output_path) / \"mt_bench_branch_data.json\"\ + \n )\n with open(\n mt_bench_branch_data_file,\n \"\ + w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\ \ f, indent=4)\n\n" env: - name: HOME @@ -1449,23 +1447,23 @@ deploymentSpec: \ - 'auto'\n # with 'auto', number of gpus allocated for serving is\ \ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ \ max_workers: str,\n models_folder: str,\n output_path: str =\ - \ \"/output/mt_bench_data.json\",\n best_score_file: Optional[str] =\ - \ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\ - \ import json\n import os\n import subprocess\n\n import httpx\n\ - \ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\ - \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ - \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ - )\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\ - \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ - \ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\ - \ if use_tls else None\n\n def launch_vllm(\n model_path: str,\ - \ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ - \ import subprocess\n import sys\n import time\n\n\ - \ import requests\n from instructlab.model.backends.common\ - \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ - 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ - \n\n command = [\n sys.executable,\n \"-m\"\ - ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ + \ \"/output/mt_bench_data.json\",\n) -> NamedTuple(\"outputs\", best_model=str,\ + \ best_score=float):\n import json\n import os\n import subprocess\n\ + \n import httpx\n import torch\n from instructlab.eval.mt_bench\ + \ import MTBenchEvaluator\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\ + , \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\ + \ = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"\ + JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\ + \ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\ + \ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \ + \ def launch_vllm(\n model_path: str, gpu_count: int, retries: int\ + \ = 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \ + \ import sys\n import time\n\n import requests\n \ + \ from instructlab.model.backends.common import free_tcp_ipv4_port\n\ + \n free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port =\ + \ str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\ + \n command = [\n sys.executable,\n \"-m\",\n\ + \ \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ ,\n port,\n \"--model\",\n model_path,\n\ \ ]\n if gpu_count > 0:\n command += [\n \ \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ @@ -1529,17 +1527,17 @@ deploymentSpec: \ \"overall_score\": overall_score,\n \"turn_scores\"\ : turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\ : error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\ - \ scores[model_path] = overall_score\n\n with open(output_path,\ - \ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\ - \ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\ - \ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\ - \ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\ - \ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\ - : best_model, \"best_score\": best_score}, f, indent=4)\n\n # Rename\ - \ the best model directory to \"candidate_model\" for the next step\n \ - \ # So we know which model to use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\ - \ \"candidate_model\")):\n print(\"candidate_model already exists.\ - \ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\ + \ scores[model_path] = overall_score\n\n outputs = NamedTuple(\"\ + outputs\", best_model=str, best_score=float)\n best_model = max(scores,\ + \ key=scores.get)\n best_score = scores[best_model]\n mt_bench_report\ + \ = {\n \"best_model\": best_model,\n \"best_score\": best_score,\n\ + \ \"reports\": all_mt_bench_data,\n }\n\n with open(output_path,\ + \ \"w\", encoding=\"utf-8\") as f:\n json.dump(mt_bench_report, f,\ + \ indent=4)\n\n # Rename the best model directory to \"candidate_model\"\ + \ for the next step\n # So we know which model to use for the final evaluation\n\ + \ if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\ + \ print(\"candidate_model already exists. Skipping renaming\")\n\ + \ else:\n os.rename(\n os.path.join(models_folder,\ \ best_model),\n os.path.join(models_folder, \"candidate_model\"\ ),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n\ \n" From 898b6f74aa22309e50cd78e7e39339473adea043 Mon Sep 17 00:00:00 2001 From: Michael Clifford Date: Wed, 8 Jan 2025 21:13:24 -0500 Subject: [PATCH 3/3] add components to output artifacts from final eval Signed-off-by: Michael Clifford --- eval/final/components.py | 2 +- pipeline.py | 26 +++++++++++++ pipeline.yaml | 80 ++++++++++++++++++++++++++++++++++++++++ utils/__init__.py | 4 ++ utils/components.py | 20 ++++++++++ 5 files changed, 131 insertions(+), 1 deletion(-) diff --git a/eval/final/components.py b/eval/final/components.py index 6f02a3df..70e5394d 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -1,7 +1,7 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error -from kfp.dsl import Artifact, Input, Metrics, Output, component +from kfp.dsl import Metrics, Output, component from utils.consts import PYTHON_IMAGE, RHELAI_IMAGE diff --git a/pipeline.py b/pipeline.py index 2a6643e7..012d2a15 100644 --- a/pipeline.py +++ b/pipeline.py @@ -78,7 +78,9 @@ def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): ) from utils import ( model_to_pvc_op, + pvc_to_mmlu_branch_op, pvc_to_model_op, + pvc_to_mt_bench_branch_op, pvc_to_mt_bench_op, ) @@ -452,6 +454,30 @@ def pipeline( mount_path="/output", ) + output_mt_bench_branch_task = pvc_to_mt_bench_branch_op( + pvc_path="/output/mt_bench_branch/mt_bench_branch_data.json", + ) + output_mt_bench_branch_task.after(final_eval_task) + output_mt_bench_branch_task.set_caching_options(False) + + mount_pvc( + task=output_mt_bench_branch_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) + + output_mmlu_branch_task = pvc_to_mmlu_branch_op( + pvc_path="/output/mmlu_branch/mmlu_branch_data.json", + ) + output_mmlu_branch_task.after(final_eval_task) + output_mmlu_branch_task.set_caching_options(False) + + mount_pvc( + task=output_mmlu_branch_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) + sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output) sdg_pvc_delete_task.after(final_eval_task) diff --git a/pipeline.yaml b/pipeline.yaml index ac5bcaa3..009d442d 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -329,6 +329,18 @@ components: defaultValue: /model isOptional: true parameterType: STRING + comp-pvc-to-mmlu-branch-op: + executorLabel: exec-pvc-to-mmlu-branch-op + inputDefinitions: + parameters: + pvc_path: + parameterType: STRING + outputDefinitions: + artifacts: + mmlu_branch_output: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 comp-pvc-to-model-op: executorLabel: exec-pvc-to-model-op inputDefinitions: @@ -341,6 +353,18 @@ components: artifactType: schemaTitle: system.Model schemaVersion: 0.0.1 + comp-pvc-to-mt-bench-branch-op: + executorLabel: exec-pvc-to-mt-bench-branch-op + inputDefinitions: + parameters: + pvc_path: + parameterType: STRING + outputDefinitions: + artifacts: + mt_bench_branch_output: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 comp-pvc-to-mt-bench-op: executorLabel: exec-pvc-to-mt-bench-op inputDefinitions: @@ -733,6 +757,14 @@ deploymentSpec: - /bin/sh - -c image: registry.access.redhat.com/ubi9/toolbox + exec-pvc-to-mmlu-branch-op: + container: + args: + - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['mmlu_branch_output'].path}} + command: + - /bin/sh + - -c + image: registry.access.redhat.com/ubi9/toolbox exec-pvc-to-model-op: container: args: @@ -741,6 +773,14 @@ deploymentSpec: - /bin/sh - -c image: registry.access.redhat.com/ubi9/toolbox + exec-pvc-to-mt-bench-branch-op: + container: + args: + - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['mt_bench_branch_output'].path}} + command: + - /bin/sh + - -c + image: registry.access.redhat.com/ubi9/toolbox exec-pvc-to-mt-bench-op: container: args: @@ -1888,6 +1928,20 @@ root: producerTask: importer taskInfo: name: model-to-pvc-op + pvc-to-mmlu-branch-op: + cachingOptions: {} + componentRef: + name: comp-pvc-to-mmlu-branch-op + dependentTasks: + - createpvc-3 + - run-final-eval-op + inputs: + parameters: + pvc_path: + runtimeValue: + constant: /output/mmlu_branch/mmlu_branch_data.json + taskInfo: + name: pvc-to-mmlu-branch-op pvc-to-model-op: cachingOptions: {} componentRef: @@ -1902,6 +1956,20 @@ root: constant: /output/phase_2/model/hf_format/candidate_model taskInfo: name: pvc-to-model-op + pvc-to-mt-bench-branch-op: + cachingOptions: {} + componentRef: + name: comp-pvc-to-mt-bench-branch-op + dependentTasks: + - createpvc-3 + - run-final-eval-op + inputs: + parameters: + pvc_path: + runtimeValue: + constant: /output/mt_bench_branch/mt_bench_branch_data.json + taskInfo: + name: pvc-to-mt-bench-branch-op pvc-to-mt-bench-op: cachingOptions: {} componentRef: @@ -2336,12 +2404,24 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc-2 + exec-pvc-to-mmlu-branch-op: + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 exec-pvc-to-model-op: pvcMount: - mountPath: /output taskOutputParameter: outputParameterKey: name producerTask: createpvc-3 + exec-pvc-to-mt-bench-branch-op: + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 exec-pvc-to-mt-bench-op: pvcMount: - mountPath: /output diff --git a/utils/__init__.py b/utils/__init__.py index 7dcc94ed..6e355be3 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -2,13 +2,17 @@ from .components import ( ilab_importer_op, model_to_pvc_op, + pvc_to_mmlu_branch_op, pvc_to_model_op, + pvc_to_mt_bench_branch_op, pvc_to_mt_bench_op, ) __all__ = [ "model_to_pvc_op", "pvc_to_mt_bench_op", + "pvc_to_mt_bench_branch_op", + "pvc_to_mmlu_branch_op", "pvc_to_model_op", "ilab_importer_op", "faked", diff --git a/utils/components.py b/utils/components.py index 35f38481..6d8f5aef 100644 --- a/utils/components.py +++ b/utils/components.py @@ -15,6 +15,26 @@ def pvc_to_mt_bench_op(mt_bench_output: dsl.Output[dsl.Artifact], pvc_path: str) ) +@dsl.container_component +def pvc_to_mt_bench_branch_op( + mt_bench_branch_output: dsl.Output[dsl.Artifact], pvc_path: str +): + return dsl.ContainerSpec( + TOOLBOX_IMAGE, + ["/bin/sh", "-c"], + [f"cp -r {pvc_path} {mt_bench_branch_output.path}"], + ) + + +@dsl.container_component +def pvc_to_mmlu_branch_op(mmlu_branch_output: dsl.Output[dsl.Artifact], pvc_path: str): + return dsl.ContainerSpec( + TOOLBOX_IMAGE, + ["/bin/sh", "-c"], + [f"cp -r {pvc_path} {mmlu_branch_output.path}"], + ) + + @dsl.container_component def pvc_to_model_op(model: dsl.Output[dsl.Model], pvc_path: str): return dsl.ContainerSpec(