diff --git a/eval/final/__init__.py b/eval/final/__init__.py index 62053fe..8edaf62 100644 --- a/eval/final/__init__.py +++ b/eval/final/__init__.py @@ -1,5 +1,5 @@ -from .components import run_final_eval_op +from .components import generate_metrics_report_op, run_final_eval_op # from . import faked -__all__ = ["run_final_eval_op"] +__all__ = ["run_final_eval_op", "generate_metrics_report_op"] diff --git a/eval/final/components.py b/eval/final/components.py index 469d222..70e5394 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -1,15 +1,13 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error -from kfp.dsl import Artifact, Output, component +from kfp.dsl import Metrics, Output, component -from utils.consts import RHELAI_IMAGE +from utils.consts import PYTHON_IMAGE, RHELAI_IMAGE @component(base_image=RHELAI_IMAGE, install_kfp_package=False) def run_final_eval_op( - mmlu_branch_output: Output[Artifact], - mt_bench_branch_output: Output[Artifact], base_model_dir: str, base_branch: str, candidate_branch: str, @@ -20,10 +18,13 @@ def run_final_eval_op( candidate_model: str = None, taxonomy_path: str = "/input/taxonomy", sdg_path: str = "/input/sdg", + mmlu_branch_output_path: str = "/output/mmlu_branch", + mt_bench_branch_output_path: str = "/output/mt_bench_branch", ): import json import os import subprocess + from pathlib import Path import httpx import torch @@ -320,13 +321,18 @@ def find_node_dataset_directories(base_dir: str): "report_title": "KNOWLEDGE EVALUATION REPORT", "max_score": "1.0", "model": candidate_model, - "model_score": round(overall_score, 2), + "trained_model_score": round(overall_score, 2), "base_model": base_model_dir, "base_model_score": round(base_overall_score, 2), "summary": summary, } - with open(mmlu_branch_output.path, "w", encoding="utf-8") as f: + if not os.path.exists(mmlu_branch_output_path): + os.makedirs(mmlu_branch_output_path) + mmlu_branch_output_file = ( + Path(mmlu_branch_output_path) / "mmlu_branch_data.json" + ) + with open(mmlu_branch_output_file, "w", encoding="utf-8") as f: json.dump(mmlu_branch_data, f, indent=4) else: print("No MMLU tasks directories found, skipping MMLU_branch evaluation.") @@ -464,11 +470,48 @@ def find_node_dataset_directories(base_dir: str): "model": candidate_model, "judge_model": judge_model_name, "max_score": "10.0", - "overall_score": overall_score, - "base_overall_score": base_overall_score, + "trained_model_score": overall_score, + "base_model_score": base_overall_score, "error_rate": error_rate, "summary": summary, } - with open(mt_bench_branch_output.path, "w", encoding="utf-8") as f: + if not os.path.exists(mt_bench_branch_output_path): + os.makedirs(mt_bench_branch_output_path) + mt_bench_branch_data_file = ( + Path(mt_bench_branch_output_path) / "mt_bench_branch_data.json" + ) + with open( + mt_bench_branch_data_file, + "w", + encoding="utf-8", + ) as f: json.dump(mt_bench_branch_data, f, indent=4) + + +@component(base_image=PYTHON_IMAGE, install_kfp_package=False) +def generate_metrics_report_op( + metrics: Output[Metrics], +): + import json + + reports = { + "mt_bench": "/output/mt_bench_data.json", + "mt_bench_branch": "/output/mt_bench_branch/mt_bench_branch_data.json", + "mmlu_branch": "/output/mmlu_branch/mmlu_branch_data.json", + } + + for report, file_name in reports.items(): + with open(file_name, "r", encoding="utf-8") as f: + report_data = json.load(f) + + if report == "mt_bench": + metrics.log_metric(f"{report}_best_model", report_data["best_model"]) + metrics.log_metric(f"{report}_best_score", report_data["best_score"]) + else: + metrics.log_metric( + f"{report}_trained_model_score", report_data["trained_model_score"] + ) + metrics.log_metric( + f"{report}_base_model_score", report_data["base_model_score"] + ) diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index 5b1800a..34723a9 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -16,7 +16,6 @@ def run_mt_bench_op( max_workers: str, models_folder: str, output_path: str = "/output/mt_bench_data.json", - best_score_file: Optional[str] = None, ) -> NamedTuple("outputs", best_model=str, best_score=float): import json import os @@ -187,15 +186,17 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20): all_mt_bench_data.append(mt_bench_data) scores[model_path] = overall_score - with open(output_path, "w", encoding="utf-8") as f: - json.dump(all_mt_bench_data, f, indent=4) - outputs = NamedTuple("outputs", best_model=str, best_score=float) best_model = max(scores, key=scores.get) best_score = scores[best_model] - if best_score_file: - with open(best_score_file, "w", encoding="utf-8") as f: - json.dump({"best_model": best_model, "best_score": best_score}, f, indent=4) + mt_bench_report = { + "best_model": best_model, + "best_score": best_score, + "reports": all_mt_bench_data, + } + + with open(output_path, "w", encoding="utf-8") as f: + json.dump(mt_bench_report, f, indent=4) # Rename the best model directory to "candidate_model" for the next step # So we know which model to use for the final evaluation diff --git a/pipeline.py b/pipeline.py index 00d3df2..012d2a1 100644 --- a/pipeline.py +++ b/pipeline.py @@ -78,12 +78,14 @@ def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): ) from utils import ( model_to_pvc_op, + pvc_to_mmlu_branch_op, pvc_to_model_op, + pvc_to_mt_bench_branch_op, pvc_to_mt_bench_op, ) # Imports for evaluation - from eval.final import run_final_eval_op + from eval.final import generate_metrics_report_op, run_final_eval_op from eval.mt_bench import run_mt_bench_op @dsl.pipeline( @@ -452,9 +454,28 @@ def pipeline( mount_path="/output", ) - output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output) - output_pvc_delete_task.after( - output_model_task, output_mt_bench_task, final_eval_task + output_mt_bench_branch_task = pvc_to_mt_bench_branch_op( + pvc_path="/output/mt_bench_branch/mt_bench_branch_data.json", + ) + output_mt_bench_branch_task.after(final_eval_task) + output_mt_bench_branch_task.set_caching_options(False) + + mount_pvc( + task=output_mt_bench_branch_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) + + output_mmlu_branch_task = pvc_to_mmlu_branch_op( + pvc_path="/output/mmlu_branch/mmlu_branch_data.json", + ) + output_mmlu_branch_task.after(final_eval_task) + output_mmlu_branch_task.set_caching_options(False) + + mount_pvc( + task=output_mmlu_branch_task, + pvc_name=output_pvc_task.output, + mount_path="/output", ) sdg_pvc_delete_task = DeletePVC(pvc_name=sdg_input_pvc_task.output) @@ -463,6 +484,21 @@ def pipeline( model_pvc_delete_task = DeletePVC(pvc_name=model_pvc_task.output) model_pvc_delete_task.after(final_eval_task) + generate_metrics_report_task = generate_metrics_report_op() + generate_metrics_report_task.after(output_mt_bench_task, final_eval_task) + generate_metrics_report_task.set_caching_options(False) + mount_pvc( + task=generate_metrics_report_task, + pvc_name=output_pvc_task.output, + mount_path="/output", + ) + + output_pvc_delete_task = DeletePVC(pvc_name=output_pvc_task.output) + output_pvc_delete_task.after( + output_model_task, + generate_metrics_report_task, + ) + return return pipeline diff --git a/pipeline.yaml b/pipeline.yaml index ef5d586..009d442 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -30,6 +30,8 @@ # train_num_warmup_steps_phase_2: int [Default: 1000.0] # train_save_samples: int [Default: 250000.0] # train_seed: int [Default: 42.0] +# Outputs: +# generate-metrics-report-op-metrics: system.Metrics components: comp-createpvc: executorLabel: exec-createpvc @@ -266,6 +268,14 @@ components: description: Name of the PVC to delete. Supports passing a runtime-generated name, such as a name provided by ``kubernetes.CreatePvcOp().outputs['name']``. parameterType: STRING + comp-generate-metrics-report-op: + executorLabel: exec-generate-metrics-report-op + outputDefinitions: + artifacts: + metrics: + artifactType: + schemaTitle: system.Metrics + schemaVersion: 0.0.1 comp-git-clone-op: executorLabel: exec-git-clone-op inputDefinitions: @@ -319,6 +329,18 @@ components: defaultValue: /model isOptional: true parameterType: STRING + comp-pvc-to-mmlu-branch-op: + executorLabel: exec-pvc-to-mmlu-branch-op + inputDefinitions: + parameters: + pvc_path: + parameterType: STRING + outputDefinitions: + artifacts: + mmlu_branch_output: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 comp-pvc-to-model-op: executorLabel: exec-pvc-to-model-op inputDefinitions: @@ -331,6 +353,18 @@ components: artifactType: schemaTitle: system.Model schemaVersion: 0.0.1 + comp-pvc-to-mt-bench-branch-op: + executorLabel: exec-pvc-to-mt-bench-branch-op + inputDefinitions: + parameters: + pvc_path: + parameterType: STRING + outputDefinitions: + artifacts: + mt_bench_branch_output: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 comp-pvc-to-mt-bench-op: executorLabel: exec-pvc-to-mt-bench-op inputDefinitions: @@ -464,6 +498,14 @@ components: parameterType: STRING merge_system_user_message: parameterType: BOOLEAN + mmlu_branch_output_path: + defaultValue: /output/mmlu_branch + isOptional: true + parameterType: STRING + mt_bench_branch_output_path: + defaultValue: /output/mt_bench_branch + isOptional: true + parameterType: STRING sdg_path: defaultValue: /input/sdg isOptional: true @@ -472,23 +514,10 @@ components: defaultValue: /input/taxonomy isOptional: true parameterType: STRING - outputDefinitions: - artifacts: - mmlu_branch_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 - mt_bench_branch_output: - artifactType: - schemaTitle: system.Artifact - schemaVersion: 0.0.1 comp-run-mt-bench-op: executorLabel: exec-run-mt-bench-op inputDefinitions: parameters: - best_score_file: - isOptional: true - parameterType: STRING max_workers: parameterType: STRING merge_system_user_message: @@ -658,6 +687,39 @@ deploymentSpec: exec-deletepvc-3: container: image: argostub/deletepvc + exec-generate-metrics-report-op: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - generate_metrics_report_op + command: + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef generate_metrics_report_op(\n metrics: Output[Metrics],\n\ + ):\n import json\n\n reports = {\n \"mt_bench\": \"/output/mt_bench_data.json\"\ + ,\n \"mt_bench_branch\": \"/output/mt_bench_branch/mt_bench_branch_data.json\"\ + ,\n \"mmlu_branch\": \"/output/mmlu_branch/mmlu_branch_data.json\"\ + ,\n }\n\n for report, file_name in reports.items():\n with\ + \ open(file_name, \"r\", encoding=\"utf-8\") as f:\n report_data\ + \ = json.load(f)\n\n if report == \"mt_bench\":\n metrics.log_metric(f\"\ + {report}_best_model\", report_data[\"best_model\"])\n metrics.log_metric(f\"\ + {report}_best_score\", report_data[\"best_score\"])\n else:\n \ + \ metrics.log_metric(\n f\"{report}_trained_model_score\"\ + , report_data[\"trained_model_score\"]\n )\n metrics.log_metric(\n\ + \ f\"{report}_base_model_score\", report_data[\"base_model_score\"\ + ]\n )\n\n" + image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111 exec-git-clone-op: container: args: @@ -695,6 +757,14 @@ deploymentSpec: - /bin/sh - -c image: registry.access.redhat.com/ubi9/toolbox + exec-pvc-to-mmlu-branch-op: + container: + args: + - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['mmlu_branch_output'].path}} + command: + - /bin/sh + - -c + image: registry.access.redhat.com/ubi9/toolbox exec-pvc-to-model-op: container: args: @@ -703,6 +773,14 @@ deploymentSpec: - /bin/sh - -c image: registry.access.redhat.com/ubi9/toolbox + exec-pvc-to-mt-bench-branch-op: + container: + args: + - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['mt_bench_branch_output'].path}} + command: + - /bin/sh + - -c + image: registry.access.redhat.com/ubi9/toolbox exec-pvc-to-mt-bench-op: container: args: @@ -1138,37 +1216,39 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\ - \ mt_bench_branch_output: Output[Artifact],\n base_model_dir: str,\n\ - \ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\ - \ few_shots: int,\n batch_size: str,\n merge_system_user_message:\ - \ bool,\n candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\ - ,\n sdg_path: str = \"/input/sdg\",\n):\n import json\n import\ - \ os\n import subprocess\n\n import httpx\n import torch\n from\ - \ instructlab.eval.mmlu import MMLUBranchEvaluator\n from instructlab.eval.mt_bench\ - \ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\ - \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n judge_api_key = os.getenv(\"\ - JUDGE_API_KEY\", \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\"\ - )\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path\ - \ = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\ - \ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\ - \ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \ - \ print(\"Starting Final Eval...\")\n\n def launch_vllm(\n model_path:\ - \ str, gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ - \ import subprocess\n import sys\n import time\n\n\ - \ import requests\n from instructlab.model.backends.common\ - \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ - 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ - \n\n command = [\n sys.executable,\n \"-m\"\ - ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ - ,\n port,\n \"--model\",\n model_path,\n\ - \ ]\n if gpu_count > 0:\n command += [\n \ - \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ - \ ]\n\n process = subprocess.Popen(args=command)\n\n \ - \ print(f\"Waiting for vLLM server to start at {vllm_server}...\"\ - )\n\n for attempt in range(retries):\n try:\n \ - \ response = requests.get(f\"{vllm_server}/models\", timeout=10)\n\ - \ if response.status_code == 200:\n print(f\"\ + \ *\n\ndef run_final_eval_op(\n base_model_dir: str,\n base_branch:\ + \ str,\n candidate_branch: str,\n max_workers: str,\n few_shots:\ + \ int,\n batch_size: str,\n merge_system_user_message: bool,\n \ + \ candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\ + ,\n sdg_path: str = \"/input/sdg\",\n mmlu_branch_output_path: str\ + \ = \"/output/mmlu_branch\",\n mt_bench_branch_output_path: str = \"\ + /output/mt_bench_branch\",\n):\n import json\n import os\n import\ + \ subprocess\n from pathlib import Path\n\n import httpx\n import\ + \ torch\n from instructlab.eval.mmlu import MMLUBranchEvaluator\n \ + \ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\ + \ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\ + \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ + \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ + )\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\ + \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ + \ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\ + \ if use_tls else None\n\n print(\"Starting Final Eval...\")\n\n def\ + \ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\ + \ 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \ + \ import sys\n import time\n\n import requests\n \ + \ from instructlab.model.backends.common import free_tcp_ipv4_port\n\n\ + \ free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port = str(free_port)\n\ + \ vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\n command\ + \ = [\n sys.executable,\n \"-m\",\n \"\ + vllm.entrypoints.openai.api_server\",\n \"--port\",\n \ + \ port,\n \"--model\",\n model_path,\n \ + \ ]\n if gpu_count > 0:\n command += [\n \ + \ \"--tensor-parallel-size\",\n str(gpu_count),\n \ + \ ]\n\n process = subprocess.Popen(args=command)\n\n \ + \ print(f\"Waiting for vLLM server to start at {vllm_server}...\")\n\n\ + \ for attempt in range(retries):\n try:\n \ + \ response = requests.get(f\"{vllm_server}/models\", timeout=10)\n \ + \ if response.status_code == 200:\n print(f\"\ vLLM server is up and running at {vllm_server}.\")\n \ \ return process, vllm_server\n except requests.ConnectionError:\n\ \ pass\n\n print(\n f\"Server not\ @@ -1291,10 +1371,13 @@ deploymentSpec: \ regressions,\n no_changes,\n )\n\n \ \ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\ \ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\ - : candidate_model,\n \"model_score\": round(overall_score, 2),\n\ - \ \"base_model\": base_model_dir,\n \"base_model_score\"\ + : candidate_model,\n \"trained_model_score\": round(overall_score,\ + \ 2),\n \"base_model\": base_model_dir,\n \"base_model_score\"\ : round(base_overall_score, 2),\n \"summary\": summary,\n \ - \ }\n\n with open(mmlu_branch_output.path, \"w\", encoding=\"\ + \ }\n\n if not os.path.exists(mmlu_branch_output_path):\n \ + \ os.makedirs(mmlu_branch_output_path)\n mmlu_branch_output_file\ + \ = (\n Path(mmlu_branch_output_path) / \"mmlu_branch_data.json\"\ + \n )\n with open(mmlu_branch_output_file, \"w\", encoding=\"\ utf-8\") as f:\n json.dump(mmlu_branch_data, f, indent=4)\n \ \ else:\n print(\"No MMLU tasks directories found, skipping MMLU_branch\ \ evaluation.\")\n\n # MT_BENCH_BRANCH\n\n print(\"Starting MT_BENCH_BRANCH\ @@ -1360,10 +1443,14 @@ deploymentSpec: \ new_qnas,\n )\n\n mt_bench_branch_data = {\n \"report_title\"\ : \"SKILLS EVALUATION REPORT\",\n \"model\": candidate_model,\n \ \ \"judge_model\": judge_model_name,\n \"max_score\": \"10.0\"\ - ,\n \"overall_score\": overall_score,\n \"base_overall_score\"\ + ,\n \"trained_model_score\": overall_score,\n \"base_model_score\"\ : base_overall_score,\n \"error_rate\": error_rate,\n \"summary\"\ - : summary,\n }\n\n with open(mt_bench_branch_output.path, \"w\", encoding=\"\ - utf-8\") as f:\n json.dump(mt_bench_branch_data, f, indent=4)\n\n" + : summary,\n }\n\n if not os.path.exists(mt_bench_branch_output_path):\n\ + \ os.makedirs(mt_bench_branch_output_path)\n mt_bench_branch_data_file\ + \ = (\n Path(mt_bench_branch_output_path) / \"mt_bench_branch_data.json\"\ + \n )\n with open(\n mt_bench_branch_data_file,\n \"\ + w\",\n encoding=\"utf-8\",\n ) as f:\n json.dump(mt_bench_branch_data,\ + \ f, indent=4)\n\n" env: - name: HOME value: /tmp @@ -1400,23 +1487,23 @@ deploymentSpec: \ - 'auto'\n # with 'auto', number of gpus allocated for serving is\ \ calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ \ max_workers: str,\n models_folder: str,\n output_path: str =\ - \ \"/output/mt_bench_data.json\",\n best_score_file: Optional[str] =\ - \ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\ - \ import json\n import os\n import subprocess\n\n import httpx\n\ - \ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\ - \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ - \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ - )\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\ - \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ - \ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\ - \ if use_tls else None\n\n def launch_vllm(\n model_path: str,\ - \ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ - \ import subprocess\n import sys\n import time\n\n\ - \ import requests\n from instructlab.model.backends.common\ - \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ - 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ - \n\n command = [\n sys.executable,\n \"-m\"\ - ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ + \ \"/output/mt_bench_data.json\",\n) -> NamedTuple(\"outputs\", best_model=str,\ + \ best_score=float):\n import json\n import os\n import subprocess\n\ + \n import httpx\n import torch\n from instructlab.eval.mt_bench\ + \ import MTBenchEvaluator\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\ + , \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\ + \ = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path = os.getenv(\"\ + JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\ + \ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\ + \ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \ + \ def launch_vllm(\n model_path: str, gpu_count: int, retries: int\ + \ = 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \ + \ import sys\n import time\n\n import requests\n \ + \ from instructlab.model.backends.common import free_tcp_ipv4_port\n\ + \n free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port =\ + \ str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\ + \n command = [\n sys.executable,\n \"-m\",\n\ + \ \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ ,\n port,\n \"--model\",\n model_path,\n\ \ ]\n if gpu_count > 0:\n command += [\n \ \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ @@ -1480,17 +1567,17 @@ deploymentSpec: \ \"overall_score\": overall_score,\n \"turn_scores\"\ : turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\ : error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\ - \ scores[model_path] = overall_score\n\n with open(output_path,\ - \ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\ - \ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\ - \ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\ - \ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\ - \ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\ - : best_model, \"best_score\": best_score}, f, indent=4)\n\n # Rename\ - \ the best model directory to \"candidate_model\" for the next step\n \ - \ # So we know which model to use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\ - \ \"candidate_model\")):\n print(\"candidate_model already exists.\ - \ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\ + \ scores[model_path] = overall_score\n\n outputs = NamedTuple(\"\ + outputs\", best_model=str, best_score=float)\n best_model = max(scores,\ + \ key=scores.get)\n best_score = scores[best_model]\n mt_bench_report\ + \ = {\n \"best_model\": best_model,\n \"best_score\": best_score,\n\ + \ \"reports\": all_mt_bench_data,\n }\n\n with open(output_path,\ + \ \"w\", encoding=\"utf-8\") as f:\n json.dump(mt_bench_report, f,\ + \ indent=4)\n\n # Rename the best model directory to \"candidate_model\"\ + \ for the next step\n # So we know which model to use for the final evaluation\n\ + \ if os.path.exists(os.path.join(models_folder, \"candidate_model\")):\n\ + \ print(\"candidate_model already exists. Skipping renaming\")\n\ + \ else:\n os.rename(\n os.path.join(models_folder,\ \ best_model),\n os.path.join(models_folder, \"candidate_model\"\ ),\n )\n\n return outputs(best_model=best_model, best_score=best_score)\n\ \n" @@ -1646,6 +1733,12 @@ pipelineInfo: name: instructlab root: dag: + outputs: + artifacts: + generate-metrics-report-op-metrics: + artifactSelectors: + - outputArtifactKey: metrics + producerSubtask: generate-metrics-report-op tasks: createpvc: cachingOptions: @@ -1731,16 +1824,14 @@ root: componentRef: name: comp-deletepvc dependentTasks: - - createpvc-3 - - pvc-to-model-op - - pvc-to-mt-bench-op + - createpvc - run-final-eval-op inputs: parameters: pvc_name: taskOutputParameter: outputParameterKey: name - producerTask: createpvc-3 + producerTask: createpvc taskInfo: name: deletepvc deletepvc-2: @@ -1749,14 +1840,14 @@ root: componentRef: name: comp-deletepvc-2 dependentTasks: - - createpvc + - createpvc-2 - run-final-eval-op inputs: parameters: pvc_name: taskOutputParameter: outputParameterKey: name - producerTask: createpvc + producerTask: createpvc-2 taskInfo: name: deletepvc-2 deletepvc-3: @@ -1765,16 +1856,27 @@ root: componentRef: name: comp-deletepvc-3 dependentTasks: - - createpvc-2 - - run-final-eval-op + - createpvc-3 + - generate-metrics-report-op + - pvc-to-model-op inputs: parameters: pvc_name: taskOutputParameter: outputParameterKey: name - producerTask: createpvc-2 + producerTask: createpvc-3 taskInfo: name: deletepvc-3 + generate-metrics-report-op: + cachingOptions: {} + componentRef: + name: comp-generate-metrics-report-op + dependentTasks: + - createpvc-3 + - pvc-to-mt-bench-op + - run-final-eval-op + taskInfo: + name: generate-metrics-report-op git-clone-op: cachingOptions: {} componentRef: @@ -1826,6 +1928,20 @@ root: producerTask: importer taskInfo: name: model-to-pvc-op + pvc-to-mmlu-branch-op: + cachingOptions: {} + componentRef: + name: comp-pvc-to-mmlu-branch-op + dependentTasks: + - createpvc-3 + - run-final-eval-op + inputs: + parameters: + pvc_path: + runtimeValue: + constant: /output/mmlu_branch/mmlu_branch_data.json + taskInfo: + name: pvc-to-mmlu-branch-op pvc-to-model-op: cachingOptions: {} componentRef: @@ -1840,6 +1956,20 @@ root: constant: /output/phase_2/model/hf_format/candidate_model taskInfo: name: pvc-to-model-op + pvc-to-mt-bench-branch-op: + cachingOptions: {} + componentRef: + name: comp-pvc-to-mt-bench-branch-op + dependentTasks: + - createpvc-3 + - run-final-eval-op + inputs: + parameters: + pvc_path: + runtimeValue: + constant: /output/mt_bench_branch/mt_bench_branch_data.json + taskInfo: + name: pvc-to-mt-bench-branch-op pvc-to-mt-bench-op: cachingOptions: {} componentRef: @@ -2227,6 +2357,12 @@ root: description: Training parameter. Random seed for initializing training. isOptional: true parameterType: NUMBER_INTEGER + outputDefinitions: + artifacts: + generate-metrics-report-op-metrics: + artifactType: + schemaTitle: system.Metrics + schemaVersion: 0.0.1 schemaVersion: 2.1.0 sdkVersion: kfp-2.9.0 --- @@ -2244,6 +2380,12 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc + exec-generate-metrics-report-op: + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 exec-git-clone-op: pvcMount: - mountPath: /data @@ -2262,12 +2404,24 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc-2 + exec-pvc-to-mmlu-branch-op: + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 exec-pvc-to-model-op: pvcMount: - mountPath: /output taskOutputParameter: outputParameterKey: name producerTask: createpvc-3 + exec-pvc-to-mt-bench-branch-op: + pvcMount: + - mountPath: /output + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-3 exec-pvc-to-mt-bench-op: pvcMount: - mountPath: /output diff --git a/utils/__init__.py b/utils/__init__.py index 7dcc94e..6e355be 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -2,13 +2,17 @@ from .components import ( ilab_importer_op, model_to_pvc_op, + pvc_to_mmlu_branch_op, pvc_to_model_op, + pvc_to_mt_bench_branch_op, pvc_to_mt_bench_op, ) __all__ = [ "model_to_pvc_op", "pvc_to_mt_bench_op", + "pvc_to_mt_bench_branch_op", + "pvc_to_mmlu_branch_op", "pvc_to_model_op", "ilab_importer_op", "faked", diff --git a/utils/components.py b/utils/components.py index 35f3848..6d8f5ae 100644 --- a/utils/components.py +++ b/utils/components.py @@ -15,6 +15,26 @@ def pvc_to_mt_bench_op(mt_bench_output: dsl.Output[dsl.Artifact], pvc_path: str) ) +@dsl.container_component +def pvc_to_mt_bench_branch_op( + mt_bench_branch_output: dsl.Output[dsl.Artifact], pvc_path: str +): + return dsl.ContainerSpec( + TOOLBOX_IMAGE, + ["/bin/sh", "-c"], + [f"cp -r {pvc_path} {mt_bench_branch_output.path}"], + ) + + +@dsl.container_component +def pvc_to_mmlu_branch_op(mmlu_branch_output: dsl.Output[dsl.Artifact], pvc_path: str): + return dsl.ContainerSpec( + TOOLBOX_IMAGE, + ["/bin/sh", "-c"], + [f"cp -r {pvc_path} {mmlu_branch_output.path}"], + ) + + @dsl.container_component def pvc_to_model_op(model: dsl.Output[dsl.Model], pvc_path: str): return dsl.ContainerSpec(