diff --git a/eval/final/__init__.py b/eval/final/__init__.py index 98521577..62053fe8 100644 --- a/eval/final/__init__.py +++ b/eval/final/__init__.py @@ -1,5 +1,5 @@ -from .components import run_mt_bench_branch_op +from .components import run_final_eval_op # from . import faked -__all__ = ["run_mt_bench_branch_op"] +__all__ = ["run_final_eval_op"] diff --git a/eval/final/components.py b/eval/final/components.py index b63e8b49..e4d9036d 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -14,15 +14,20 @@ "vllm", ], ) -def run_mt_bench_branch_op( +def run_final_eval_op( + mmlu_branch_output: Output[Artifact], mt_bench_branch_output: Output[Artifact], candidate_model: str, base_model_dir: str, + tasks: Input[Dataset], taxonomy: Input[Dataset], base_branch: str, candidate_branch: str, max_workers: str, device: str, + model_dtype: str, + few_shots: int, + batch_size: int, merge_system_user_message: bool, ): import json @@ -34,6 +39,7 @@ def run_mt_bench_branch_op( launch_vllm, stop_vllm, ) + from instructlab.eval.mmlu import MMLU_TASKS, MMLUBranchEvaluator from instructlab.eval.mt_bench import MTBenchBranchEvaluator from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score @@ -106,6 +112,101 @@ def branch_eval_summary_to_json( print(f"GPU Available: {gpu_available}, Using: {gpu_name}") + # MMLU_BRANCH + + # find_node_dataset_directories to find sdg output node_datasets_* + def find_node_dataset_directories(base_directory: str): + import os + import re + + # This is specific to ilab/eval output + pattern = r"node_datasets_" + matching_dirs = [] + regex = re.compile(pattern) + + for root, dirs, files in os.walk(base_directory): + for directory in dirs: + if regex.search(directory): + matching_dirs.append(os.path.join(root, directory)) + + return matching_dirs + + mmlu_tasks = ["mmlu_pr"] + + node_dataset_dirs = find_node_dataset_directories(tasks.path) + # This assumes generated filesystem from ilab sdg, which + # generates a node_datasets_ directory for MMLU custom tasks data + if node_dataset_dirs: + tasks_dir = node_dataset_dirs[0] + + mmlu_branch_evaluators = [ + MMLUBranchEvaluator( + model_path=candidate_model, + tasks_dir=tasks_dir, + tasks=mmlu_tasks, + few_shots=few_shots, + batch_size=batch_size, + ), + MMLUBranchEvaluator( + model_path=base_model_dir, + tasks_dir=tasks_dir, + tasks=mmlu_tasks, + few_shots=few_shots, + batch_size=batch_size, + ), + ] + m_paths = [candidate_model, base_model_dir] + overall_scores = [] + individual_scores_list = [] + for i, evaluator in enumerate(mmlu_branch_evaluators): + m_path = m_paths[i] + launch_vllm(m_path, gpu_count) + overall_score, individual_scores = evaluator.run(VLLM_SERVER) + overall_scores.append(overall_score) + individual_scores_list.append(individual_scores) + stop_vllm() + + # TODO: update instructlab/instructlab model/evaluate.py + # so this logic can be imported outside of the CLI + overall_score = overall_scores[0] + base_overall_score = overall_scores[1] + individual_scores = individual_scores_list[0] + base_individual_scores = individual_scores_list[1] + + improvements, regressions, no_changes = [], [], [] + for task, score in individual_scores.items(): + base_score = base_individual_scores[task] + s = score["score"] + b_s = base_score["score"] + d = round(s - b_s, 2) + if s > b_s: + improvements.append((task, d, b_s, s)) + elif b_s > s: + regressions.append((task, d, b_s, s)) + else: + no_changes.append((task, s)) + + summary = branch_eval_summary_to_json( + improvements, + regressions, + no_changes, + ) + + mmlu_branch_data = { + "report_title": "KNOWLEDGE EVALUATION REPORT", + "max_score": "1.0", + "model": candidate_model, + "model_score": round(overall_score, 2), + "base_model": base_model_dir, + "base_model_score": round(base_overall_score, 2), + "summary": summary, + } + + with open(mmlu_branch_output.path, "w") as f: + json.dump(mmlu_branch_data, f, indent=4) + else: + print("No MMLU tasks directories found, skipping MMLU_branch evaluation.") + # MT_BENCH_BRANCH judge_api_key = os.getenv("JUDGE_API_KEY", "") diff --git a/pipeline.py b/pipeline.py index bf940991..1f8d12e0 100644 --- a/pipeline.py +++ b/pipeline.py @@ -65,7 +65,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): ) # Imports for evaluation - from eval.final import run_mt_bench_branch_op + from eval.final import run_final_eval_op from eval.mmlu import load_mmlu_results_op, run_mmlu_op ## from eval.mmlu import run_mmlu_op, load_mmlu_results_op @@ -313,9 +313,10 @@ def pipeline( use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) - final_eval_task = run_mt_bench_branch_op( + final_eval_task = run_final_eval_op( candidate_model=run_mt_bench_task.outputs["best_model"], taxonomy=git_clone_task.outputs["taxonomy"], + tasks=sdg_task.outputs["sdg"], # TODO: DO we need both candidate_branch and base_branch base_branch=repo_branch, candidate_branch=repo_branch, @@ -323,6 +324,9 @@ def pipeline( base_model_dir=BASE_MODEL_DIR, max_workers=max_workers, merge_system_user_message=merge_system_user_message, + model_dtype=model_dtype, + few_shots=few_shots, + batch_size=batch_size, ) mount_pvc( diff --git a/pipeline.yaml b/pipeline.yaml index 3e9b3a90..e45acc8b 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -423,10 +423,14 @@ components: parameterType: STRING name: parameterType: STRING - comp-run-mt-bench-branch-op: - executorLabel: exec-run-mt-bench-branch-op + comp-run-final-eval-op: + executorLabel: exec-run-final-eval-op inputDefinitions: artifacts: + tasks: + artifactType: + schemaTitle: system.Dataset + schemaVersion: 0.0.1 taxonomy: artifactType: schemaTitle: system.Dataset @@ -436,18 +440,28 @@ components: parameterType: STRING base_model_dir: parameterType: STRING + batch_size: + parameterType: NUMBER_INTEGER candidate_branch: parameterType: STRING candidate_model: parameterType: STRING device: parameterType: STRING + few_shots: + parameterType: NUMBER_INTEGER max_workers: parameterType: STRING merge_system_user_message: parameterType: BOOLEAN + model_dtype: + parameterType: STRING outputDefinitions: artifacts: + mmlu_branch_output: + artifactType: + schemaTitle: system.Artifact + schemaVersion: 0.0.1 mt_bench_branch_output: artifactType: schemaTitle: system.Artifact @@ -966,13 +980,13 @@ deploymentSpec: \ claimName: {output_pvc_name}\n \"\"\"\n\ \ )\n\n return Outputs(manifest, name)\n\n" image: registry.access.redhat.com/ubi9/python-311:latest - exec-run-mt-bench-branch-op: + exec-run-final-eval-op: container: args: - --executor_input - '{{$}}' - --function_to_execute - - run_mt_bench_branch_op + - run_final_eval_op command: - sh - -c @@ -993,15 +1007,17 @@ deploymentSpec: ' - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef run_mt_bench_branch_op(\n mt_bench_branch_output: Output[Artifact],\n\ - \ candidate_model: str,\n base_model_dir: str,\n taxonomy: Input[Dataset],\n\ + \ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\ + \ mt_bench_branch_output: Output[Artifact],\n candidate_model: str,\n\ + \ base_model_dir: str,\n tasks: Input[Dataset],\n taxonomy: Input[Dataset],\n\ \ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\ - \ device: str,\n merge_system_user_message: bool,\n):\n import\ - \ json\n import os\n\n import torch\n from helpers import (\n \ - \ VLLM_SERVER,\n launch_vllm,\n stop_vllm,\n )\n\ - \ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\ - \ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\ - \n ######################################################################\n\ + \ device: str,\n model_dtype: str,\n few_shots: int,\n batch_size:\ + \ int,\n merge_system_user_message: bool,\n):\n import json\n import\ + \ os\n\n import torch\n from helpers import (\n VLLM_SERVER,\n\ + \ launch_vllm,\n stop_vllm,\n )\n from instructlab.eval.mmlu\ + \ import MMLU_TASKS, MMLUBranchEvaluator\n from instructlab.eval.mt_bench\ + \ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\ + \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n ######################################################################\n\ \ # branch_eval_summary_to_json creates a json object from output of\ \ instructlab/eval\n # TODO: Add this to the instructlab/eval or instructlab/instructlab\ \ repository\n def branch_eval_summary_to_json(\n improvements:\ @@ -1036,13 +1052,59 @@ deploymentSpec: \ torch.cuda.get_device_name(torch.cuda.current_device())\n if\ \ gpu_available\n else \"No GPU available\"\n )\n gpu_count\ \ = torch.cuda.device_count() if gpu_available else 0\n\n print(f\"GPU\ - \ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MT_BENCH_BRANCH\n\ - \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ - \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ - )\n\n output_dir = \"/tmp/eval_output\"\n\n # TODO: candidate_branch\ - \ must be in same repo, not a fork, or, can compare main branch against\ - \ candidate, base models\n base_branch = base_branch or \"main\"\n \ - \ candidate_branch = candidate_branch or \"main\"\n\n ######################################################################\n\ + \ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MMLU_BRANCH\n\ + \n # find_node_dataset_directories to find sdg output node_datasets_*\n\ + \ def find_node_dataset_directories(base_directory: str):\n import\ + \ os\n import re\n\n # This is specific to ilab/eval output\n\ + \ pattern = r\"node_datasets_\"\n matching_dirs = []\n \ + \ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n\ + \ for directory in dirs:\n if regex.search(directory):\n\ + \ matching_dirs.append(os.path.join(root, directory))\n\ + \n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n \ + \ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n # This\ + \ assumes generated filesystem from ilab sdg, which\n # generates a node_datasets_\ + \ directory for MMLU custom tasks data\n if node_dataset_dirs:\n \ + \ tasks_dir = node_dataset_dirs[0]\n\n mmlu_branch_evaluators\ + \ = [\n MMLUBranchEvaluator(\n model_path=candidate_model,\n\ + \ tasks_dir=tasks_dir,\n tasks=mmlu_tasks,\n\ + \ few_shots=few_shots,\n batch_size=batch_size,\n\ + \ ),\n MMLUBranchEvaluator(\n model_path=base_model_dir,\n\ + \ tasks_dir=tasks_dir,\n tasks=mmlu_tasks,\n\ + \ few_shots=few_shots,\n batch_size=batch_size,\n\ + \ ),\n ]\n m_paths = [candidate_model, base_model_dir]\n\ + \ overall_scores = []\n individual_scores_list = []\n \ + \ for i, evaluator in enumerate(mmlu_branch_evaluators):\n \ + \ m_path = m_paths[i]\n launch_vllm(m_path, gpu_count)\n \ + \ overall_score, individual_scores = evaluator.run(VLLM_SERVER)\n\ + \ overall_scores.append(overall_score)\n individual_scores_list.append(individual_scores)\n\ + \ stop_vllm()\n\n # TODO: update instructlab/instructlab\ + \ model/evaluate.py\n # so this logic can be imported outside of\ + \ the CLI\n overall_score = overall_scores[0]\n base_overall_score\ + \ = overall_scores[1]\n individual_scores = individual_scores_list[0]\n\ + \ base_individual_scores = individual_scores_list[1]\n\n improvements,\ + \ regressions, no_changes = [], [], []\n for task, score in individual_scores.items():\n\ + \ base_score = base_individual_scores[task]\n s =\ + \ score[\"score\"]\n b_s = base_score[\"score\"]\n \ + \ d = round(s - b_s, 2)\n if s > b_s:\n improvements.append((task,\ + \ d, b_s, s))\n elif b_s > s:\n regressions.append((task,\ + \ d, b_s, s))\n else:\n no_changes.append((task,\ + \ s))\n\n summary = branch_eval_summary_to_json(\n improvements,\n\ + \ regressions,\n no_changes,\n )\n\n \ + \ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\ + \ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\ + : candidate_model,\n \"model_score\": round(overall_score, 2),\n\ + \ \"base_model\": base_model_dir,\n \"base_model_score\"\ + : round(base_overall_score, 2),\n \"summary\": summary,\n \ + \ }\n\n with open(mmlu_branch_output.path, \"w\") as f:\n \ + \ json.dump(mmlu_branch_data, f, indent=4)\n else:\n print(\"\ + No MMLU tasks directories found, skipping MMLU_branch evaluation.\")\n\n\ + \ # MT_BENCH_BRANCH\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\ + , \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\ + \ = os.getenv(\"JUDGE_ENDPOINT\")\n\n output_dir = \"/tmp/eval_output\"\ + \n\n # TODO: candidate_branch must be in same repo, not a fork, or, can\ + \ compare main branch against candidate, base models\n base_branch =\ + \ base_branch or \"main\"\n candidate_branch = candidate_branch or \"\ + main\"\n\n ######################################################################\n\ \ # TODO: Update ilab/model/evaluate evaluate def logic to allow for\ \ external judge model\n # and when that happens, much of this logic\ \ can be imported from the `evaluate` definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\ @@ -1625,18 +1687,23 @@ root: constant: second taskInfo: name: pytorchjob-manifest-op-2 - run-mt-bench-branch-op: + run-final-eval-op: cachingOptions: enableCache: true componentRef: - name: comp-run-mt-bench-branch-op + name: comp-run-final-eval-op dependentTasks: - createpvc - createpvc-3 - git-clone-op - run-mt-bench-op + - sdg-op inputs: artifacts: + tasks: + taskOutputArtifact: + outputArtifactKey: sdg + producerTask: sdg-op taxonomy: taskOutputArtifact: outputArtifactKey: taxonomy @@ -1647,6 +1714,8 @@ root: base_model_dir: runtimeValue: constant: /model/model + batch_size: + componentInputParameter: batch_size candidate_branch: componentInputParameter: repo_branch candidate_model: @@ -1655,12 +1724,16 @@ root: producerTask: run-mt-bench-op device: componentInputParameter: device + few_shots: + componentInputParameter: few_shots max_workers: componentInputParameter: max_workers merge_system_user_message: componentInputParameter: merge_system_user_message + model_dtype: + componentInputParameter: model_dtype taskInfo: - name: run-mt-bench-branch-op + name: run-final-eval-op run-mt-bench-op: cachingOptions: {} componentRef: @@ -1805,7 +1878,7 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc-3 - exec-run-mt-bench-branch-op: + exec-run-final-eval-op: configMapAsEnv: - configMapName: kfp-model-server keyToEnv: