Skip to content

Commit

Permalink
Add final eval mmlu_branch (#75)
Browse files Browse the repository at this point in the history
  • Loading branch information
sallyom authored Oct 9, 2024
1 parent 1dae29e commit 79ef52e
Show file tree
Hide file tree
Showing 4 changed files with 206 additions and 28 deletions.
4 changes: 2 additions & 2 deletions eval/final/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .components import run_mt_bench_branch_op
from .components import run_final_eval_op

# from . import faked

__all__ = ["run_mt_bench_branch_op"]
__all__ = ["run_final_eval_op"]
103 changes: 102 additions & 1 deletion eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,20 @@
"vllm",
],
)
def run_mt_bench_branch_op(
def run_final_eval_op(
mmlu_branch_output: Output[Artifact],
mt_bench_branch_output: Output[Artifact],
candidate_model: str,
base_model_dir: str,
tasks: Input[Dataset],
taxonomy: Input[Dataset],
base_branch: str,
candidate_branch: str,
max_workers: str,
device: str,
model_dtype: str,
few_shots: int,
batch_size: int,
merge_system_user_message: bool,
):
import json
Expand All @@ -34,6 +39,7 @@ def run_mt_bench_branch_op(
launch_vllm,
stop_vllm,
)
from instructlab.eval.mmlu import MMLU_TASKS, MMLUBranchEvaluator
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score

Expand Down Expand Up @@ -106,6 +112,101 @@ def branch_eval_summary_to_json(

print(f"GPU Available: {gpu_available}, Using: {gpu_name}")

# MMLU_BRANCH

# find_node_dataset_directories to find sdg output node_datasets_*
def find_node_dataset_directories(base_directory: str):
import os
import re

# This is specific to ilab/eval output
pattern = r"node_datasets_"
matching_dirs = []
regex = re.compile(pattern)

for root, dirs, files in os.walk(base_directory):
for directory in dirs:
if regex.search(directory):
matching_dirs.append(os.path.join(root, directory))

return matching_dirs

mmlu_tasks = ["mmlu_pr"]

node_dataset_dirs = find_node_dataset_directories(tasks.path)
# This assumes generated filesystem from ilab sdg, which
# generates a node_datasets_ directory for MMLU custom tasks data
if node_dataset_dirs:
tasks_dir = node_dataset_dirs[0]

mmlu_branch_evaluators = [
MMLUBranchEvaluator(
model_path=candidate_model,
tasks_dir=tasks_dir,
tasks=mmlu_tasks,
few_shots=few_shots,
batch_size=batch_size,
),
MMLUBranchEvaluator(
model_path=base_model_dir,
tasks_dir=tasks_dir,
tasks=mmlu_tasks,
few_shots=few_shots,
batch_size=batch_size,
),
]
m_paths = [candidate_model, base_model_dir]
overall_scores = []
individual_scores_list = []
for i, evaluator in enumerate(mmlu_branch_evaluators):
m_path = m_paths[i]
launch_vllm(m_path, gpu_count)
overall_score, individual_scores = evaluator.run(VLLM_SERVER)
overall_scores.append(overall_score)
individual_scores_list.append(individual_scores)
stop_vllm()

# TODO: update instructlab/instructlab model/evaluate.py
# so this logic can be imported outside of the CLI
overall_score = overall_scores[0]
base_overall_score = overall_scores[1]
individual_scores = individual_scores_list[0]
base_individual_scores = individual_scores_list[1]

improvements, regressions, no_changes = [], [], []
for task, score in individual_scores.items():
base_score = base_individual_scores[task]
s = score["score"]
b_s = base_score["score"]
d = round(s - b_s, 2)
if s > b_s:
improvements.append((task, d, b_s, s))
elif b_s > s:
regressions.append((task, d, b_s, s))
else:
no_changes.append((task, s))

summary = branch_eval_summary_to_json(
improvements,
regressions,
no_changes,
)

mmlu_branch_data = {
"report_title": "KNOWLEDGE EVALUATION REPORT",
"max_score": "1.0",
"model": candidate_model,
"model_score": round(overall_score, 2),
"base_model": base_model_dir,
"base_model_score": round(base_overall_score, 2),
"summary": summary,
}

with open(mmlu_branch_output.path, "w") as f:
json.dump(mmlu_branch_data, f, indent=4)
else:
print("No MMLU tasks directories found, skipping MMLU_branch evaluation.")

# MT_BENCH_BRANCH

judge_api_key = os.getenv("JUDGE_API_KEY", "")
Expand Down
8 changes: 6 additions & 2 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
)

# Imports for evaluation
from eval.final import run_mt_bench_branch_op
from eval.final import run_final_eval_op
from eval.mmlu import load_mmlu_results_op, run_mmlu_op

## from eval.mmlu import run_mmlu_op, load_mmlu_results_op
Expand Down Expand Up @@ -313,16 +313,20 @@ def pipeline(

use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"})

final_eval_task = run_mt_bench_branch_op(
final_eval_task = run_final_eval_op(
candidate_model=run_mt_bench_task.outputs["best_model"],
taxonomy=git_clone_task.outputs["taxonomy"],
tasks=sdg_task.outputs["sdg"],
# TODO: DO we need both candidate_branch and base_branch
base_branch=repo_branch,
candidate_branch=repo_branch,
device=device,
base_model_dir=BASE_MODEL_DIR,
max_workers=max_workers,
merge_system_user_message=merge_system_user_message,
model_dtype=model_dtype,
few_shots=few_shots,
batch_size=batch_size,
)

mount_pvc(
Expand Down
119 changes: 96 additions & 23 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -423,10 +423,14 @@ components:
parameterType: STRING
name:
parameterType: STRING
comp-run-mt-bench-branch-op:
executorLabel: exec-run-mt-bench-branch-op
comp-run-final-eval-op:
executorLabel: exec-run-final-eval-op
inputDefinitions:
artifacts:
tasks:
artifactType:
schemaTitle: system.Dataset
schemaVersion: 0.0.1
taxonomy:
artifactType:
schemaTitle: system.Dataset
Expand All @@ -436,18 +440,28 @@ components:
parameterType: STRING
base_model_dir:
parameterType: STRING
batch_size:
parameterType: NUMBER_INTEGER
candidate_branch:
parameterType: STRING
candidate_model:
parameterType: STRING
device:
parameterType: STRING
few_shots:
parameterType: NUMBER_INTEGER
max_workers:
parameterType: STRING
merge_system_user_message:
parameterType: BOOLEAN
model_dtype:
parameterType: STRING
outputDefinitions:
artifacts:
mmlu_branch_output:
artifactType:
schemaTitle: system.Artifact
schemaVersion: 0.0.1
mt_bench_branch_output:
artifactType:
schemaTitle: system.Artifact
Expand Down Expand Up @@ -966,13 +980,13 @@ deploymentSpec:
\ claimName: {output_pvc_name}\n \"\"\"\n\
\ )\n\n return Outputs(manifest, name)\n\n"
image: registry.access.redhat.com/ubi9/python-311:latest
exec-run-mt-bench-branch-op:
exec-run-final-eval-op:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- run_mt_bench_branch_op
- run_final_eval_op
command:
- sh
- -c
Expand All @@ -993,15 +1007,17 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef run_mt_bench_branch_op(\n mt_bench_branch_output: Output[Artifact],\n\
\ candidate_model: str,\n base_model_dir: str,\n taxonomy: Input[Dataset],\n\
\ *\n\ndef run_final_eval_op(\n mmlu_branch_output: Output[Artifact],\n\
\ mt_bench_branch_output: Output[Artifact],\n candidate_model: str,\n\
\ base_model_dir: str,\n tasks: Input[Dataset],\n taxonomy: Input[Dataset],\n\
\ base_branch: str,\n candidate_branch: str,\n max_workers: str,\n\
\ device: str,\n merge_system_user_message: bool,\n):\n import\
\ json\n import os\n\n import torch\n from helpers import (\n \
\ VLLM_SERVER,\n launch_vllm,\n stop_vllm,\n )\n\
\ from instructlab.eval.mt_bench import MTBenchBranchEvaluator\n from\
\ instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score\n\
\n ######################################################################\n\
\ device: str,\n model_dtype: str,\n few_shots: int,\n batch_size:\
\ int,\n merge_system_user_message: bool,\n):\n import json\n import\
\ os\n\n import torch\n from helpers import (\n VLLM_SERVER,\n\
\ launch_vllm,\n stop_vllm,\n )\n from instructlab.eval.mmlu\
\ import MMLU_TASKS, MMLUBranchEvaluator\n from instructlab.eval.mt_bench\
\ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\
\ qa_pairs_to_qna_to_avg_scores, sort_score\n\n ######################################################################\n\
\ # branch_eval_summary_to_json creates a json object from output of\
\ instructlab/eval\n # TODO: Add this to the instructlab/eval or instructlab/instructlab\
\ repository\n def branch_eval_summary_to_json(\n improvements:\
Expand Down Expand Up @@ -1036,13 +1052,59 @@ deploymentSpec:
\ torch.cuda.get_device_name(torch.cuda.current_device())\n if\
\ gpu_available\n else \"No GPU available\"\n )\n gpu_count\
\ = torch.cuda.device_count() if gpu_available else 0\n\n print(f\"GPU\
\ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MT_BENCH_BRANCH\n\
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
)\n\n output_dir = \"/tmp/eval_output\"\n\n # TODO: candidate_branch\
\ must be in same repo, not a fork, or, can compare main branch against\
\ candidate, base models\n base_branch = base_branch or \"main\"\n \
\ candidate_branch = candidate_branch or \"main\"\n\n ######################################################################\n\
\ Available: {gpu_available}, Using: {gpu_name}\")\n\n # MMLU_BRANCH\n\
\n # find_node_dataset_directories to find sdg output node_datasets_*\n\
\ def find_node_dataset_directories(base_directory: str):\n import\
\ os\n import re\n\n # This is specific to ilab/eval output\n\
\ pattern = r\"node_datasets_\"\n matching_dirs = []\n \
\ regex = re.compile(pattern)\n\n for root, dirs, files in os.walk(base_directory):\n\
\ for directory in dirs:\n if regex.search(directory):\n\
\ matching_dirs.append(os.path.join(root, directory))\n\
\n return matching_dirs\n\n mmlu_tasks = [\"mmlu_pr\"]\n\n \
\ node_dataset_dirs = find_node_dataset_directories(tasks.path)\n # This\
\ assumes generated filesystem from ilab sdg, which\n # generates a node_datasets_\
\ directory for MMLU custom tasks data\n if node_dataset_dirs:\n \
\ tasks_dir = node_dataset_dirs[0]\n\n mmlu_branch_evaluators\
\ = [\n MMLUBranchEvaluator(\n model_path=candidate_model,\n\
\ tasks_dir=tasks_dir,\n tasks=mmlu_tasks,\n\
\ few_shots=few_shots,\n batch_size=batch_size,\n\
\ ),\n MMLUBranchEvaluator(\n model_path=base_model_dir,\n\
\ tasks_dir=tasks_dir,\n tasks=mmlu_tasks,\n\
\ few_shots=few_shots,\n batch_size=batch_size,\n\
\ ),\n ]\n m_paths = [candidate_model, base_model_dir]\n\
\ overall_scores = []\n individual_scores_list = []\n \
\ for i, evaluator in enumerate(mmlu_branch_evaluators):\n \
\ m_path = m_paths[i]\n launch_vllm(m_path, gpu_count)\n \
\ overall_score, individual_scores = evaluator.run(VLLM_SERVER)\n\
\ overall_scores.append(overall_score)\n individual_scores_list.append(individual_scores)\n\
\ stop_vllm()\n\n # TODO: update instructlab/instructlab\
\ model/evaluate.py\n # so this logic can be imported outside of\
\ the CLI\n overall_score = overall_scores[0]\n base_overall_score\
\ = overall_scores[1]\n individual_scores = individual_scores_list[0]\n\
\ base_individual_scores = individual_scores_list[1]\n\n improvements,\
\ regressions, no_changes = [], [], []\n for task, score in individual_scores.items():\n\
\ base_score = base_individual_scores[task]\n s =\
\ score[\"score\"]\n b_s = base_score[\"score\"]\n \
\ d = round(s - b_s, 2)\n if s > b_s:\n improvements.append((task,\
\ d, b_s, s))\n elif b_s > s:\n regressions.append((task,\
\ d, b_s, s))\n else:\n no_changes.append((task,\
\ s))\n\n summary = branch_eval_summary_to_json(\n improvements,\n\
\ regressions,\n no_changes,\n )\n\n \
\ mmlu_branch_data = {\n \"report_title\": \"KNOWLEDGE EVALUATION\
\ REPORT\",\n \"max_score\": \"1.0\",\n \"model\"\
: candidate_model,\n \"model_score\": round(overall_score, 2),\n\
\ \"base_model\": base_model_dir,\n \"base_model_score\"\
: round(base_overall_score, 2),\n \"summary\": summary,\n \
\ }\n\n with open(mmlu_branch_output.path, \"w\") as f:\n \
\ json.dump(mmlu_branch_data, f, indent=4)\n else:\n print(\"\
No MMLU tasks directories found, skipping MMLU_branch evaluation.\")\n\n\
\ # MT_BENCH_BRANCH\n\n judge_api_key = os.getenv(\"JUDGE_API_KEY\"\
, \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\")\n judge_endpoint\
\ = os.getenv(\"JUDGE_ENDPOINT\")\n\n output_dir = \"/tmp/eval_output\"\
\n\n # TODO: candidate_branch must be in same repo, not a fork, or, can\
\ compare main branch against candidate, base models\n base_branch =\
\ base_branch or \"main\"\n candidate_branch = candidate_branch or \"\
main\"\n\n ######################################################################\n\
\ # TODO: Update ilab/model/evaluate evaluate def logic to allow for\
\ external judge model\n # and when that happens, much of this logic\
\ can be imported from the `evaluate` definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
Expand Down Expand Up @@ -1625,18 +1687,23 @@ root:
constant: second
taskInfo:
name: pytorchjob-manifest-op-2
run-mt-bench-branch-op:
run-final-eval-op:
cachingOptions:
enableCache: true
componentRef:
name: comp-run-mt-bench-branch-op
name: comp-run-final-eval-op
dependentTasks:
- createpvc
- createpvc-3
- git-clone-op
- run-mt-bench-op
- sdg-op
inputs:
artifacts:
tasks:
taskOutputArtifact:
outputArtifactKey: sdg
producerTask: sdg-op
taxonomy:
taskOutputArtifact:
outputArtifactKey: taxonomy
Expand All @@ -1647,6 +1714,8 @@ root:
base_model_dir:
runtimeValue:
constant: /model/model
batch_size:
componentInputParameter: batch_size
candidate_branch:
componentInputParameter: repo_branch
candidate_model:
Expand All @@ -1655,12 +1724,16 @@ root:
producerTask: run-mt-bench-op
device:
componentInputParameter: device
few_shots:
componentInputParameter: few_shots
max_workers:
componentInputParameter: max_workers
merge_system_user_message:
componentInputParameter: merge_system_user_message
model_dtype:
componentInputParameter: model_dtype
taskInfo:
name: run-mt-bench-branch-op
name: run-final-eval-op
run-mt-bench-op:
cachingOptions: {}
componentRef:
Expand Down Expand Up @@ -1805,7 +1878,7 @@ platforms:
taskOutputParameter:
outputParameterKey: name
producerTask: createpvc-3
exec-run-mt-bench-branch-op:
exec-run-final-eval-op:
configMapAsEnv:
- configMapName: kfp-model-server
keyToEnv:
Expand Down

0 comments on commit 79ef52e

Please sign in to comment.