Skip to content

Commit

Permalink
Merge pull request #76 from leseb/sdg-to-s3
Browse files Browse the repository at this point in the history
feat: remove dependency on KFP lib + eval serving endpoint support + ci
  • Loading branch information
MichaelClifford authored Oct 10, 2024
2 parents 79ef52e + 8a3cef9 commit 5c6483d
Show file tree
Hide file tree
Showing 12 changed files with 1,547 additions and 750 deletions.
6 changes: 6 additions & 0 deletions .github/workflows/pre_commit.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,3 +34,9 @@ jobs:
- name: Run pre-commit
run: |
pre-commit run --all-files
- name: Test if pipeline is up-to-date
run: |
pip install click kfp==2.9.0 kfp.kubernetes
make pipeline
git diff --exit-code || (echo "Pipeline is not up-to-date. Please run 'make pipeline' and commit the changes." && exit 1)
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
.PHONY: standalone
.PHONY: standalone pipeline

standalone:
python3 pipeline.py gen-standalone
ruff format standalone/standalone.py

pipeline:
python3 pipeline.py
13 changes: 9 additions & 4 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
def run_final_eval_op(
mmlu_branch_output: Output[Artifact],
mt_bench_branch_output: Output[Artifact],
candidate_model: str,
base_model_dir: str,
tasks: Input[Dataset],
taxonomy: Input[Dataset],
Expand All @@ -29,6 +28,7 @@ def run_final_eval_op(
few_shots: int,
batch_size: int,
merge_system_user_message: bool,
candidate_model: str = None,
):
import json
import os
Expand All @@ -43,6 +43,11 @@ def run_final_eval_op(
from instructlab.eval.mt_bench import MTBenchBranchEvaluator
from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score

# For standalone mode
if candidate_model is None:
# logic to get the best model from the models folder and results
pass

######################################################################
# branch_eval_summary_to_json creates a json object from output of instructlab/eval
# TODO: Add this to the instructlab/eval or instructlab/instructlab repository
Expand Down Expand Up @@ -221,7 +226,7 @@ def find_node_dataset_directories(base_directory: str):

######################################################################
# TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
# and when that happens, much of this logic can be imported from the `evaluate` definition:
# and when that happens, much of this logic can be imported from the 'evaluate' definition:
# https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
#
# With instructlab, model_name is synonomous with model_path
Expand All @@ -244,8 +249,8 @@ def find_node_dataset_directories(base_directory: str):
),
]

# ilab/evaluate uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# ilab/evaluate uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
if max_workers == "auto":
try:
Expand Down
98 changes: 89 additions & 9 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def run_mt_bench_op(
models_path_prefix: str,
mt_bench_output: Output[Artifact],
merge_system_user_message: bool,
# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
max_workers: str,
models_list: List[str] = None,
Expand All @@ -24,13 +24,93 @@ def run_mt_bench_op(
import os

import torch
from helpers import (
VLLM_SERVER,
launch_vllm,
stop_vllm,
)
from instructlab.eval.mt_bench import MTBenchEvaluator

VLLM_SERVER = "http://localhost:8000/v1"

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
):
import subprocess
import sys
import time

import requests

if gpu_count > 0:
command = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_path,
"--tensor-parallel-size",
str(gpu_count),
]
else:
command = [
sys.executable,
"-m",
"vllm.entrypoints.openai.api_server",
"--model",
model_path,
]

subprocess.Popen(args=command)

print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")

for attempt in range(retries):
try:
response = requests.get(f"{VLLM_SERVER}/models")
if response.status_code == 200:
print(f"vLLM server is up and running at {VLLM_SERVER}.")
return
except requests.ConnectionError:
pass

print(
f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
)
time.sleep(delay)

raise RuntimeError(
f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
)

# This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
# Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
def stop_vllm():
import psutil

for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
cmdline = process.info.get("cmdline")
if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
print(
f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
)
try:
process.terminate() # Try graceful termination
process.wait(timeout=5) # Wait a bit for it to terminate
if process.is_running():
print(
f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
)
process.kill() # Force kill if it's still running
print(
f"Successfully stopped vLLM server with PID: {process.info['pid']}"
)
except psutil.NoSuchProcess:
print(f"Process with PID {process.info['pid']} no longer exists.")
except psutil.AccessDenied:
print(
f"Access denied when trying to terminate process with PID {process.info['pid']}."
)
except Exception as e:
print(
f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
)

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

gpu_available = torch.cuda.is_available()
Expand All @@ -53,8 +133,8 @@ def run_mt_bench_op(
scores = {}
all_mt_bench_data = []

# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
if max_workers == "auto":
try:
Expand Down
Loading

0 comments on commit 5c6483d

Please sign in to comment.