Skip to content

Commit

Permalink
eval: add proper selector and permissions
Browse files Browse the repository at this point in the history
Signed-off-by: Sébastien Han <[email protected]>
  • Loading branch information
leseb committed Oct 10, 2024
1 parent 5c6483d commit b7ff865
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 4 deletions.
2 changes: 1 addition & 1 deletion eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def run_mt_bench_op(
VLLM_SERVER = "http://localhost:8000/v1"

def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
import subprocess
import sys
Expand Down
2 changes: 1 addition & 1 deletion pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1209,7 +1209,7 @@ deploymentSpec:
\ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\
\n VLLM_SERVER = \"http://localhost:8000/v1\"\n\n def launch_vllm(\n\
\ model_path: str, gpu_count: int, retries: int = 120, delay: int\
\ = 5\n ):\n import subprocess\n import sys\n import\
\ = 10\n ):\n import subprocess\n import sys\n import\
\ time\n\n import requests\n\n if gpu_count > 0:\n \
\ command = [\n sys.executable,\n \"-m\"\
,\n \"vllm.entrypoints.openai.api_server\",\n \
Expand Down
14 changes: 12 additions & 2 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -363,6 +363,12 @@ def upload_s3_file():
PYTHON_EXECUTOR = """
set -e
export XDG_CACHE_HOME=/tmp
export OUTLINES_CACHE_DIR=/tmp
export NUMBA_CACHE_DIR=/tmp
export TRANSFORMERS_CACHE=/tmp
export HF_HOME=/tmp
export HOME=/tmp
export TRITON_CACHE_DIR=/tmp
tmp=$(mktemp -d)
cat <<EOF > "$tmp"/exec.py
Expand Down Expand Up @@ -1268,7 +1274,6 @@ def data_processing(train_args: TrainingArgs) -> None:

container = kubernetes.client.V1Container(
name="sdg-preprocess",
# image="quay.io/tcoufal/ilab-sdg:latest",
image=RHELAI_IMAGE,
command=["/bin/sh", "-ce"],
args=[
Expand Down Expand Up @@ -1320,6 +1325,7 @@ def create_eval_job(
namespace: str,
job_name: str,
eval_type: str,
nproc_per_node: int = 1,
) -> kubernetes.client.V1Job:
"""
Create a Kubernetes Job object.
Expand Down Expand Up @@ -1384,7 +1390,7 @@ def run_mt_bench_op(
VLLM_SERVER = "http://localhost:8000/v1"
def launch_vllm(
model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
model_path: str, gpu_count: int, retries: int = 120, delay: int = 10
):
import subprocess
import sys
Expand Down Expand Up @@ -1573,6 +1579,10 @@ def stop_vllm():
)
),
],
resources=kubernetes.client.V1ResourceRequirements(
requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
),
)
]
container = kubernetes.client.V1Container(
Expand Down
11 changes: 11 additions & 0 deletions standalone/standalone.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -348,6 +348,12 @@ spec:
PYTHON_EXECUTOR = """
set -e
export XDG_CACHE_HOME=/tmp
export OUTLINES_CACHE_DIR=/tmp
export NUMBA_CACHE_DIR=/tmp
export TRANSFORMERS_CACHE=/tmp
export HF_HOME=/tmp
export HOME=/tmp
export TRITON_CACHE_DIR=/tmp
tmp=$(mktemp -d)
cat <<EOF > "$tmp"/exec.py
Expand Down Expand Up @@ -1131,6 +1137,7 @@ def create_eval_job(
namespace: str,
job_name: str,
eval_type: str,
nproc_per_node: int = 1,
) -> kubernetes.client.V1Job:
"""
Create a Kubernetes Job object.
Expand Down Expand Up @@ -1199,6 +1206,10 @@ def create_eval_job(
)
),
],
resources=kubernetes.client.V1ResourceRequirements(
requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node},
),
)
]
container = kubernetes.client.V1Container(
Expand Down

0 comments on commit b7ff865

Please sign in to comment.