fix: eval, do not use external deps

The pipeline excepts to have all the functions at its disposal to run. So helper packages cannot be used. In this case, helpers only works because the current eval is using a custom image. Let's move all the logic inside the component. Signed-off-by: Sébastien Han <[email protected]>
opendatahub-io · Oct 10, 2024 · 2eb8e1c · 2eb8e1c
1 parent 2b9d01e
commit 2eb8e1c
Show file tree

Hide file tree

Showing 5 changed files with 229 additions and 24 deletions.
diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
@@ -24,13 +24,93 @@ def run_mt_bench_op(
     import os
 
     import torch
-    from helpers import (
-        VLLM_SERVER,
-        launch_vllm,
-        stop_vllm,
-    )
     from instructlab.eval.mt_bench import MTBenchEvaluator
 
+    VLLM_SERVER = "http://localhost:8000/v1"
+
+    def launch_vllm(
+        model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
+    ):
+        import subprocess
+        import sys
+        import time
+
+        import requests
+
+        if gpu_count > 0:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+                "--tensor-parallel-size",
+                str(gpu_count),
+            ]
+        else:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+            ]
+
+        subprocess.Popen(args=command)
+
+        print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
+
+        for attempt in range(retries):
+            try:
+                response = requests.get(f"{VLLM_SERVER}/models")
+                if response.status_code == 200:
+                    print(f"vLLM server is up and running at {VLLM_SERVER}.")
+                    return
+            except requests.ConnectionError:
+                pass
+
+            print(
+                f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
+            )
+            time.sleep(delay)
+
+        raise RuntimeError(
+            f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
+        )
+
+    # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
+    # Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
+    def stop_vllm():
+        import psutil
+
+        for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
+            cmdline = process.info.get("cmdline")
+            if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
+                print(
+                    f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
+                )
+                try:
+                    process.terminate()  # Try graceful termination
+                    process.wait(timeout=5)  # Wait a bit for it to terminate
+                    if process.is_running():
+                        print(
+                            f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
+                        )
+                        process.kill()  # Force kill if it's still running
+                    print(
+                        f"Successfully stopped vLLM server with PID: {process.info['pid']}"
+                    )
+                except psutil.NoSuchProcess:
+                    print(f"Process with PID {process.info['pid']} no longer exists.")
+                except psutil.AccessDenied:
+                    print(
+                        f"Access denied when trying to terminate process with PID {process.info['pid']}."
+                    )
+                except Exception as e:
+                    print(
+                        f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
+                    )
+
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     gpu_available = torch.cuda.is_available()

diff --git a/pipeline.py b/pipeline.py
@@ -618,6 +618,7 @@ def change_dsl_function_to_normal_function(rendered_code: list):
         "dsl.Input[dsl.Artifact]": "str",
         "dsl.Output[dsl.Dataset]": "str",
         "dsl.Output[dsl.Model]": "str",
+        "Output[Artifact]": "str",
         "import kfp": "",
         "from kfp import dsl": "",
         "from kfp.dsl import *": "",

diff --git a/pipeline.yaml b/pipeline.yaml
@@ -1206,9 +1206,53 @@ deploymentSpec:
           \    max_workers: str,\n    models_list: List[str] = None,\n    models_folder:\
           \ Optional[str] = None,\n    device: str = None,\n) -> NamedTuple(\"outputs\"\
           , best_model=str, best_score=float):\n    import json\n    import os\n\n\
-          \    import torch\n    from helpers import (\n        VLLM_SERVER,\n   \
-          \     launch_vllm,\n        stop_vllm,\n    )\n    from instructlab.eval.mt_bench\
-          \ import MTBenchEvaluator\n\n    os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"\
+          \    import torch\n    from instructlab.eval.mt_bench import MTBenchEvaluator\n\
+          \n    VLLM_SERVER = \"http://localhost:8000/v1\"\n\n    def launch_vllm(\n\
+          \        model_path: str, gpu_count: int, retries: int = 120, delay: int\
+          \ = 5\n    ):\n        import subprocess\n        import sys\n        import\
+          \ time\n\n        import requests\n\n        if gpu_count > 0:\n       \
+          \     command = [\n                sys.executable,\n                \"-m\"\
+          ,\n                \"vllm.entrypoints.openai.api_server\",\n           \
+          \     \"--model\",\n                model_path,\n                \"--tensor-parallel-size\"\
+          ,\n                str(gpu_count),\n            ]\n        else:\n     \
+          \       command = [\n                sys.executable,\n                \"\
+          -m\",\n                \"vllm.entrypoints.openai.api_server\",\n       \
+          \         \"--model\",\n                model_path,\n            ]\n\n \
+          \       subprocess.Popen(args=command)\n\n        print(f\"Waiting for vLLM\
+          \ server to start at {VLLM_SERVER}...\")\n\n        for attempt in range(retries):\n\
+          \            try:\n                response = requests.get(f\"{VLLM_SERVER}/models\"\
+          )\n                if response.status_code == 200:\n                   \
+          \ print(f\"vLLM server is up and running at {VLLM_SERVER}.\")\n        \
+          \            return\n            except requests.ConnectionError:\n    \
+          \            pass\n\n            print(\n                f\"Server not available\
+          \ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\
+          \n            )\n            time.sleep(delay)\n\n        raise RuntimeError(\n\
+          \            f\"Failed to start vLLM server at {VLLM_SERVER} after {retries}\
+          \ retries.\"\n        )\n\n    # This seems like excessive effort to stop\
+          \ the vllm process, but merely saving & killing the pid doesn't work\n \
+          \   # Also, the base image does not include 'pkill' cmd, so can't pkill\
+          \ -f vllm.entrypoints.openai.api_server either\n    def stop_vllm():\n \
+          \       import psutil\n\n        for process in psutil.process_iter(attrs=[\"\
+          pid\", \"name\", \"cmdline\"]):\n            cmdline = process.info.get(\"\
+          cmdline\")\n            if cmdline and \"vllm.entrypoints.openai.api_server\"\
+          \ in cmdline:\n                print(\n                    f\"Found vLLM\
+          \ server process with PID: {process.info['pid']}, terminating...\"\n   \
+          \             )\n                try:\n                    process.terminate()\
+          \  # Try graceful termination\n                    process.wait(timeout=5)\
+          \  # Wait a bit for it to terminate\n                    if process.is_running():\n\
+          \                        print(\n                            f\"Forcefully\
+          \ killing vLLM server process with PID: {process.info['pid']}\"\n      \
+          \                  )\n                        process.kill()  # Force kill\
+          \ if it's still running\n                    print(\n                  \
+          \      f\"Successfully stopped vLLM server with PID: {process.info['pid']}\"\
+          \n                    )\n                except psutil.NoSuchProcess:\n\
+          \                    print(f\"Process with PID {process.info['pid']} no\
+          \ longer exists.\")\n                except psutil.AccessDenied:\n     \
+          \               print(\n                        f\"Access denied when trying\
+          \ to terminate process with PID {process.info['pid']}.\"\n             \
+          \       )\n                except Exception as e:\n                    print(\n\
+          \                        f\"Failed to terminate process with PID {process.info['pid']}.\
+          \ Error: {e}\"\n                    )\n\n    os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"\
           ] = \"expandable_segments:True\"\n\n    gpu_available = torch.cuda.is_available()\n\
           \    gpu_name = (\n        torch.cuda.get_device_name(torch.cuda.current_device())\n\
           \        if gpu_available\n        else \"No GPU available\"\n    )\n  \

diff --git a/standalone/standalone.py b/standalone/standalone.py
@@ -1366,7 +1366,7 @@ def create_eval_job(
 
 def run_mt_bench_op(
     models_path_prefix: str,
-    mt_bench_output: Output[Artifact],
+    mt_bench_output: str,
     merge_system_user_message: bool,
     # generate_answers,judgment uses a magic word for its mt_bench evaluator  - 'auto'
     # with 'auto', number of gpus allocated for serving is calculated based on environment
@@ -1380,13 +1380,93 @@ def run_mt_bench_op(
     import os
 
     import torch
-    from helpers import (
-        VLLM_SERVER,
-        launch_vllm,
-        stop_vllm,
-    )
     from instructlab.eval.mt_bench import MTBenchEvaluator
 
+    VLLM_SERVER = "http://localhost:8000/v1"
+
+    def launch_vllm(
+        model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
+    ):
+        import subprocess
+        import sys
+        import time
+
+        import requests
+
+        if gpu_count > 0:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+                "--tensor-parallel-size",
+                str(gpu_count),
+            ]
+        else:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+            ]
+
+        subprocess.Popen(args=command)
+
+        print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
+
+        for attempt in range(retries):
+            try:
+                response = requests.get(f"{VLLM_SERVER}/models")
+                if response.status_code == 200:
+                    print(f"vLLM server is up and running at {VLLM_SERVER}.")
+                    return
+            except requests.ConnectionError:
+                pass
+
+            print(
+                f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
+            )
+            time.sleep(delay)
+
+        raise RuntimeError(
+            f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
+        )
+
+    # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
+    # Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
+    def stop_vllm():
+        import psutil
+
+        for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
+            cmdline = process.info.get("cmdline")
+            if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
+                print(
+                    f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
+                )
+                try:
+                    process.terminate()  # Try graceful termination
+                    process.wait(timeout=5)  # Wait a bit for it to terminate
+                    if process.is_running():
+                        print(
+                            f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
+                        )
+                        process.kill()  # Force kill if it's still running
+                    print(
+                        f"Successfully stopped vLLM server with PID: {process.info['pid']}"
+                    )
+                except psutil.NoSuchProcess:
+                    print(f"Process with PID {process.info['pid']} no longer exists.")
+                except psutil.AccessDenied:
+                    print(
+                        f"Access denied when trying to terminate process with PID {process.info['pid']}."
+                    )
+                except Exception as e:
+                    print(
+                        f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
+                    )
+
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     gpu_available = torch.cuda.is_available()
@@ -1477,7 +1557,7 @@ def run_mt_bench_op(
         init_containers = [
             kubernetes.client.V1Container(
                 name=f"run-eval-{eval_type}",
-                image="quay.io/sallyom/instructlab-ocp:eval-10-8",
+                image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
                 command=["/bin/sh", "-ce"],
                 args=[
                     PYTHON_EXECUTOR.format(
@@ -1497,7 +1577,7 @@ def run_mt_bench_op(
         ]
         container = kubernetes.client.V1Container(
             name=f"output-eval-{eval_type}-scores",
-            image="quay.io/sallyom/instructlab-ocp:eval-10-8",
+            image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
             command=["/bin/sh", "-c"],
             args=[f"cat {MT_BENCH_SCORES_PATH}"],
             volume_mounts=get_vol_mount(),
@@ -1900,9 +1980,9 @@ def decode_base64(data):
             name=judge_serving_details_secret, namespace=namespace
         ),
         string_data={
-            "judge_name": judge_serving_model_name,
-            "judge_api_key": judge_serving_model_api_key,
-            "judge_endpoint": judge_serving_endpoint,
+            "JUDGE_NAME": judge_serving_model_name,
+            "JUDGE_API_KEY": judge_serving_model_api_key,
+            "JUDGE_ENDPOINT": judge_serving_endpoint,
         },
     )
 

diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl
@@ -1184,7 +1184,7 @@ def create_eval_job(
         init_containers = [
             kubernetes.client.V1Container(
                 name=f"run-eval-{eval_type}",
-                image="{{exec_run_mt_bench_op_image}}",
+                image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
                 command=["/bin/sh", "-ce"],
                 args=[
                     PYTHON_EXECUTOR.format(
@@ -1204,7 +1204,7 @@ def create_eval_job(
         ]
         container = kubernetes.client.V1Container(
             name=f"output-eval-{eval_type}-scores",
-            image="{{exec_run_mt_bench_op_image}}",
+            image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
             command=["/bin/sh", "-c"],
             args=[f"cat {MT_BENCH_SCORES_PATH}"],
             volume_mounts=get_vol_mount(),
@@ -1607,9 +1607,9 @@ def sdg_data_fetch(
             name=judge_serving_details_secret, namespace=namespace
         ),
         string_data={
-            "judge_name": judge_serving_model_name,
-            "judge_api_key": judge_serving_model_api_key,
-            "judge_endpoint": judge_serving_endpoint,
+            "JUDGE_NAME": judge_serving_model_name,
+            "JUDGE_API_KEY": judge_serving_model_api_key,
+            "JUDGE_ENDPOINT": judge_serving_endpoint,
         },
     )