From 2eb8e1c9074510d7accd8c508bd597982da50267 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Thu, 10 Oct 2024 12:02:14 +0200
Subject: [PATCH] fix: eval, do not use external deps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The pipeline excepts to have all the functions at its disposal to run.
So helper packages cannot be used.
In this case, helpers only works because the current eval is using a
custom image.
Let's move all the logic inside the component.

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 eval/mt_bench/components.py |  90 +++++++++++++++++++++++++++++--
 pipeline.py                 |   1 +
 pipeline.yaml               |  50 ++++++++++++++++--
 standalone/standalone.py    | 102 ++++++++++++++++++++++++++++++++----
 standalone/standalone.tpl   |  10 ++--
 5 files changed, 229 insertions(+), 24 deletions(-)

diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py
index 17beffdf..8f853f9a 100644
--- a/eval/mt_bench/components.py
+++ b/eval/mt_bench/components.py
@@ -24,13 +24,93 @@ def run_mt_bench_op(
     import os
 
     import torch
-    from helpers import (
-        VLLM_SERVER,
-        launch_vllm,
-        stop_vllm,
-    )
     from instructlab.eval.mt_bench import MTBenchEvaluator
 
+    VLLM_SERVER = "http://localhost:8000/v1"
+
+    def launch_vllm(
+        model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
+    ):
+        import subprocess
+        import sys
+        import time
+
+        import requests
+
+        if gpu_count > 0:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+                "--tensor-parallel-size",
+                str(gpu_count),
+            ]
+        else:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+            ]
+
+        subprocess.Popen(args=command)
+
+        print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
+
+        for attempt in range(retries):
+            try:
+                response = requests.get(f"{VLLM_SERVER}/models")
+                if response.status_code == 200:
+                    print(f"vLLM server is up and running at {VLLM_SERVER}.")
+                    return
+            except requests.ConnectionError:
+                pass
+
+            print(
+                f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
+            )
+            time.sleep(delay)
+
+        raise RuntimeError(
+            f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
+        )
+
+    # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
+    # Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
+    def stop_vllm():
+        import psutil
+
+        for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
+            cmdline = process.info.get("cmdline")
+            if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
+                print(
+                    f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
+                )
+                try:
+                    process.terminate()  # Try graceful termination
+                    process.wait(timeout=5)  # Wait a bit for it to terminate
+                    if process.is_running():
+                        print(
+                            f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
+                        )
+                        process.kill()  # Force kill if it's still running
+                    print(
+                        f"Successfully stopped vLLM server with PID: {process.info['pid']}"
+                    )
+                except psutil.NoSuchProcess:
+                    print(f"Process with PID {process.info['pid']} no longer exists.")
+                except psutil.AccessDenied:
+                    print(
+                        f"Access denied when trying to terminate process with PID {process.info['pid']}."
+                    )
+                except Exception as e:
+                    print(
+                        f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
+                    )
+
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     gpu_available = torch.cuda.is_available()
diff --git a/pipeline.py b/pipeline.py
index 9b2b35d7..cfbfe585 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -618,6 +618,7 @@ def change_dsl_function_to_normal_function(rendered_code: list):
         "dsl.Input[dsl.Artifact]": "str",
         "dsl.Output[dsl.Dataset]": "str",
         "dsl.Output[dsl.Model]": "str",
+        "Output[Artifact]": "str",
         "import kfp": "",
         "from kfp import dsl": "",
         "from kfp.dsl import *": "",
diff --git a/pipeline.yaml b/pipeline.yaml
index 50334dba..e074bfb0 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -1206,9 +1206,53 @@ deploymentSpec:
           \    max_workers: str,\n    models_list: List[str] = None,\n    models_folder:\
           \ Optional[str] = None,\n    device: str = None,\n) -> NamedTuple(\"outputs\"\
           , best_model=str, best_score=float):\n    import json\n    import os\n\n\
-          \    import torch\n    from helpers import (\n        VLLM_SERVER,\n   \
-          \     launch_vllm,\n        stop_vllm,\n    )\n    from instructlab.eval.mt_bench\
-          \ import MTBenchEvaluator\n\n    os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"\
+          \    import torch\n    from instructlab.eval.mt_bench import MTBenchEvaluator\n\
+          \n    VLLM_SERVER = \"http://localhost:8000/v1\"\n\n    def launch_vllm(\n\
+          \        model_path: str, gpu_count: int, retries: int = 120, delay: int\
+          \ = 5\n    ):\n        import subprocess\n        import sys\n        import\
+          \ time\n\n        import requests\n\n        if gpu_count > 0:\n       \
+          \     command = [\n                sys.executable,\n                \"-m\"\
+          ,\n                \"vllm.entrypoints.openai.api_server\",\n           \
+          \     \"--model\",\n                model_path,\n                \"--tensor-parallel-size\"\
+          ,\n                str(gpu_count),\n            ]\n        else:\n     \
+          \       command = [\n                sys.executable,\n                \"\
+          -m\",\n                \"vllm.entrypoints.openai.api_server\",\n       \
+          \         \"--model\",\n                model_path,\n            ]\n\n \
+          \       subprocess.Popen(args=command)\n\n        print(f\"Waiting for vLLM\
+          \ server to start at {VLLM_SERVER}...\")\n\n        for attempt in range(retries):\n\
+          \            try:\n                response = requests.get(f\"{VLLM_SERVER}/models\"\
+          )\n                if response.status_code == 200:\n                   \
+          \ print(f\"vLLM server is up and running at {VLLM_SERVER}.\")\n        \
+          \            return\n            except requests.ConnectionError:\n    \
+          \            pass\n\n            print(\n                f\"Server not available\
+          \ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\
+          \n            )\n            time.sleep(delay)\n\n        raise RuntimeError(\n\
+          \            f\"Failed to start vLLM server at {VLLM_SERVER} after {retries}\
+          \ retries.\"\n        )\n\n    # This seems like excessive effort to stop\
+          \ the vllm process, but merely saving & killing the pid doesn't work\n \
+          \   # Also, the base image does not include 'pkill' cmd, so can't pkill\
+          \ -f vllm.entrypoints.openai.api_server either\n    def stop_vllm():\n \
+          \       import psutil\n\n        for process in psutil.process_iter(attrs=[\"\
+          pid\", \"name\", \"cmdline\"]):\n            cmdline = process.info.get(\"\
+          cmdline\")\n            if cmdline and \"vllm.entrypoints.openai.api_server\"\
+          \ in cmdline:\n                print(\n                    f\"Found vLLM\
+          \ server process with PID: {process.info['pid']}, terminating...\"\n   \
+          \             )\n                try:\n                    process.terminate()\
+          \  # Try graceful termination\n                    process.wait(timeout=5)\
+          \  # Wait a bit for it to terminate\n                    if process.is_running():\n\
+          \                        print(\n                            f\"Forcefully\
+          \ killing vLLM server process with PID: {process.info['pid']}\"\n      \
+          \                  )\n                        process.kill()  # Force kill\
+          \ if it's still running\n                    print(\n                  \
+          \      f\"Successfully stopped vLLM server with PID: {process.info['pid']}\"\
+          \n                    )\n                except psutil.NoSuchProcess:\n\
+          \                    print(f\"Process with PID {process.info['pid']} no\
+          \ longer exists.\")\n                except psutil.AccessDenied:\n     \
+          \               print(\n                        f\"Access denied when trying\
+          \ to terminate process with PID {process.info['pid']}.\"\n             \
+          \       )\n                except Exception as e:\n                    print(\n\
+          \                        f\"Failed to terminate process with PID {process.info['pid']}.\
+          \ Error: {e}\"\n                    )\n\n    os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"\
           ] = \"expandable_segments:True\"\n\n    gpu_available = torch.cuda.is_available()\n\
           \    gpu_name = (\n        torch.cuda.get_device_name(torch.cuda.current_device())\n\
           \        if gpu_available\n        else \"No GPU available\"\n    )\n  \
diff --git a/standalone/standalone.py b/standalone/standalone.py
index def72adc..4175c847 100755
--- a/standalone/standalone.py
+++ b/standalone/standalone.py
@@ -1366,7 +1366,7 @@ def create_eval_job(
 
 def run_mt_bench_op(
     models_path_prefix: str,
-    mt_bench_output: Output[Artifact],
+    mt_bench_output: str,
     merge_system_user_message: bool,
     # generate_answers,judgment uses a magic word for its mt_bench evaluator  - 'auto'
     # with 'auto', number of gpus allocated for serving is calculated based on environment
@@ -1380,13 +1380,93 @@ def run_mt_bench_op(
     import os
 
     import torch
-    from helpers import (
-        VLLM_SERVER,
-        launch_vllm,
-        stop_vllm,
-    )
     from instructlab.eval.mt_bench import MTBenchEvaluator
 
+    VLLM_SERVER = "http://localhost:8000/v1"
+
+    def launch_vllm(
+        model_path: str, gpu_count: int, retries: int = 120, delay: int = 5
+    ):
+        import subprocess
+        import sys
+        import time
+
+        import requests
+
+        if gpu_count > 0:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+                "--tensor-parallel-size",
+                str(gpu_count),
+            ]
+        else:
+            command = [
+                sys.executable,
+                "-m",
+                "vllm.entrypoints.openai.api_server",
+                "--model",
+                model_path,
+            ]
+
+        subprocess.Popen(args=command)
+
+        print(f"Waiting for vLLM server to start at {VLLM_SERVER}...")
+
+        for attempt in range(retries):
+            try:
+                response = requests.get(f"{VLLM_SERVER}/models")
+                if response.status_code == 200:
+                    print(f"vLLM server is up and running at {VLLM_SERVER}.")
+                    return
+            except requests.ConnectionError:
+                pass
+
+            print(
+                f"Server not available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})..."
+            )
+            time.sleep(delay)
+
+        raise RuntimeError(
+            f"Failed to start vLLM server at {VLLM_SERVER} after {retries} retries."
+        )
+
+    # This seems like excessive effort to stop the vllm process, but merely saving & killing the pid doesn't work
+    # Also, the base image does not include 'pkill' cmd, so can't pkill -f vllm.entrypoints.openai.api_server either
+    def stop_vllm():
+        import psutil
+
+        for process in psutil.process_iter(attrs=["pid", "name", "cmdline"]):
+            cmdline = process.info.get("cmdline")
+            if cmdline and "vllm.entrypoints.openai.api_server" in cmdline:
+                print(
+                    f"Found vLLM server process with PID: {process.info['pid']}, terminating..."
+                )
+                try:
+                    process.terminate()  # Try graceful termination
+                    process.wait(timeout=5)  # Wait a bit for it to terminate
+                    if process.is_running():
+                        print(
+                            f"Forcefully killing vLLM server process with PID: {process.info['pid']}"
+                        )
+                        process.kill()  # Force kill if it's still running
+                    print(
+                        f"Successfully stopped vLLM server with PID: {process.info['pid']}"
+                    )
+                except psutil.NoSuchProcess:
+                    print(f"Process with PID {process.info['pid']} no longer exists.")
+                except psutil.AccessDenied:
+                    print(
+                        f"Access denied when trying to terminate process with PID {process.info['pid']}."
+                    )
+                except Exception as e:
+                    print(
+                        f"Failed to terminate process with PID {process.info['pid']}. Error: {e}"
+                    )
+
     os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 
     gpu_available = torch.cuda.is_available()
@@ -1477,7 +1557,7 @@ def run_mt_bench_op(
         init_containers = [
             kubernetes.client.V1Container(
                 name=f"run-eval-{eval_type}",
-                image="quay.io/sallyom/instructlab-ocp:eval-10-8",
+                image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
                 command=["/bin/sh", "-ce"],
                 args=[
                     PYTHON_EXECUTOR.format(
@@ -1497,7 +1577,7 @@ def run_mt_bench_op(
         ]
         container = kubernetes.client.V1Container(
             name=f"output-eval-{eval_type}-scores",
-            image="quay.io/sallyom/instructlab-ocp:eval-10-8",
+            image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
             command=["/bin/sh", "-c"],
             args=[f"cat {MT_BENCH_SCORES_PATH}"],
             volume_mounts=get_vol_mount(),
@@ -1900,9 +1980,9 @@ def decode_base64(data):
             name=judge_serving_details_secret, namespace=namespace
         ),
         string_data={
-            "judge_name": judge_serving_model_name,
-            "judge_api_key": judge_serving_model_api_key,
-            "judge_endpoint": judge_serving_endpoint,
+            "JUDGE_NAME": judge_serving_model_name,
+            "JUDGE_API_KEY": judge_serving_model_api_key,
+            "JUDGE_ENDPOINT": judge_serving_endpoint,
         },
     )
 
diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl
index 1163852b..0dd6ac9f 100755
--- a/standalone/standalone.tpl
+++ b/standalone/standalone.tpl
@@ -1184,7 +1184,7 @@ def create_eval_job(
         init_containers = [
             kubernetes.client.V1Container(
                 name=f"run-eval-{eval_type}",
-                image="{{exec_run_mt_bench_op_image}}",
+                image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
                 command=["/bin/sh", "-ce"],
                 args=[
                     PYTHON_EXECUTOR.format(
@@ -1204,7 +1204,7 @@ def create_eval_job(
         ]
         container = kubernetes.client.V1Container(
             name=f"output-eval-{eval_type}-scores",
-            image="{{exec_run_mt_bench_op_image}}",
+            image="registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2",
             command=["/bin/sh", "-c"],
             args=[f"cat {MT_BENCH_SCORES_PATH}"],
             volume_mounts=get_vol_mount(),
@@ -1607,9 +1607,9 @@ def sdg_data_fetch(
             name=judge_serving_details_secret, namespace=namespace
         ),
         string_data={
-            "judge_name": judge_serving_model_name,
-            "judge_api_key": judge_serving_model_api_key,
-            "judge_endpoint": judge_serving_endpoint,
+            "JUDGE_NAME": judge_serving_model_name,
+            "JUDGE_API_KEY": judge_serving_model_api_key,
+            "JUDGE_ENDPOINT": judge_serving_endpoint,
         },
     )