diff --git a/eval/final/components.py b/eval/final/components.py index 9cb710d9..a6d99459 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -39,6 +39,26 @@ def run_final_eval_op( from instructlab.eval.mt_bench import MTBenchBranchEvaluator from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score + if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"): + import httpx + import openai + + # Create a custom HTTP client + class CustomHttpClient(httpx.Client): + def __init__(self, *args, **kwargs): + # Use the custom CA certificate + kwargs.setdefault("verify", judge_ca_cert) + super().__init__(*args, **kwargs) + + # Create a new OpenAI class that uses the custom HTTP client + class CustomOpenAI(openai.OpenAI): + def __init__(self, *args, **kwargs): + custom_client = CustomHttpClient() + super().__init__(http_client=custom_client, *args, **kwargs) + + # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it + openai.OpenAI = CustomOpenAI + print("Starting Final Eval...") def launch_vllm( diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index d67ab9d5..96c94c31 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -28,6 +28,26 @@ def run_mt_bench_op( import torch from instructlab.eval.mt_bench import MTBenchEvaluator + if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"): + import httpx + import openai + + # Create a custom HTTP client + class CustomHttpClient(httpx.Client): + def __init__(self, *args, **kwargs): + # Use the custom CA certificate + kwargs.setdefault("verify", judge_ca_cert) + super().__init__(*args, **kwargs) + + # Create a new OpenAI class that uses the custom HTTP client + class CustomOpenAI(openai.OpenAI): + def __init__(self, *args, **kwargs): + custom_client = CustomHttpClient() + super().__init__(http_client=custom_client, *args, **kwargs) + + # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it + openai.OpenAI = CustomOpenAI + def launch_vllm( model_path: str, gpu_count: int, retries: int = 120, delay: int = 10 ) -> tuple: diff --git a/pipeline.yaml b/pipeline.yaml index a92c8b33..e4690797 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1191,15 +1191,26 @@ deploymentSpec: \ os\n import subprocess\n\n import torch\n from instructlab.eval.mmlu\ \ import MMLU_TASKS, MMLUBranchEvaluator\n from instructlab.eval.mt_bench\ \ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\ - \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n print(\"Starting Final\ - \ Eval...\")\n\n def launch_vllm(\n model_path: str, gpu_count:\ - \ int, retries: int = 120, delay: int = 10\n ) -> tuple:\n import\ - \ subprocess\n import sys\n import time\n\n import\ - \ requests\n from instructlab.model.backends.common import free_tcp_ipv4_port\n\ - \n free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port =\ - \ str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\ - \n command = [\n sys.executable,\n \"-m\",\n\ - \ \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ + \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n if judge_ca_cert := os.getenv(\"\ + JUDGE_CA_CERT_PATH\"):\n import httpx\n import openai\n\n\ + \ # Create a custom HTTP client\n class CustomHttpClient(httpx.Client):\n\ + \ def __init__(self, *args, **kwargs):\n # Use\ + \ the custom CA certificate\n kwargs.setdefault(\"verify\"\ + , judge_ca_cert)\n super().__init__(*args, **kwargs)\n\n\ + \ # Create a new OpenAI class that uses the custom HTTP client\n\ + \ class CustomOpenAI(openai.OpenAI):\n def __init__(self,\ + \ *args, **kwargs):\n custom_client = CustomHttpClient()\n\ + \ super().__init__(http_client=custom_client, *args, **kwargs)\n\ + \n # Monkey patch the OpenAI class in the openai module, so that\ + \ the eval lib can use it\n openai.OpenAI = CustomOpenAI\n\n print(\"\ + Starting Final Eval...\")\n\n def launch_vllm(\n model_path: str,\ + \ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ + \ import subprocess\n import sys\n import time\n\n\ + \ import requests\n from instructlab.model.backends.common\ + \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ + 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ + \n\n command = [\n sys.executable,\n \"-m\"\ + ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ ,\n port,\n \"--model\",\n model_path,\n\ \ ]\n if gpu_count > 0:\n command += [\n \ \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ @@ -1446,26 +1457,38 @@ deploymentSpec: \ Optional[str] = None,\n device: str = None,\n best_score_file: Optional[str]\ \ = None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\ \ import json\n import os\n import subprocess\n\n import torch\n\ - \ from instructlab.eval.mt_bench import MTBenchEvaluator\n\n def launch_vllm(\n\ - \ model_path: str, gpu_count: int, retries: int = 120, delay: int\ - \ = 10\n ) -> tuple:\n import subprocess\n import sys\n\ - \ import time\n\n import requests\n from instructlab.model.backends.common\ - \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ - 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ - \n\n command = [\n sys.executable,\n \"-m\"\ - ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ - ,\n port,\n \"--model\",\n model_path,\n\ - \ ]\n if gpu_count > 0:\n command += [\n \ - \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ - \ ]\n\n process = subprocess.Popen(args=command)\n\n \ - \ print(f\"Waiting for vLLM server to start at {vllm_server}...\"\ - )\n\n for attempt in range(retries):\n try:\n \ - \ response = requests.get(f\"{vllm_server}/models\")\n \ - \ if response.status_code == 200:\n print(f\"vLLM\ - \ server is up and running at {vllm_server}.\")\n return\ - \ process, vllm_server\n except requests.ConnectionError:\n \ - \ pass\n\n print(\n f\"Server not\ - \ available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ + \ from instructlab.eval.mt_bench import MTBenchEvaluator\n\n if judge_ca_cert\ + \ := os.getenv(\"JUDGE_CA_CERT_PATH\"):\n import httpx\n import\ + \ openai\n\n # Create a custom HTTP client\n class CustomHttpClient(httpx.Client):\n\ + \ def __init__(self, *args, **kwargs):\n # Use\ + \ the custom CA certificate\n kwargs.setdefault(\"verify\"\ + , judge_ca_cert)\n super().__init__(*args, **kwargs)\n\n\ + \ # Create a new OpenAI class that uses the custom HTTP client\n\ + \ class CustomOpenAI(openai.OpenAI):\n def __init__(self,\ + \ *args, **kwargs):\n custom_client = CustomHttpClient()\n\ + \ super().__init__(http_client=custom_client, *args, **kwargs)\n\ + \n # Monkey patch the OpenAI class in the openai module, so that\ + \ the eval lib can use it\n openai.OpenAI = CustomOpenAI\n\n def\ + \ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\ + \ 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \ + \ import sys\n import time\n\n import requests\n \ + \ from instructlab.model.backends.common import free_tcp_ipv4_port\n\n\ + \ free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port = str(free_port)\n\ + \ vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\n command\ + \ = [\n sys.executable,\n \"-m\",\n \"\ + vllm.entrypoints.openai.api_server\",\n \"--port\",\n \ + \ port,\n \"--model\",\n model_path,\n \ + \ ]\n if gpu_count > 0:\n command += [\n \ + \ \"--tensor-parallel-size\",\n str(gpu_count),\n \ + \ ]\n\n process = subprocess.Popen(args=command)\n\n \ + \ print(f\"Waiting for vLLM server to start at {vllm_server}...\")\n\n\ + \ for attempt in range(retries):\n try:\n \ + \ response = requests.get(f\"{vllm_server}/models\")\n \ + \ if response.status_code == 200:\n print(f\"vLLM server\ + \ is up and running at {vllm_server}.\")\n return process,\ + \ vllm_server\n except requests.ConnectionError:\n \ + \ pass\n\n print(\n f\"Server not available\ + \ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ \n )\n time.sleep(delay)\n\n raise RuntimeError(\n\ \ f\"Failed to start vLLM server at {vllm_server} after {retries}\ \ retries.\"\n )\n\n def shutdown_vllm(process: subprocess.Popen,\ diff --git a/standalone/README.md b/standalone/README.md index b8c5f713..a4cb70ce 100644 --- a/standalone/README.md +++ b/standalone/README.md @@ -399,6 +399,7 @@ evaluation * `--judge-serving-model-name`: The name of the model to use for evaluation. **Optional** * `--judge-serving-model-api-key`: The API key for the model to evaluate. `JUDGE_SERVING_MODEL_API_KEY` environment variable can be used as well. **Optional** +* `--judge-serving-model-ca-cert`: Name of the Kubernetes ConfigMap containing the serving model CA cert. **Optional** * `--judge-serving-model-secret`: The name of the Kubernetes secret containing the judge serving model API key. **Optional** - If not provided, the script will expect the provided CLI options to evaluate the model. * `--force-pull`: Force pull the data (sdg data, model and taxonomy) from the object store even if it already @@ -513,9 +514,17 @@ The list of all mandatory keys: * `JUDGE_ENDPOINT`: Serving endpoint for evaluation - **Required** * `JUDGE_NAME`: The name of the model to use for evaluation - **Required** +Optional keys: + +* `JUDGE_CA_CERT`: The name of ConfigMap containing the custom CA Cert - **Optional** +* `JUDGE_CA_CERT_CM_KEY`: The key of the CA Cert in the ConfigMap - **Optional** + > [!WARNING] > Mind the upper case of the keys, as the script expects them to be in upper case. +> [!WARNING] +> Make sure the endpoint URL ends with /v1 + #### Running the Script Without Kubernetes Secret Alternatively, you can provide the necessary information directly via CLI options or environment, diff --git a/standalone/standalone.py b/standalone/standalone.py index 35df0167..23c20809 100755 --- a/standalone/standalone.py +++ b/standalone/standalone.py @@ -25,6 +25,7 @@ import base64 import json import logging +import os import time import typing from ast import literal_eval @@ -97,6 +98,9 @@ MERGE_SYSTEM_USER_MESSAGE = False FEW_SHOTS = 5 BATCH_SIZE = 8 +JUDGE_CA_CERT_ENV_VAR_NAME = "JUDGE_CA_CERT_PATH" +JUDGE_CA_CERT_PATH = "/tmp" +JUDGE_CA_CERT_CM_KEY = "ca-bundle.crt" # TEMPLATES PYTORCH_TRAINING_JOB = """ @@ -655,6 +659,20 @@ def show( ), envvar="JUDGE_SERVING_MODEL_API_KEY", ) +@click.option( + "--judge-serving-model-ca-cert", + type=str, + help=( + "Name of the Kubernetes ConfigMap containing the serving model CA cert." + "The expected key name is 'ca-bundle.crt'." + ), +) +@click.option( + "--judge-serving-model-ca-cert-cm-key", + type=str, + help="Name of the Key in the Kubernetes ConfigMap containing the serving model CA cert.", + default=JUDGE_CA_CERT_CM_KEY, +) @click.option( "--judge-serving-model-secret", type=str, @@ -663,9 +681,10 @@ def show( "Name of the Kubernetes Secret containing the judge serving model endpoint. " "For evaluation only. " "The namespace is inferred from the namespace option. " - "The following keys are expected: JUDGE_API_KEY, JUDGE_ENDPOINT, JUDGE_NAME " + "The following keys are expected: JUDGE_API_KEY, JUDGE_ENDPOINT, JUDGE_NAME" + "Optional keys are: JUDGE_CA_CERT, JUDGE_CA_CERT_CM_KEY" " (JUDGE_SERVING_MODEL_SECRET env var)" - "If used, the --judge-serving-model-{api-key,endpoint,name} options will be ignored." + "If used, --judge-serving-model-{api-key,endpoint,name,ca-cert} will be ignored." ), ) @click.option( @@ -811,6 +830,8 @@ def run( judge_serving_model_endpoint: typing.Optional[str] = None, judge_serving_model_name: typing.Optional[str] = None, judge_serving_model_api_key: typing.Optional[str] = None, + judge_serving_model_ca_cert: typing.Optional[str] = None, + judge_serving_model_ca_cert_cm_key: typing.Optional[str] = None, judge_serving_model_secret: typing.Optional[str] = None, nproc_per_node: typing.Optional[int] = 1, eval_type: typing.Optional[str] = None, @@ -846,6 +867,8 @@ def run( judge_serving_model_name (str): The serving model name for evaluation. For Evaluation only. judge_serving_model_api_key (str): The serving model API key for evaluation. For Evaluation only. + judge_serving_model_ca_cert (str): The serving model CA cert for evaluation. + judge_serving_model_ca_cert_cm_key (str): The name of the Key in the Kubernetes ConfigMap judge_serving_model_secret (str): The name of the Kubernetes Secret containing the serving model credentials. For Evaluation only. nproc_per_node (int): The number of processes per node. For training only. @@ -882,7 +905,9 @@ def run( ctx.obj["judge_serving_model_endpoint"] = judge_serving_model_endpoint ctx.obj["judge_serving_model_name"] = judge_serving_model_name ctx.obj["judge_serving_model_api_key"] = judge_serving_model_api_key + ctx.obj["judge_serving_model_ca_cert"] = judge_serving_model_ca_cert ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret + ctx.obj["judge_serving_model_ca_cert_cm_key"] = judge_serving_model_ca_cert_cm_key ctx.obj["nproc_per_node"] = nproc_per_node ctx.obj["eval_type"] = eval_type ctx.obj["training_phase"] = training_phase @@ -1552,6 +1577,8 @@ def create_eval_job( eval_type: str, judge_serving_model_secret: str, nproc_per_node: int = 1, + judge_serving_model_ca_cert: str = None, + judge_serving_model_ca_cert_cm_key: str = None, ) -> kubernetes.client.V1Job: """ Create a Kubernetes Job object. @@ -1619,6 +1646,26 @@ def run_mt_bench_op( import torch from instructlab.eval.mt_bench import MTBenchEvaluator + if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"): + import httpx + import openai + + # Create a custom HTTP client + class CustomHttpClient(httpx.Client): + def __init__(self, *args, **kwargs): + # Use the custom CA certificate + kwargs.setdefault("verify", judge_ca_cert) + super().__init__(*args, **kwargs) + + # Create a new OpenAI class that uses the custom HTTP client + class CustomOpenAI(openai.OpenAI): + def __init__(self, *args, **kwargs): + custom_client = CustomHttpClient() + super().__init__(http_client=custom_client, *args, **kwargs) + + # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it + openai.OpenAI = CustomOpenAI + def launch_vllm( model_path: str, gpu_count: int, retries: int = 120, delay: int = 10 ) -> tuple: @@ -1827,6 +1874,26 @@ def run_final_eval_op( from instructlab.eval.mt_bench import MTBenchBranchEvaluator from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score + if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"): + import httpx + import openai + + # Create a custom HTTP client + class CustomHttpClient(httpx.Client): + def __init__(self, *args, **kwargs): + # Use the custom CA certificate + kwargs.setdefault("verify", judge_ca_cert) + super().__init__(*args, **kwargs) + + # Create a new OpenAI class that uses the custom HTTP client + class CustomOpenAI(openai.OpenAI): + def __init__(self, *args, **kwargs): + custom_client = CustomHttpClient() + super().__init__(http_client=custom_client, *args, **kwargs) + + # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it + openai.OpenAI = CustomOpenAI + print("Starting Final Eval...") def launch_vllm( @@ -2260,78 +2327,60 @@ def find_node_dataset_directories(base_dir: str): run_final_eval_op(mmlu_branch_output="{MMLU_BRANCH_SCORES_PATH}", mt_bench_branch_output="{MT_BENCH_BRANCH_SCORES_PATH}", candidate_model="{CANDIDATE_MODEL_PATH}", taxonomy="{TAXONOMY_PATH}", tasks="{DATA_PVC_SDG_PATH}", base_branch="", candidate_branch="", device=None, base_model_dir="{DATA_PVC_MODEL_PATH}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE}, model_dtype="{MODEL_DTYPE}", few_shots={FEW_SHOTS}, batch_size={BATCH_SIZE}) """ - if eval_type == "mt-bench": - init_containers = [ - kubernetes.client.V1Container( - name=f"run-eval-{eval_type}", - image=RHELAI_IMAGE, - command=["/bin/sh", "-ce"], - args=[ - PYTHON_EXECUTOR.format( - python_code=exec_run_mt_bench_op_command, - python_main=exec_run_mt_bench_op_args.strip(), - ), - ], - volume_mounts=get_vol_mount(), - security_context=get_security_context(), - env_from=[ - kubernetes.client.V1EnvFromSource( - secret_ref=kubernetes.client.V1SecretEnvSource( - name=judge_serving_model_secret - ) - ), - ], - resources=kubernetes.client.V1ResourceRequirements( - requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - ), - ) - ] - container = kubernetes.client.V1Container( - name=f"output-eval-{eval_type}-scores", - image=RHELAI_IMAGE, - command=["/bin/sh", "-c"], - args=[f"cat {MT_BENCH_SCORES_PATH}"], - security_context=get_security_context(), - volume_mounts=get_vol_mount(), - ) - elif eval_type == EVAL_TYPE_FINAL: - init_containers = [ - kubernetes.client.V1Container( - name=f"run-eval-{eval_type}", - image=RHELAI_IMAGE, - command=["/bin/sh", "-ce"], - args=[ - PYTHON_EXECUTOR.format( - python_code=exec_run_final_eval_op_command, - python_main=exec_run_final_eval_op_args.strip(), - ), - ], - volume_mounts=get_vol_mount(), - security_context=get_security_context(), - env_from=[ - kubernetes.client.V1EnvFromSource( - secret_ref=kubernetes.client.V1SecretEnvSource( - name=judge_serving_model_secret - ) - ), - ], - resources=kubernetes.client.V1ResourceRequirements( - requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - ), - ) - ] - container = kubernetes.client.V1Container( - name=f"output-eval-{eval_type}-scores", - image=RHELAI_IMAGE, - command=["/bin/sh", "-c"], - args=[f"cat {MT_BENCH_BRANCH_SCORES_PATH}"], - security_context=get_security_context(), - volume_mounts=get_vol_mount(), - ) - else: - raise ValueError(f"Unknown evaluation type: {eval_type}") + eval_container = kubernetes.client.V1Container( + name=f"run-eval-{eval_type}", + image=RHELAI_IMAGE, + command=["/bin/sh", "-ce"], + volume_mounts=get_vol_mount(), + security_context=get_security_context(), + env_from=[ + kubernetes.client.V1EnvFromSource( + secret_ref=kubernetes.client.V1SecretEnvSource( + name=judge_serving_model_secret + ) + ), + ], + resources=kubernetes.client.V1ResourceRequirements( + requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + ), + ) + eval_args = { + EVAL_TYPE_MT_BENCH: [ + PYTHON_EXECUTOR.format( + python_code=exec_run_mt_bench_op_command, + python_main=exec_run_mt_bench_op_args.strip(), + ), + ], + EVAL_TYPE_FINAL: [ + PYTHON_EXECUTOR.format( + python_code=exec_run_final_eval_op_command, + python_main=exec_run_final_eval_op_args.strip(), + ), + ], + } + try: + eval_container.args = eval_args[eval_type] + except KeyError as exc: + raise ValueError(f"Unknown evaluation type: {eval_type}") from exc + + init_containers = [eval_container] + + output_container = kubernetes.client.V1Container( + name=f"output-eval-{eval_type}-scores", + image=RHELAI_IMAGE, + command=["/bin/sh", "-c"], + security_context=get_security_context(), + volume_mounts=get_vol_mount(), + ) + eval_paths = { + EVAL_TYPE_MT_BENCH: MT_BENCH_SCORES_PATH, + EVAL_TYPE_FINAL: MT_BENCH_BRANCH_SCORES_PATH, + } + try: + output_container.args = [f"cat {eval_paths[eval_type]}"] + except KeyError as exc: + raise ValueError(f"Unknown evaluation type: {eval_type}") from exc # Create and configure a spec section template = kubernetes.client.V1PodTemplateSpec( @@ -2339,11 +2388,38 @@ def find_node_dataset_directories(base_dir: str): spec=kubernetes.client.V1PodSpec( restart_policy="Never", init_containers=init_containers, - containers=[container], + containers=[output_container], volumes=get_vol(), ), ) + if judge_serving_model_ca_cert: + # Define the volume that references the ConfigMap + cm_volume = kubernetes.client.V1Volume( + name="judge-ca-cert-volume", + config_map=kubernetes.client.V1ConfigMapVolumeSource( + name=judge_serving_model_ca_cert + ), + ) + # Define the volume mount to specify where the Secret should be mounted in the container + cm_volume_mount = kubernetes.client.V1VolumeMount( + name="judge-ca-cert-volume", + mount_path=JUDGE_CA_CERT_PATH, # Path where the Secret will be mounted + ) + # Add an env var to the container to specify the path to the CA cert + eval_container.env.append( + kubernetes.client.V1EnvVar( + name=JUDGE_CA_CERT_ENV_VAR_NAME, + value=os.path.join( + JUDGE_CA_CERT_PATH, judge_serving_model_ca_cert_cm_key + ), + ) + ) + # Add the volume to the Pod spec + eval_container.volume_mounts.append(cm_volume_mount) + # Add the volume mount to the container + template.spec.volumes.append(cm_volume) + # Create the specification of deployment spec = kubernetes.client.V1JobSpec( template=template, @@ -2650,6 +2726,8 @@ def sdg_data_fetch( judge_serving_model_endpoint = ctx.obj["judge_serving_model_endpoint"] judge_serving_model_name = ctx.obj["judge_serving_model_name"] judge_serving_model_api_key = ctx.obj["judge_serving_model_api_key"] + judge_serving_model_ca_cert = ctx.obj["judge_serving_model_ca_cert"] + judge_serving_model_ca_cert_cm_key = ctx.obj["judge_serving_model_ca_cert_cm_key"] judge_serving_model_secret = ctx.obj["judge_serving_model_secret"] sdg_object_store_endpoint = ctx.obj["sdg_object_store_endpoint"] sdg_object_store_bucket = ctx.obj["sdg_object_store_bucket"] @@ -2850,15 +2928,49 @@ def decode_base64(data): secret.data.get("JUDGE_ENDPOINT") ) validate_url(judge_serving_model_endpoint) + + # Validation of the secret's existence is done in the next conditional block + if secret.data.get("JUDGE_CA_CERT"): + judge_serving_model_ca_cert = secret.data.get("JUDGE_CA_CERT") + if secret.data.get("JUDGE_CA_CERT_CM_KEY"): + judge_serving_model_ca_cert_cm_key = secret.data.get( + "JUDGE_CA_CERT_CM_KEY" + ) except kubernetes.client.rest.ApiException as exc: if exc.status == 404: raise ValueError( f"Secret {judge_serving_model_secret} not found in namespace {namespace}." ) from exc + # If the CA cert is provided, verify the existence of the secret + # We don't add the CA Cert Secret name into the Secret that contains the judge details + # If provided, the Secret will be mounted as a volume in the evaluation job + if judge_serving_model_ca_cert and not dry_run: + try: + cm = v1.read_namespaced_config_map( + name=judge_serving_model_ca_cert, namespace=namespace + ) + # Validate the presence of the key + if not cm.data.get(judge_serving_model_ca_cert_cm_key): + raise ValueError( + f"Provided ConfigMap {judge_serving_model_ca_cert} does not contain the key:" + f"'{judge_serving_model_ca_cert_cm_key}'." + "Use '--judge-serving-model-ca-cert-cm-key' to specify the key." + ) + except kubernetes.client.rest.ApiException as exc: + if exc.status == 404: + raise ValueError( + f"ConfigMap {judge_serving_model_ca_cert} not found in namespace {namespace}." + ) from exc + # Set the judge secret in the context for the evaluation job ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret + # Set the judge CA cert in the context for the evaluation job, this handles the case where the + # secret is not provided via the cli flag but inside the secret + ctx.obj["judge_serving_model_ca_cert"] = judge_serving_model_ca_cert + ctx.obj["judge_serving_model_ca_cert_cm_key"] = judge_serving_model_ca_cert_cm_key + # list of PVCs to create and their details pvcs = [ { @@ -3118,6 +3230,8 @@ def evaluation(ctx: click.Context) -> str: eval_type = ctx.obj["eval_type"] dry_run = ctx.obj["dry_run"] judge_serving_model_secret = ctx.obj["judge_serving_model_secret"] + judge_serving_model_ca_cert = ctx.obj["judge_serving_model_ca_cert"] + judge_serving_model_ca_cert_cm_key = ctx.obj["judge_serving_model_ca_cert_cm_key"] # This should only happen if the script is called with the "evaluation" subcommand if not judge_serving_model_secret: @@ -3137,6 +3251,8 @@ def evaluation(ctx: click.Context) -> str: namespace=namespace, eval_type=eval_type, judge_serving_model_secret=judge_serving_model_secret, + judge_serving_model_ca_cert=judge_serving_model_ca_cert, + judge_serving_model_ca_cert_cm_key=judge_serving_model_ca_cert_cm_key, ) if dry_run: diff --git a/standalone/standalone.tpl b/standalone/standalone.tpl index 287ba29a..c8c404e0 100755 --- a/standalone/standalone.tpl +++ b/standalone/standalone.tpl @@ -25,6 +25,7 @@ TODO: import base64 import json import logging +import os import time import typing from ast import literal_eval @@ -97,6 +98,9 @@ MAX_WORKERS = "auto" MERGE_SYSTEM_USER_MESSAGE = False FEW_SHOTS = 5 BATCH_SIZE = 8 +JUDGE_CA_CERT_ENV_VAR_NAME = "JUDGE_CA_CERT_PATH" +JUDGE_CA_CERT_PATH = "/tmp" +JUDGE_CA_CERT_CM_KEY = "ca-bundle.crt" # TEMPLATES PYTORCH_TRAINING_JOB = """ @@ -655,6 +659,20 @@ def show( ), envvar="JUDGE_SERVING_MODEL_API_KEY", ) +@click.option( + "--judge-serving-model-ca-cert", + type=str, + help=( + "Name of the Kubernetes ConfigMap containing the serving model CA cert." + "The expected key name is 'ca-bundle.crt'." + ), +) +@click.option( + "--judge-serving-model-ca-cert-cm-key", + type=str, + help="Name of the Key in the Kubernetes ConfigMap containing the serving model CA cert.", + default=JUDGE_CA_CERT_CM_KEY, +) @click.option( "--judge-serving-model-secret", type=str, @@ -663,9 +681,10 @@ def show( "Name of the Kubernetes Secret containing the judge serving model endpoint. " "For evaluation only. " "The namespace is inferred from the namespace option. " - "The following keys are expected: JUDGE_API_KEY, JUDGE_ENDPOINT, JUDGE_NAME " + "The following keys are expected: JUDGE_API_KEY, JUDGE_ENDPOINT, JUDGE_NAME" + "Optional keys are: JUDGE_CA_CERT, JUDGE_CA_CERT_CM_KEY" " (JUDGE_SERVING_MODEL_SECRET env var)" - "If used, the --judge-serving-model-{api-key,endpoint,name} options will be ignored." + "If used, --judge-serving-model-{api-key,endpoint,name,ca-cert} will be ignored." ), ) @click.option( @@ -811,6 +830,8 @@ def run( judge_serving_model_endpoint: typing.Optional[str] = None, judge_serving_model_name: typing.Optional[str] = None, judge_serving_model_api_key: typing.Optional[str] = None, + judge_serving_model_ca_cert: typing.Optional[str] = None, + judge_serving_model_ca_cert_cm_key: typing.Optional[str] = None, judge_serving_model_secret: typing.Optional[str] = None, nproc_per_node: typing.Optional[int] = 1, eval_type: typing.Optional[str] = None, @@ -846,6 +867,8 @@ def run( judge_serving_model_name (str): The serving model name for evaluation. For Evaluation only. judge_serving_model_api_key (str): The serving model API key for evaluation. For Evaluation only. + judge_serving_model_ca_cert (str): The serving model CA cert for evaluation. + judge_serving_model_ca_cert_cm_key (str): The name of the Key in the Kubernetes ConfigMap judge_serving_model_secret (str): The name of the Kubernetes Secret containing the serving model credentials. For Evaluation only. nproc_per_node (int): The number of processes per node. For training only. @@ -882,7 +905,9 @@ def run( ctx.obj["judge_serving_model_endpoint"] = judge_serving_model_endpoint ctx.obj["judge_serving_model_name"] = judge_serving_model_name ctx.obj["judge_serving_model_api_key"] = judge_serving_model_api_key + ctx.obj["judge_serving_model_ca_cert"] = judge_serving_model_ca_cert ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret + ctx.obj["judge_serving_model_ca_cert_cm_key"] = judge_serving_model_ca_cert_cm_key ctx.obj["nproc_per_node"] = nproc_per_node ctx.obj["eval_type"] = eval_type ctx.obj["training_phase"] = training_phase @@ -1337,6 +1362,8 @@ def create_eval_job( eval_type: str, judge_serving_model_secret: str, nproc_per_node: int = 1, + judge_serving_model_ca_cert: str = None, + judge_serving_model_ca_cert_cm_key: str = None, ) -> kubernetes.client.V1Job: """ Create a Kubernetes Job object. @@ -1394,78 +1421,60 @@ def create_eval_job( {{exec_run_final_eval_op_args}} """ - if eval_type == "mt-bench": - init_containers = [ - kubernetes.client.V1Container( - name=f"run-eval-{eval_type}", - image=RHELAI_IMAGE, - command=["/bin/sh", "-ce"], - args=[ - PYTHON_EXECUTOR.format( - python_code=exec_run_mt_bench_op_command, - python_main=exec_run_mt_bench_op_args.strip(), - ), - ], - volume_mounts=get_vol_mount(), - security_context=get_security_context(), - env_from=[ - kubernetes.client.V1EnvFromSource( - secret_ref=kubernetes.client.V1SecretEnvSource( - name=judge_serving_model_secret - ) - ), - ], - resources=kubernetes.client.V1ResourceRequirements( - requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - ), - ) - ] - container = kubernetes.client.V1Container( - name=f"output-eval-{eval_type}-scores", - image=RHELAI_IMAGE, - command=["/bin/sh", "-c"], - args=[f"cat {MT_BENCH_SCORES_PATH}"], - security_context=get_security_context(), - volume_mounts=get_vol_mount(), - ) - elif eval_type == EVAL_TYPE_FINAL: - init_containers = [ - kubernetes.client.V1Container( - name=f"run-eval-{eval_type}", - image=RHELAI_IMAGE, - command=["/bin/sh", "-ce"], - args=[ - PYTHON_EXECUTOR.format( - python_code=exec_run_final_eval_op_command, - python_main=exec_run_final_eval_op_args.strip(), - ), - ], - volume_mounts=get_vol_mount(), - security_context=get_security_context(), - env_from=[ - kubernetes.client.V1EnvFromSource( - secret_ref=kubernetes.client.V1SecretEnvSource( - name=judge_serving_model_secret - ) - ), - ], - resources=kubernetes.client.V1ResourceRequirements( - requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, - ), - ) - ] - container = kubernetes.client.V1Container( - name=f"output-eval-{eval_type}-scores", - image=RHELAI_IMAGE, - command=["/bin/sh", "-c"], - args=[f"cat {MT_BENCH_BRANCH_SCORES_PATH}"], - security_context=get_security_context(), - volume_mounts=get_vol_mount(), - ) - else: - raise ValueError(f"Unknown evaluation type: {eval_type}") + eval_container = kubernetes.client.V1Container( + name=f"run-eval-{eval_type}", + image=RHELAI_IMAGE, + command=["/bin/sh", "-ce"], + volume_mounts=get_vol_mount(), + security_context=get_security_context(), + env_from=[ + kubernetes.client.V1EnvFromSource( + secret_ref=kubernetes.client.V1SecretEnvSource( + name=judge_serving_model_secret + ) + ), + ], + resources=kubernetes.client.V1ResourceRequirements( + requests={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + limits={"cpu": "1", "nvidia.com/gpu": nproc_per_node}, + ), + ) + eval_args = { + EVAL_TYPE_MT_BENCH: [ + PYTHON_EXECUTOR.format( + python_code=exec_run_mt_bench_op_command, + python_main=exec_run_mt_bench_op_args.strip(), + ), + ], + EVAL_TYPE_FINAL: [ + PYTHON_EXECUTOR.format( + python_code=exec_run_final_eval_op_command, + python_main=exec_run_final_eval_op_args.strip(), + ), + ], + } + try: + eval_container.args = eval_args[eval_type] + except KeyError as exc: + raise ValueError(f"Unknown evaluation type: {eval_type}") from exc + + init_containers = [eval_container] + + output_container = kubernetes.client.V1Container( + name=f"output-eval-{eval_type}-scores", + image=RHELAI_IMAGE, + command=["/bin/sh", "-c"], + security_context=get_security_context(), + volume_mounts=get_vol_mount(), + ) + eval_paths = { + EVAL_TYPE_MT_BENCH: MT_BENCH_SCORES_PATH, + EVAL_TYPE_FINAL: MT_BENCH_BRANCH_SCORES_PATH, + } + try: + output_container.args = [f"cat {eval_paths[eval_type]}"] + except KeyError as exc: + raise ValueError(f"Unknown evaluation type: {eval_type}") from exc # Create and configure a spec section template = kubernetes.client.V1PodTemplateSpec( @@ -1473,11 +1482,38 @@ def create_eval_job( spec=kubernetes.client.V1PodSpec( restart_policy="Never", init_containers=init_containers, - containers=[container], + containers=[output_container], volumes=get_vol(), ), ) + if judge_serving_model_ca_cert: + # Define the volume that references the ConfigMap + cm_volume = kubernetes.client.V1Volume( + name="judge-ca-cert-volume", + config_map=kubernetes.client.V1ConfigMapVolumeSource( + name=judge_serving_model_ca_cert + ), + ) + # Define the volume mount to specify where the Secret should be mounted in the container + cm_volume_mount = kubernetes.client.V1VolumeMount( + name="judge-ca-cert-volume", + mount_path=JUDGE_CA_CERT_PATH, # Path where the Secret will be mounted + ) + # Add an env var to the container to specify the path to the CA cert + eval_container.env.append( + kubernetes.client.V1EnvVar( + name=JUDGE_CA_CERT_ENV_VAR_NAME, + value=os.path.join( + JUDGE_CA_CERT_PATH, judge_serving_model_ca_cert_cm_key + ), + ) + ) + # Add the volume to the Pod spec + eval_container.volume_mounts.append(cm_volume_mount) + # Add the volume mount to the container + template.spec.volumes.append(cm_volume) + # Create the specification of deployment spec = kubernetes.client.V1JobSpec( template=template, @@ -1784,6 +1820,8 @@ def sdg_data_fetch( judge_serving_model_endpoint = ctx.obj["judge_serving_model_endpoint"] judge_serving_model_name = ctx.obj["judge_serving_model_name"] judge_serving_model_api_key = ctx.obj["judge_serving_model_api_key"] + judge_serving_model_ca_cert = ctx.obj["judge_serving_model_ca_cert"] + judge_serving_model_ca_cert_cm_key = ctx.obj["judge_serving_model_ca_cert_cm_key"] judge_serving_model_secret = ctx.obj["judge_serving_model_secret"] sdg_object_store_endpoint = ctx.obj["sdg_object_store_endpoint"] sdg_object_store_bucket = ctx.obj["sdg_object_store_bucket"] @@ -1984,15 +2022,49 @@ def sdg_data_fetch( secret.data.get("JUDGE_ENDPOINT") ) validate_url(judge_serving_model_endpoint) + + # Validation of the secret's existence is done in the next conditional block + if secret.data.get("JUDGE_CA_CERT"): + judge_serving_model_ca_cert = secret.data.get("JUDGE_CA_CERT") + if secret.data.get("JUDGE_CA_CERT_CM_KEY"): + judge_serving_model_ca_cert_cm_key = secret.data.get( + "JUDGE_CA_CERT_CM_KEY" + ) except kubernetes.client.rest.ApiException as exc: if exc.status == 404: raise ValueError( f"Secret {judge_serving_model_secret} not found in namespace {namespace}." ) from exc + # If the CA cert is provided, verify the existence of the secret + # We don't add the CA Cert Secret name into the Secret that contains the judge details + # If provided, the Secret will be mounted as a volume in the evaluation job + if judge_serving_model_ca_cert and not dry_run: + try: + cm = v1.read_namespaced_config_map( + name=judge_serving_model_ca_cert, namespace=namespace + ) + # Validate the presence of the key + if not cm.data.get(judge_serving_model_ca_cert_cm_key): + raise ValueError( + f"Provided ConfigMap {judge_serving_model_ca_cert} does not contain the key:" + f"'{judge_serving_model_ca_cert_cm_key}'." + "Use '--judge-serving-model-ca-cert-cm-key' to specify the key." + ) + except kubernetes.client.rest.ApiException as exc: + if exc.status == 404: + raise ValueError( + f"ConfigMap {judge_serving_model_ca_cert} not found in namespace {namespace}." + ) from exc + # Set the judge secret in the context for the evaluation job ctx.obj["judge_serving_model_secret"] = judge_serving_model_secret + # Set the judge CA cert in the context for the evaluation job, this handles the case where the + # secret is not provided via the cli flag but inside the secret + ctx.obj["judge_serving_model_ca_cert"] = judge_serving_model_ca_cert + ctx.obj["judge_serving_model_ca_cert_cm_key"] = judge_serving_model_ca_cert_cm_key + # list of PVCs to create and their details pvcs = [ { @@ -2252,6 +2324,8 @@ def evaluation(ctx: click.Context) -> str: eval_type = ctx.obj["eval_type"] dry_run = ctx.obj["dry_run"] judge_serving_model_secret = ctx.obj["judge_serving_model_secret"] + judge_serving_model_ca_cert = ctx.obj["judge_serving_model_ca_cert"] + judge_serving_model_ca_cert_cm_key = ctx.obj["judge_serving_model_ca_cert_cm_key"] # This should only happen if the script is called with the "evaluation" subcommand if not judge_serving_model_secret: @@ -2271,6 +2345,8 @@ def evaluation(ctx: click.Context) -> str: namespace=namespace, eval_type=eval_type, judge_serving_model_secret=judge_serving_model_secret, + judge_serving_model_ca_cert=judge_serving_model_ca_cert, + judge_serving_model_ca_cert_cm_key=judge_serving_model_ca_cert_cm_key, ) if dry_run: