diff --git a/README.md b/README.md index 77211e53..87f1bda6 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,10 @@ metadata: data: endpoint: '' model: mixtral + ca.crt: | # If using TLS + -----BEGIN CERTIFICATE----- + + -----END CERTIFICATE----- ``` ```yaml @@ -110,6 +114,10 @@ metadata: data: endpoint: '' model: prometheus + ca.crt: | # If using TLS + -----BEGIN CERTIFICATE----- + + -----END CERTIFICATE----- ``` ```yaml @@ -122,6 +130,8 @@ data: type: Opaque ``` +**NOTE**: You can find and copy the certs needed for the teacher- and judge-server ConfigMaps in another ConfigMap, `kube-root-ca.crt`, found in the same namespace as the hosted model + ### Run the Pipeline diff --git a/eval/final/components.py b/eval/final/components.py index 94124d03..469d222b 100644 --- a/eval/final/components.py +++ b/eval/final/components.py @@ -25,30 +25,20 @@ def run_final_eval_op( import os import subprocess + import httpx import torch from instructlab.eval.mmlu import MMLUBranchEvaluator from instructlab.eval.mt_bench import MTBenchBranchEvaluator from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores, sort_score - if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"): - import httpx - import openai - - # Create a custom HTTP client - class CustomHttpClient(httpx.Client): - def __init__(self, *args, **kwargs): - # Use the custom CA certificate - kwargs.setdefault("verify", judge_ca_cert) - super().__init__(*args, **kwargs) - - # Create a new OpenAI class that uses the custom HTTP client - class CustomOpenAI(openai.OpenAI): - def __init__(self, *args, **kwargs): - custom_client = CustomHttpClient() - super().__init__(http_client=custom_client, *args, **kwargs) - - # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it - openai.OpenAI = CustomOpenAI + judge_api_key = os.getenv("JUDGE_API_KEY", "") + judge_model_name = os.getenv("JUDGE_NAME") + judge_endpoint = os.getenv("JUDGE_ENDPOINT") + judge_ca_cert_path = os.getenv("JUDGE_CA_CERT_PATH") + use_tls = os.path.exists(judge_ca_cert_path) and ( + os.path.getsize(judge_ca_cert_path) > 0 + ) + judge_http_client = httpx.Client(verify=judge_ca_cert_path) if use_tls else None print("Starting Final Eval...") @@ -408,6 +398,7 @@ def find_node_dataset_directories(base_dir: str): server_url=vllm_server, serving_gpus=gpu_count, max_workers=max_workers, + http_client=judge_http_client, ) shutdown_vllm(vllm_process) @@ -418,6 +409,7 @@ def find_node_dataset_directories(base_dir: str): api_key=judge_api_key, serving_gpus=gpu_count, max_workers=max_workers, + http_client=judge_http_client, ) qa_pairs_and_errors.append((overall_score, qa_pairs, error_rate)) diff --git a/eval/mt_bench/components.py b/eval/mt_bench/components.py index 481952d9..5b1800a2 100644 --- a/eval/mt_bench/components.py +++ b/eval/mt_bench/components.py @@ -1,6 +1,6 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error -from typing import List, NamedTuple, Optional +from typing import NamedTuple, Optional from kfp.dsl import component @@ -22,28 +22,18 @@ def run_mt_bench_op( import os import subprocess + import httpx import torch from instructlab.eval.mt_bench import MTBenchEvaluator - if judge_ca_cert := os.getenv("JUDGE_CA_CERT_PATH"): - import httpx - import openai - - # Create a custom HTTP client - class CustomHttpClient(httpx.Client): - def __init__(self, *args, **kwargs): - # Use the custom CA certificate - kwargs.setdefault("verify", judge_ca_cert) - super().__init__(*args, **kwargs) - - # Create a new OpenAI class that uses the custom HTTP client - class CustomOpenAI(openai.OpenAI): - def __init__(self, *args, **kwargs): - custom_client = CustomHttpClient() - super().__init__(http_client=custom_client, *args, **kwargs) - - # Monkey patch the OpenAI class in the openai module, so that the eval lib can use it - openai.OpenAI = CustomOpenAI + judge_api_key = os.getenv("JUDGE_API_KEY", "") + judge_model_name = os.getenv("JUDGE_NAME") + judge_endpoint = os.getenv("JUDGE_ENDPOINT") + judge_ca_cert_path = os.getenv("JUDGE_CA_CERT_PATH") + use_tls = os.path.exists(judge_ca_cert_path) and ( + os.path.getsize(judge_ca_cert_path) > 0 + ) + judge_http_client = httpx.Client(verify=judge_ca_cert_path) if use_tls else None def launch_vllm( model_path: str, gpu_count: int, retries: int = 120, delay: int = 10 @@ -136,10 +126,6 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20): models_list = os.listdir(models_folder) - judge_api_key = os.getenv("JUDGE_API_KEY", "") - judge_model_name = os.getenv("JUDGE_NAME") - judge_endpoint = os.getenv("JUDGE_ENDPOINT") - scores = {} all_mt_bench_data = [] @@ -175,6 +161,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20): server_url=vllm_server, serving_gpus=gpu_count, max_workers=max_workers, + http_client=judge_http_client, ) shutdown_vllm(vllm_process) @@ -184,6 +171,7 @@ def shutdown_vllm(process: subprocess.Popen, timeout: int = 20): api_key=judge_api_key, serving_gpus=gpu_count, max_workers=max_workers, + http_client=judge_http_client, ) mt_bench_data = { diff --git a/pipeline.py b/pipeline.py index 7f5b0e51..d52fddc0 100644 --- a/pipeline.py +++ b/pipeline.py @@ -1,5 +1,6 @@ # type: ignore # pylint: disable=no-value-for-parameter,import-outside-toplevel,import-error,no-member +import os import typing from typing import List, Literal, Optional @@ -9,8 +10,8 @@ CreatePVC, DeletePVC, mount_pvc, - set_image_pull_policy, use_config_map_as_env, + use_config_map_as_volume, use_secret_as_env, use_secret_as_volume, ) @@ -26,6 +27,15 @@ GENERATED_STANDALONE_FILE_NAME = "standalone.py" DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git" +# Model Serving SSL connection +SDG_CA_CERT_CM_KEY = "ca.crt" +SDG_CA_CERT_ENV_VAR_NAME = "SDG_CA_CERT_PATH" +SDG_CA_CERT_PATH = "/tmp/cert" + +JUDGE_CA_CERT_CM_KEY = "ca.crt" +JUDGE_CA_CERT_ENV_VAR_NAME = "JUDGE_CA_CERT_PATH" +JUDGE_CA_CERT_PATH = "/tmp/cert" + def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): """Wrapper for KFP pipeline, which allows for mocking individual stages.""" @@ -187,6 +197,13 @@ def pipeline( sdg_task, TEACHER_CONFIG_MAP, dict(endpoint="endpoint", model="model") ) use_secret_as_env(sdg_task, TEACHER_SECRET, {"api_key": "api_key"}) + use_config_map_as_volume( + sdg_task, TEACHER_CONFIG_MAP, mount_path=SDG_CA_CERT_PATH + ) + sdg_task.set_env_variable( + SDG_CA_CERT_ENV_VAR_NAME, os.path.join(SDG_CA_CERT_PATH, SDG_CA_CERT_CM_KEY) + ) + sdg_task.after(git_clone_task) mount_pvc( task=sdg_task, @@ -349,6 +366,14 @@ def pipeline( ) use_secret_as_env(run_mt_bench_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) + use_config_map_as_volume( + run_mt_bench_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH + ) + run_mt_bench_task.set_env_variable( + JUDGE_CA_CERT_ENV_VAR_NAME, + os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), + ) + # uncomment if updating image with same tag # set_image_pull_policy(run_mt_bench_task, "Always") @@ -391,6 +416,14 @@ def pipeline( use_secret_as_env(final_eval_task, JUDGE_SECRET, {"api_key": "JUDGE_API_KEY"}) + use_config_map_as_volume( + final_eval_task, JUDGE_CONFIG_MAP, mount_path=JUDGE_CA_CERT_PATH + ) + final_eval_task.set_env_variable( + JUDGE_CA_CERT_ENV_VAR_NAME, + os.path.join(JUDGE_CA_CERT_PATH, JUDGE_CA_CERT_CM_KEY), + ) + final_eval_task.after(run_mt_bench_task) final_eval_task.set_accelerator_type("nvidia.com/gpu") final_eval_task.set_accelerator_limit(1) diff --git a/pipeline.yaml b/pipeline.yaml index bdcc742d..efce07cb 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -1144,24 +1144,19 @@ deploymentSpec: \ few_shots: int,\n batch_size: str,\n merge_system_user_message:\ \ bool,\n candidate_model: str = None,\n taxonomy_path: str = \"/input/taxonomy\"\ ,\n sdg_path: str = \"/input/sdg\",\n):\n import json\n import\ - \ os\n import subprocess\n\n import torch\n from instructlab.eval.mmlu\ - \ import MMLUBranchEvaluator\n from instructlab.eval.mt_bench import\ - \ MTBenchBranchEvaluator\n from instructlab.model.evaluate import qa_pairs_to_qna_to_avg_scores,\ - \ sort_score\n\n if judge_ca_cert := os.getenv(\"JUDGE_CA_CERT_PATH\"\ - ):\n import httpx\n import openai\n\n # Create a custom\ - \ HTTP client\n class CustomHttpClient(httpx.Client):\n \ - \ def __init__(self, *args, **kwargs):\n # Use the custom\ - \ CA certificate\n kwargs.setdefault(\"verify\", judge_ca_cert)\n\ - \ super().__init__(*args, **kwargs)\n\n # Create a\ - \ new OpenAI class that uses the custom HTTP client\n class CustomOpenAI(openai.OpenAI):\n\ - \ def __init__(self, *args, **kwargs):\n custom_client\ - \ = CustomHttpClient()\n super().__init__(http_client=custom_client,\ - \ *args, **kwargs)\n\n # Monkey patch the OpenAI class in the openai\ - \ module, so that the eval lib can use it\n openai.OpenAI = CustomOpenAI\n\ - \n print(\"Starting Final Eval...\")\n\n def launch_vllm(\n \ - \ model_path: str, gpu_count: int, retries: int = 120, delay: int = 10\n\ - \ ) -> tuple:\n import subprocess\n import sys\n \ - \ import time\n\n import requests\n from instructlab.model.backends.common\ + \ os\n import subprocess\n\n import httpx\n import torch\n from\ + \ instructlab.eval.mmlu import MMLUBranchEvaluator\n from instructlab.eval.mt_bench\ + \ import MTBenchBranchEvaluator\n from instructlab.model.evaluate import\ + \ qa_pairs_to_qna_to_avg_scores, sort_score\n\n judge_api_key = os.getenv(\"\ + JUDGE_API_KEY\", \"\")\n judge_model_name = os.getenv(\"JUDGE_NAME\"\ + )\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\")\n judge_ca_cert_path\ + \ = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls = os.path.exists(judge_ca_cert_path)\ + \ and (\n os.path.getsize(judge_ca_cert_path) > 0\n )\n judge_http_client\ + \ = httpx.Client(verify=judge_ca_cert_path) if use_tls else None\n\n \ + \ print(\"Starting Final Eval...\")\n\n def launch_vllm(\n model_path:\ + \ str, gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ + \ import subprocess\n import sys\n import time\n\n\ + \ import requests\n from instructlab.model.backends.common\ \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ \n\n command = [\n sys.executable,\n \"-m\"\ @@ -1335,11 +1330,12 @@ deploymentSpec: \ for branch {branch}...\"\n )\n vllm_process, vllm_server\ \ = launch_vllm(m_path, gpu_count)\n\n evaluator.gen_answers(\n \ \ server_url=vllm_server,\n serving_gpus=gpu_count,\n\ - \ max_workers=max_workers,\n )\n\n shutdown_vllm(vllm_process)\n\ - \n print(f\"Evaluating answers for branch {branch}...\")\n \ - \ overall_score, qa_pairs, error_rate = evaluator.judge_answers(\n \ - \ server_url=judge_endpoint,\n api_key=judge_api_key,\n\ - \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ + \ max_workers=max_workers,\n http_client=judge_http_client,\n\ + \ )\n\n shutdown_vllm(vllm_process)\n\n print(f\"Evaluating\ + \ answers for branch {branch}...\")\n overall_score, qa_pairs, error_rate\ + \ = evaluator.judge_answers(\n server_url=judge_endpoint,\n \ + \ api_key=judge_api_key,\n serving_gpus=gpu_count,\n\ + \ max_workers=max_workers,\n http_client=judge_http_client,\n\ \ )\n\n qa_pairs_and_errors.append((overall_score, qa_pairs,\ \ error_rate))\n\n overall_score, qa_pairs, error_rate = qa_pairs_and_errors[0]\n\ \ base_overall_score, base_qa_pairs, base_error_rate = qa_pairs_and_errors[1]\n\ @@ -1373,6 +1369,8 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp + - name: JUDGE_CA_CERT_PATH + value: /tmp/cert/ca.crt image: registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1 resources: accelerator: @@ -1404,39 +1402,33 @@ deploymentSpec: \ max_workers: str,\n models_folder: str,\n output_path: str =\ \ \"/output/mt_bench_data.json\",\n best_score_file: Optional[str] =\ \ None,\n) -> NamedTuple(\"outputs\", best_model=str, best_score=float):\n\ - \ import json\n import os\n import subprocess\n\n import torch\n\ - \ from instructlab.eval.mt_bench import MTBenchEvaluator\n\n if judge_ca_cert\ - \ := os.getenv(\"JUDGE_CA_CERT_PATH\"):\n import httpx\n import\ - \ openai\n\n # Create a custom HTTP client\n class CustomHttpClient(httpx.Client):\n\ - \ def __init__(self, *args, **kwargs):\n # Use\ - \ the custom CA certificate\n kwargs.setdefault(\"verify\"\ - , judge_ca_cert)\n super().__init__(*args, **kwargs)\n\n\ - \ # Create a new OpenAI class that uses the custom HTTP client\n\ - \ class CustomOpenAI(openai.OpenAI):\n def __init__(self,\ - \ *args, **kwargs):\n custom_client = CustomHttpClient()\n\ - \ super().__init__(http_client=custom_client, *args, **kwargs)\n\ - \n # Monkey patch the OpenAI class in the openai module, so that\ - \ the eval lib can use it\n openai.OpenAI = CustomOpenAI\n\n def\ - \ launch_vllm(\n model_path: str, gpu_count: int, retries: int =\ - \ 120, delay: int = 10\n ) -> tuple:\n import subprocess\n \ - \ import sys\n import time\n\n import requests\n \ - \ from instructlab.model.backends.common import free_tcp_ipv4_port\n\n\ - \ free_port = free_tcp_ipv4_port(\"127.0.0.1\")\n port = str(free_port)\n\ - \ vllm_server = f\"http://127.0.0.1:{port}/v1\"\n\n command\ - \ = [\n sys.executable,\n \"-m\",\n \"\ - vllm.entrypoints.openai.api_server\",\n \"--port\",\n \ - \ port,\n \"--model\",\n model_path,\n \ - \ ]\n if gpu_count > 0:\n command += [\n \ - \ \"--tensor-parallel-size\",\n str(gpu_count),\n \ - \ ]\n\n process = subprocess.Popen(args=command)\n\n \ - \ print(f\"Waiting for vLLM server to start at {vllm_server}...\")\n\n\ - \ for attempt in range(retries):\n try:\n \ - \ response = requests.get(f\"{vllm_server}/models\")\n \ - \ if response.status_code == 200:\n print(f\"vLLM server\ - \ is up and running at {vllm_server}.\")\n return process,\ - \ vllm_server\n except requests.ConnectionError:\n \ - \ pass\n\n print(\n f\"Server not available\ - \ yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ + \ import json\n import os\n import subprocess\n\n import httpx\n\ + \ import torch\n from instructlab.eval.mt_bench import MTBenchEvaluator\n\ + \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ + \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ + )\n judge_ca_cert_path = os.getenv(\"JUDGE_CA_CERT_PATH\")\n use_tls\ + \ = os.path.exists(judge_ca_cert_path) and (\n os.path.getsize(judge_ca_cert_path)\ + \ > 0\n )\n judge_http_client = httpx.Client(verify=judge_ca_cert_path)\ + \ if use_tls else None\n\n def launch_vllm(\n model_path: str,\ + \ gpu_count: int, retries: int = 120, delay: int = 10\n ) -> tuple:\n\ + \ import subprocess\n import sys\n import time\n\n\ + \ import requests\n from instructlab.model.backends.common\ + \ import free_tcp_ipv4_port\n\n free_port = free_tcp_ipv4_port(\"\ + 127.0.0.1\")\n port = str(free_port)\n vllm_server = f\"http://127.0.0.1:{port}/v1\"\ + \n\n command = [\n sys.executable,\n \"-m\"\ + ,\n \"vllm.entrypoints.openai.api_server\",\n \"--port\"\ + ,\n port,\n \"--model\",\n model_path,\n\ + \ ]\n if gpu_count > 0:\n command += [\n \ + \ \"--tensor-parallel-size\",\n str(gpu_count),\n\ + \ ]\n\n process = subprocess.Popen(args=command)\n\n \ + \ print(f\"Waiting for vLLM server to start at {vllm_server}...\"\ + )\n\n for attempt in range(retries):\n try:\n \ + \ response = requests.get(f\"{vllm_server}/models\")\n \ + \ if response.status_code == 200:\n print(f\"vLLM\ + \ server is up and running at {vllm_server}.\")\n return\ + \ process, vllm_server\n except requests.ConnectionError:\n \ + \ pass\n\n print(\n f\"Server not\ + \ available yet, retrying in {delay} seconds (Attempt {attempt + 1}/{retries})...\"\ \n )\n time.sleep(delay)\n\n raise RuntimeError(\n\ \ f\"Failed to start vLLM server at {vllm_server} after {retries}\ \ retries.\"\n )\n\n def shutdown_vllm(process: subprocess.Popen,\ @@ -1460,9 +1452,7 @@ deploymentSpec: \ gpu_available\n else \"No GPU available\"\n )\n gpu_count\ \ = torch.cuda.device_count() if gpu_available else 0\n\n print(f\"GPU\ \ Available: {gpu_available}, {gpu_name}\")\n\n models_list = os.listdir(models_folder)\n\ - \n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\ - \ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\ - )\n\n scores = {}\n all_mt_bench_data = []\n\n # generate_answers,judgment\ + \n scores = {}\n all_mt_bench_data = []\n\n # generate_answers,judgment\ \ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\ \ number of gpus allocated for serving is calculated based on environment\n\ \ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\ @@ -1480,25 +1470,25 @@ deploymentSpec: \ output_dir=\"/tmp/eval_output\",\n merge_system_user_message=merge_system_user_message,\n\ \ )\n\n evaluator.gen_answers(\n server_url=vllm_server,\n\ \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ - \ )\n\n shutdown_vllm(vllm_process)\n\n overall_score,\ - \ qa_pairs, turn_scores, error_rate = evaluator.judge_answers(\n \ - \ server_url=judge_endpoint,\n api_key=judge_api_key,\n \ - \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ - \ )\n\n mt_bench_data = {\n \"report_title\": \"\ - SKILLS EVALUATION REPORT\",\n \"model\": model_path,\n \ - \ \"judge_model\": judge_model_name,\n \"overall_score\"\ - : overall_score,\n \"turn_scores\": turn_scores,\n \ - \ \"qa_scores\": qa_pairs,\n \"error_rate\": error_rate,\n \ - \ }\n\n all_mt_bench_data.append(mt_bench_data)\n scores[model_path]\ - \ = overall_score\n\n with open(output_path, \"w\", encoding=\"utf-8\"\ - ) as f:\n json.dump(all_mt_bench_data, f, indent=4)\n\n outputs\ - \ = NamedTuple(\"outputs\", best_model=str, best_score=float)\n best_model\ - \ = max(scores, key=scores.get)\n best_score = scores[best_model]\n \ - \ if best_score_file:\n with open(best_score_file, \"w\", encoding=\"\ - utf-8\") as f:\n json.dump({\"best_model\": best_model, \"best_score\"\ - : best_score}, f, indent=4)\n\n # Rename the best model directory to\ - \ \"candidate_model\" for the next step\n # So we know which model to\ - \ use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\ + \ http_client=judge_http_client,\n )\n\n shutdown_vllm(vllm_process)\n\ + \n overall_score, qa_pairs, turn_scores, error_rate = evaluator.judge_answers(\n\ + \ server_url=judge_endpoint,\n api_key=judge_api_key,\n\ + \ serving_gpus=gpu_count,\n max_workers=max_workers,\n\ + \ http_client=judge_http_client,\n )\n\n mt_bench_data\ + \ = {\n \"report_title\": \"SKILLS EVALUATION REPORT\",\n \ + \ \"model\": model_path,\n \"judge_model\": judge_model_name,\n\ + \ \"overall_score\": overall_score,\n \"turn_scores\"\ + : turn_scores,\n \"qa_scores\": qa_pairs,\n \"error_rate\"\ + : error_rate,\n }\n\n all_mt_bench_data.append(mt_bench_data)\n\ + \ scores[model_path] = overall_score\n\n with open(output_path,\ + \ \"w\", encoding=\"utf-8\") as f:\n json.dump(all_mt_bench_data,\ + \ f, indent=4)\n\n outputs = NamedTuple(\"outputs\", best_model=str,\ + \ best_score=float)\n best_model = max(scores, key=scores.get)\n best_score\ + \ = scores[best_model]\n if best_score_file:\n with open(best_score_file,\ + \ \"w\", encoding=\"utf-8\") as f:\n json.dump({\"best_model\"\ + : best_model, \"best_score\": best_score}, f, indent=4)\n\n # Rename\ + \ the best model directory to \"candidate_model\" for the next step\n \ + \ # So we know which model to use for the final evaluation\n if os.path.exists(os.path.join(models_folder,\ \ \"candidate_model\")):\n print(\"candidate_model already exists.\ \ Skipping renaming\")\n else:\n os.rename(\n os.path.join(models_folder,\ \ best_model),\n os.path.join(models_folder, \"candidate_model\"\ @@ -1509,6 +1499,8 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp + - name: JUDGE_CA_CERT_PATH + value: /tmp/cert/ca.crt image: registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1 resources: accelerator: @@ -1536,11 +1528,13 @@ deploymentSpec: \ *\n\ndef sdg_op(\n num_instructions_to_generate: int,\n pipeline:\ \ str,\n repo_branch: Optional[str],\n repo_pr: Optional[int],\n \ \ taxonomy_path: str = \"/data/taxonomy\",\n sdg_path: str = \"/data/sdg\"\ - ,\n sdg_sampling_size: float = 1.0,\n):\n from os import getenv, path\n\ - \n import instructlab.sdg\n import openai\n import yaml\n\n \ - \ api_key = getenv(\"api_key\")\n model = getenv(\"model\")\n endpoint\ - \ = getenv(\"endpoint\")\n\n if sdg_ca_cert := getenv(\"SDG_CA_CERT_PATH\"\ - ):\n import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert)\n\ + ,\n sdg_sampling_size: float = 1.0,\n):\n import os\n from os import\ + \ getenv, path\n\n import instructlab.sdg\n import openai\n import\ + \ yaml\n\n api_key = getenv(\"api_key\")\n model = getenv(\"model\"\ + )\n endpoint = getenv(\"endpoint\")\n\n sdg_ca_cert_path = getenv(\"\ + SDG_CA_CERT_PATH\")\n use_tls = os.path.exists(sdg_ca_cert_path) and\ + \ (\n os.path.getsize(sdg_ca_cert_path) > 0\n )\n if use_tls:\n\ + \ import httpx\n\n custom_http_client = httpx.Client(verify=sdg_ca_cert_path)\n\ \ client = openai.OpenAI(\n base_url=endpoint, api_key=api_key,\ \ http_client=custom_http_client\n )\n else:\n client =\ \ openai.OpenAI(base_url=endpoint, api_key=api_key)\n\n taxonomy_base\ @@ -1619,6 +1613,8 @@ deploymentSpec: value: /tmp - name: HF_HOME value: /tmp + - name: SDG_CA_CERT_PATH + value: /tmp/cert/ca.crt image: registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1 exec-sdg-to-artifact-op: container: @@ -2293,6 +2289,10 @@ platforms: envVar: JUDGE_ENDPOINT - configMapKey: model envVar: JUDGE_NAME + configMapAsVolume: + - configMapName: judge-server + mountPath: /tmp/cert + optional: false pvcMount: - mountPath: /output taskOutputParameter: @@ -2319,6 +2319,10 @@ platforms: envVar: JUDGE_ENDPOINT - configMapKey: model envVar: JUDGE_NAME + configMapAsVolume: + - configMapName: judge-server + mountPath: /tmp/cert + optional: false pvcMount: - mountPath: /output taskOutputParameter: @@ -2337,6 +2341,10 @@ platforms: envVar: endpoint - configMapKey: model envVar: model + configMapAsVolume: + - configMapName: teacher-server + mountPath: /tmp/cert + optional: false pvcMount: - mountPath: /data taskOutputParameter: diff --git a/sdg/components.py b/sdg/components.py index e3370e67..3d891a1a 100644 --- a/sdg/components.py +++ b/sdg/components.py @@ -37,6 +37,7 @@ def sdg_op( sdg_path: str = "/data/sdg", sdg_sampling_size: float = 1.0, ): + import os from os import getenv, path import instructlab.sdg @@ -47,10 +48,14 @@ def sdg_op( model = getenv("model") endpoint = getenv("endpoint") - if sdg_ca_cert := getenv("SDG_CA_CERT_PATH"): + sdg_ca_cert_path = getenv("SDG_CA_CERT_PATH") + use_tls = os.path.exists(sdg_ca_cert_path) and ( + os.path.getsize(sdg_ca_cert_path) > 0 + ) + if use_tls: import httpx - custom_http_client = httpx.Client(verify=sdg_ca_cert) + custom_http_client = httpx.Client(verify=sdg_ca_cert_path) client = openai.OpenAI( base_url=endpoint, api_key=api_key, http_client=custom_http_client ) diff --git a/training/components.py b/training/components.py index 84143d01..20bf353f 100644 --- a/training/components.py +++ b/training/components.py @@ -1,7 +1,7 @@ # type: ignore # pylint: disable=import-outside-toplevel,missing-function-docstring -from typing import NamedTuple, Optional +from typing import Optional from kfp import dsl diff --git a/training/run_main_ds.py b/training/run_main_ds.py index 1b782eeb..9c4fb535 100644 --- a/training/run_main_ds.py +++ b/training/run_main_ds.py @@ -20,8 +20,8 @@ def run_main_ds(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--nproc_per_node={torch_args.nproc_per_node}", f"--rdzv_id={torch_args.rdzv_id}", f"--rdzv_endpoint={torch_args.rdzv_endpoint}", - f"-m", - f"instructlab.training.main_ds", + "-m", + "instructlab.training.main_ds", f"--model_name_or_path={train_args.model_path}", f"--data_path={train_args.data_output_dir}/data.jsonl", f"--output_dir={train_args.ckpt_output_dir}", @@ -30,7 +30,7 @@ def run_main_ds(torch_args: TorchrunArgs, train_args: TrainingArgs) -> None: f"--learning_rate={train_args.learning_rate}", f"--num_warmup_steps={train_args.warmup_steps}", f"--save_samples={train_args.save_samples}", - f"--log_level=INFO", + "--log_level=INFO", f"--max_batch_len={train_args.max_batch_len}", f"--seed={train_args.random_seed}", f"--chat-tmpl-path={train_args.chat_tmpl_path}", diff --git a/utils/components.py b/utils/components.py index 8ddabcd7..35f38481 100644 --- a/utils/components.py +++ b/utils/components.py @@ -3,7 +3,7 @@ from kfp import dsl -from .consts import PYTHON_IMAGE, RHELAI_IMAGE, TOOLBOX_IMAGE +from .consts import RHELAI_IMAGE, TOOLBOX_IMAGE @dsl.container_component