From 2d9d60df9aca7990922fc556cc7ec509e4064b9a Mon Sep 17 00:00:00 2001 From: Tomas Coufal Date: Thu, 21 Nov 2024 15:35:03 +0100 Subject: [PATCH] feat: source base model from S3 and provide a helper pipeline to populate the model to s3 default bucket Signed-off-by: Tomas Coufal --- importer-pipeline.yaml | 88 ++++++++++++++++++++++++++++ pipeline.py | 73 ++++++++++++++++++++---- pipeline.yaml | 117 +++++++++++++++++++++----------------- utils/__init__.py | 6 +- utils/components.py | 27 ++++++--- utils/faked/__init__.py | 2 + utils/faked/components.py | 2 +- 7 files changed, 242 insertions(+), 73 deletions(-) create mode 100644 importer-pipeline.yaml diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml new file mode 100644 index 00000000..c4486c7e --- /dev/null +++ b/importer-pipeline.yaml @@ -0,0 +1,88 @@ +# PIPELINE DEFINITION +# Name: instructlab-base-importer +# Description: Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model +# Inputs: +# release: str [Default: 'latest'] +# repository: str [Default: 'docker://registry.redhat.io/rhelai1/granite-7b-starter'] +components: + comp-ilab-importer-op: + executorLabel: exec-ilab-importer-op + inputDefinitions: + parameters: + release: + parameterType: STRING + repository: + parameterType: STRING + outputDefinitions: + artifacts: + base_model: + artifactType: + schemaTitle: system.Model + schemaVersion: 0.0.1 +deploymentSpec: + executors: + exec-ilab-importer-op: + container: + args: + - ilab --config=DEFAULT model download --repository {{$.inputs.parameters['repository']}} + --release {{$.inputs.parameters['release']}} --model-dir {{$.outputs.artifacts['base_model'].path}} + command: + - /bin/sh + - -c + env: + - name: REGISTRY_AUTH_FILE + value: /mnt/containers/.dockerconfigjson + image: quay.io/redhat-et/ilab:1.2 +pipelineInfo: + description: Helper pipeline to the InstructLab pipeline which allows users to seed/import + a new base model + displayName: InstructLab - base model importer + name: instructlab-base-importer +root: + dag: + tasks: + ilab-importer-op: + cachingOptions: + enableCache: true + componentRef: + name: comp-ilab-importer-op + inputs: + parameters: + release: + componentInputParameter: release + repository: + componentInputParameter: repository + taskInfo: + name: ilab-importer-op + inputDefinitions: + parameters: + release: + defaultValue: latest + description: The revision of the model to download - e.g. a branch, tag, or + commit hash for Hugging Face repositories and tag or commit hash for OCI + repositories. + isOptional: true + parameterType: STRING + repository: + defaultValue: docker://registry.redhat.io/rhelai1/granite-7b-starter + description: Hugging Face or OCI repository of the model to download. OCI + repository must have a docker:// prefix + isOptional: true + parameterType: STRING +schemaVersion: 2.1.0 +sdkVersion: kfp-2.9.0 +--- +platforms: + kubernetes: + deploymentSpec: + executors: + exec-ilab-importer-op: + secretAsEnv: + - keyToEnv: + - envVar: HF_TOKEN + secretKey: HF_TOKEN + secretName: hugging-face-token + secretAsVolume: + - mountPath: /mnt/containers + optional: false + secretName: 7033380-ilab-pull-secret diff --git a/pipeline.py b/pipeline.py index 7e129a7f..0f29adc8 100644 --- a/pipeline.py +++ b/pipeline.py @@ -13,6 +13,7 @@ set_image_pull_secrets, use_config_map_as_env, use_secret_as_env, + use_secret_as_volume, ) TEACHER_CONFIG_MAP = "teacher-server" @@ -21,13 +22,16 @@ JUDGE_SECRET = "judge-server" MOCKED_STAGES = ["sdg", "train", "eval"] PIPELINE_FILE_NAME = "pipeline.yaml" +IMPORTER_PIPELINE_FILE_NAME = "importer-pipeline.yaml" SDG_PIPELINE = "simple" IMAGE_PULL_SECRET = "redhat-et-ilab-botty-pull-secret" STANDALONE_TEMPLATE_FILE_NAME = "standalone.tpl" GENERATED_STANDALONE_FILE_NAME = "standalone.py" DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git" KFP_MODEL_SERVER_CM = "sdg/kfp-model-server.yaml" -BASE_MODEL = "ibm-granite/granite-7b-base" + +# FIXME: This value is specific to ocp-beta-test.nerc.mghpcc.org cluster, `ilab` namespace. It is quite cumbersome to copypaste and remember the path every time in dev. This default value should go away once we reach feature freeze. +BASE_MODEL = "s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter" # eval args MMLU_TASKS_LIST = "mmlu_anatomy,mmlu_astronomy" @@ -51,7 +55,7 @@ SEED = 42 -def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): +def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): """Wrapper for KFP pipeline, which allows for mocking individual stages.""" # Imports for SDG stage @@ -79,7 +83,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): skills_processed_data_to_artifact_op, ) from utils.faked import ( - huggingface_importer_op, + model_to_pvc_op, pvc_to_model_op, pvc_to_mt_bench_op, ) @@ -91,7 +95,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): skills_processed_data_to_artifact_op, ) from utils import ( - huggingface_importer_op, + model_to_pvc_op, pvc_to_model_op, pvc_to_mt_bench_op, ) @@ -239,6 +243,9 @@ def pipeline( # set_image_pull_policy(sdg_task, "Always") # Training stage + model_source_s3_task = dsl.importer( + artifact_uri=sdg_base_model, artifact_class=dsl.Model + ) # We need to pass storage_class_name as "" to use the default StorageClass, if left empty, KFP uses "standard" StorageClass. # 'standard' != default StorageClass @@ -250,7 +257,8 @@ def pipeline( size="100Gi", storage_class_name=k8s_storage_class_name, ) - model_to_pvc_task = huggingface_importer_op(repo_name=sdg_base_model) + + model_to_pvc_task = model_to_pvc_op(model=model_source_s3_task.output) model_to_pvc_task.set_caching_options(False) mount_pvc( task=model_to_pvc_task, pvc_name=model_pvc_task.output, mount_path="/model" @@ -459,6 +467,46 @@ def pipeline( return pipeline +def import_base_model_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): + from utils import ilab_importer_op + + @dsl.pipeline( + display_name="InstructLab - base model importer", + name="instructlab-base-importer", + description="Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model", + ) + def pipeline( + # hf_token_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 + # oci_pull_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 + repository: str = "docker://registry.redhat.io/rhelai1/granite-7b-starter", + release: str = "latest", + ): + """InstructLab - base model importer. + + Args: + repository: Hugging Face or OCI repository of the model to download. OCI repository must have a docker:// prefix + release: The revision of the model to download - e.g. a branch, tag, or commit hash for Hugging Face repositories and tag or commit hash for OCI repositories. + hf_token_secret: Name of existing Kubernetes secret which contains HF_TOKEN value for Hugging Face repositories. Mandatory for all repositories besides those which belong to the "instructlab" organization. + oci_pull_secret: Name of existing Kubernetes secret of .dockerconfigjson type for OCI repository authentication. + """ + importer_task = ilab_importer_op(repository=repository, release=release) + + # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395 + # FIXME: Make env variables optional once implemented upstream: https://github.com/kubeflow/pipelines/issues/11401 + # This pipeline is currently unusable outside of ocp-beta-test.nerc.mghpcc.org cluster, `ilab` namespace due to the hardcoded names... + use_secret_as_env( + importer_task, "hugging-face-token", dict(HF_TOKEN="HF_TOKEN") + ) + importer_task.set_env_variable( + "REGISTRY_AUTH_FILE", "/mnt/containers/.dockerconfigjson" + ) + use_secret_as_volume( + importer_task, "7033380-ilab-pull-secret", mount_path="/mnt/containers" + ) + + return pipeline + + @click.option( "--mock", type=click.Choice(MOCKED_STAGES, case_sensitive=False), @@ -474,11 +522,17 @@ def cli(ctx: click.Context, mock): def generate_pipeline(mock): - p = pipeline_wrapper(mock) + ilab_pipeline = ilab_pipeline_wrapper(mock) + import_base_model_pipeline = import_base_model_pipeline_wrapper(mock) + + pipelines = [ + (ilab_pipeline, PIPELINE_FILE_NAME), + (import_base_model_pipeline, IMPORTER_PIPELINE_FILE_NAME), + ] - with click.progressbar(length=1, label="Generating pipeline") as bar: - compiler.Compiler().compile(p, PIPELINE_FILE_NAME) - bar.update(1) + with click.progressbar(pipelines, label="Generating pipeline") as bar: + for pipeline_func, pipeline_file in bar: + compiler.Compiler().compile(pipeline_func, pipeline_file) @cli.command(name="gen-standalone") @@ -517,7 +571,6 @@ def gen_standalone(): "exec-data-processing-op": 'data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg_path="{DATA_PVC_SDG_PATH}", model_path="{DATA_PVC_MODEL_PATH}", skills_path="{PREPROCESSED_DATA_SKILLS_PATH}", knowledge_path="{PREPROCESSED_DATA_KNOWLEDGE_PATH}")', "exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})', "exec-git-clone-op": {}, - "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="{REPO_GRANITE_7B_IMAGE}", model_path="{DATA_PVC_MODEL_PATH}")', "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",output_path="{MT_BENCH_OUTPUT_PATH}",models_folder="{CANDIDATE_MODEL_PATH_PREFIX}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE})', "exec-run-final-eval-op": 'run_final_eval_op(mmlu_branch_output="{MMLU_BRANCH_SCORES_PATH}", mt_bench_branch_output="{MT_BENCH_BRANCH_SCORES_PATH}", candidate_model="{CANDIDATE_MODEL_PATH}", taxonomy_path="{TAXONOMY_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", base_branch="", candidate_branch="", base_model_dir="{DATA_PVC_MODEL_PATH}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE}, few_shots={FEW_SHOTS}, batch_size="{BATCH_SIZE}")', } diff --git a/pipeline.yaml b/pipeline.yaml index c27bcff9..0dbec02a 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -9,7 +9,7 @@ # k8s_storage_class_name: str [Default: 'nfs-csi'] # mt_bench_max_workers: str [Default: 'auto'] # mt_bench_merge_system_user_message: bool [Default: False] -# sdg_base_model: str [Default: 'ibm-granite/granite-7b-base'] +# sdg_base_model: str [Default: 's3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter'] # sdg_max_batch_len: int [Default: 20000.0] # sdg_pipeline: str [Default: 'simple'] # sdg_repo_branch: str @@ -280,16 +280,18 @@ components: defaultValue: /data/taxonomy isOptional: true parameterType: STRING - comp-huggingface-importer-op: - executorLabel: exec-huggingface-importer-op + comp-importer: + executorLabel: exec-importer inputDefinitions: parameters: - model_path: - defaultValue: /model - isOptional: true - parameterType: STRING - repo_name: + uri: parameterType: STRING + outputDefinitions: + artifacts: + artifact: + artifactType: + schemaTitle: system.Model + schemaVersion: 0.0.1 comp-knowledge-processed-data-to-artifact-op: executorLabel: exec-knowledge-processed-data-to-artifact-op inputDefinitions: @@ -304,6 +306,19 @@ components: artifactType: schemaTitle: system.Dataset schemaVersion: 0.0.1 + comp-model-to-pvc-op: + executorLabel: exec-model-to-pvc-op + inputDefinitions: + artifacts: + model: + artifactType: + schemaTitle: system.Model + schemaVersion: 0.0.1 + parameters: + pvc_path: + defaultValue: /model + isOptional: true + parameterType: STRING comp-pvc-to-model-op: executorLabel: exec-pvc-to-model-op inputDefinitions: @@ -654,39 +669,25 @@ deploymentSpec: - /bin/sh - -c image: registry.access.redhat.com/ubi9/toolbox - exec-huggingface-importer-op: + exec-importer: + importer: + artifactUri: + runtimeParameter: uri + typeSchema: + schemaTitle: system.Model + schemaVersion: 0.0.1 + exec-knowledge-processed-data-to-artifact-op: container: args: - - --executor_input - - '{{$}}' - - --function_to_execute - - huggingface_importer_op + - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['knowledge_processed_data'].path}} command: - - sh + - /bin/sh - -c - - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ - \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location 'huggingface_hub'\ - \ && \"$0\" \"$@\"\n" - - sh - - -ec - - 'program_path=$(mktemp -d) - - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef huggingface_importer_op(repo_name: str, model_path: str = \"\ - /model\"):\n from huggingface_hub import snapshot_download\n\n snapshot_download(repo_id=repo_name,\ - \ cache_dir=\"/tmp\", local_dir=model_path)\n\n" - image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111 - exec-knowledge-processed-data-to-artifact-op: + image: registry.access.redhat.com/ubi9/toolbox + exec-model-to-pvc-op: container: args: - - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['knowledge_processed_data'].path}} + - cp -r {{$.inputs.artifacts['model'].path}}/* {{$.inputs.parameters['pvc_path']}} command: - /bin/sh - -c @@ -1670,7 +1671,7 @@ root: dependentTasks: - createpvc - createpvc-2 - - huggingface-importer-op + - model-to-pvc-op - sdg-op inputs: parameters: @@ -1744,18 +1745,17 @@ root: componentInputParameter: sdg_repo_url taskInfo: name: git-clone-op - huggingface-importer-op: - cachingOptions: {} + importer: + cachingOptions: + enableCache: true componentRef: - name: comp-huggingface-importer-op - dependentTasks: - - createpvc-2 + name: comp-importer inputs: parameters: - repo_name: + uri: componentInputParameter: sdg_base_model taskInfo: - name: huggingface-importer-op + name: importer knowledge-processed-data-to-artifact-op: cachingOptions: {} componentRef: @@ -1765,6 +1765,21 @@ root: - data-processing-op taskInfo: name: knowledge-processed-data-to-artifact-op + model-to-pvc-op: + cachingOptions: {} + componentRef: + name: comp-model-to-pvc-op + dependentTasks: + - createpvc-2 + - importer + inputs: + artifacts: + model: + taskOutputArtifact: + outputArtifactKey: artifact + producerTask: importer + taskInfo: + name: model-to-pvc-op pvc-to-model-op: cachingOptions: {} componentRef: @@ -1803,7 +1818,7 @@ root: - createpvc-2 - createpvc-3 - data-processing-op - - huggingface-importer-op + - model-to-pvc-op inputs: parameters: effective_batch_size: @@ -2045,7 +2060,7 @@ root: isOptional: true parameterType: BOOLEAN sdg_base_model: - defaultValue: ibm-granite/granite-7b-base + defaultValue: s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter description: SDG parameter. LLM model used to generate the synthetic dataset isOptional: true parameterType: STRING @@ -2193,18 +2208,18 @@ platforms: taskOutputParameter: outputParameterKey: name producerTask: createpvc - exec-huggingface-importer-op: - pvcMount: - - mountPath: /model - taskOutputParameter: - outputParameterKey: name - producerTask: createpvc-2 exec-knowledge-processed-data-to-artifact-op: pvcMount: - mountPath: /data taskOutputParameter: outputParameterKey: name producerTask: createpvc + exec-model-to-pvc-op: + pvcMount: + - mountPath: /model + taskOutputParameter: + outputParameterKey: name + producerTask: createpvc-2 exec-pvc-to-model-op: pvcMount: - mountPath: /output diff --git a/utils/__init__.py b/utils/__init__.py index 1e8897cf..7dcc94ed 100644 --- a/utils/__init__.py +++ b/utils/__init__.py @@ -1,13 +1,15 @@ from . import faked from .components import ( - huggingface_importer_op, + ilab_importer_op, + model_to_pvc_op, pvc_to_model_op, pvc_to_mt_bench_op, ) __all__ = [ - "huggingface_importer_op", + "model_to_pvc_op", "pvc_to_mt_bench_op", "pvc_to_model_op", + "ilab_importer_op", "faked", ] diff --git a/utils/components.py b/utils/components.py index 30581818..8ddabcd7 100644 --- a/utils/components.py +++ b/utils/components.py @@ -3,7 +3,7 @@ from kfp import dsl -from .consts import PYTHON_IMAGE, TOOLBOX_IMAGE +from .consts import PYTHON_IMAGE, RHELAI_IMAGE, TOOLBOX_IMAGE @dsl.container_component @@ -24,12 +24,21 @@ def pvc_to_model_op(model: dsl.Output[dsl.Model], pvc_path: str): ) -@dsl.component( - base_image=PYTHON_IMAGE, - install_kfp_package=False, - packages_to_install=["huggingface_hub"], -) -def huggingface_importer_op(repo_name: str, model_path: str = "/model"): - from huggingface_hub import snapshot_download +@dsl.container_component +def model_to_pvc_op(model: dsl.Input[dsl.Model], pvc_path: str = "/model"): + return dsl.ContainerSpec( + TOOLBOX_IMAGE, + ["/bin/sh", "-c"], + [f"cp -r {model.path}/* {pvc_path}"], + ) - snapshot_download(repo_id=repo_name, cache_dir="/tmp", local_dir=model_path) + +@dsl.container_component +def ilab_importer_op(repository: str, release: str, base_model: dsl.Output[dsl.Model]): + return dsl.ContainerSpec( + RHELAI_IMAGE, + ["/bin/sh", "-c"], + [ + f"ilab --config=DEFAULT model download --repository {repository} --release {release} --model-dir {base_model.path}" + ], + ) diff --git a/utils/faked/__init__.py b/utils/faked/__init__.py index b49686c8..bb7ef2b2 100644 --- a/utils/faked/__init__.py +++ b/utils/faked/__init__.py @@ -1,4 +1,5 @@ from .components import ( + model_to_pvc_op, pvc_to_model_op, pvc_to_mt_bench_op, ) @@ -6,4 +7,5 @@ __all__ = [ "pvc_to_mt_bench_op", "pvc_to_model_op", + "model_to_pvc_op", ] diff --git a/utils/faked/components.py b/utils/faked/components.py index fa964877..65e4c74d 100644 --- a/utils/faked/components.py +++ b/utils/faked/components.py @@ -6,7 +6,7 @@ @dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False) -def huggingface_importer_op(repo_name: str, model_path: str = "/model"): +def model_to_pvc_op(model: dsl.Input[dsl.Model], pvc_path: str = "/model"): return