From 2d9d60df9aca7990922fc556cc7ec509e4064b9a Mon Sep 17 00:00:00 2001
From: Tomas Coufal <tcoufal@redhat.com>
Date: Thu, 21 Nov 2024 15:35:03 +0100
Subject: [PATCH] feat: source base model from S3 and provide a helper pipeline
 to populate the model to s3 default bucket

Signed-off-by: Tomas Coufal <tcoufal@redhat.com>
---
 importer-pipeline.yaml    |  88 ++++++++++++++++++++++++++++
 pipeline.py               |  73 ++++++++++++++++++++----
 pipeline.yaml             | 117 +++++++++++++++++++++-----------------
 utils/__init__.py         |   6 +-
 utils/components.py       |  27 ++++++---
 utils/faked/__init__.py   |   2 +
 utils/faked/components.py |   2 +-
 7 files changed, 242 insertions(+), 73 deletions(-)
 create mode 100644 importer-pipeline.yaml

diff --git a/importer-pipeline.yaml b/importer-pipeline.yaml
new file mode 100644
index 00000000..c4486c7e
--- /dev/null
+++ b/importer-pipeline.yaml
@@ -0,0 +1,88 @@
+# PIPELINE DEFINITION
+# Name: instructlab-base-importer
+# Description: Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model
+# Inputs:
+#    release: str [Default: 'latest']
+#    repository: str [Default: 'docker://registry.redhat.io/rhelai1/granite-7b-starter']
+components:
+  comp-ilab-importer-op:
+    executorLabel: exec-ilab-importer-op
+    inputDefinitions:
+      parameters:
+        release:
+          parameterType: STRING
+        repository:
+          parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        base_model:
+          artifactType:
+            schemaTitle: system.Model
+            schemaVersion: 0.0.1
+deploymentSpec:
+  executors:
+    exec-ilab-importer-op:
+      container:
+        args:
+        - ilab --config=DEFAULT model download --repository {{$.inputs.parameters['repository']}}
+          --release {{$.inputs.parameters['release']}} --model-dir {{$.outputs.artifacts['base_model'].path}}
+        command:
+        - /bin/sh
+        - -c
+        env:
+        - name: REGISTRY_AUTH_FILE
+          value: /mnt/containers/.dockerconfigjson
+        image: quay.io/redhat-et/ilab:1.2
+pipelineInfo:
+  description: Helper pipeline to the InstructLab pipeline which allows users to seed/import
+    a new base model
+  displayName: InstructLab - base model importer
+  name: instructlab-base-importer
+root:
+  dag:
+    tasks:
+      ilab-importer-op:
+        cachingOptions:
+          enableCache: true
+        componentRef:
+          name: comp-ilab-importer-op
+        inputs:
+          parameters:
+            release:
+              componentInputParameter: release
+            repository:
+              componentInputParameter: repository
+        taskInfo:
+          name: ilab-importer-op
+  inputDefinitions:
+    parameters:
+      release:
+        defaultValue: latest
+        description: The revision of the model to download - e.g. a branch, tag, or
+          commit hash for Hugging Face repositories and tag or commit hash for OCI
+          repositories.
+        isOptional: true
+        parameterType: STRING
+      repository:
+        defaultValue: docker://registry.redhat.io/rhelai1/granite-7b-starter
+        description: Hugging Face or OCI repository of the model to download. OCI
+          repository must have a docker:// prefix
+        isOptional: true
+        parameterType: STRING
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.9.0
+---
+platforms:
+  kubernetes:
+    deploymentSpec:
+      executors:
+        exec-ilab-importer-op:
+          secretAsEnv:
+          - keyToEnv:
+            - envVar: HF_TOKEN
+              secretKey: HF_TOKEN
+            secretName: hugging-face-token
+          secretAsVolume:
+          - mountPath: /mnt/containers
+            optional: false
+            secretName: 7033380-ilab-pull-secret
diff --git a/pipeline.py b/pipeline.py
index 7e129a7f..0f29adc8 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -13,6 +13,7 @@
     set_image_pull_secrets,
     use_config_map_as_env,
     use_secret_as_env,
+    use_secret_as_volume,
 )
 
 TEACHER_CONFIG_MAP = "teacher-server"
@@ -21,13 +22,16 @@
 JUDGE_SECRET = "judge-server"
 MOCKED_STAGES = ["sdg", "train", "eval"]
 PIPELINE_FILE_NAME = "pipeline.yaml"
+IMPORTER_PIPELINE_FILE_NAME = "importer-pipeline.yaml"
 SDG_PIPELINE = "simple"
 IMAGE_PULL_SECRET = "redhat-et-ilab-botty-pull-secret"
 STANDALONE_TEMPLATE_FILE_NAME = "standalone.tpl"
 GENERATED_STANDALONE_FILE_NAME = "standalone.py"
 DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git"
 KFP_MODEL_SERVER_CM = "sdg/kfp-model-server.yaml"
-BASE_MODEL = "ibm-granite/granite-7b-base"
+
+# FIXME: This value is specific to ocp-beta-test.nerc.mghpcc.org cluster, `ilab` namespace. It is quite cumbersome to copypaste and remember the path every time in dev. This default value should go away once we reach feature freeze.
+BASE_MODEL = "s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter"
 
 # eval args
 MMLU_TASKS_LIST = "mmlu_anatomy,mmlu_astronomy"
@@ -51,7 +55,7 @@
 SEED = 42
 
 
-def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
+def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
     """Wrapper for KFP pipeline, which allows for mocking individual stages."""
 
     # Imports for SDG stage
@@ -79,7 +83,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
             skills_processed_data_to_artifact_op,
         )
         from utils.faked import (
-            huggingface_importer_op,
+            model_to_pvc_op,
             pvc_to_model_op,
             pvc_to_mt_bench_op,
         )
@@ -91,7 +95,7 @@ def pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
             skills_processed_data_to_artifact_op,
         )
         from utils import (
-            huggingface_importer_op,
+            model_to_pvc_op,
             pvc_to_model_op,
             pvc_to_mt_bench_op,
         )
@@ -239,6 +243,9 @@ def pipeline(
         # set_image_pull_policy(sdg_task, "Always")
 
         # Training stage
+        model_source_s3_task = dsl.importer(
+            artifact_uri=sdg_base_model, artifact_class=dsl.Model
+        )
 
         # We need to pass storage_class_name as "" to use the default StorageClass, if left empty, KFP uses "standard" StorageClass.
         # 'standard' !=  default StorageClass
@@ -250,7 +257,8 @@ def pipeline(
             size="100Gi",
             storage_class_name=k8s_storage_class_name,
         )
-        model_to_pvc_task = huggingface_importer_op(repo_name=sdg_base_model)
+
+        model_to_pvc_task = model_to_pvc_op(model=model_source_s3_task.output)
         model_to_pvc_task.set_caching_options(False)
         mount_pvc(
             task=model_to_pvc_task, pvc_name=model_pvc_task.output, mount_path="/model"
@@ -459,6 +467,46 @@ def pipeline(
     return pipeline
 
 
+def import_base_model_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
+    from utils import ilab_importer_op
+
+    @dsl.pipeline(
+        display_name="InstructLab - base model importer",
+        name="instructlab-base-importer",
+        description="Helper pipeline to the InstructLab pipeline which allows users to seed/import a new base model",
+    )
+    def pipeline(
+        # hf_token_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395
+        # oci_pull_secret: str = "", # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395
+        repository: str = "docker://registry.redhat.io/rhelai1/granite-7b-starter",
+        release: str = "latest",
+    ):
+        """InstructLab - base model importer.
+
+        Args:
+            repository: Hugging Face or OCI repository of the model to download. OCI repository must have a docker:// prefix
+            release: The revision of the model to download - e.g. a branch, tag, or commit hash for Hugging Face repositories and tag or commit hash for OCI repositories.
+            hf_token_secret: Name of existing Kubernetes secret which contains HF_TOKEN value for Hugging Face repositories. Mandatory for all repositories besides those which belong to the "instructlab" organization.
+            oci_pull_secret: Name of existing Kubernetes secret of .dockerconfigjson type for OCI repository authentication.
+        """
+        importer_task = ilab_importer_op(repository=repository, release=release)
+
+        # FIXME: Don't use hardcoded secret/configmap names once fixed upstream: https://github.com/kubeflow/pipelines/issues/11395
+        # FIXME: Make env variables optional once implemented upstream: https://github.com/kubeflow/pipelines/issues/11401
+        # This pipeline is currently unusable outside of ocp-beta-test.nerc.mghpcc.org cluster, `ilab` namespace due to the hardcoded names...
+        use_secret_as_env(
+            importer_task, "hugging-face-token", dict(HF_TOKEN="HF_TOKEN")
+        )
+        importer_task.set_env_variable(
+            "REGISTRY_AUTH_FILE", "/mnt/containers/.dockerconfigjson"
+        )
+        use_secret_as_volume(
+            importer_task, "7033380-ilab-pull-secret", mount_path="/mnt/containers"
+        )
+
+    return pipeline
+
+
 @click.option(
     "--mock",
     type=click.Choice(MOCKED_STAGES, case_sensitive=False),
@@ -474,11 +522,17 @@ def cli(ctx: click.Context, mock):
 
 
 def generate_pipeline(mock):
-    p = pipeline_wrapper(mock)
+    ilab_pipeline = ilab_pipeline_wrapper(mock)
+    import_base_model_pipeline = import_base_model_pipeline_wrapper(mock)
+
+    pipelines = [
+        (ilab_pipeline, PIPELINE_FILE_NAME),
+        (import_base_model_pipeline, IMPORTER_PIPELINE_FILE_NAME),
+    ]
 
-    with click.progressbar(length=1, label="Generating pipeline") as bar:
-        compiler.Compiler().compile(p, PIPELINE_FILE_NAME)
-        bar.update(1)
+    with click.progressbar(pipelines, label="Generating pipeline") as bar:
+        for pipeline_func, pipeline_file in bar:
+            compiler.Compiler().compile(pipeline_func, pipeline_file)
 
 
 @cli.command(name="gen-standalone")
@@ -517,7 +571,6 @@ def gen_standalone():
         "exec-data-processing-op": 'data_processing_op(max_seq_len={MAX_SEQ_LEN}, max_batch_len={MAX_BATCH_LEN}, sdg_path="{DATA_PVC_SDG_PATH}", model_path="{DATA_PVC_MODEL_PATH}", skills_path="{PREPROCESSED_DATA_SKILLS_PATH}", knowledge_path="{PREPROCESSED_DATA_KNOWLEDGE_PATH}")',
         "exec-sdg-op": 'sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})',
         "exec-git-clone-op": {},
-        "exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="{REPO_GRANITE_7B_IMAGE}", model_path="{DATA_PVC_MODEL_PATH}")',
         "exec-run-mt-bench-op": 'run_mt_bench_op(best_score_file="{MT_BENCH_SCORES_PATH}",output_path="{MT_BENCH_OUTPUT_PATH}",models_folder="{CANDIDATE_MODEL_PATH_PREFIX}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE})',
         "exec-run-final-eval-op": 'run_final_eval_op(mmlu_branch_output="{MMLU_BRANCH_SCORES_PATH}", mt_bench_branch_output="{MT_BENCH_BRANCH_SCORES_PATH}", candidate_model="{CANDIDATE_MODEL_PATH}", taxonomy_path="{TAXONOMY_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", base_branch="", candidate_branch="", base_model_dir="{DATA_PVC_MODEL_PATH}", max_workers="{MAX_WORKERS}", merge_system_user_message={MERGE_SYSTEM_USER_MESSAGE}, few_shots={FEW_SHOTS}, batch_size="{BATCH_SIZE}")',
     }
diff --git a/pipeline.yaml b/pipeline.yaml
index c27bcff9..0dbec02a 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -9,7 +9,7 @@
 #    k8s_storage_class_name: str [Default: 'nfs-csi']
 #    mt_bench_max_workers: str [Default: 'auto']
 #    mt_bench_merge_system_user_message: bool [Default: False]
-#    sdg_base_model: str [Default: 'ibm-granite/granite-7b-base']
+#    sdg_base_model: str [Default: 's3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter']
 #    sdg_max_batch_len: int [Default: 20000.0]
 #    sdg_pipeline: str [Default: 'simple']
 #    sdg_repo_branch: str
@@ -280,16 +280,18 @@ components:
           defaultValue: /data/taxonomy
           isOptional: true
           parameterType: STRING
-  comp-huggingface-importer-op:
-    executorLabel: exec-huggingface-importer-op
+  comp-importer:
+    executorLabel: exec-importer
     inputDefinitions:
       parameters:
-        model_path:
-          defaultValue: /model
-          isOptional: true
-          parameterType: STRING
-        repo_name:
+        uri:
           parameterType: STRING
+    outputDefinitions:
+      artifacts:
+        artifact:
+          artifactType:
+            schemaTitle: system.Model
+            schemaVersion: 0.0.1
   comp-knowledge-processed-data-to-artifact-op:
     executorLabel: exec-knowledge-processed-data-to-artifact-op
     inputDefinitions:
@@ -304,6 +306,19 @@ components:
           artifactType:
             schemaTitle: system.Dataset
             schemaVersion: 0.0.1
+  comp-model-to-pvc-op:
+    executorLabel: exec-model-to-pvc-op
+    inputDefinitions:
+      artifacts:
+        model:
+          artifactType:
+            schemaTitle: system.Model
+            schemaVersion: 0.0.1
+      parameters:
+        pvc_path:
+          defaultValue: /model
+          isOptional: true
+          parameterType: STRING
   comp-pvc-to-model-op:
     executorLabel: exec-pvc-to-model-op
     inputDefinitions:
@@ -654,39 +669,25 @@ deploymentSpec:
         - /bin/sh
         - -c
         image: registry.access.redhat.com/ubi9/toolbox
-    exec-huggingface-importer-op:
+    exec-importer:
+      importer:
+        artifactUri:
+          runtimeParameter: uri
+        typeSchema:
+          schemaTitle: system.Model
+          schemaVersion: 0.0.1
+    exec-knowledge-processed-data-to-artifact-op:
       container:
         args:
-        - --executor_input
-        - '{{$}}'
-        - --function_to_execute
-        - huggingface_importer_op
+        - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['knowledge_processed_data'].path}}
         command:
-        - sh
+        - /bin/sh
         - -c
-        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
-          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
-          \ python3 -m pip install --quiet --no-warn-script-location 'huggingface_hub'\
-          \ && \"$0\" \"$@\"\n"
-        - sh
-        - -ec
-        - 'program_path=$(mktemp -d)
-
-
-          printf "%s" "$0" > "$program_path/ephemeral_component.py"
-
-          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
-
-          '
-        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
-          \ *\n\ndef huggingface_importer_op(repo_name: str, model_path: str = \"\
-          /model\"):\n    from huggingface_hub import snapshot_download\n\n    snapshot_download(repo_id=repo_name,\
-          \ cache_dir=\"/tmp\", local_dir=model_path)\n\n"
-        image: quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111
-    exec-knowledge-processed-data-to-artifact-op:
+        image: registry.access.redhat.com/ubi9/toolbox
+    exec-model-to-pvc-op:
       container:
         args:
-        - cp -r {{$.inputs.parameters['pvc_path']}} {{$.outputs.artifacts['knowledge_processed_data'].path}}
+        - cp -r {{$.inputs.artifacts['model'].path}}/* {{$.inputs.parameters['pvc_path']}}
         command:
         - /bin/sh
         - -c
@@ -1670,7 +1671,7 @@ root:
         dependentTasks:
         - createpvc
         - createpvc-2
-        - huggingface-importer-op
+        - model-to-pvc-op
         - sdg-op
         inputs:
           parameters:
@@ -1744,18 +1745,17 @@ root:
               componentInputParameter: sdg_repo_url
         taskInfo:
           name: git-clone-op
-      huggingface-importer-op:
-        cachingOptions: {}
+      importer:
+        cachingOptions:
+          enableCache: true
         componentRef:
-          name: comp-huggingface-importer-op
-        dependentTasks:
-        - createpvc-2
+          name: comp-importer
         inputs:
           parameters:
-            repo_name:
+            uri:
               componentInputParameter: sdg_base_model
         taskInfo:
-          name: huggingface-importer-op
+          name: importer
       knowledge-processed-data-to-artifact-op:
         cachingOptions: {}
         componentRef:
@@ -1765,6 +1765,21 @@ root:
         - data-processing-op
         taskInfo:
           name: knowledge-processed-data-to-artifact-op
+      model-to-pvc-op:
+        cachingOptions: {}
+        componentRef:
+          name: comp-model-to-pvc-op
+        dependentTasks:
+        - createpvc-2
+        - importer
+        inputs:
+          artifacts:
+            model:
+              taskOutputArtifact:
+                outputArtifactKey: artifact
+                producerTask: importer
+        taskInfo:
+          name: model-to-pvc-op
       pvc-to-model-op:
         cachingOptions: {}
         componentRef:
@@ -1803,7 +1818,7 @@ root:
         - createpvc-2
         - createpvc-3
         - data-processing-op
-        - huggingface-importer-op
+        - model-to-pvc-op
         inputs:
           parameters:
             effective_batch_size:
@@ -2045,7 +2060,7 @@ root:
         isOptional: true
         parameterType: BOOLEAN
       sdg_base_model:
-        defaultValue: ibm-granite/granite-7b-base
+        defaultValue: s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter
         description: SDG parameter. LLM model used to generate the synthetic dataset
         isOptional: true
         parameterType: STRING
@@ -2193,18 +2208,18 @@ platforms:
             taskOutputParameter:
               outputParameterKey: name
               producerTask: createpvc
-        exec-huggingface-importer-op:
-          pvcMount:
-          - mountPath: /model
-            taskOutputParameter:
-              outputParameterKey: name
-              producerTask: createpvc-2
         exec-knowledge-processed-data-to-artifact-op:
           pvcMount:
           - mountPath: /data
             taskOutputParameter:
               outputParameterKey: name
               producerTask: createpvc
+        exec-model-to-pvc-op:
+          pvcMount:
+          - mountPath: /model
+            taskOutputParameter:
+              outputParameterKey: name
+              producerTask: createpvc-2
         exec-pvc-to-model-op:
           pvcMount:
           - mountPath: /output
diff --git a/utils/__init__.py b/utils/__init__.py
index 1e8897cf..7dcc94ed 100644
--- a/utils/__init__.py
+++ b/utils/__init__.py
@@ -1,13 +1,15 @@
 from . import faked
 from .components import (
-    huggingface_importer_op,
+    ilab_importer_op,
+    model_to_pvc_op,
     pvc_to_model_op,
     pvc_to_mt_bench_op,
 )
 
 __all__ = [
-    "huggingface_importer_op",
+    "model_to_pvc_op",
     "pvc_to_mt_bench_op",
     "pvc_to_model_op",
+    "ilab_importer_op",
     "faked",
 ]
diff --git a/utils/components.py b/utils/components.py
index 30581818..8ddabcd7 100644
--- a/utils/components.py
+++ b/utils/components.py
@@ -3,7 +3,7 @@
 
 from kfp import dsl
 
-from .consts import PYTHON_IMAGE, TOOLBOX_IMAGE
+from .consts import PYTHON_IMAGE, RHELAI_IMAGE, TOOLBOX_IMAGE
 
 
 @dsl.container_component
@@ -24,12 +24,21 @@ def pvc_to_model_op(model: dsl.Output[dsl.Model], pvc_path: str):
     )
 
 
-@dsl.component(
-    base_image=PYTHON_IMAGE,
-    install_kfp_package=False,
-    packages_to_install=["huggingface_hub"],
-)
-def huggingface_importer_op(repo_name: str, model_path: str = "/model"):
-    from huggingface_hub import snapshot_download
+@dsl.container_component
+def model_to_pvc_op(model: dsl.Input[dsl.Model], pvc_path: str = "/model"):
+    return dsl.ContainerSpec(
+        TOOLBOX_IMAGE,
+        ["/bin/sh", "-c"],
+        [f"cp -r {model.path}/* {pvc_path}"],
+    )
 
-    snapshot_download(repo_id=repo_name, cache_dir="/tmp", local_dir=model_path)
+
+@dsl.container_component
+def ilab_importer_op(repository: str, release: str, base_model: dsl.Output[dsl.Model]):
+    return dsl.ContainerSpec(
+        RHELAI_IMAGE,
+        ["/bin/sh", "-c"],
+        [
+            f"ilab --config=DEFAULT model download --repository {repository} --release {release} --model-dir {base_model.path}"
+        ],
+    )
diff --git a/utils/faked/__init__.py b/utils/faked/__init__.py
index b49686c8..bb7ef2b2 100644
--- a/utils/faked/__init__.py
+++ b/utils/faked/__init__.py
@@ -1,4 +1,5 @@
 from .components import (
+    model_to_pvc_op,
     pvc_to_model_op,
     pvc_to_mt_bench_op,
 )
@@ -6,4 +7,5 @@
 __all__ = [
     "pvc_to_mt_bench_op",
     "pvc_to_model_op",
+    "model_to_pvc_op",
 ]
diff --git a/utils/faked/components.py b/utils/faked/components.py
index fa964877..65e4c74d 100644
--- a/utils/faked/components.py
+++ b/utils/faked/components.py
@@ -6,7 +6,7 @@
 
 
 @dsl.component(base_image=PYTHON_IMAGE, install_kfp_package=False)
-def huggingface_importer_op(repo_name: str, model_path: str = "/model"):
+def model_to_pvc_op(model: dsl.Input[dsl.Model], pvc_path: str = "/model"):
     return