From 1a3b9709582bb2c306ae7dc9ee1115007c1c7a0c Mon Sep 17 00:00:00 2001
From: Tomas Coufal <tcoufal@redhat.com>
Date: Mon, 9 Dec 2024 15:31:24 +0100
Subject: [PATCH] chore: ensure pipeline params match ilab defaults

Signed-off-by: Tomas Coufal <tcoufal@redhat.com>
---
 pipeline.py   | 74 ++++++++++++++++++++-------------------------------
 pipeline.yaml | 52 ++++++++++++++++++------------------
 2 files changed, 55 insertions(+), 71 deletions(-)
diff --git a/pipeline.py b/pipeline.py
index 177179e..770e059 100644
--- a/pipeline.py
+++ b/pipeline.py
@@ -23,32 +23,11 @@
 MOCKED_STAGES = ["sdg", "train", "eval"]
 PIPELINE_FILE_NAME = "pipeline.yaml"
 IMPORTER_PIPELINE_FILE_NAME = "importer-pipeline.yaml"
-SDG_PIPELINE = "simple"
 IMAGE_PULL_SECRET = "redhat-et-ilab-botty-pull-secret"
 STANDALONE_TEMPLATE_FILE_NAME = "standalone.tpl"
 GENERATED_STANDALONE_FILE_NAME = "standalone.py"
 DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git"
 
-# eval args
-FEW_SHOTS = 5
-# BATCH_SIZE can also be an int, for example "8" is converted to an int in eval/final
-BATCH_SIZE = "auto"
-MAX_WORKERS = "auto"
-MERGE_SYSTEM_USER_MESSAGE = False
-
-# training args
-NUM_EPOCHS_PHASE_1 = 2
-NUM_EPOCHS_PHASE_2 = 2
-EFFECTIVE_BATCH_SIZE_PHASE_1 = 3840
-EFFECTIVE_BATCH_SIZE_PHASE_2 = 3840
-LEARNING_RATE_PHASE_1 = 1e-4
-LEARNING_RATE_PHASE_2 = 1e-4
-NUM_WARMUP_STEPS_PHASE_1 = 100
-NUM_WARMUP_STEPS_PHASE_2 = 100
-SAVE_SAMPLES = 0
-MAX_BATCH_LEN = 20000
-SEED = 42
-
 
 def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
     """Wrapper for KFP pipeline, which allows for mocking individual stages."""
@@ -112,32 +91,32 @@ def pipeline(
             int
         ] = None,  # FIXME: https://issues.redhat.com/browse/RHOAIRFE-467
         sdg_base_model: str = "s3://<BUCKET>/<PATH_TO_MODEL>",
-        sdg_scale_factor: int = 2,  # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
-        sdg_pipeline: str = SDG_PIPELINE,
-        sdg_max_batch_len: int = MAX_BATCH_LEN,
-        sdg_sample_size: float = 1.0,
+        sdg_scale_factor: int = 30,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L125
+        sdg_pipeline: str = "full",  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L122
+        sdg_max_batch_len: int = 5000,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334
+        sdg_sample_size: float = 1.0,  # FIXME: Not present in default config. Need justification?
         # Training phase
-        train_nproc_per_node: int = 3,
-        train_nnodes: int = 2,
-        train_num_epochs_phase_1: int = NUM_EPOCHS_PHASE_1,
-        train_num_epochs_phase_2: int = NUM_EPOCHS_PHASE_2,
-        train_effective_batch_size_phase_1: int = EFFECTIVE_BATCH_SIZE_PHASE_1,
-        train_effective_batch_size_phase_2: int = EFFECTIVE_BATCH_SIZE_PHASE_2,
-        train_learning_rate_phase_1: float = LEARNING_RATE_PHASE_1,
-        train_learning_rate_phase_2: float = LEARNING_RATE_PHASE_2,
-        train_num_warmup_steps_phase_1: int = NUM_WARMUP_STEPS_PHASE_1,
-        train_num_warmup_steps_phase_2: int = NUM_WARMUP_STEPS_PHASE_2,
-        train_save_samples: int = SAVE_SAMPLES,
-        train_max_batch_len: int = MAX_BATCH_LEN,
-        train_seed: int = SEED,
+        train_nproc_per_node: int = 2,  # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification.
+        train_nnodes: int = 2,  # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification.
+        train_num_epochs_phase_1: int = 7,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L364
+        train_num_epochs_phase_2: int = 10,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L377
+        train_effective_batch_size_phase_1: int = 128,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L357
+        train_effective_batch_size_phase_2: int = 3840,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L371
+        train_learning_rate_phase_1: float = 2e-05,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L360
+        train_learning_rate_phase_2: float = 6e-06,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L374
+        train_num_warmup_steps_phase_1: int = 1000,  # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874
+        train_num_warmup_steps_phase_2: int = 1000,  # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874
+        train_save_samples: int = 250000,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L393
+        train_max_batch_len: int = 5000,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334
+        train_seed: int = 42,  # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L901
         # MT Bench
-        mt_bench_max_workers: str = MAX_WORKERS,
-        mt_bench_merge_system_user_message: bool = MERGE_SYSTEM_USER_MESSAGE,
+        mt_bench_max_workers: str = "auto",  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74
+        mt_bench_merge_system_user_message: bool = False,  # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474
         # Final evaluation
-        final_eval_max_workers: str = MAX_WORKERS,
-        final_eval_few_shots: int = FEW_SHOTS,
-        final_eval_batch_size: str = BATCH_SIZE,
-        final_eval_merge_system_user_message: bool = MERGE_SYSTEM_USER_MESSAGE,
+        final_eval_max_workers: str = "auto",  # FIXME: Not present in default config. Need justification?
+        final_eval_few_shots: int = 5,  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L56
+        final_eval_batch_size: str = "auto",  # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L52
+        final_eval_merge_system_user_message: bool = False,  # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474
         # Other options
         k8s_storage_class_name: str = "standard",  # FIXME: https://github.com/kubeflow/pipelines/issues/11396, https://issues.redhat.com/browse/RHOAIRFE-470
     ):
@@ -558,7 +537,12 @@ def run(mock, experiment, run_name, param):
     dev_arguments = {
         "k8s_storage_class_name": "nfs-csi",
         "sdg_base_model": "s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter",
-        "train_nproc_per_node": 2,
+        "train_num_epochs_phase_1": 2,
+        "train_num_epochs_phase_2": 2,
+        "train_num_warmup_steps_phase_1": 100,
+        "train_num_warmup_steps_phase_2": 100,
+        "train_learning_rate_phase_1": 1e-4,
+        "train_learning_rate_phase_2": 1e-4,
     }
 
     try:
diff --git a/pipeline.yaml b/pipeline.yaml
index 823d756..4d3bf05 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -10,25 +10,25 @@
 #    mt_bench_max_workers: str [Default: 'auto']
 #    mt_bench_merge_system_user_message: bool [Default: False]
 #    sdg_base_model: str [Default: 's3://<BUCKET>/<PATH_TO_MODEL>']
-#    sdg_max_batch_len: int [Default: 20000.0]
-#    sdg_pipeline: str [Default: 'simple']
+#    sdg_max_batch_len: int [Default: 5000.0]
+#    sdg_pipeline: str [Default: 'full']
 #    sdg_repo_branch: str
 #    sdg_repo_pr: int
 #    sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
 #    sdg_sample_size: float [Default: 1.0]
-#    sdg_scale_factor: int [Default: 2.0]
-#    train_effective_batch_size_phase_1: int [Default: 3840.0]
+#    sdg_scale_factor: int [Default: 30.0]
+#    train_effective_batch_size_phase_1: int [Default: 128.0]
 #    train_effective_batch_size_phase_2: int [Default: 3840.0]
-#    train_learning_rate_phase_1: float [Default: 0.0001]
-#    train_learning_rate_phase_2: float [Default: 0.0001]
-#    train_max_batch_len: int [Default: 20000.0]
+#    train_learning_rate_phase_1: float [Default: 2e-05]
+#    train_learning_rate_phase_2: float [Default: 6e-06]
+#    train_max_batch_len: int [Default: 5000.0]
 #    train_nnodes: int [Default: 2.0]
-#    train_nproc_per_node: int [Default: 3.0]
-#    train_num_epochs_phase_1: int [Default: 2.0]
-#    train_num_epochs_phase_2: int [Default: 2.0]
-#    train_num_warmup_steps_phase_1: int [Default: 100.0]
-#    train_num_warmup_steps_phase_2: int [Default: 100.0]
-#    train_save_samples: int [Default: 0.0]
+#    train_nproc_per_node: int [Default: 2.0]
+#    train_num_epochs_phase_1: int [Default: 7.0]
+#    train_num_epochs_phase_2: int [Default: 10.0]
+#    train_num_warmup_steps_phase_1: int [Default: 1000.0]
+#    train_num_warmup_steps_phase_2: int [Default: 1000.0]
+#    train_save_samples: int [Default: 250000.0]
 #    train_seed: int [Default: 42.0]
 components:
   comp-createpvc:
@@ -2115,13 +2115,13 @@ root:
         isOptional: true
         parameterType: STRING
       sdg_max_batch_len:
-        defaultValue: 20000.0
+        defaultValue: 5000.0
         description: SDG parameter. Maximum tokens per gpu for each batch that will
           be handled in a single step.
         isOptional: true
         parameterType: NUMBER_INTEGER
       sdg_pipeline:
-        defaultValue: simple
+        defaultValue: full
         description: 'SDG parameter. Data generation pipeline to use. Available: ''simple'',
           ''full'', or a valid path to a directory of pipeline workflow YAML files.
           Note that ''full'' requires a larger teacher model, Mixtral-8x7b.'
@@ -2149,12 +2149,12 @@ root:
         isOptional: true
         parameterType: NUMBER_DOUBLE
       sdg_scale_factor:
-        defaultValue: 2.0
+        defaultValue: 30.0
         description: SDG parameter. The total number of instructions to be generated.
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_effective_batch_size_phase_1:
-        defaultValue: 3840.0
+        defaultValue: 128.0
         description: Training parameter for in Phase 1. The number of samples in a
           batch that the model should see before its parameters are updated.
         isOptional: true
@@ -2166,7 +2166,7 @@ root:
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_learning_rate_phase_1:
-        defaultValue: 0.0001
+        defaultValue: 2.0e-05
         description: Training parameter for in Phase 1. How fast we optimize the weights
           during gradient descent. Higher values may lead to unstable learning performance.
           It's generally recommended to have a low learning rate with a high effective
@@ -2174,7 +2174,7 @@ root:
         isOptional: true
         parameterType: NUMBER_DOUBLE
       train_learning_rate_phase_2:
-        defaultValue: 0.0001
+        defaultValue: 6.0e-06
         description: Training parameter for in Phase 2. How fast we optimize the weights
           during gradient descent. Higher values may lead to unstable learning performance.
           It's generally recommended to have a low learning rate with a high effective
@@ -2182,7 +2182,7 @@ root:
         isOptional: true
         parameterType: NUMBER_DOUBLE
       train_max_batch_len:
-        defaultValue: 20000.0
+        defaultValue: 5000.0
         description: Training parameter. Maximum tokens per gpu for each batch that
           will be handled in a single step.
         isOptional: true
@@ -2193,37 +2193,37 @@ root:
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_nproc_per_node:
-        defaultValue: 3.0
+        defaultValue: 2.0
         description: Training parameter. Number of GPUs per each node/worker to use
           for training.
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_num_epochs_phase_1:
-        defaultValue: 2.0
+        defaultValue: 7.0
         description: Training parameter for in Phase 1. Number of epochs to run training.
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_num_epochs_phase_2:
-        defaultValue: 2.0
+        defaultValue: 10.0
         description: Training parameter for in Phase 2. Number of epochs to run training.
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_num_warmup_steps_phase_1:
-        defaultValue: 100.0
+        defaultValue: 1000.0
         description: Training parameter for in Phase 1. The number of steps a model
           should go through before reaching the full learning rate. We start at 0
           and linearly climb up to train_learning_rate.
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_num_warmup_steps_phase_2:
-        defaultValue: 100.0
+        defaultValue: 1000.0
         description: Training parameter for in Phase 2. The number of steps a model
           should go through before reaching the full learning rate. We start at 0
           and linearly climb up to train_learning_rate.
         isOptional: true
         parameterType: NUMBER_INTEGER
       train_save_samples:
-        defaultValue: 0.0
+        defaultValue: 250000.0
         description: Training parameter. Number of samples the model should see before
           saving a checkpoint.
         isOptional: true