From 1a3b9709582bb2c306ae7dc9ee1115007c1c7a0c Mon Sep 17 00:00:00 2001 From: Tomas Coufal Date: Mon, 9 Dec 2024 15:31:24 +0100 Subject: [PATCH] chore: ensure pipeline params match ilab defaults Signed-off-by: Tomas Coufal --- pipeline.py | 74 ++++++++++++++++++++------------------------------- pipeline.yaml | 52 ++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 71 deletions(-) diff --git a/pipeline.py b/pipeline.py index 177179e..770e059 100644 --- a/pipeline.py +++ b/pipeline.py @@ -23,32 +23,11 @@ MOCKED_STAGES = ["sdg", "train", "eval"] PIPELINE_FILE_NAME = "pipeline.yaml" IMPORTER_PIPELINE_FILE_NAME = "importer-pipeline.yaml" -SDG_PIPELINE = "simple" IMAGE_PULL_SECRET = "redhat-et-ilab-botty-pull-secret" STANDALONE_TEMPLATE_FILE_NAME = "standalone.tpl" GENERATED_STANDALONE_FILE_NAME = "standalone.py" DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git" -# eval args -FEW_SHOTS = 5 -# BATCH_SIZE can also be an int, for example "8" is converted to an int in eval/final -BATCH_SIZE = "auto" -MAX_WORKERS = "auto" -MERGE_SYSTEM_USER_MESSAGE = False - -# training args -NUM_EPOCHS_PHASE_1 = 2 -NUM_EPOCHS_PHASE_2 = 2 -EFFECTIVE_BATCH_SIZE_PHASE_1 = 3840 -EFFECTIVE_BATCH_SIZE_PHASE_2 = 3840 -LEARNING_RATE_PHASE_1 = 1e-4 -LEARNING_RATE_PHASE_2 = 1e-4 -NUM_WARMUP_STEPS_PHASE_1 = 100 -NUM_WARMUP_STEPS_PHASE_2 = 100 -SAVE_SAMPLES = 0 -MAX_BATCH_LEN = 20000 -SEED = 42 - def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]): """Wrapper for KFP pipeline, which allows for mocking individual stages.""" @@ -112,32 +91,32 @@ def pipeline( int ] = None, # FIXME: https://issues.redhat.com/browse/RHOAIRFE-467 sdg_base_model: str = "s3:///", - sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290 - sdg_pipeline: str = SDG_PIPELINE, - sdg_max_batch_len: int = MAX_BATCH_LEN, - sdg_sample_size: float = 1.0, + sdg_scale_factor: int = 30, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L125 + sdg_pipeline: str = "full", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L122 + sdg_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 + sdg_sample_size: float = 1.0, # FIXME: Not present in default config. Need justification? # Training phase - train_nproc_per_node: int = 3, - train_nnodes: int = 2, - train_num_epochs_phase_1: int = NUM_EPOCHS_PHASE_1, - train_num_epochs_phase_2: int = NUM_EPOCHS_PHASE_2, - train_effective_batch_size_phase_1: int = EFFECTIVE_BATCH_SIZE_PHASE_1, - train_effective_batch_size_phase_2: int = EFFECTIVE_BATCH_SIZE_PHASE_2, - train_learning_rate_phase_1: float = LEARNING_RATE_PHASE_1, - train_learning_rate_phase_2: float = LEARNING_RATE_PHASE_2, - train_num_warmup_steps_phase_1: int = NUM_WARMUP_STEPS_PHASE_1, - train_num_warmup_steps_phase_2: int = NUM_WARMUP_STEPS_PHASE_2, - train_save_samples: int = SAVE_SAMPLES, - train_max_batch_len: int = MAX_BATCH_LEN, - train_seed: int = SEED, + train_nproc_per_node: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. + train_nnodes: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification. + train_num_epochs_phase_1: int = 7, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L364 + train_num_epochs_phase_2: int = 10, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L377 + train_effective_batch_size_phase_1: int = 128, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L357 + train_effective_batch_size_phase_2: int = 3840, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L371 + train_learning_rate_phase_1: float = 2e-05, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L360 + train_learning_rate_phase_2: float = 6e-06, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L374 + train_num_warmup_steps_phase_1: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874 + train_num_warmup_steps_phase_2: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874 + train_save_samples: int = 250000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L393 + train_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334 + train_seed: int = 42, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L901 # MT Bench - mt_bench_max_workers: str = MAX_WORKERS, - mt_bench_merge_system_user_message: bool = MERGE_SYSTEM_USER_MESSAGE, + mt_bench_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74 + mt_bench_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 # Final evaluation - final_eval_max_workers: str = MAX_WORKERS, - final_eval_few_shots: int = FEW_SHOTS, - final_eval_batch_size: str = BATCH_SIZE, - final_eval_merge_system_user_message: bool = MERGE_SYSTEM_USER_MESSAGE, + final_eval_max_workers: str = "auto", # FIXME: Not present in default config. Need justification? + final_eval_few_shots: int = 5, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L56 + final_eval_batch_size: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L52 + final_eval_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474 # Other options k8s_storage_class_name: str = "standard", # FIXME: https://github.com/kubeflow/pipelines/issues/11396, https://issues.redhat.com/browse/RHOAIRFE-470 ): @@ -558,7 +537,12 @@ def run(mock, experiment, run_name, param): dev_arguments = { "k8s_storage_class_name": "nfs-csi", "sdg_base_model": "s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter", - "train_nproc_per_node": 2, + "train_num_epochs_phase_1": 2, + "train_num_epochs_phase_2": 2, + "train_num_warmup_steps_phase_1": 100, + "train_num_warmup_steps_phase_2": 100, + "train_learning_rate_phase_1": 1e-4, + "train_learning_rate_phase_2": 1e-4, } try: diff --git a/pipeline.yaml b/pipeline.yaml index 823d756..4d3bf05 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -10,25 +10,25 @@ # mt_bench_max_workers: str [Default: 'auto'] # mt_bench_merge_system_user_message: bool [Default: False] # sdg_base_model: str [Default: 's3:///'] -# sdg_max_batch_len: int [Default: 20000.0] -# sdg_pipeline: str [Default: 'simple'] +# sdg_max_batch_len: int [Default: 5000.0] +# sdg_pipeline: str [Default: 'full'] # sdg_repo_branch: str # sdg_repo_pr: int # sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git'] # sdg_sample_size: float [Default: 1.0] -# sdg_scale_factor: int [Default: 2.0] -# train_effective_batch_size_phase_1: int [Default: 3840.0] +# sdg_scale_factor: int [Default: 30.0] +# train_effective_batch_size_phase_1: int [Default: 128.0] # train_effective_batch_size_phase_2: int [Default: 3840.0] -# train_learning_rate_phase_1: float [Default: 0.0001] -# train_learning_rate_phase_2: float [Default: 0.0001] -# train_max_batch_len: int [Default: 20000.0] +# train_learning_rate_phase_1: float [Default: 2e-05] +# train_learning_rate_phase_2: float [Default: 6e-06] +# train_max_batch_len: int [Default: 5000.0] # train_nnodes: int [Default: 2.0] -# train_nproc_per_node: int [Default: 3.0] -# train_num_epochs_phase_1: int [Default: 2.0] -# train_num_epochs_phase_2: int [Default: 2.0] -# train_num_warmup_steps_phase_1: int [Default: 100.0] -# train_num_warmup_steps_phase_2: int [Default: 100.0] -# train_save_samples: int [Default: 0.0] +# train_nproc_per_node: int [Default: 2.0] +# train_num_epochs_phase_1: int [Default: 7.0] +# train_num_epochs_phase_2: int [Default: 10.0] +# train_num_warmup_steps_phase_1: int [Default: 1000.0] +# train_num_warmup_steps_phase_2: int [Default: 1000.0] +# train_save_samples: int [Default: 250000.0] # train_seed: int [Default: 42.0] components: comp-createpvc: @@ -2115,13 +2115,13 @@ root: isOptional: true parameterType: STRING sdg_max_batch_len: - defaultValue: 20000.0 + defaultValue: 5000.0 description: SDG parameter. Maximum tokens per gpu for each batch that will be handled in a single step. isOptional: true parameterType: NUMBER_INTEGER sdg_pipeline: - defaultValue: simple + defaultValue: full description: 'SDG parameter. Data generation pipeline to use. Available: ''simple'', ''full'', or a valid path to a directory of pipeline workflow YAML files. Note that ''full'' requires a larger teacher model, Mixtral-8x7b.' @@ -2149,12 +2149,12 @@ root: isOptional: true parameterType: NUMBER_DOUBLE sdg_scale_factor: - defaultValue: 2.0 + defaultValue: 30.0 description: SDG parameter. The total number of instructions to be generated. isOptional: true parameterType: NUMBER_INTEGER train_effective_batch_size_phase_1: - defaultValue: 3840.0 + defaultValue: 128.0 description: Training parameter for in Phase 1. The number of samples in a batch that the model should see before its parameters are updated. isOptional: true @@ -2166,7 +2166,7 @@ root: isOptional: true parameterType: NUMBER_INTEGER train_learning_rate_phase_1: - defaultValue: 0.0001 + defaultValue: 2.0e-05 description: Training parameter for in Phase 1. How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective @@ -2174,7 +2174,7 @@ root: isOptional: true parameterType: NUMBER_DOUBLE train_learning_rate_phase_2: - defaultValue: 0.0001 + defaultValue: 6.0e-06 description: Training parameter for in Phase 2. How fast we optimize the weights during gradient descent. Higher values may lead to unstable learning performance. It's generally recommended to have a low learning rate with a high effective @@ -2182,7 +2182,7 @@ root: isOptional: true parameterType: NUMBER_DOUBLE train_max_batch_len: - defaultValue: 20000.0 + defaultValue: 5000.0 description: Training parameter. Maximum tokens per gpu for each batch that will be handled in a single step. isOptional: true @@ -2193,37 +2193,37 @@ root: isOptional: true parameterType: NUMBER_INTEGER train_nproc_per_node: - defaultValue: 3.0 + defaultValue: 2.0 description: Training parameter. Number of GPUs per each node/worker to use for training. isOptional: true parameterType: NUMBER_INTEGER train_num_epochs_phase_1: - defaultValue: 2.0 + defaultValue: 7.0 description: Training parameter for in Phase 1. Number of epochs to run training. isOptional: true parameterType: NUMBER_INTEGER train_num_epochs_phase_2: - defaultValue: 2.0 + defaultValue: 10.0 description: Training parameter for in Phase 2. Number of epochs to run training. isOptional: true parameterType: NUMBER_INTEGER train_num_warmup_steps_phase_1: - defaultValue: 100.0 + defaultValue: 1000.0 description: Training parameter for in Phase 1. The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to train_learning_rate. isOptional: true parameterType: NUMBER_INTEGER train_num_warmup_steps_phase_2: - defaultValue: 100.0 + defaultValue: 1000.0 description: Training parameter for in Phase 2. The number of steps a model should go through before reaching the full learning rate. We start at 0 and linearly climb up to train_learning_rate. isOptional: true parameterType: NUMBER_INTEGER train_save_samples: - defaultValue: 0.0 + defaultValue: 250000.0 description: Training parameter. Number of samples the model should see before saving a checkpoint. isOptional: true