Skip to content

Commit

Permalink
chore: ensure pipeline params match ilab defaults
Browse files Browse the repository at this point in the history
Signed-off-by: Tomas Coufal <[email protected]>
  • Loading branch information
tumido committed Dec 9, 2024
1 parent 3bb3be0 commit 1a3b970
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 71 deletions.
74 changes: 29 additions & 45 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,32 +23,11 @@
MOCKED_STAGES = ["sdg", "train", "eval"]
PIPELINE_FILE_NAME = "pipeline.yaml"
IMPORTER_PIPELINE_FILE_NAME = "importer-pipeline.yaml"
SDG_PIPELINE = "simple"
IMAGE_PULL_SECRET = "redhat-et-ilab-botty-pull-secret"
STANDALONE_TEMPLATE_FILE_NAME = "standalone.tpl"
GENERATED_STANDALONE_FILE_NAME = "standalone.py"
DEFAULT_REPO_URL = "https://github.com/instructlab/taxonomy.git"

# eval args
FEW_SHOTS = 5
# BATCH_SIZE can also be an int, for example "8" is converted to an int in eval/final
BATCH_SIZE = "auto"
MAX_WORKERS = "auto"
MERGE_SYSTEM_USER_MESSAGE = False

# training args
NUM_EPOCHS_PHASE_1 = 2
NUM_EPOCHS_PHASE_2 = 2
EFFECTIVE_BATCH_SIZE_PHASE_1 = 3840
EFFECTIVE_BATCH_SIZE_PHASE_2 = 3840
LEARNING_RATE_PHASE_1 = 1e-4
LEARNING_RATE_PHASE_2 = 1e-4
NUM_WARMUP_STEPS_PHASE_1 = 100
NUM_WARMUP_STEPS_PHASE_2 = 100
SAVE_SAMPLES = 0
MAX_BATCH_LEN = 20000
SEED = 42


def ilab_pipeline_wrapper(mock: List[Literal[MOCKED_STAGES]]):
"""Wrapper for KFP pipeline, which allows for mocking individual stages."""
Expand Down Expand Up @@ -112,32 +91,32 @@ def pipeline(
int
] = None, # FIXME: https://issues.redhat.com/browse/RHOAIRFE-467
sdg_base_model: str = "s3://<BUCKET>/<PATH_TO_MODEL>",
sdg_scale_factor: int = 2, # Renamed upstream https://github.com/instructlab/instructlab/blob/f7d40f6ed5112d59132dd832bd332fa6fbbe7010/src/instructlab/configuration.py#L279-L290
sdg_pipeline: str = SDG_PIPELINE,
sdg_max_batch_len: int = MAX_BATCH_LEN,
sdg_sample_size: float = 1.0,
sdg_scale_factor: int = 30, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L125
sdg_pipeline: str = "full", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L122
sdg_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334
sdg_sample_size: float = 1.0, # FIXME: Not present in default config. Need justification?
# Training phase
train_nproc_per_node: int = 3,
train_nnodes: int = 2,
train_num_epochs_phase_1: int = NUM_EPOCHS_PHASE_1,
train_num_epochs_phase_2: int = NUM_EPOCHS_PHASE_2,
train_effective_batch_size_phase_1: int = EFFECTIVE_BATCH_SIZE_PHASE_1,
train_effective_batch_size_phase_2: int = EFFECTIVE_BATCH_SIZE_PHASE_2,
train_learning_rate_phase_1: float = LEARNING_RATE_PHASE_1,
train_learning_rate_phase_2: float = LEARNING_RATE_PHASE_2,
train_num_warmup_steps_phase_1: int = NUM_WARMUP_STEPS_PHASE_1,
train_num_warmup_steps_phase_2: int = NUM_WARMUP_STEPS_PHASE_2,
train_save_samples: int = SAVE_SAMPLES,
train_max_batch_len: int = MAX_BATCH_LEN,
train_seed: int = SEED,
train_nproc_per_node: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification.
train_nnodes: int = 2, # FIXME: Not present in default config. Arbitrary value chosen to demonstrate multi-node multi-gpu capabilities. Needs proper reference architecture justification.
train_num_epochs_phase_1: int = 7, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L364
train_num_epochs_phase_2: int = 10, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L377
train_effective_batch_size_phase_1: int = 128, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L357
train_effective_batch_size_phase_2: int = 3840, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L371
train_learning_rate_phase_1: float = 2e-05, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L360
train_learning_rate_phase_2: float = 6e-06, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L374
train_num_warmup_steps_phase_1: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874
train_num_warmup_steps_phase_2: int = 1000, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L874
train_save_samples: int = 250000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L393
train_max_batch_len: int = 5000, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L334
train_seed: int = 42, # https://github.com/instructlab/training/blob/v0.6.1/src/instructlab/training/main_ds.py#L901
# MT Bench
mt_bench_max_workers: str = MAX_WORKERS,
mt_bench_merge_system_user_message: bool = MERGE_SYSTEM_USER_MESSAGE,
mt_bench_max_workers: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L74
mt_bench_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474
# Final evaluation
final_eval_max_workers: str = MAX_WORKERS,
final_eval_few_shots: int = FEW_SHOTS,
final_eval_batch_size: str = BATCH_SIZE,
final_eval_merge_system_user_message: bool = MERGE_SYSTEM_USER_MESSAGE,
final_eval_max_workers: str = "auto", # FIXME: Not present in default config. Need justification?
final_eval_few_shots: int = 5, # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L56
final_eval_batch_size: str = "auto", # https://github.com/instructlab/instructlab/blob/v0.21.2/tests/testdata/default_config.yaml#L52
final_eval_merge_system_user_message: bool = False, # https://github.com/instructlab/instructlab/blob/v0.21.2/src/instructlab/model/evaluate.py#L474
# Other options
k8s_storage_class_name: str = "standard", # FIXME: https://github.com/kubeflow/pipelines/issues/11396, https://issues.redhat.com/browse/RHOAIRFE-470
):
Expand Down Expand Up @@ -558,7 +537,12 @@ def run(mock, experiment, run_name, param):
dev_arguments = {
"k8s_storage_class_name": "nfs-csi",
"sdg_base_model": "s3://ilab-pipeline-b1d4c2b1-ab00-4e7f-b985-697bda3df385/instructlab-base-importer/648f36d0-e3f0-43b8-8adb-530576beb675/ilab-importer-op/model/granite-7b-starter",
"train_nproc_per_node": 2,
"train_num_epochs_phase_1": 2,
"train_num_epochs_phase_2": 2,
"train_num_warmup_steps_phase_1": 100,
"train_num_warmup_steps_phase_2": 100,
"train_learning_rate_phase_1": 1e-4,
"train_learning_rate_phase_2": 1e-4,
}

try:
Expand Down
52 changes: 26 additions & 26 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,25 @@
# mt_bench_max_workers: str [Default: 'auto']
# mt_bench_merge_system_user_message: bool [Default: False]
# sdg_base_model: str [Default: 's3://<BUCKET>/<PATH_TO_MODEL>']
# sdg_max_batch_len: int [Default: 20000.0]
# sdg_pipeline: str [Default: 'simple']
# sdg_max_batch_len: int [Default: 5000.0]
# sdg_pipeline: str [Default: 'full']
# sdg_repo_branch: str
# sdg_repo_pr: int
# sdg_repo_url: str [Default: 'https://github.com/instructlab/taxonomy.git']
# sdg_sample_size: float [Default: 1.0]
# sdg_scale_factor: int [Default: 2.0]
# train_effective_batch_size_phase_1: int [Default: 3840.0]
# sdg_scale_factor: int [Default: 30.0]
# train_effective_batch_size_phase_1: int [Default: 128.0]
# train_effective_batch_size_phase_2: int [Default: 3840.0]
# train_learning_rate_phase_1: float [Default: 0.0001]
# train_learning_rate_phase_2: float [Default: 0.0001]
# train_max_batch_len: int [Default: 20000.0]
# train_learning_rate_phase_1: float [Default: 2e-05]
# train_learning_rate_phase_2: float [Default: 6e-06]
# train_max_batch_len: int [Default: 5000.0]
# train_nnodes: int [Default: 2.0]
# train_nproc_per_node: int [Default: 3.0]
# train_num_epochs_phase_1: int [Default: 2.0]
# train_num_epochs_phase_2: int [Default: 2.0]
# train_num_warmup_steps_phase_1: int [Default: 100.0]
# train_num_warmup_steps_phase_2: int [Default: 100.0]
# train_save_samples: int [Default: 0.0]
# train_nproc_per_node: int [Default: 2.0]
# train_num_epochs_phase_1: int [Default: 7.0]
# train_num_epochs_phase_2: int [Default: 10.0]
# train_num_warmup_steps_phase_1: int [Default: 1000.0]
# train_num_warmup_steps_phase_2: int [Default: 1000.0]
# train_save_samples: int [Default: 250000.0]
# train_seed: int [Default: 42.0]
components:
comp-createpvc:
Expand Down Expand Up @@ -2115,13 +2115,13 @@ root:
isOptional: true
parameterType: STRING
sdg_max_batch_len:
defaultValue: 20000.0
defaultValue: 5000.0
description: SDG parameter. Maximum tokens per gpu for each batch that will
be handled in a single step.
isOptional: true
parameterType: NUMBER_INTEGER
sdg_pipeline:
defaultValue: simple
defaultValue: full
description: 'SDG parameter. Data generation pipeline to use. Available: ''simple'',
''full'', or a valid path to a directory of pipeline workflow YAML files.
Note that ''full'' requires a larger teacher model, Mixtral-8x7b.'
Expand Down Expand Up @@ -2149,12 +2149,12 @@ root:
isOptional: true
parameterType: NUMBER_DOUBLE
sdg_scale_factor:
defaultValue: 2.0
defaultValue: 30.0
description: SDG parameter. The total number of instructions to be generated.
isOptional: true
parameterType: NUMBER_INTEGER
train_effective_batch_size_phase_1:
defaultValue: 3840.0
defaultValue: 128.0
description: Training parameter for in Phase 1. The number of samples in a
batch that the model should see before its parameters are updated.
isOptional: true
Expand All @@ -2166,23 +2166,23 @@ root:
isOptional: true
parameterType: NUMBER_INTEGER
train_learning_rate_phase_1:
defaultValue: 0.0001
defaultValue: 2.0e-05
description: Training parameter for in Phase 1. How fast we optimize the weights
during gradient descent. Higher values may lead to unstable learning performance.
It's generally recommended to have a low learning rate with a high effective
batch size.
isOptional: true
parameterType: NUMBER_DOUBLE
train_learning_rate_phase_2:
defaultValue: 0.0001
defaultValue: 6.0e-06
description: Training parameter for in Phase 2. How fast we optimize the weights
during gradient descent. Higher values may lead to unstable learning performance.
It's generally recommended to have a low learning rate with a high effective
batch size.
isOptional: true
parameterType: NUMBER_DOUBLE
train_max_batch_len:
defaultValue: 20000.0
defaultValue: 5000.0
description: Training parameter. Maximum tokens per gpu for each batch that
will be handled in a single step.
isOptional: true
Expand All @@ -2193,37 +2193,37 @@ root:
isOptional: true
parameterType: NUMBER_INTEGER
train_nproc_per_node:
defaultValue: 3.0
defaultValue: 2.0
description: Training parameter. Number of GPUs per each node/worker to use
for training.
isOptional: true
parameterType: NUMBER_INTEGER
train_num_epochs_phase_1:
defaultValue: 2.0
defaultValue: 7.0
description: Training parameter for in Phase 1. Number of epochs to run training.
isOptional: true
parameterType: NUMBER_INTEGER
train_num_epochs_phase_2:
defaultValue: 2.0
defaultValue: 10.0
description: Training parameter for in Phase 2. Number of epochs to run training.
isOptional: true
parameterType: NUMBER_INTEGER
train_num_warmup_steps_phase_1:
defaultValue: 100.0
defaultValue: 1000.0
description: Training parameter for in Phase 1. The number of steps a model
should go through before reaching the full learning rate. We start at 0
and linearly climb up to train_learning_rate.
isOptional: true
parameterType: NUMBER_INTEGER
train_num_warmup_steps_phase_2:
defaultValue: 100.0
defaultValue: 1000.0
description: Training parameter for in Phase 2. The number of steps a model
should go through before reaching the full learning rate. We start at 0
and linearly climb up to train_learning_rate.
isOptional: true
parameterType: NUMBER_INTEGER
train_save_samples:
defaultValue: 0.0
defaultValue: 250000.0
description: Training parameter. Number of samples the model should see before
saving a checkpoint.
isOptional: true
Expand Down

0 comments on commit 1a3b970

Please sign in to comment.