diff --git a/pipeline.yaml b/pipeline.yaml index e45acc8b..a143f3e1 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -779,7 +779,7 @@ deploymentSpec: \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\ \ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\ \ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\ - \ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\ + \ image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2\"\ \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ \ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\ @@ -798,31 +798,31 @@ deploymentSpec: \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\ \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\ \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ - \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ - \ name: input-data\n readOnly:\ - \ true\n - mountPath: /input_model\n \ - \ name: model\n readOnly: true\n \ - \ - mountPath: /output\n \ - \ name: output\n env:\n - name:\ - \ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value:\ - \ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \ - \ requests:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ - \ - name: input-data\n persistentVolumeClaim:\n\ - \ claimName: {input_pvc_name}\n \ - \ - name: model\n persistentVolumeClaim:\n \ - \ claimName: {model_pvc_name}\n - name:\ - \ output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n Worker:\n \ - \ replicas: {nnodes-1}\n restartPolicy: OnFailure\n \ - \ template:\n metadata:\n annotations:\n\ + \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath:\ + \ /input_data\n name: input-data\n \ + \ readOnly: true\n - mountPath: /input_model\n\ + \ name: model\n readOnly:\ + \ true\n - mountPath: /output\n \ + \ name: output\n env:\n \ + \ - name: NNODES\n value: \\\\\"{nnodes}\\\\\ + \"\n - name: NPROC_PER_NODE\n \ + \ value: \\\\\"{nproc_per_node}\\\\\"\n resources:\n\ + \ requests:\n cpu: 2\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ limits:\n cpu: 2\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ volumes:\n - name: input-data\n \ + \ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\ + \ - name: model\n persistentVolumeClaim:\n\ + \ claimName: {model_pvc_name}\n \ + \ - name: output\n persistentVolumeClaim:\n \ + \ claimName: {output_pvc_name}\n Worker:\n\ + \ replicas: {nnodes-1}\n restartPolicy: OnFailure\n\ + \ template:\n metadata:\n annotations:\n\ \ sidecar.istio.io/inject: 'false'\n spec:\n\ \ containers:\n - args:\n \ \ - |\n mkdir -p /tmp/model;\n \ @@ -833,21 +833,21 @@ deploymentSpec: \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\ \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \ \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\ - \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\ \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ - \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ - \ name: input-data\n readOnly:\ - \ true\n - mountPath: /input_model\n \ - \ name: model\n readOnly: true\n \ - \ - mountPath: /output\n \ - \ name: output\n readOnly: true\n \ - \ env:\n - name: NNODES\n \ - \ value: \\\\\"{nnodes}\\\\\"\n - name:\ - \ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ + \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath:\ + \ /input_data\n name: input-data\n \ + \ readOnly: true\n - mountPath: /input_model\n\ + \ name: model\n readOnly:\ + \ true\n - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\\\"{nnodes}\\\\\"\n \ + \ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ \\\"\n resources:\n requests:\n\ \ cpu: 2\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n limits:\n \ @@ -899,7 +899,7 @@ deploymentSpec: \ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n if phase_name\ \ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\ \ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\ - \ image = \"registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989\"\ + \ image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2\"\ \n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\ \ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \ \ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\ @@ -918,31 +918,31 @@ deploymentSpec: \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model\ \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\ \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ - \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ - \ name: input-data\n readOnly:\ - \ true\n - mountPath: /input_model\n \ - \ name: model\n readOnly: true\n \ - \ - mountPath: /output\n \ - \ name: output\n env:\n - name:\ - \ NNODES\n value: \\\\\"{nnodes}\\\\\"\n \ - \ - name: NPROC_PER_NODE\n value:\ - \ \\\\\"{nproc_per_node}\\\\\"\n resources:\n \ - \ requests:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n \ - \ limits:\n cpu: 2\n \ - \ \"nvidia.com/gpu\": {nproc_per_node}\n volumes:\n\ - \ - name: input-data\n persistentVolumeClaim:\n\ - \ claimName: {input_pvc_name}\n \ - \ - name: model\n persistentVolumeClaim:\n \ - \ claimName: {model_pvc_name}\n - name:\ - \ output\n persistentVolumeClaim:\n \ - \ claimName: {output_pvc_name}\n Worker:\n \ - \ replicas: {nnodes-1}\n restartPolicy: OnFailure\n \ - \ template:\n metadata:\n annotations:\n\ + \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath:\ + \ /input_data\n name: input-data\n \ + \ readOnly: true\n - mountPath: /input_model\n\ + \ name: model\n readOnly:\ + \ true\n - mountPath: /output\n \ + \ name: output\n env:\n \ + \ - name: NNODES\n value: \\\\\"{nnodes}\\\\\ + \"\n - name: NPROC_PER_NODE\n \ + \ value: \\\\\"{nproc_per_node}\\\\\"\n resources:\n\ + \ requests:\n cpu: 2\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ limits:\n cpu: 2\n \ + \ \"nvidia.com/gpu\": {nproc_per_node}\n \ + \ volumes:\n - name: input-data\n \ + \ persistentVolumeClaim:\n claimName: {input_pvc_name}\n\ + \ - name: model\n persistentVolumeClaim:\n\ + \ claimName: {model_pvc_name}\n \ + \ - name: output\n persistentVolumeClaim:\n \ + \ claimName: {output_pvc_name}\n Worker:\n\ + \ replicas: {nnodes-1}\n restartPolicy: OnFailure\n\ + \ template:\n metadata:\n annotations:\n\ \ sidecar.istio.io/inject: 'false'\n spec:\n\ \ containers:\n - args:\n \ \ - |\n mkdir -p /tmp/model;\n \ @@ -953,21 +953,21 @@ deploymentSpec: \ --node_rank \\$(RANK) --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT)\ \ -m instructlab.training.main_ds --model_name_or_path={path_to_model} \ \ --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model\ - \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800\ + \ --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800\ \ --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer\ - \ --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch\n \ - \ command:\n - /bin/bash\n \ - \ - '-c'\n - '--'\n \ - \ image: {image}\n name: pytorch\n \ - \ volumeMounts:\n - mountPath: /input_data\n\ - \ name: input-data\n readOnly:\ - \ true\n - mountPath: /input_model\n \ - \ name: model\n readOnly: true\n \ - \ - mountPath: /output\n \ - \ name: output\n readOnly: true\n \ - \ env:\n - name: NNODES\n \ - \ value: \\\\\"{nnodes}\\\\\"\n - name:\ - \ NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ + \ --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch\n\ + \ command:\n - /bin/bash\n \ + \ - '-c'\n - '--'\n \ + \ image: {image}\n name: pytorch\n \ + \ volumeMounts:\n - mountPath:\ + \ /input_data\n name: input-data\n \ + \ readOnly: true\n - mountPath: /input_model\n\ + \ name: model\n readOnly:\ + \ true\n - mountPath: /output\n \ + \ name: output\n readOnly: true\n \ + \ env:\n - name: NNODES\n \ + \ value: \\\\\"{nnodes}\\\\\"\n \ + \ - name: NPROC_PER_NODE\n value: \\\\\"{nproc_per_node}\\\ \\\"\n resources:\n requests:\n\ \ cpu: 2\n \"nvidia.com/gpu\"\ : {nproc_per_node}\n limits:\n \ diff --git a/training/components.py b/training/components.py index f1239d46..b6792998 100644 --- a/training/components.py +++ b/training/components.py @@ -109,7 +109,7 @@ def list_phase1_final_model(): path_to_model = "/input_model/model" elif phase_name == "second": path_to_model = list_phase1_final_model() - image = "registry.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.1-1724960989" + image = "registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.2" manifest = inspect.cleandoc( f""" @@ -137,7 +137,7 @@ def list_phase1_final_model(): export TRITON_CACHE_DIR=/tmp export HF_HOME=/tmp export TRANSFORMERS_CACHE=/tmp - torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch + torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/output/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch command: - /bin/bash - '-c' @@ -191,7 +191,7 @@ def list_phase1_final_model(): export XDG_CACHE_HOME=/tmp export HF_HOME=/tmp export TRANSFORMERS_CACHE=/tmp - torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=2e-6 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --sharding_strategy=FULL_SHARD --is_granite --checkpoint_at_epoch + torchrun --nnodes {nnodes} --nproc_per_node {nproc_per_node} --node_rank \$(RANK) --rdzv_endpoint \$(MASTER_ADDR):\$(MASTER_PORT) -m instructlab.training.main_ds --model_name_or_path={path_to_model} --data_path=/input_data/processed_data/data.jsonl --output_dir=/tmp/model --num_epochs=2 --effective_batch_size=3840 --learning_rate=1e-4 --num_warmup_steps=800 --save_samples=0 --log_level=INFO --max_batch_len=20000 --seed=42 --cpu_offload_optimizer --distributed_training_framework fsdp --is_granite --checkpoint_at_epoch command: - /bin/bash - '-c'