Skip to content

Commit

Permalink
Remove hardcoded training image refernce
Browse files Browse the repository at this point in the history
- Already defined as a const
  • Loading branch information
gmfrasca committed Jan 7, 2025
1 parent 723fb6c commit 41ae2b7
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 60 deletions.
110 changes: 54 additions & 56 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -747,16 +747,15 @@ deploymentSpec:
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
\n echo \"Using {path_to_model} model for training\"\
\n echo \"Using {path_to_data} data for training\"\
\n mkdir -p /output/phase_{phase_num}/model;\n\
Expand All @@ -781,23 +780,23 @@ deploymentSpec:
\ --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n image:\
\ {image}\n name: pytorch\n volumeMounts:\n\
\ - mountPath: /input_data\n \
\ name: input-data\n readOnly: true\n \
\ - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n name: output\n\
\ env:\n - name: NNODES\n \
\ value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\
\"\n - name: XDG_CACHE_HOME\n \
\ value: /tmp\n - name: TRITON_CACHE_DIR\n\
\ {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\
\ value: /tmp\n - name:\
\ HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n value: /tmp\n\
\ resources:\n requests:\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n \"nvidia.com/gpu\"\
\ TRITON_CACHE_DIR\n value: /tmp\n \
\ - name: HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n \
\ value: /tmp\n resources:\n \
\ requests:\n \"nvidia.com/gpu\": {nproc_per_node}\n\
\ limits:\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n volumes:\n - name:\
\ input-data\n persistentVolumeClaim:\n \
\ claimName: {input_pvc_name}\n - name: model\n\
Expand Down Expand Up @@ -831,8 +830,8 @@ deploymentSpec:
\ \\\n --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ image: {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
Expand Down Expand Up @@ -951,16 +950,15 @@ deploymentSpec:
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
\n echo \"Using {path_to_model} model for training\"\
\n echo \"Using {path_to_data} data for training\"\
\n mkdir -p /output/phase_{phase_num}/model;\n\
Expand All @@ -985,23 +983,23 @@ deploymentSpec:
\ --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n image:\
\ {image}\n name: pytorch\n volumeMounts:\n\
\ - mountPath: /input_data\n \
\ name: input-data\n readOnly: true\n \
\ - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n name: output\n\
\ env:\n - name: NNODES\n \
\ value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\
\"\n - name: XDG_CACHE_HOME\n \
\ value: /tmp\n - name: TRITON_CACHE_DIR\n\
\ {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\
\ value: /tmp\n - name:\
\ HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n value: /tmp\n\
\ resources:\n requests:\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n \"nvidia.com/gpu\"\
\ TRITON_CACHE_DIR\n value: /tmp\n \
\ - name: HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n \
\ value: /tmp\n resources:\n \
\ requests:\n \"nvidia.com/gpu\": {nproc_per_node}\n\
\ limits:\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n volumes:\n - name:\
\ input-data\n persistentVolumeClaim:\n \
\ claimName: {input_pvc_name}\n - name: model\n\
Expand Down Expand Up @@ -1035,8 +1033,8 @@ deploymentSpec:
\ \\\n --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ image: {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
Expand Down
6 changes: 2 additions & 4 deletions training/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,8 +167,6 @@ def list_phase1_final_model():
else:
raise RuntimeError(f"Unsupported value of {phase_num=}")

image = "registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1"

manifest = inspect.cleandoc(
f"""
apiVersion: kubeflow.org/v1
Expand Down Expand Up @@ -218,7 +216,7 @@ def list_phase1_final_model():
- /bin/bash
- '-c'
- '--'
image: {image}
image: {RHELAI_IMAGE}
name: pytorch
volumeMounts:
- mountPath: /input_data
Expand Down Expand Up @@ -296,7 +294,7 @@ def list_phase1_final_model():
- /bin/bash
- '-c'
- '--'
image: {image}
image: {RHELAI_IMAGE}
name: pytorch
volumeMounts:
- mountPath: /input_data
Expand Down

0 comments on commit 41ae2b7

Please sign in to comment.