Skip to content

Commit

Permalink
Remove hardcoded training image refernce
Browse files Browse the repository at this point in the history
- Already defined as a const
  • Loading branch information
gmfrasca committed Jan 7, 2025
1 parent 723fb6c commit 3894909
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 101 deletions.
156 changes: 78 additions & 78 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -730,33 +730,33 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef pytorchjob_manifest_op(\n model_pvc_name: str,\n input_pvc_name:\
\ str,\n output_pvc_name: str,\n name_suffix: str,\n # path_to_model:\
\ str,\n phase_num: int,\n nproc_per_node: int = 3,\n nnodes: int\
\ = 2,\n num_epochs: int = 2,\n effective_batch_size: int = 3840,\n\
\ learning_rate: float = 1e-4,\n num_warmup_steps: int = 800,\n \
\ save_samples: int = 0,\n max_batch_len: int = 20000,\n seed: int\
\ = 42,\n):\n import inspect\n import os\n import time\n\n import\
\ *\n\ndef pytorchjob_manifest_op(\n model_pvc_name: str,\n \
\ input_pvc_name: str,\n output_pvc_name: str,\n name_suffix:\
\ str,\n # path_to_model: str,\n phase_num: int,\n \
\ nproc_per_node: int = 3,\n nnodes: int = 2,\n num_epochs:\
\ int = 2,\n effective_batch_size: int = 3840,\n learning_rate:\
\ float = 1e-4,\n num_warmup_steps: int = 800,\n save_samples:\
\ int = 0,\n max_batch_len: int = 20000,\n seed: int = 42,\n\
\ ):\n import inspect\n import os\n import time\n\n import\
\ kubernetes\n import urllib3\n import yaml\n\n def list_phase1_final_model():\n\
\ model_dir = \"/output/phase_1/model/hf_format\"\n models\
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
{model_dir}/{model}\"), i)\n for i, model in enumerate(models)\n\
\ )[-1]\n newest_model = models[newest_idx]\n return\
\ f\"{model_dir}/{newest_model}\"\n\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
{model_dir}/{model}\"), i)\n for i, model in enumerate(models)\n\
\ )[-1]\n newest_model = models[newest_idx]\n \
\ return f\"{model_dir}/{newest_model}\"\n\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\
\n\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
\n echo \"Using {path_to_model} model for training\"\
\n echo \"Using {path_to_data} data for training\"\
\n mkdir -p /output/phase_{phase_num}/model;\n\
Expand All @@ -781,23 +781,23 @@ deploymentSpec:
\ --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n image:\
\ {image}\n name: pytorch\n volumeMounts:\n\
\ - mountPath: /input_data\n \
\ name: input-data\n readOnly: true\n \
\ - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n name: output\n\
\ env:\n - name: NNODES\n \
\ value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\
\"\n - name: XDG_CACHE_HOME\n \
\ value: /tmp\n - name: TRITON_CACHE_DIR\n\
\ {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\
\ value: /tmp\n - name:\
\ HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n value: /tmp\n\
\ resources:\n requests:\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n \"nvidia.com/gpu\"\
\ TRITON_CACHE_DIR\n value: /tmp\n \
\ - name: HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n \
\ value: /tmp\n resources:\n \
\ requests:\n \"nvidia.com/gpu\": {nproc_per_node}\n\
\ limits:\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n volumes:\n - name:\
\ input-data\n persistentVolumeClaim:\n \
\ claimName: {input_pvc_name}\n - name: model\n\
Expand Down Expand Up @@ -831,8 +831,8 @@ deploymentSpec:
\ \\\n --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ image: {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
Expand Down Expand Up @@ -934,33 +934,33 @@ deploymentSpec:
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef pytorchjob_manifest_op(\n model_pvc_name: str,\n input_pvc_name:\
\ str,\n output_pvc_name: str,\n name_suffix: str,\n # path_to_model:\
\ str,\n phase_num: int,\n nproc_per_node: int = 3,\n nnodes: int\
\ = 2,\n num_epochs: int = 2,\n effective_batch_size: int = 3840,\n\
\ learning_rate: float = 1e-4,\n num_warmup_steps: int = 800,\n \
\ save_samples: int = 0,\n max_batch_len: int = 20000,\n seed: int\
\ = 42,\n):\n import inspect\n import os\n import time\n\n import\
\ *\n\ndef pytorchjob_manifest_op(\n model_pvc_name: str,\n \
\ input_pvc_name: str,\n output_pvc_name: str,\n name_suffix:\
\ str,\n # path_to_model: str,\n phase_num: int,\n \
\ nproc_per_node: int = 3,\n nnodes: int = 2,\n num_epochs:\
\ int = 2,\n effective_batch_size: int = 3840,\n learning_rate:\
\ float = 1e-4,\n num_warmup_steps: int = 800,\n save_samples:\
\ int = 0,\n max_batch_len: int = 20000,\n seed: int = 42,\n\
\ ):\n import inspect\n import os\n import time\n\n import\
\ kubernetes\n import urllib3\n import yaml\n\n def list_phase1_final_model():\n\
\ model_dir = \"/output/phase_1/model/hf_format\"\n models\
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
{model_dir}/{model}\"), i)\n for i, model in enumerate(models)\n\
\ )[-1]\n newest_model = models[newest_idx]\n return\
\ f\"{model_dir}/{newest_model}\"\n\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
{model_dir}/{model}\"), i)\n for i, model in enumerate(models)\n\
\ )[-1]\n newest_model = models[newest_idx]\n \
\ return f\"{model_dir}/{newest_model}\"\n\n name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\
\n\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
Unsupported value of {phase_num=}\")\n\n image = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\
\n\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\"{nproc_per_node}\\\
\"\n pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
Unsupported value of {phase_num=}\")\n\n manifest = inspect.cleandoc(\n\
\ f\"\"\"\n apiVersion: kubeflow.org/v1\n kind: PyTorchJob\n\
\ metadata:\n name: {name}\n spec:\n nprocPerNode:\
\ \\\"{nproc_per_node}\\\"\n pytorchReplicaSpecs:\n \
\ Master:\n replicas: 1\n restartPolicy: OnFailure\n\
\ template:\n metadata:\n annotations:\n\
\ sidecar.istio.io/inject: 'false'\n spec:\n\
\ containers:\n - args:\n \
\ - |\n echo \"Running phase {phase_num}\"\
\n echo \"Using {path_to_model} model for training\"\
\n echo \"Using {path_to_data} data for training\"\
\n mkdir -p /output/phase_{phase_num}/model;\n\
Expand All @@ -985,23 +985,23 @@ deploymentSpec:
\ --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n image:\
\ {image}\n name: pytorch\n volumeMounts:\n\
\ - mountPath: /input_data\n \
\ name: input-data\n readOnly: true\n \
\ - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n name: output\n\
\ env:\n - name: NNODES\n \
\ value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value: \\\"{nproc_per_node}\\\
\"\n - name: XDG_CACHE_HOME\n \
\ value: /tmp\n - name: TRITON_CACHE_DIR\n\
\ {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
\ - mountPath: /output\n \
\ name: output\n env:\n - name:\
\ NNODES\n value: \\\"{nnodes}\\\"\n \
\ - name: NPROC_PER_NODE\n value:\
\ \\\"{nproc_per_node}\\\"\n - name: XDG_CACHE_HOME\n\
\ value: /tmp\n - name:\
\ HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n value: /tmp\n\
\ resources:\n requests:\n \
\ \"nvidia.com/gpu\": {nproc_per_node}\n \
\ limits:\n \"nvidia.com/gpu\"\
\ TRITON_CACHE_DIR\n value: /tmp\n \
\ - name: HF_HOME\n value: /tmp\n \
\ - name: TRANSFORMERS_CACHE\n \
\ value: /tmp\n resources:\n \
\ requests:\n \"nvidia.com/gpu\": {nproc_per_node}\n\
\ limits:\n \"nvidia.com/gpu\"\
: {nproc_per_node}\n volumes:\n - name:\
\ input-data\n persistentVolumeClaim:\n \
\ claimName: {input_pvc_name}\n - name: model\n\
Expand Down Expand Up @@ -1035,8 +1035,8 @@ deploymentSpec:
\ \\\n --checkpoint_at_epoch\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
\ image: {image}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ image: {RHELAI_IMAGE}\n name: pytorch\n \
\ volumeMounts:\n - mountPath: /input_data\n\
\ name: input-data\n readOnly:\
\ true\n - mountPath: /input_model\n \
\ name: model\n readOnly: true\n \
Expand Down
Loading

0 comments on commit 3894909

Please sign in to comment.