Skip to content

Commit

Permalink
Add RHEL 1.3.1 image
Browse files Browse the repository at this point in the history
  • Loading branch information
Shreyanand committed Dec 11, 2024
1 parent 67492e3 commit f21d8a4
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 368 deletions.
84 changes: 42 additions & 42 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -765,27 +765,27 @@ deploymentSpec:
\ int = 3840,\n learning_rate: float = 1e-4,\n num_warmup_steps: int\
\ = 800,\n save_samples: int = 0,\n max_batch_len: int = 20000,\n\
\ seed: int = 42,\n job_timeout: int = 86400,\n delete_after_done:\
\ bool = False,\n):\n import time\n import logging\n from kubeflow.training\
\ import TrainingClient\n from kubeflow.training.utils import utils as\
\ kfto_utils\n from kubeflow.training import models\n import os\n\n\
\ def list_phase1_final_model():\n model_dir = \"/output/phase_1/model/hf_format\"\
\n model_list = os.listdir(model_dir)\n newest_idx = max(\n\
\ (os.path.getmtime(f\"{model_dir}/{model}\"), i)\n \
\ for i, model in enumerate(model_list)\n )[-1]\n newest_model\
\ = model_list[newest_idx]\n return f\"{model_dir}/{newest_model}\"\
\n\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
Unsupported value of {phase_num=}\")\n\n resources_per_worker = {\"nvidia.com/gpu\"\
: nproc_per_node}\n\n base_image = \"quay.io/redhat-et/ilab:1.3\"\n \
\ name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n \
\ command = [\"/bin/bash\", \"-c\", \"--\"]\n\n master_args = [\n \
\ f\"\"\"echo \"Running phase {phase_num}\"\n \
\ echo \"Using {path_to_model} model for training\"\n \
\ echo \"Using {path_to_data} data for training\"\n \
\ mkdir -p /output/phase_{phase_num}/model;\n \
\ mkdir -p /output/data;\n torchrun --nnodes\
\ bool = False,\n):\n import logging\n import os\n import time\n\
\n from kubeflow.training import TrainingClient, models\n from kubeflow.training.utils\
\ import utils as kfto_utils\n\n def list_phase1_final_model():\n \
\ model_dir = \"/output/phase_1/model/hf_format\"\n model_list\
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
{model_dir}/{model}\"), i)\n for i, model in enumerate(model_list)\n\
\ )[-1]\n newest_model = model_list[newest_idx]\n return\
\ f\"{model_dir}/{newest_model}\"\n\n if phase_num == 1:\n path_to_model\
\ = \"/input_model\"\n path_to_data = \"/input_data/knowledge/data.jsonl\"\
\n elif phase_num == 2:\n path_to_model = list_phase1_final_model()\n\
\ path_to_data = \"/input_data/skills/data.jsonl\"\n else:\n \
\ raise RuntimeError(f\"Unsupported value of {phase_num=}\")\n\n \
\ resources_per_worker = {\"nvidia.com/gpu\": nproc_per_node}\n\n base_image\
\ = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\n\
\ name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n\
\ command = [\"/bin/bash\", \"-c\", \"--\"]\n\n master_args = [\n\
\ f\"\"\"echo \"Running phase {phase_num}\"\n \
\ echo \"Using {path_to_model} model for training\"\n \
\ echo \"Using {path_to_data} data for training\"\n \
\ mkdir -p /output/phase_{phase_num}/model;\n \
\ mkdir -p /output/data;\n torchrun --nnodes\
\ {nnodes} \\\n --nproc_per_node {nproc_per_node}\
\ \\\n --node_rank \\$(RANK) \\\n \
\ --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT) \\\n\
Expand Down Expand Up @@ -930,27 +930,27 @@ deploymentSpec:
\ int = 3840,\n learning_rate: float = 1e-4,\n num_warmup_steps: int\
\ = 800,\n save_samples: int = 0,\n max_batch_len: int = 20000,\n\
\ seed: int = 42,\n job_timeout: int = 86400,\n delete_after_done:\
\ bool = False,\n):\n import time\n import logging\n from kubeflow.training\
\ import TrainingClient\n from kubeflow.training.utils import utils as\
\ kfto_utils\n from kubeflow.training import models\n import os\n\n\
\ def list_phase1_final_model():\n model_dir = \"/output/phase_1/model/hf_format\"\
\n model_list = os.listdir(model_dir)\n newest_idx = max(\n\
\ (os.path.getmtime(f\"{model_dir}/{model}\"), i)\n \
\ for i, model in enumerate(model_list)\n )[-1]\n newest_model\
\ = model_list[newest_idx]\n return f\"{model_dir}/{newest_model}\"\
\n\n if phase_num == 1:\n path_to_model = \"/input_model\"\n \
\ path_to_data = \"/input_data/knowledge/data.jsonl\"\n elif phase_num\
\ == 2:\n path_to_model = list_phase1_final_model()\n path_to_data\
\ = \"/input_data/skills/data.jsonl\"\n else:\n raise RuntimeError(f\"\
Unsupported value of {phase_num=}\")\n\n resources_per_worker = {\"nvidia.com/gpu\"\
: nproc_per_node}\n\n base_image = \"quay.io/redhat-et/ilab:1.3\"\n \
\ name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n \
\ command = [\"/bin/bash\", \"-c\", \"--\"]\n\n master_args = [\n \
\ f\"\"\"echo \"Running phase {phase_num}\"\n \
\ echo \"Using {path_to_model} model for training\"\n \
\ echo \"Using {path_to_data} data for training\"\n \
\ mkdir -p /output/phase_{phase_num}/model;\n \
\ mkdir -p /output/data;\n torchrun --nnodes\
\ bool = False,\n):\n import logging\n import os\n import time\n\
\n from kubeflow.training import TrainingClient, models\n from kubeflow.training.utils\
\ import utils as kfto_utils\n\n def list_phase1_final_model():\n \
\ model_dir = \"/output/phase_1/model/hf_format\"\n model_list\
\ = os.listdir(model_dir)\n newest_idx = max(\n (os.path.getmtime(f\"\
{model_dir}/{model}\"), i)\n for i, model in enumerate(model_list)\n\
\ )[-1]\n newest_model = model_list[newest_idx]\n return\
\ f\"{model_dir}/{newest_model}\"\n\n if phase_num == 1:\n path_to_model\
\ = \"/input_model\"\n path_to_data = \"/input_data/knowledge/data.jsonl\"\
\n elif phase_num == 2:\n path_to_model = list_phase1_final_model()\n\
\ path_to_data = \"/input_data/skills/data.jsonl\"\n else:\n \
\ raise RuntimeError(f\"Unsupported value of {phase_num=}\")\n\n \
\ resources_per_worker = {\"nvidia.com/gpu\": nproc_per_node}\n\n base_image\
\ = \"registry.stage.redhat.io/rhelai1/instructlab-nvidia-rhel9:1.3.1\"\n\
\ name = f\"train-phase-{phase_num}-{name_suffix.rstrip('-sdg')}\"\n\
\ command = [\"/bin/bash\", \"-c\", \"--\"]\n\n master_args = [\n\
\ f\"\"\"echo \"Running phase {phase_num}\"\n \
\ echo \"Using {path_to_model} model for training\"\n \
\ echo \"Using {path_to_data} data for training\"\n \
\ mkdir -p /output/phase_{phase_num}/model;\n \
\ mkdir -p /output/data;\n torchrun --nnodes\
\ {nnodes} \\\n --nproc_per_node {nproc_per_node}\
\ \\\n --node_rank \\$(RANK) \\\n \
\ --rdzv_endpoint \\$(MASTER_ADDR):\\$(MASTER_PORT) \\\n\
Expand Down
Loading

0 comments on commit f21d8a4

Please sign in to comment.