Skip to content

Commit

Permalink
fix listing models in training (#77)
Browse files Browse the repository at this point in the history
Signed-off-by: sallyom <[email protected]>
  • Loading branch information
sallyom authored Oct 9, 2024
1 parent 2f7d51e commit b1d174a
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 37 deletions.
74 changes: 38 additions & 36 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -758,24 +758,25 @@ deploymentSpec:
\ int = 2,\n) -> NamedTuple(\"outputs\", manifest=str, name=str):\n import\
\ inspect\n import os\n\n def list_phase1_final_model():\n \
\ model_dir = \"/output/model/hf_format\"\n models = os.listdir(model_dir)\n\
\ newest_idx = max(\n (os.path.getmtime(model), i) for\
\ i, model in enumerate(models)\n )[-1]\n newest_model = models[newest_idx]\n\
\ return f\"{model_dir}/{newest_model}\"\n\n Outputs = NamedTuple(\"\
outputs\", manifest=str, name=str)\n name = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\
\n\n image = \"quay.io/shanand/test-train:0.0.4\"\n if phase_name\
\ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
\ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
\\\"\n pytorchReplicaSpecs:\n Master:\n \
\ replicas: 1\n restartPolicy: OnFailure\n template:\n\
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ newest_idx = max(\n (os.path.getmtime(f\"{model_dir}/{model}\"\
), i)\n for i, model in enumerate(models)\n )[-1]\n \
\ newest_model = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\
\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n name\
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n image =\
\ \"quay.io/shanand/test-train:0.0.4\"\n if phase_name == \"first\":\n\
\ path_to_model = \"/input_model/model\"\n elif phase_name ==\
\ \"second\":\n path_to_model = list_phase1_final_model()\n\n \
\ manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion: kubeflow.org/v1\n\
\ kind: PyTorchJob\n metadata:\n name: {name}\n \
\ spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\\\"\n \
\ pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ /output/model --data_output_dir /input_data/processed_data\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
Expand Down Expand Up @@ -862,24 +863,25 @@ deploymentSpec:
\ int = 2,\n) -> NamedTuple(\"outputs\", manifest=str, name=str):\n import\
\ inspect\n import os\n\n def list_phase1_final_model():\n \
\ model_dir = \"/output/model/hf_format\"\n models = os.listdir(model_dir)\n\
\ newest_idx = max(\n (os.path.getmtime(model), i) for\
\ i, model in enumerate(models)\n )[-1]\n newest_model = models[newest_idx]\n\
\ return f\"{model_dir}/{newest_model}\"\n\n Outputs = NamedTuple(\"\
outputs\", manifest=str, name=str)\n name = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\
\n\n image = \"quay.io/shanand/test-train:0.0.4\"\n if phase_name\
\ == \"first\":\n path_to_model = \"/input_model/model\"\n elif\
\ phase_name == \"second\":\n path_to_model = list_phase1_final_model()\n\
\n manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion:\
\ kubeflow.org/v1\n kind: PyTorchJob\n metadata:\n \
\ name: {name}\n spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\
\\\"\n pytorchReplicaSpecs:\n Master:\n \
\ replicas: 1\n restartPolicy: OnFailure\n template:\n\
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ newest_idx = max(\n (os.path.getmtime(f\"{model_dir}/{model}\"\
), i)\n for i, model in enumerate(models)\n )[-1]\n \
\ newest_model = models[newest_idx]\n return f\"{model_dir}/{newest_model}\"\
\n\n Outputs = NamedTuple(\"outputs\", manifest=str, name=str)\n name\
\ = f\"train-{phase_name}-{name_suffix.rstrip('-sdg')}\"\n\n image =\
\ \"quay.io/shanand/test-train:0.0.4\"\n if phase_name == \"first\":\n\
\ path_to_model = \"/input_model/model\"\n elif phase_name ==\
\ \"second\":\n path_to_model = list_phase1_final_model()\n\n \
\ manifest = inspect.cleandoc(\n f\"\"\"\n apiVersion: kubeflow.org/v1\n\
\ kind: PyTorchJob\n metadata:\n name: {name}\n \
\ spec:\n nprocPerNode: \\\\\"{nproc_per_node}\\\\\"\n \
\ pytorchReplicaSpecs:\n Master:\n replicas:\
\ 1\n restartPolicy: OnFailure\n template:\n \
\ metadata:\n annotations:\n \
\ sidecar.istio.io/inject: 'false'\n spec:\n \
\ containers:\n - args:\n \
\ - |\n mkdir -p /output/model;\n \
\ mkdir -p /output/data;\n \
\ python3.11 -u run_main_ds.py --model_path {path_to_model} --ckpt_output_dir\
\ /output/model --data_output_dir /input_data/processed_data\n \
\ command:\n - /bin/bash\n \
\ - '-c'\n - '--'\n \
Expand Down
3 changes: 2 additions & 1 deletion training/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ def list_phase1_final_model():
model_dir = "/output/model/hf_format"
models = os.listdir(model_dir)
newest_idx = max(
(os.path.getmtime(model), i) for i, model in enumerate(models)
(os.path.getmtime(f"{model_dir}/{model}"), i)
for i, model in enumerate(models)
)[-1]
newest_model = models[newest_idx]
return f"{model_dir}/{newest_model}"
Expand Down

0 comments on commit b1d174a

Please sign in to comment.