Skip to content

Commit

Permalink
bulk commit
Browse files Browse the repository at this point in the history
Sorry, I went really far with this one but I can confirm that:

* sdg-data-fetch is working
* data processing works
* training phase 1 is stuck when launched, need investigation

Also:

* remove backtick from the code since it breaks the shell that runs the
  python executor
* only use a single PVC for everything: sdg data, model, trained model
* --force-pull: to force pulling from the object store again if the data
  are already present

Signed-off-by: Sébastien Han <[email protected]>
  • Loading branch information
leseb committed Oct 9, 2024
1 parent ca03343 commit a645a1e
Show file tree
Hide file tree
Showing 10 changed files with 640 additions and 531 deletions.
6 changes: 3 additions & 3 deletions eval/final/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ def find_node_dataset_directories(base_directory: str):

######################################################################
# TODO: Update ilab/model/evaluate evaluate def logic to allow for external judge model
# and when that happens, much of this logic can be imported from the `evaluate` definition:
# and when that happens, much of this logic can be imported from the 'evaluate' definition:
# https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504
#
# With instructlab, model_name is synonomous with model_path
Expand All @@ -244,8 +244,8 @@ def find_node_dataset_directories(base_directory: str):
),
]

# ilab/evaluate uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# ilab/evaluate uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
if max_workers == "auto":
try:
Expand Down
8 changes: 4 additions & 4 deletions eval/mt_bench/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ def run_mt_bench_op(
models_path_prefix: str,
mt_bench_output: Output[Artifact],
merge_system_user_message: bool,
# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
max_workers: str,
models_list: List[str] = None,
Expand Down Expand Up @@ -53,8 +53,8 @@ def run_mt_bench_op(
scores = {}
all_mt_bench_data = []

# generate_answers,judgment uses a magic word for its mt_bench evaluator - `auto`
# with `auto`, number of gpus allocated for serving is calculated based on environment
# generate_answers,judgment uses a magic word for its mt_bench evaluator - 'auto'
# with 'auto', number of gpus allocated for serving is calculated based on environment
# https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36
if max_workers == "auto":
try:
Expand Down
23 changes: 16 additions & 7 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ def pipeline(
final_eval_task.set_accelerator_type("nvidia.com/gpu")
final_eval_task.set_accelerator_limit(1)

# Technically `output_model_task` and `output_data_task` can happen before evaluation,
# Technically 'output_model_task' and 'output_data_task' can happen before evaluation,
# however the PVC can only be mounted once, so, setting these to _after_ so the eval proceeds.
output_model_task = pvc_to_artifact_op(
pvc_path="/output/data",
Expand Down Expand Up @@ -417,7 +417,7 @@ def gen_standalone():
This function should be used when Kubeflow Pipelines are not available. It will generate a
script that replicates the pipeline's functionality.
Example usage: ``` $ python pipeline.py gen-standalone ```
Example usage: ''' $ python pipeline.py gen-standalone '''
"""
from os import path

Expand All @@ -442,11 +442,11 @@ def gen_standalone():

# The list of executor names to extract details from to generate the standalone script
executors = {
"exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/input_data/generated", model="/input_model", processed_data="/input_data/processed_data")',
"exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/input_data/taxonomy", sdg="/input_data/generated")',
"exec-data-processing-op": 'data_processing_op(max_seq_len=4096, max_batch_len=20000, sdg="/data/data", model="/data/model", processed_data="/data/processed_data")',
"exec-sdg-op": 'sdg_op(num_instructions_to_generate=2, repo_branch="", repo_pr="", taxonomy="/data/taxonomy", sdg="/data/generated")',
"exec-git-clone-op": {},
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/input_model")',
"exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/output/mt-bench-results.txt", models_list="/output/model/model/hf_format", models_path_prefix="/output/model/hf_format", max_workers="auto", merge_system_user_message=False)',
"exec-huggingface-importer-op": 'huggingface_importer_op(repo_name="ibm-granite/granite-7b-base", model="/data/model")',
"exec-run-mt-bench-op": 'run_mt_bench_op(mt_bench_output="/data/mt-bench-results.txt", models_list="/data/model/model/hf_format", models_path_prefix="/data/model/hf_format", max_workers="auto", merge_system_user_message=False)',
}

details = {}
Expand Down Expand Up @@ -621,9 +621,18 @@ def change_dsl_function_to_normal_function(rendered_code: list):
"import kfp": "",
"from kfp import dsl": "",
"from kfp.dsl import *": "",
".path": "", # super hacky, but works for now, the idea is that "taxonomy.path" is a string so we just remove the ".path" part
}

import re

# Regular expression to match ".path" but not "os.path"
path_pattern = re.compile(r"(?<!os)\.path")

def remove_path_not_os_path(line):
return path_pattern.sub("", line)

rendered_code = [remove_path_not_os_path(line) for line in rendered_code]

for old, new in replacements.items():
rendered_code = [line.replace(old, new) for line in rendered_code]
return rendered_code[-1].strip()
Expand Down
12 changes: 6 additions & 6 deletions pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ deploymentSpec:
\ )\n\n def data_processing(train_args: TrainingArgs) -> None:\n \
\ # early validation logic here\n if train_args.max_batch_len\
\ < train_args.max_seq_len:\n raise ValueError(\n \
\ f\"the `max_batch_len` cannot be less than `max_seq_len`: {train_args.max_batch_len=}\
\ f\"the 'max_batch_len' cannot be less than 'max_seq_len': {train_args.max_batch_len=}\
\ < {train_args.max_seq_len=}\"\n )\n\n # process\
\ the training data\n if not os.path.exists(train_args.data_output_dir):\n\
\ os.makedirs(train_args.data_output_dir, exist_ok=True)\n \
Expand Down Expand Up @@ -1107,7 +1107,7 @@ deploymentSpec:
main\"\n\n ######################################################################\n\
\ # TODO: Update ilab/model/evaluate evaluate def logic to allow for\
\ external judge model\n # and when that happens, much of this logic\
\ can be imported from the `evaluate` definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
\ can be imported from the 'evaluate' definition:\n # https://github.com/instructlab/instructlab/blob/83ca501ecdd858677380046e2a56da5b2f3f14e7/src/instructlab/model/evaluate.py#L504\n\
\ #\n # With instructlab, model_name is synonomous with model_path\n\
\ mt_bench_evaluators = [\n MTBenchBranchEvaluator(\n \
\ model_name=candidate_model,\n judge_model_name=judge_model_name,\n\
Expand All @@ -1118,7 +1118,7 @@ deploymentSpec:
\ branch=base_branch,\n output_dir=output_dir,\n \
\ merge_system_user_message=merge_system_user_message,\n \
\ ),\n ]\n\n # ilab/evaluate uses a magic word for its mt_bench\
\ evaluator - `auto`\n # with `auto`, number of gpus allocated for serving\
\ evaluator - 'auto'\n # with 'auto', number of gpus allocated for serving\
\ is calculated based on environment\n # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
\ if max_workers == \"auto\":\n try:\n usable_cpu_count\
\ = len(os.sched_getaffinity(0)) // 2\n except AttributeError:\n\
Expand Down Expand Up @@ -1197,7 +1197,7 @@ deploymentSpec:
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef run_mt_bench_op(\n models_path_prefix: str,\n mt_bench_output:\
\ Output[Artifact],\n merge_system_user_message: bool,\n # generate_answers,judgment\
\ uses a magic word for its mt_bench evaluator - `auto`\n # with `auto`,\
\ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\
\ number of gpus allocated for serving is calculated based on environment\n\
\ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
\ max_workers: str,\n models_list: List[str] = None,\n models_folder:\
Expand All @@ -1215,7 +1215,7 @@ deploymentSpec:
\n judge_api_key = os.getenv(\"JUDGE_API_KEY\", \"\")\n judge_model_name\
\ = os.getenv(\"JUDGE_NAME\")\n judge_endpoint = os.getenv(\"JUDGE_ENDPOINT\"\
)\n\n scores = {}\n all_mt_bench_data = []\n\n # generate_answers,judgment\
\ uses a magic word for its mt_bench evaluator - `auto`\n # with `auto`,\
\ uses a magic word for its mt_bench evaluator - 'auto'\n # with 'auto',\
\ number of gpus allocated for serving is calculated based on environment\n\
\ # https://github.com/instructlab/eval/blob/main/src/instructlab/eval/mt_bench.py#L36\n\
\ if max_workers == \"auto\":\n try:\n usable_cpu_count\
Expand Down Expand Up @@ -1286,7 +1286,7 @@ deploymentSpec:
\ > 0) else \"empty\"\n\n print(\"Generating syntetic dataset for:\"\
)\n print()\n print(read_taxonomy(taxonomy.path, taxonomy_base))\n\
\n # generate_data has a magic word for its taxonomy_base argument -\
\ `empty`\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ 'empty'\n # it allows generating from the whole repo, see:\n # https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230\n\
\ generate_data(\n client=client,\n num_instructions_to_generate=num_instructions_to_generate,\n\
\ output_dir=sdg.path,\n taxonomy=taxonomy.path,\n \
\ taxonomy_base=taxonomy_base,\n model_name=model,\n chunk_word_count=1000,\n\
Expand Down
2 changes: 1 addition & 1 deletion sdg/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def sdg_op(
print()
print(read_taxonomy(taxonomy.path, taxonomy_base))

# generate_data has a magic word for its taxonomy_base argument - `empty`
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
generate_data(
Expand Down
4 changes: 4 additions & 0 deletions standalone/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,10 @@ The script requires information regarding the location and method for accessing
* `--eval-serving-model-name`: The name of the model to use for evaluation. **Required**
* `--eval-serving-model-api-key`: The API key for the model to evaluate. `EVAL_SERVING_MODEL_API_KEY`
environment variable can be used as well. **Required**
* `--force-pull`: Force pull the data (sdg data and model) from the object store even if it already
exists in the PVC. **Optional** - Default: false.
* `--training-1-epoch-num`: The number of epochs to train the model for phase 1. **Optional** - Default: 7.
* `--training-2-epoch-num`: The number of epochs to train the model for phase 2. **Optional** - Default: 10.


## Example End-To-End Workflow
Expand Down
Loading

0 comments on commit a645a1e

Please sign in to comment.