Skip to content

Commit

Permalink
Merge pull request #230 from MichaelClifford/rhel1.3
Browse files Browse the repository at this point in the history
Pipeline Updates for RHEL AI 1.3
  • Loading branch information
tumido authored Dec 9, 2024
2 parents 8c2981c + 46b9149 commit 3bb3be0
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 206 deletions.
2 changes: 1 addition & 1 deletion importer-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ deploymentSpec:
env:
- name: REGISTRY_AUTH_FILE
value: /mnt/containers/.dockerconfigjson
image: quay.io/redhat-et/ilab:1.2
image: quay.io/redhat-et/ilab:1.3
pipelineInfo:
description: Helper pipeline to the InstructLab pipeline which allows users to seed/import
a new base model
Expand Down
1 change: 1 addition & 0 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def pipeline(
)
data_processing_task.after(model_to_pvc_task, sdg_task)
data_processing_task.set_caching_options(False)
data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp")

set_image_pull_secrets(data_processing_task, [IMAGE_PULL_SECRET])

Expand Down
322 changes: 186 additions & 136 deletions pipeline.yaml

Large diffs are not rendered by default.

143 changes: 113 additions & 30 deletions sdg/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,9 @@ def sdg_op(
):
from os import getenv, path

import instructlab.sdg
import openai
import yaml
from instructlab.sdg import generate_data
from instructlab.sdg.utils.taxonomy import read_taxonomy

def set_precomputed_skills_data_ratio(sampling_size: float):
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
if path.exists(skills_recipe):
with open(skills_recipe, "r") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

skills_yaml["datasets"][0]["sampling_size"] = sampling_size

with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)

api_key = getenv("api_key")
model = getenv("model")
Expand All @@ -73,25 +61,120 @@ def set_precomputed_skills_data_ratio(sampling_size: float):

print("Generating synthetic dataset for:")
print()
print(read_taxonomy(taxonomy_path, taxonomy_base))

set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
print(
instructlab.sdg.utils.taxonomy.read_taxonomy(
taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
)
)

# Generate synthetic dataset
# 1.0 is the default size
if sdg_sampling_size == 1.0:
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
# Tweak precomputed skills data ratio if needed
else:
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"

def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
if path.exists(skills_recipe):
with open(skills_recipe, "r", encoding="utf-8") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

skills_yaml["datasets"][0]["sampling_size"] = sampling_size

with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)

try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
)
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile

import xdg_base_dirs

# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a default_data_recipes directory
temp_dir = path.join(temp_dir, "default_data_recipes")
os.mkdir(temp_dir)

# Copy default_data_recipes/skills.yaml to the temporary directory
shutil.copy(skills_recipe, temp_dir)

# Also copy the current pipeline directory to the temporary directory - it's a small
# directory like 28KB
# This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
data_dirs = [
os.path.join(str(dir), "instructlab", "sdg")
for dir in xdg_base_dirs.xdg_data_dirs()
]
temp_pipeline_dir = path.join(temp_dir, "pipeline")
os.mkdir(temp_pipeline_dir)
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
shutil.copytree(
pipeline_path,
temp_pipeline_dir,
dirs_exist_ok=True,
)
break

# Build new skills.yaml path
new_skills_recipe = path.join(temp_dir, "skills.yaml")
print(f"New skills recipe path: {new_skills_recipe}")

# Override XDG_DATA_DIRS with the temporary directory
# This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
# and looks for a default_data_recipes directory with a skills.yaml file
os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"

# Try to set the precomputed skills data ratio again
try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
)
print(
f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
except Exception as e:
print(f"Failed to set precomputed skills data ratio: {e}")
raise


@dsl.container_component
def taxonomy_to_artifact_op(
Expand Down
141 changes: 112 additions & 29 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,21 +1134,9 @@ def sdg_op(
):
from os import getenv, path
import instructlab.sdg
import openai
import yaml
from instructlab.sdg import generate_data
from instructlab.sdg.utils.taxonomy import read_taxonomy
def set_precomputed_skills_data_ratio(sampling_size: float):
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
if path.exists(skills_recipe):
with open(skills_recipe, "r") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)
skills_yaml["datasets"][0]["sampling_size"] = sampling_size
with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)
api_key = getenv("api_key")
model = getenv("model")
Expand All @@ -1168,24 +1156,119 @@ def set_precomputed_skills_data_ratio(sampling_size: float):
print("Generating synthetic dataset for:")
print()
print(read_taxonomy(taxonomy_path, taxonomy_base))
print(
instructlab.sdg.utils.taxonomy.read_taxonomy(
taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
)
)
set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
# Generate synthetic dataset
# 1.0 is the default size
if sdg_sampling_size == 1.0:
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
# Tweak precomputed skills data ratio if needed
else:
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
if path.exists(skills_recipe):
with open(skills_recipe, "r", encoding="utf-8") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)
skills_yaml["datasets"][0]["sampling_size"] = sampling_size
with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)
try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
)
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile
import xdg_base_dirs
# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a default_data_recipes directory
temp_dir = path.join(temp_dir, "default_data_recipes")
os.mkdir(temp_dir)
# Copy default_data_recipes/skills.yaml to the temporary directory
shutil.copy(skills_recipe, temp_dir)
# Also copy the current pipeline directory to the temporary directory - it's a small
# directory like 28KB
# This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
data_dirs = [
os.path.join(str(dir), "instructlab", "sdg")
for dir in xdg_base_dirs.xdg_data_dirs()
]
temp_pipeline_dir = path.join(temp_dir, "pipeline")
os.mkdir(temp_pipeline_dir)
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
shutil.copytree(
pipeline_path,
temp_pipeline_dir,
dirs_exist_ok=True,
)
break
# Build new skills.yaml path
new_skills_recipe = path.join(temp_dir, "skills.yaml")
print(f"New skills recipe path: {new_skills_recipe}")
# Override XDG_DATA_DIRS with the temporary directory
# This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
# and looks for a default_data_recipes directory with a skills.yaml file
os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"
# Try to set the precomputed skills data ratio again
try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
)
print(
f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
)
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
except Exception as e:
print(f"Failed to set precomputed skills data ratio: {e}")
raise
"""
exec_sdg_op_args = f"""
sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})
Expand Down
12 changes: 3 additions & 9 deletions training/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def list_phase1_final_model():
else:
raise RuntimeError(f"Unsupported value of {phase_num=}")

image = "quay.io/redhat-et/ilab:1.2"
image = "quay.io/redhat-et/ilab:1.3"

manifest = inspect.cleandoc(
f"""
Expand Down Expand Up @@ -211,9 +211,8 @@ def list_phase1_final_model():
--max_batch_len={max_batch_len} \
--seed={seed} \
--cpu_offload_optimizer \
--cpu_offload_params \
--cpu_offload_params_fsdp \
--distributed_training_framework fsdp \
--is_granite \
--checkpoint_at_epoch
command:
- /bin/bash
Expand Down Expand Up @@ -245,10 +244,8 @@ def list_phase1_final_model():
value: /tmp
resources:
requests:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
limits:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
volumes:
- name: input-data
Expand Down Expand Up @@ -292,9 +289,8 @@ def list_phase1_final_model():
--max_batch_len={max_batch_len} \
--seed={seed} \
--cpu_offload_optimizer \
--cpu_offload_params \
--cpu_offload_params_fsdp \
--distributed_training_framework fsdp \
--is_granite \
--checkpoint_at_epoch
command:
- /bin/bash
Expand Down Expand Up @@ -327,10 +323,8 @@ def list_phase1_final_model():
value: /tmp
resources:
requests:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
limits:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
volumes:
- name: input-data
Expand Down
2 changes: 1 addition & 1 deletion utils/consts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111"
TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox"
OC_IMAGE = "registry.redhat.io/openshift4/ose-cli"
RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.2"
RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.3"

0 comments on commit 3bb3be0

Please sign in to comment.