Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pipeline Updates for RHEL AI 1.3 #230

Merged
merged 5 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion importer-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ deploymentSpec:
env:
- name: REGISTRY_AUTH_FILE
value: /mnt/containers/.dockerconfigjson
image: quay.io/redhat-et/ilab:1.2
image: quay.io/redhat-et/ilab:1.3
pipelineInfo:
description: Helper pipeline to the InstructLab pipeline which allows users to seed/import
a new base model
Expand Down
1 change: 1 addition & 0 deletions pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,7 @@ def pipeline(
)
data_processing_task.after(model_to_pvc_task, sdg_task)
data_processing_task.set_caching_options(False)
data_processing_task.set_env_variable("XDG_CACHE_HOME", "/tmp")

set_image_pull_secrets(data_processing_task, [IMAGE_PULL_SECRET])

Expand Down
322 changes: 186 additions & 136 deletions pipeline.yaml

Large diffs are not rendered by default.

143 changes: 113 additions & 30 deletions sdg/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,9 @@ def sdg_op(
):
from os import getenv, path

import instructlab.sdg
import openai
import yaml
from instructlab.sdg import generate_data
from instructlab.sdg.utils.taxonomy import read_taxonomy

def set_precomputed_skills_data_ratio(sampling_size: float):
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
if path.exists(skills_recipe):
with open(skills_recipe, "r") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

skills_yaml["datasets"][0]["sampling_size"] = sampling_size

with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)

api_key = getenv("api_key")
model = getenv("model")
Expand All @@ -73,25 +61,120 @@ def set_precomputed_skills_data_ratio(sampling_size: float):

print("Generating synthetic dataset for:")
print()
print(read_taxonomy(taxonomy_path, taxonomy_base))

set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
print(
instructlab.sdg.utils.taxonomy.read_taxonomy(
taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
)
)

# Generate synthetic dataset
# 1.0 is the default size
if sdg_sampling_size == 1.0:
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
# Tweak precomputed skills data ratio if needed
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add more explanation for this since it differs from what's exposed in ilab CLI? Or, link to an issue to replace this once the equivalent is in ilab/sdg? This can be a follow-up.

else:
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"

def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
if path.exists(skills_recipe):
with open(skills_recipe, "r", encoding="utf-8") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

skills_yaml["datasets"][0]["sampling_size"] = sampling_size

with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)

try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
)
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile

import xdg_base_dirs

# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a default_data_recipes directory
temp_dir = path.join(temp_dir, "default_data_recipes")
os.mkdir(temp_dir)

# Copy default_data_recipes/skills.yaml to the temporary directory
shutil.copy(skills_recipe, temp_dir)

# Also copy the current pipeline directory to the temporary directory - it's a small
# directory like 28KB
# This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
data_dirs = [
os.path.join(str(dir), "instructlab", "sdg")
for dir in xdg_base_dirs.xdg_data_dirs()
]
temp_pipeline_dir = path.join(temp_dir, "pipeline")
os.mkdir(temp_pipeline_dir)
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
shutil.copytree(
pipeline_path,
temp_pipeline_dir,
dirs_exist_ok=True,
)
break

# Build new skills.yaml path
new_skills_recipe = path.join(temp_dir, "skills.yaml")
print(f"New skills recipe path: {new_skills_recipe}")

# Override XDG_DATA_DIRS with the temporary directory
# This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
# and looks for a default_data_recipes directory with a skills.yaml file
os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's keep the override here instead of doing data_processing_task.set_env_variable so that removing this code will be easier in the future. Nothing to change!


# Try to set the precomputed skills data ratio again
try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
)
print(
f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
except Exception as e:
print(f"Failed to set precomputed skills data ratio: {e}")
raise


@dsl.container_component
def taxonomy_to_artifact_op(
Expand Down
141 changes: 112 additions & 29 deletions standalone/standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -1134,21 +1134,9 @@ def sdg_op(
):
from os import getenv, path

import instructlab.sdg
import openai
import yaml
from instructlab.sdg import generate_data
from instructlab.sdg.utils.taxonomy import read_taxonomy

def set_precomputed_skills_data_ratio(sampling_size: float):
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"
if path.exists(skills_recipe):
with open(skills_recipe, "r") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

skills_yaml["datasets"][0]["sampling_size"] = sampling_size

with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)

api_key = getenv("api_key")
model = getenv("model")
Expand All @@ -1168,24 +1156,119 @@ def set_precomputed_skills_data_ratio(sampling_size: float):

print("Generating synthetic dataset for:")
print()
print(read_taxonomy(taxonomy_path, taxonomy_base))
print(
instructlab.sdg.utils.taxonomy.read_taxonomy(
taxonomy_path, taxonomy_base, document_output_dir=f"{sdg_path}/documents"
)
)

set_precomputed_skills_data_ratio(sampling_size=sdg_sampling_size)
# Generate synthetic dataset
# 1.0 is the default size
if sdg_sampling_size == 1.0:
# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
# Tweak precomputed skills data ratio if needed
else:
skills_recipe = "/usr/share/instructlab/sdg/default_data_recipes/skills.yaml"

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
def set_precomputed_skills_data_ratio(sampling_size: float, skills_recipe: str):
if path.exists(skills_recipe):
with open(skills_recipe, "r", encoding="utf-8") as file:
skills_yaml = yaml.load(file, Loader=yaml.Loader)

skills_yaml["datasets"][0]["sampling_size"] = sampling_size

with open(skills_recipe, "w", encoding="utf-8") as file:
yaml.dump(skills_yaml, file)

try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=skills_recipe
)
except PermissionError:
print("Failed to set precomputed skills data ratio: Permission denied")
print("Attempting to move default data recipes to temporary directory")
import os
import shutil
import tempfile

import xdg_base_dirs

# Create a temporary directory
with tempfile.TemporaryDirectory() as temp_dir:
# Create a default_data_recipes directory
temp_dir = path.join(temp_dir, "default_data_recipes")
os.mkdir(temp_dir)

# Copy default_data_recipes/skills.yaml to the temporary directory
shutil.copy(skills_recipe, temp_dir)

# Also copy the current pipeline directory to the temporary directory - it's a small
# directory like 28KB
# This isn't needed if the pipeline is either "full" or "simple" but it's future-proofing
data_dirs = [
os.path.join(str(dir), "instructlab", "sdg")
for dir in xdg_base_dirs.xdg_data_dirs()
]
temp_pipeline_dir = path.join(temp_dir, "pipeline")
os.mkdir(temp_pipeline_dir)
for d in data_dirs:
pipeline_path = os.path.join(d, "pipelines", pipeline)
if os.path.exists(pipeline_path):
shutil.copytree(
pipeline_path,
temp_pipeline_dir,
dirs_exist_ok=True,
)
break

# Build new skills.yaml path
new_skills_recipe = path.join(temp_dir, "skills.yaml")
print(f"New skills recipe path: {new_skills_recipe}")

# Override XDG_DATA_DIRS with the temporary directory
# This allows SDG to read the new skills.yaml since it's looking into XDG_DATA_DIRS
# and looks for a default_data_recipes directory with a skills.yaml file
os.environ["XDG_DATA_DIRS"] = f"{temp_dir}"

# Try to set the precomputed skills data ratio again
try:
set_precomputed_skills_data_ratio(
sampling_size=sdg_sampling_size, skills_recipe=new_skills_recipe
)
print(
f"Successfully set precomputed skills data ratio to {sdg_sampling_size}"
)

# generate_data has a magic word for its taxonomy_base argument - 'empty'
# it allows generating from the whole repo, see:
# https://github.com/instructlab/sdg/blob/c6a9e74a1618b1077cd38e713b8aaed8b7c0c8ce/src/instructlab/sdg/utils/taxonomy.py#L230
instructlab.sdg.generate_data(
client=client,
num_instructions_to_generate=num_instructions_to_generate,
output_dir=sdg_path,
taxonomy=taxonomy_path,
taxonomy_base=taxonomy_base,
model_name=model,
pipeline=pipeline,
chunk_word_count=1000,
server_ctx_size=4096,
)
except Exception as e:
print(f"Failed to set precomputed skills data ratio: {e}")
raise
"""
exec_sdg_op_args = f"""
sdg_op(num_instructions_to_generate={num_instructions_to_generate}, pipeline="{sdg_pipeline}", repo_branch="{exec_git_clone_op_repo_branch or ""}", repo_pr={exec_git_clone_op_repo_pr or 0}, taxonomy_path="{TAXONOMY_DATA_PATH}", sdg_path="{DATA_PVC_SDG_PATH}", sdg_sampling_size={sdg_sampling_size})
Expand Down
12 changes: 3 additions & 9 deletions training/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def list_phase1_final_model():
else:
raise RuntimeError(f"Unsupported value of {phase_num=}")

image = "quay.io/redhat-et/ilab:1.2"
image = "quay.io/redhat-et/ilab:1.3"

manifest = inspect.cleandoc(
f"""
Expand Down Expand Up @@ -211,9 +211,8 @@ def list_phase1_final_model():
--max_batch_len={max_batch_len} \
--seed={seed} \
--cpu_offload_optimizer \
--cpu_offload_params \
--cpu_offload_params_fsdp \
--distributed_training_framework fsdp \
--is_granite \
--checkpoint_at_epoch
command:
- /bin/bash
Expand Down Expand Up @@ -245,10 +244,8 @@ def list_phase1_final_model():
value: /tmp
resources:
requests:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
limits:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
volumes:
- name: input-data
Expand Down Expand Up @@ -292,9 +289,8 @@ def list_phase1_final_model():
--max_batch_len={max_batch_len} \
--seed={seed} \
--cpu_offload_optimizer \
--cpu_offload_params \
--cpu_offload_params_fsdp \
--distributed_training_framework fsdp \
--is_granite \
--checkpoint_at_epoch
command:
- /bin/bash
Expand Down Expand Up @@ -327,10 +323,8 @@ def list_phase1_final_model():
value: /tmp
resources:
requests:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
limits:
cpu: 8
"nvidia.com/gpu": {nproc_per_node}
volumes:
- name: input-data
Expand Down
2 changes: 1 addition & 1 deletion utils/consts.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
PYTHON_IMAGE = "quay.io/modh/odh-generic-data-science-notebook:v3-2024b-20241111"
TOOLBOX_IMAGE = "registry.access.redhat.com/ubi9/toolbox"
OC_IMAGE = "registry.redhat.io/openshift4/ose-cli"
RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.2"
RHELAI_IMAGE = "quay.io/redhat-et/ilab:1.3"
Loading