Skip to content

Commit

Permalink
Load custom pipelines from platform data dirs
Browse files Browse the repository at this point in the history
Allow convenient aliases to be defined for custom pipelines,
to be loaded from user or site data directorys.

Related #154

Signed-off-by: Derek Higgins <[email protected]>
  • Loading branch information
derekhiggins committed Jul 18, 2024
1 parent e4ed530 commit a78525e
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 13 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ click>=8.1.7,<9.0.0
httpx>=0.25.0,<1.0.0
langchain-text-splitters
openai>=1.13.3,<2.0.0
platformdirs>=4.2
# Note: this dependency goes along with langchain-text-splitters and mayt be
# removed once that one is removed.
# do not use 8.4.0 due to a bug in the library
Expand Down
45 changes: 32 additions & 13 deletions src/instructlab/sdg/generate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from datasets import Dataset
import httpx
import openai
import platformdirs

# First Party
# pylint: disable=ungrouped-imports
Expand Down Expand Up @@ -164,23 +165,40 @@ def _gen_test_data(
outfile.write("\n")


def _check_pipeline_dir(pipeline):
for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]:
if not os.path.exists(os.path.join(pipeline, file)):
raise GenerateException(
f"Error: pipeline directory ({pipeline}) does not contain {file}."
)


def _sdg_init(pipeline, client, model_family, model_id, num_instructions_to_generate):
pipeline_pkg = None
if pipeline == "full":
pipeline_pkg = FULL_PIPELINES_PACKAGE
elif pipeline == "simple":
pipeline_pkg = SIMPLE_PIPELINES_PACKAGE

# Search for the pipeline in User and Site data directories
# then for a package defined pipeline
# and finally pipelines referenced by absolute path
pd = platformdirs.PlatformDirs(
appname=os.path.join("instructlab", "sdg"), multipath=True
)
for d in pd.iter_data_dirs():
if os.path.exists(os.path.join(d, pipeline)):
pipeline = os.path.join(d, pipeline)
_check_pipeline_dir(pipeline)
break
else:
# Validate that pipeline is a valid directory and that it contains the required files
if not os.path.exists(pipeline):
raise GenerateException(
f"Error: pipeline directory ({pipeline}) does not exist."
)
for file in ["knowledge.yaml", "freeform_skills.yaml", "grounded_skills.yaml"]:
if not os.path.exists(os.path.join(pipeline, file)):
if pipeline == "full":
pipeline_pkg = FULL_PIPELINES_PACKAGE
elif pipeline == "simple":
pipeline_pkg = SIMPLE_PIPELINES_PACKAGE
else:
# Validate that pipeline is a valid directory and that it contains the required files
if not os.path.exists(pipeline):
raise GenerateException(
f"Error: pipeline directory ({pipeline}) does not contain {file}."
f"Error: pipeline directory ({pipeline}) does not exist."
)
_check_pipeline_dir(pipeline)

ctx = PipelineContext(client, model_family, model_id, num_instructions_to_generate)

Expand Down Expand Up @@ -238,7 +256,8 @@ def generate_data(
use the SDG library constructs directly, and this function will likely be removed.
Args:
pipeline: This argument may be either an alias defined by the sdg library ("simple", "full"),
pipeline: This argument may be either an alias defined in a user or site "data directory"
or an alias defined by the sdg library ("simple", "full")(if the data directory has no matches),
or an absolute path to a directory containing the pipeline YAML files.
We expect three files to be present in this directory: "knowledge.yaml",
"freeform_skills.yaml", and "grounded_skills.yaml".
Expand Down

0 comments on commit a78525e

Please sign in to comment.