Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automate validation of pipeline configs #132

Merged
merged 14 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,8 @@ jobs:
if: ${{ !cancelled() && (steps.deps.outcome == 'success') }}
run: |
tox -e mypy

- name: Validate pipeline schemas
if: ${{ !cancelled() && (steps.deps.outcome == 'success') }}
run: |
tox -e validate-pipelines
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,7 @@ spellcheck-sort: .spellcheck-en-custom.txt ## Sort spellcheck directory
.PHONY: verify
verify: check-tox ## Run linting, typing, and formatting checks via tox
tox p -e fastlint,mypy,ruff

.PHONY: validate-pipelines
validate-pipelines: ## Validate all pipeline files against schema
tox -e validate-pipelines
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ pytest-asyncio
pytest-cov
pytest-html
tox>=4.4.2,<5
jsonschema
45 changes: 45 additions & 0 deletions scripts/validate_pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python

# Standard
import glob
import json
import sys

# Third Party
from jsonschema import validate
import jsonschema
import yaml


def validate_yaml_file(yaml_file, schema):
with open(yaml_file, "r") as file:
pipeline = yaml.safe_load(file)

try:
validate(instance=pipeline, schema=schema)
print(f"Validation successful for {yaml_file}.")
except jsonschema.exceptions.ValidationError as err:
print(f"Validation failed for {yaml_file}:", err)
return False
return True


def main():
schema_path = "src/instructlab/sdg/pipelines/schema/v1.json"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about loading this from the instructlab.sdg package

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, that would be better

with open(schema_path, "r") as file:
schema = json.load(file)

yaml_files = glob.glob("src/instructlab/sdg/pipelines/**/*.yaml", recursive=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And loading these from a command-line argument?

Then it can be a tool that can be used to validate custom pipeline configs using the installed instructlab.sdg schema?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that sounds good.

We could even leave this as the default when you don't pass an argument

all_valid = True
for yaml_file in yaml_files:
print("=======================================================")
print("=== Validating", yaml_file)
print("=======================================================")
if not validate_yaml_file(yaml_file, schema):
all_valid = False

return 1 if not all_valid else 0


if __name__ == "__main__":
sys.exit(main())
Empty file.
260 changes: 260 additions & 0 deletions src/instructlab/sdg/pipelines/schema/v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
{
"type": "object",
"additionalProperties": false,
"required": ["version", "blocks"],
"properties": {
"version": {
"type": "string"
},
"blocks": {
"type": "array",
"additionalProperties": false,
"items": {
"type": "object",
"additionalProperties": false,
"required": ["name", "type", "config"],
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
},
"drop_duplicates": {
"type": "array",
"items": {
"type": "string"
}
},
"drop_columns": {
"type": "array",
"items": {
"type": "string"
}
},
"gen_kwargs": {
"type": "object",
"properties": {
markmc marked this conversation as resolved.
Show resolved Hide resolved
"model": {
"type": "string"
},
"max_tokens": {
"type": "number"
},
"temperature": {
"type": "number"
},
"n": {
"type": "number"
},
"extra_body": {
"type": "object"
}
}
},
"config": {
"anyOf": [
{
"type": "object",
"description": "ImportBlock",
"required": ["path"],
"additionalProperties": false,
"properties": {
"path": {
"type": "string"
}
}
},
{
"type": "object",
"description": "FilterByValueBlock",
"required": ["filter_column", "filter_value", "operation"],
"additionalProperties": false,
"properties": {
"convert_dtype": {
"type": "string",
"enum": ["float", "int", "bool"]
},
"filter_column": {
"type": "string"
},
"filter_value": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
},
{
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
}
]
}
}
]
},
"operation": {
"type": "string",
"enum": ["eq", "ne", "gt", "ge", "lt", "le", "contains"]
}
}
},
{
"type": "object",
"description": "LLMBlock",
markmc marked this conversation as resolved.
Show resolved Hide resolved
"required": ["config_path", "output_cols"],
"additionalProperties": false,
"properties": {
"config_path": {
"type": "string"
},
"output_cols": {
"type": "array",
"items": {
"type": "string"
}
},
"parser_kwargs": {
"type": "object",
"properties": {
"parser_name": {
"type": "string"
},
"parsing_pattern": {
"type": "string"
},
"parser_cleanup_tags": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"batch_kwargs": {
"type": "object",
"properties": {
"num_samples": {
"type": "number"
}
}
}
}
},
{
"type": "object",
"description": "ConditionalLLMBlock",
"required": ["config_paths", "output_cols", "selector_column_name"],
"additionalProperties": false,
"properties": {
"config_paths": {
"type": "array",
"items": {
"type": "string"
}
},
"output_cols": {
"type": "array",
"items": {
"type": "string"
}
},
"selector_column_name": {
"type": "string"
},
"parser_kwargs": {
"type": "object",
"properties": {
"parser_name": {
"type": "string"
},
"parsing_pattern": {
"type": "string"
},
"parser_cleanup_tags": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"batch_kwargs": {
"type": "object",
"properties": {
"num_samples": {
"type": "number"
}
}
}
}
},
{
"type:": "object",
"description": "SamplePopulatorBlock",
"additionalProperties": false,
"required": ["config_paths", "column_name"],
"properties": {
"config_paths": {
"type": "array",
"items": {
"type": "string"
}
},
"column_name": {
"type": "string"
},
"post_fix": {
"type": "string"
}
}
},
{
"type:": "object",
"description": "SelectorBlock",
"additionalProperties": false,
"required": ["choice_map", "choice_col", "output_col"],
"properties": {
"choice_map": {
"type": "object"
},
"choice_col": {
"type": "string"
},
"output_col": {
"type": "string"
}
}
},
{
"type:": "object",
"description": "CombineColumnsBlock",
"additionalProperties": false,
"required": ["columns", "output_col"],
"properties": {
"output_col": {
"type": "string"
},
"columns": {
"type": "array",
"items": {
"type": "string"
}
}
}
}
]
}
}
}
}
}
}
8 changes: 8 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ deps =
commands =
mypy src

[testenv:validate-pipelines]
description = Validate pipeline yaml configs
skip_install = true
skipsdist = true
deps = -r requirements-dev.txt
commands =
{envpython} ./scripts/validate_pipelines.py

[gh]
python =
3.11 = py311-unitcov
Expand Down