Skip to content

Commit

Permalink
Merge pull request #132 from russellb/pipeline-validation
Browse files Browse the repository at this point in the history
Automate validation of pipeline configs
  • Loading branch information
russellb authored Jul 15, 2024
2 parents e636680 + 1f39d94 commit b49f961
Show file tree
Hide file tree
Showing 7 changed files with 323 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,8 @@ jobs:
if: ${{ !cancelled() && (steps.deps.outcome == 'success') }}
run: |
tox -e mypy
- name: Validate pipeline schemas
if: ${{ !cancelled() && (steps.deps.outcome == 'success') }}
run: |
tox -e validate-pipelines
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,7 @@ spellcheck-sort: .spellcheck-en-custom.txt ## Sort spellcheck directory
.PHONY: verify
verify: check-tox ## Run linting, typing, and formatting checks via tox
tox p -e fastlint,mypy,ruff

.PHONY: validate-pipelines
validate-pipelines: ## Validate all pipeline files against schema
tox -e validate-pipelines
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ pytest-asyncio
pytest-cov
pytest-html
tox>=4.4.2,<5
jsonschema
45 changes: 45 additions & 0 deletions scripts/validate_pipelines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/usr/bin/env python

# Standard
import glob
import json
import sys

# Third Party
from jsonschema import validate
import jsonschema
import yaml


def validate_yaml_file(yaml_file, schema):
with open(yaml_file, "r") as file:
pipeline = yaml.safe_load(file)

try:
validate(instance=pipeline, schema=schema)
print(f"Validation successful for {yaml_file}.")
except jsonschema.exceptions.ValidationError as err:
print(f"Validation failed for {yaml_file}:", err)
return False
return True


def main():
schema_path = "src/instructlab/sdg/pipelines/schema/v1.json"
with open(schema_path, "r") as file:
schema = json.load(file)

yaml_files = glob.glob("src/instructlab/sdg/pipelines/**/*.yaml", recursive=True)
all_valid = True
for yaml_file in yaml_files:
print("=======================================================")
print("=== Validating", yaml_file)
print("=======================================================")
if not validate_yaml_file(yaml_file, schema):
all_valid = False

return 1 if not all_valid else 0


if __name__ == "__main__":
sys.exit(main())
Empty file.
260 changes: 260 additions & 0 deletions src/instructlab/sdg/pipelines/schema/v1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
{
"type": "object",
"additionalProperties": false,
"required": ["version", "blocks"],
"properties": {
"version": {
"type": "string"
},
"blocks": {
"type": "array",
"additionalProperties": false,
"items": {
"type": "object",
"additionalProperties": false,
"required": ["name", "type", "config"],
"properties": {
"name": {
"type": "string"
},
"type": {
"type": "string"
},
"drop_duplicates": {
"type": "array",
"items": {
"type": "string"
}
},
"drop_columns": {
"type": "array",
"items": {
"type": "string"
}
},
"gen_kwargs": {
"type": "object",
"properties": {
"model": {
"type": "string"
},
"max_tokens": {
"type": "number"
},
"temperature": {
"type": "number"
},
"n": {
"type": "number"
},
"extra_body": {
"type": "object"
}
}
},
"config": {
"anyOf": [
{
"type": "object",
"description": "ImportBlock",
"required": ["path"],
"additionalProperties": false,
"properties": {
"path": {
"type": "string"
}
}
},
{
"type": "object",
"description": "FilterByValueBlock",
"required": ["filter_column", "filter_value", "operation"],
"additionalProperties": false,
"properties": {
"convert_dtype": {
"type": "string",
"enum": ["float", "int", "bool"]
},
"filter_column": {
"type": "string"
},
"filter_value": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
},
{
"type": "array",
"items": {
"oneOf": [
{
"type": "string"
},
{
"type": "number"
}
]
}
}
]
},
"operation": {
"type": "string",
"enum": ["eq", "ne", "gt", "ge", "lt", "le", "contains"]
}
}
},
{
"type": "object",
"description": "LLMBlock",
"required": ["config_path", "output_cols"],
"additionalProperties": false,
"properties": {
"config_path": {
"type": "string"
},
"output_cols": {
"type": "array",
"items": {
"type": "string"
}
},
"parser_kwargs": {
"type": "object",
"properties": {
"parser_name": {
"type": "string"
},
"parsing_pattern": {
"type": "string"
},
"parser_cleanup_tags": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"batch_kwargs": {
"type": "object",
"properties": {
"num_samples": {
"type": "number"
}
}
}
}
},
{
"type": "object",
"description": "ConditionalLLMBlock",
"required": ["config_paths", "output_cols", "selector_column_name"],
"additionalProperties": false,
"properties": {
"config_paths": {
"type": "array",
"items": {
"type": "string"
}
},
"output_cols": {
"type": "array",
"items": {
"type": "string"
}
},
"selector_column_name": {
"type": "string"
},
"parser_kwargs": {
"type": "object",
"properties": {
"parser_name": {
"type": "string"
},
"parsing_pattern": {
"type": "string"
},
"parser_cleanup_tags": {
"type": "array",
"items": {
"type": "string"
}
}
}
},
"batch_kwargs": {
"type": "object",
"properties": {
"num_samples": {
"type": "number"
}
}
}
}
},
{
"type:": "object",
"description": "SamplePopulatorBlock",
"additionalProperties": false,
"required": ["config_paths", "column_name"],
"properties": {
"config_paths": {
"type": "array",
"items": {
"type": "string"
}
},
"column_name": {
"type": "string"
},
"post_fix": {
"type": "string"
}
}
},
{
"type:": "object",
"description": "SelectorBlock",
"additionalProperties": false,
"required": ["choice_map", "choice_col", "output_col"],
"properties": {
"choice_map": {
"type": "object"
},
"choice_col": {
"type": "string"
},
"output_col": {
"type": "string"
}
}
},
{
"type:": "object",
"description": "CombineColumnsBlock",
"additionalProperties": false,
"required": ["columns", "output_col"],
"properties": {
"output_col": {
"type": "string"
},
"columns": {
"type": "array",
"items": {
"type": "string"
}
}
}
}
]
}
}
}
}
}
}
8 changes: 8 additions & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,14 @@ deps =
commands =
mypy src

[testenv:validate-pipelines]
description = Validate pipeline yaml configs
skip_install = true
skipsdist = true
deps = -r requirements-dev.txt
commands =
{envpython} ./scripts/validate_pipelines.py

[gh]
python =
3.11 = py311-unitcov
Expand Down

0 comments on commit b49f961

Please sign in to comment.