From 4eafaacfe97876653126265df2dca0ccac82259a Mon Sep 17 00:00:00 2001 From: BJ Hargrave Date: Tue, 25 Jun 2024 16:23:54 -0400 Subject: [PATCH] Add taxonomy qna.yaml parsing API The parse method will yamllint and jsonschema validate a qna.yaml file. It will return an object holding the parsed yaml. Signed-off-by: BJ Hargrave --- .github/dependabot.yml | 6 + .github/workflows/lint.yml | 2 +- pyproject.toml | 28 +- scripts/ruff.sh | 50 --- src/instructlab/schema/__init__.py | 15 +- src/instructlab/schema/taxonomy.py | 413 ++++++++++++++++++ tests/test_parse.py | 343 +++++++++++++++ tests/test_versions.py | 19 +- .../compositional_skills/array_yaml/qna.yaml | 2 + .../compositional_skills/empty_yaml/qna.yaml | 2 + .../invalid_yaml/qna.yaml | 18 + .../skill_incomplete/qna.yaml | 8 + .../compositional_skills/skill_valid/qna.yaml | 16 + .../compositional_skills/version_1/qna.yaml | 17 + .../testdata/knowledge/invalid_name/file.yaml | 23 + tests/testdata/knowledge/invalid_name/qna.yml | 23 + .../knowledge/knowledge_valid/qna.yaml | 23 + tox.ini | 30 +- 18 files changed, 964 insertions(+), 74 deletions(-) delete mode 100755 scripts/ruff.sh create mode 100644 src/instructlab/schema/taxonomy.py create mode 100644 tests/test_parse.py create mode 100644 tests/testdata/compositional_skills/array_yaml/qna.yaml create mode 100644 tests/testdata/compositional_skills/empty_yaml/qna.yaml create mode 100644 tests/testdata/compositional_skills/invalid_yaml/qna.yaml create mode 100644 tests/testdata/compositional_skills/skill_incomplete/qna.yaml create mode 100644 tests/testdata/compositional_skills/skill_valid/qna.yaml create mode 100644 tests/testdata/compositional_skills/version_1/qna.yaml create mode 100644 tests/testdata/knowledge/invalid_name/file.yaml create mode 100644 tests/testdata/knowledge/invalid_name/qna.yml create mode 100644 tests/testdata/knowledge/knowledge_valid/qna.yaml diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 85c1d1b..ccb6572 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,3 +13,9 @@ updates: directory: "/.github/workflows" schedule: interval: "daily" + + # Maintain dependencies for Python code + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c156a3a..3a46994 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -48,7 +48,7 @@ jobs: tox -e jsonschema - name: "ruff" commands: | - tox -e ruff -- check + tox -e ruffcheck - name: "pylint" commands: | echo "::add-matcher::.github/workflows/matchers/pylint.json" diff --git a/pyproject.toml b/pyproject.toml index 4bc1753..a282070 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,15 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] -dynamic = ["dependencies", "optional-dependencies", "version"] +dependencies = [ + "typing_extensions", + "jsonschema>=4.22.0", + "PyYAML>=6.0.0", + # The below library should NOT be imported into any python files + # We only use the command via subprocess + "yamllint>=1.35.1", +] +dynamic = ["version"] [project.urls] homepage = "https://instructlab.ai" @@ -40,6 +48,7 @@ exclude = ["^src/instructlab/schema/_version\\.py$"] target-version = "py310" src = ["src", "tests"] extend-exclude = ["src/instructlab/schema/_version.py"] +line-length = 180 [tool.ruff.lint] select = [ @@ -52,12 +61,29 @@ select = [ "SIM", # flake8-simplify "TID", # flake8-tidy-imports ] +ignore = [ + "B019", # cached-instance-method +] + +[tool.ruff.lint.flake8-tidy-imports.banned-api] +"yamllint".msg = "yamllint is for use as a command via subprocess." [tool.pylint.main] py-version = "3.10" source-roots = ["src", "tests"] ignore = ["_version.py"] +[tool.pylint.design] +max-attributes = 10 +max-branches = 30 +max-line-length = 180 +max-locals = 30 +max-statements = 80 +min-public-methods = 1 + +[tool.pylint.format] +max-args = 10 + [tool.pylint."messages control"] disable = [ "missing-class-docstring", diff --git a/scripts/ruff.sh b/scripts/ruff.sh deleted file mode 100755 index 6bf131f..0000000 --- a/scripts/ruff.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -set -e - -# wrapper to combine ruff check and ruff format -# -# "ruff.sh fix" runs fixes and reformats the code -# "ruff.sh check" checks style, format, and imports -# "ruff.sh " passes abitrary args to ruff - -if [ -z "$1" ]; then - echo "USAGE: $0 [check|fix|]" >&2 - exit 2 -fi - -run() { - declare -i err - - echo "RUN: '$*'" - "$@" - err=$? - echo - return $err -} - -case $1 in - "check") - declare -i exitcode=0 - - set +e - run ruff check --diff - exitcode=$(( exitcode + $? )) - - run ruff format --check - exitcode=$(( exitcode + $? )) - set -e - - if [ $exitcode -ne 0 ]; then - echo "ERROR: one or more checks have failed." >&2 - echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2 - exit 3 - fi - ;; - "fix") - run ruff check --fix - run ruff format - ;; - *) - ruff "$@" -esac diff --git a/src/instructlab/schema/__init__.py b/src/instructlab/schema/__init__.py index a1bc1f2..da4e07c 100644 --- a/src/instructlab/schema/__init__.py +++ b/src/instructlab/schema/__init__.py @@ -10,7 +10,17 @@ except ImportError: # python<3.11 from importlib.abc import Traversable -__all__ = ["schema_versions"] +__all__ = ["schema_base", "schema_versions"] + + +def schema_base() -> Traversable: + """Return the schema base. + + Returns: + Traversable: The base for the schema versions. + """ + base = resources.files(__package__) + return base def schema_versions() -> list[Traversable]: @@ -19,9 +29,8 @@ def schema_versions() -> list[Traversable]: Returns: list[Traversable]: A sorted list of schema versions. """ - schema_base = resources.files(__package__) versions = sorted( - (v for v in schema_base.iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), + (v for v in schema_base().iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), key=lambda k: int(k.name[1:]), ) return versions diff --git a/src/instructlab/schema/taxonomy.py b/src/instructlab/schema/taxonomy.py new file mode 100644 index 0000000..fc8814f --- /dev/null +++ b/src/instructlab/schema/taxonomy.py @@ -0,0 +1,413 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Taxonomy qna.yaml parsing""" + +# Standard +import json +import logging +import os +import re +import subprocess +from collections.abc import Mapping +from enum import Enum, auto +from functools import lru_cache, partial +from pathlib import Path +from typing import Any + +# Third Party +import yaml +from jsonschema.protocols import Validator +from jsonschema.validators import validator_for +from referencing import Registry, Resource +from referencing.exceptions import NoSuchResource +from referencing.jsonschema import DRAFT202012 +from typing_extensions import Self + +from . import schema_base, schema_versions + +logger = logging.getLogger(__name__) + +DEFAULT_TAXONOMY_FOLDERS: list[str] = ["compositional_skills", "knowledge"] +"""Taxonomy folders which are also the schema names""" + +DEFAULT_YAMLLINT_CONFIG: str = "{extends: relaxed, rules: {line-length: {max: 120}}}" +"""Default yamllint configuration""" + + +class TaxonomyReadingException(Exception): + """An exception raised during reading of the taxonomy.""" + + +class TaxonomyMessageFormat(Enum): + """An enum for the format choices for taxonomy parsing messages""" + + AUTO = auto() + """ + If specified, then GITHUB will be used if both the GITHUB_ACTIONS and GITHUB_WORKFLOW + environment variables are set. Otherwise STANDARD will be used. + """ + STANDARD = auto() + """ + A plain message starting with ERROR or WARN. + """ + GITHUB = auto() + """ + Uses GitHub Actions workflow commands to set error or warning commands. + """ + LOGGING = auto() + """ + Uses this module's logger to log warnings or errors. + """ + + +class Taxonomy: + """A container for a parsed taxonomy qna.yaml file.""" + + def __init__(self, *, path: Path, abs_path: Path, message_format: TaxonomyMessageFormat) -> None: + """Create a container for a parsed taxonomy qna.yaml file. + + Args: + path (Path): The taxonomy path for the qna.yaml file. + This should be a relative path starting at the taxonomy folder holding the qna.yaml file. + abs_path (Path): The absolute path for the qna.yaml file. + message_format (TaxonomyMessageFormat): The format of any messages. + If TaxonomyMessageFormat.AUTO is specified, then TaxonomyMessageFormat.GITHUB will be used if + both the GITHUB_ACTIONS and GITHUB_WORKFLOW environment variables are set. Otherwise + TaxonomyMessageFormat.STANDARD will be used. + """ + self.path: Path = path + abs_path = abs_path.resolve() + cwd = Path.cwd() + self.rel_path: Path = abs_path.relative_to(cwd) if abs_path.is_relative_to(cwd) else abs_path + if message_format is TaxonomyMessageFormat.AUTO: + message_format = TaxonomyMessageFormat.GITHUB if "GITHUB_ACTIONS" in os.environ and "GITHUB_WORKFLOW" in os.environ else TaxonomyMessageFormat.STANDARD + self.message_format: TaxonomyMessageFormat = message_format + self.errors: int = 0 + self.warnings: int = 0 + self.parsed: Mapping[str, Any] = {} + self.version: int = 0 + + def error( + self, + message: str, + *message_args: Any, + line: str | int = 1, + col: str | int = 1, + yaml_path: str = "", + ) -> Self: + """Report an error on the taxonomy qna.yaml file. + + Args: + message (str): The error message. The message supports string formatting using the specified message_args, if any. + message_args (Any, optional): Values used by the message string formatting. + line (str | int, optional): The line in the qna.yaml file where the error occurred. Defaults to 1. + col (str | int, optional): The column in the qna.yaml file where the error occurred. Defaults to 1. + yaml_path (str, optional): The yaml path to the item in the qna.yaml file where the error occurred. + + Returns: + Self: This Taxonomy object. + """ + self.errors += 1 + match self.message_format: + case TaxonomyMessageFormat.GITHUB: + if message_args: + message = message % message_args + print( + f"::error file={self.rel_path},line={line},col={col}::{line}:{col} [{yaml_path}] {message}" + if yaml_path + else f"::error file={self.rel_path},line={line},col={col}::{line}:{col} {message}" + ) + case TaxonomyMessageFormat.LOGGING: + if yaml_path: + logger.error( + "%s:%s:%s [%s] " + message, + self.rel_path, + line, + col, + yaml_path, + *message_args, + ) + else: + logger.error("%s:%s:%s " + message, self.rel_path, line, col, *message_args) + case TaxonomyMessageFormat.STANDARD | _: + if message_args: + message = message % message_args + print(f"ERROR: {self.rel_path}:{line}:{col} [{yaml_path}] {message}" if yaml_path else f"ERROR: {self.rel_path}:{line}:{col} {message}") + return self + + def warning( + self, + message: str, + *message_args: Any, + line: str | int = 1, + col: str | int = 1, + yaml_path: str = "", + ) -> Self: + """Report a warning on the taxonomy qna.yaml file. + + Args: + message (str): The warning message. The message supports string formatting using the specified message_args, if any. + message_args (Any, optional): Values used by the message string formatting. + line (str | int, optional): The line in the qna.yaml file where the warning occurred. Defaults to 1. + col (str | int, optional): The column in the qna.yaml file where the warning occurred. Defaults to 1. + yaml_path (str, optional): The yaml path to the item in the qna.yaml file where the warning occurred. + + Returns: + Self: This Taxonomy object. + """ + self.warnings += 1 + match self.message_format: + case TaxonomyMessageFormat.GITHUB: + if message_args: + message = message % message_args + print( + f"::warning file={self.rel_path},line={line},col={col}::{line}:{col} [{yaml_path}] {message}" + if yaml_path + else f"::warning file={self.rel_path},line={line},col={col}::{line}:{col} {message}" + ) + case TaxonomyMessageFormat.LOGGING: + if yaml_path: + logger.warning( + "%s:%s:%s [%s] " + message, + self.rel_path, + line, + col, + yaml_path, + *message_args, + ) + else: + logger.warning("%s:%s:%s " + message, self.rel_path, line, col, *message_args) + case TaxonomyMessageFormat.STANDARD | _: + if message_args: + message = message % message_args + print(f"WARN: {self.rel_path}:{line}:{col} [{yaml_path}] {message}" if yaml_path else f"WARN: {self.rel_path}:{line}:{col} {message}") + return self + + +class TaxonomyParser: + """A parser for taxonomy qna.yaml files. The parser will return a Taxonomy object.""" + + def __init__( + self, + *, + taxonomy_folders: list[str] | None = None, + schema_version: int | None = None, + yamllint_config: str | None = None, + message_format: TaxonomyMessageFormat | str | None = None, + ) -> None: + """Create a parser for a taxonomy qna.yaml file. + + Args: + taxonomy_folders (list[str] | None, optional): The folder/schema names. + DEFAULT_TAXONOMY_FOLDERS is used if None is specified. + schema_version (int | None, optional): The version of the Taxonomy schema. + Specifying a version less than 1 will use the schema version specified by each YAML document's "version" key. + The highest schema version is used if None is specified. + yamllint_config (str | None, optional): The yamllint configuration data. + DEFAULT_YAMLLINT_CONFIG is used if None is specified. + message_format (TaxonomyMessageFormat | str | None, optional): The format of any messages. + TaxonomyMessageFormat.AUTO is used if None is specified. + """ + if taxonomy_folders is None: + taxonomy_folders = DEFAULT_TAXONOMY_FOLDERS + self.taxonomy_folders: list[str] = taxonomy_folders + if schema_version is None: + versions = schema_versions() + if not versions: + raise TaxonomyReadingException(f'Schema base "{schema_base()}" does not contain any schema versions') + schema_version = int(versions[-1].name[1:]) + self.schema_version: int = schema_version + if yamllint_config is None: + yamllint_config = DEFAULT_YAMLLINT_CONFIG + self.yamllint_config: str = yamllint_config + if message_format is None: + message_format = TaxonomyMessageFormat.AUTO + elif isinstance(message_format, str): + message_format = TaxonomyMessageFormat[message_format.upper()] + self.message_format: TaxonomyMessageFormat = message_format + self.yq_available: bool = True + + def _yamllint(self, content: str, taxonomy: Taxonomy) -> None: + yamllint_cmd = [ + "yamllint", + "-f", + "parsable", + "-d", + self.yamllint_config, + "-", # read from stdin + ] + + try: + result = subprocess.run( + yamllint_cmd, + check=False, + input=content, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + except FileNotFoundError as e: + logger.warning( + "could not run yamllint command", + exc_info=e, + ) + return + + lines = result.stdout.splitlines() + if lines: + pattern = re.compile(r"[^:]+:(?P[^:]+):(?P[^:]+):\s*\[(?P[^]]+)\]\s*(?P.*)") + for line in lines: + match = pattern.match(line) + if match: + line = match.group("line") + col = match.group("col") + severity = match.group("severity") + message = match.group("message") + if severity == "error": + taxonomy.error( + message, + line=line, + col=col, + ) + else: + taxonomy.warning( + message, + line=line, + col=col, + ) + + @lru_cache + def _load_schema(self, path: str) -> Resource: + try: + schema_path = schema_base().joinpath(path) + contents = json.loads(schema_path.read_text(encoding="utf-8")) + resource = Resource.from_contents(contents=contents, default_specification=DRAFT202012) + return resource + except Exception as e: + raise NoSuchResource(path) from e + + def _retrieve(self, version_base: str, schema: str) -> Resource: + path = Path(version_base, schema).as_posix() + return self._load_schema(path) + + def _schema_validate(self, content: str, taxonomy: Taxonomy) -> None: + retrieve = partial(self._retrieve, f"v{taxonomy.version}") + schema_name = taxonomy.path.parts[0] + if schema_name not in self.taxonomy_folders: + schema_name = "knowledge" if "document" in taxonomy.parsed else "compositional_skills" + + try: + schema_resource = retrieve(f"{schema_name}.json") + schema = schema_resource.contents + validator_cls = validator_for(schema) + registry: Registry = Registry(retrieve=retrieve) # type: ignore[call-arg] + validator: Validator = validator_cls(schema, registry=registry) + + for validation_error in validator.iter_errors(taxonomy.parsed): + yaml_path = validation_error.json_path[1:] + if not yaml_path: + yaml_path = "." + line: str | int = 1 + if self.yq_available: + try: + yq_expression = f"{yaml_path} | line" + line = subprocess.check_output(["yq", yq_expression], input=content, text=True) + line = line.strip() if line else 1 + except (subprocess.SubprocessError, FileNotFoundError) as e: + if isinstance(e, FileNotFoundError): + self.yq_available = False + logger.warning( + "could not run yq command", + exc_info=e, + ) + if validation_error.validator == "minItems": + # Special handling for minItems which can have a long message for seed_examples + taxonomy.error( + "Value must have at least %s items", + validation_error.validator_value, + line=line, + yaml_path=yaml_path, + ) + else: + taxonomy.error( + validation_error.message[-200:], + line=line, + yaml_path=yaml_path, + ) + except NoSuchResource as e: + taxonomy.error( + "Cannot load schema file %s. %s", + e.ref, + e, + ) + + def parse(self, path: str | Path) -> Taxonomy: + """Parse a qna.yaml file into a Taxonomy object. + + Args: + path (str | Path): The path to the qna.yaml file to parse. + + Raises: + TaxonomyReadingException: If an exception is raised while parsing. + + Returns: + Taxonomy: A Taxonomy object holding the parsed qna.yaml file and any errors or warnings + identified during parsing the qna.yaml file. + These can include lint issues and also schema validation issues. + """ + abs_path = Path(path).resolve() + for i in range(len(abs_path.parts) - 1, -1, -1): + if abs_path.parts[i] in self.taxonomy_folders: + taxonomy = Taxonomy(path=Path(*abs_path.parts[i:]), abs_path=abs_path, message_format=self.message_format) + break + else: + taxonomy = Taxonomy(path=abs_path, abs_path=abs_path, message_format=self.message_format) + + if not os.path.isfile(abs_path): + return taxonomy.error( + 'The file "%s" does not exist or is not a file', + abs_path, + ) + + if abs_path.name != "qna.yaml": + return taxonomy.error( + 'Taxonomy file must be named "qna.yaml"; "%s" is not a valid name', + abs_path.name, + ) + + try: + with open(abs_path, encoding="utf-8") as stream: + content = stream.read() + + parsed: Mapping[str, Any] = yaml.safe_load(content) + if not parsed: + return taxonomy.error("The file is empty") + + if not isinstance(parsed, Mapping): + return taxonomy.error( + "The file is not valid. The top-level element is not an object with key-value pairs.", + ) + + version: int = self.schema_version + if version < 1: # Use version from YAML document + parsed_version = parsed.get("version", 1) + if isinstance(parsed_version, int): + version = parsed_version + else: + # schema validation will complain about the type + try: + version = int(parsed_version) + except ValueError: + version = 1 # fallback to version 1 + + taxonomy.version = version + taxonomy.parsed = parsed + + if version > 1: # no linting for version 1 yaml + self._yamllint(content=content, taxonomy=taxonomy) + + self._schema_validate(content=content, taxonomy=taxonomy) + except Exception as e: + raise TaxonomyReadingException from e + + return taxonomy diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..f76666b --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,343 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import logging +import os +import re +from collections.abc import Callable +from pathlib import Path + +# Third Party +from assertpy import assert_that +from pytest import CaptureFixture, LogCaptureFixture + +from instructlab.schema.taxonomy import TaxonomyMessageFormat, TaxonomyParser + +testdata = Path("tests/testdata") + + +class TestParsing: + def message_filter(self, regex: str) -> Callable[[logging.LogRecord], bool]: + pattern = re.compile(regex) + return lambda r: bool(re.search(pattern, r.message)) + + def test_invalid(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), # type: ignore[call-arg] + ).contains_only(logging.WARNING) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"Unevaluated properties.*createdby"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"created_by.*required property"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_invalid_custom_yaml_config(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + yamllint_config = "{extends: relaxed, rules: {line-length: {max: 180}}}" + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser( + schema_version=0, + message_format="LOGGING", + yamllint_config=yamllint_config, + ) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), # type: ignore[call-arg] + ).is_empty() + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"Unevaluated properties.*createdby"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"created_by.*required property"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_incomplete_skill(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/skill_incomplete/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_greater_than_or_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"[\.seed_examples].*Value must have at least"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_valid_skill(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/skill_valid/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format="logging") + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).is_empty() + + assert_that(taxonomy.parsed).contains_only("version", "created_by", "seed_examples", "task_description") + assert_that(taxonomy.parsed.get("seed_examples")).is_length(5) + + def test_valid_knowledge(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "knowledge/knowledge_valid/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).is_empty() + + assert_that(taxonomy.parsed).contains_only( + "version", + "created_by", + "seed_examples", + "task_description", + "document", + "domain", + ) + assert_that(taxonomy.parsed.get("seed_examples")).is_length(5) + assert_that(taxonomy.parsed.get("document")).contains_only("repo", "commit", "patterns") + + def test_file_does_not_exist(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "knowledge/invalid_name/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"does not exist or is not a file"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_file_has_wrong_extension(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "knowledge/invalid_name/qna.yml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"""Taxonomy file must be named "qna\.yaml".*qna.yml"""), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_file_has_wrong_name(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "knowledge/invalid_name/file.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"""Taxonomy file must be named "qna\.yaml".*file\.yaml"""), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_empty_yaml(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/empty_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"The file is empty"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_array_yaml(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/array_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"The file is not valid"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_version_1(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/version_1/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(taxonomy.version).is_equal_to(1) + assert_that(caplog.records).is_empty() + + def test_version_1_as_version_2(self, caplog: LogCaptureFixture): + caplog.set_level(logging.INFO, logger="instructlab.schema") + test_yaml = "compositional_skills/version_1/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=2, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(taxonomy.version).is_equal_to(2) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), # type: ignore[call-arg] + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), # type: ignore[call-arg] + ).contains_only(logging.WARNING) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"version.*required property"), # type: ignore[call-arg] + ).contains_only(logging.ERROR) + + def test_format_github(self, capsys: CaptureFixture[str]): + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.GITHUB) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + captured = capsys.readouterr() + assert_that(captured.err).is_empty() + assert_that(captured.out).is_not_empty() + lines: list[str] = captured.out.splitlines() + assert_that(lines).is_not_empty() + pattern = f"^::(error|warning) file={re.escape(str(rel_path))}," + for line in lines: + assert_that(line).matches(pattern) # type: ignore[arg-type] + + def test_format_standard(self, capsys: CaptureFixture[str]): + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.STANDARD) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + captured = capsys.readouterr() + assert_that(captured.err).is_empty() + assert_that(captured.out).is_not_empty() + lines: list[str] = captured.out.splitlines() + assert_that(lines).is_not_empty() + pattern = f"^(ERROR|WARN): {re.escape(str(rel_path))}:" + for line in lines: + assert_that(line).matches(pattern) # type: ignore[arg-type] + + def test_format_auto(self, capsys: CaptureFixture[str]): + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + captured = capsys.readouterr() + assert_that(captured.err).is_empty() + assert_that(captured.out).is_not_empty() + lines: list[str] = captured.out.splitlines() + assert_that(lines).is_not_empty() + pattern = ( + f"^::(error|warning) file={re.escape(str(rel_path))}," + if "GITHUB_ACTIONS" in os.environ and "GITHUB_WORKFLOW" in os.environ + else f"^(ERROR|WARN): {re.escape(str(rel_path))}:" + ) + for line in lines: + assert_that(line).matches(pattern) # type: ignore[arg-type] diff --git a/tests/test_versions.py b/tests/test_versions.py index ed08865..5dc55e1 100644 --- a/tests/test_versions.py +++ b/tests/test_versions.py @@ -5,6 +5,7 @@ from importlib import resources # Third Party +from assertpy import assert_that from referencing import Resource from referencing.jsonschema import DRAFT202012 @@ -14,23 +15,17 @@ class TestVersions: def test_versions(self): versions = schema_versions() - assert versions is not None - assert len(versions) > 1 + assert_that(versions).is_not_none().is_not_empty() for i, v in enumerate(versions): - assert v.name == f"v{i+1}" + assert_that(v).has_name(f"v{i+1}") def _load_schema(self, path): text = path.read_text(encoding="utf-8") - assert text - assert len(text) > 1 + assert_that(text).is_not_none().is_not_empty() contents = json.loads(text) - assert contents - assert len(contents) > 1 - resource = Resource.from_contents( - contents=contents, default_specification=DRAFT202012 - ) - assert resource - assert resource.contents == contents + assert_that(contents).is_not_none().is_not_empty() + resource = Resource.from_contents(contents=contents, default_specification=DRAFT202012) + assert_that(resource).is_not_none().has_contents(contents) def test_import_schema_base(self): schema_base = resources.files("instructlab.schema") diff --git a/tests/testdata/compositional_skills/array_yaml/qna.yaml b/tests/testdata/compositional_skills/array_yaml/qna.yaml new file mode 100644 index 0000000..dc790be --- /dev/null +++ b/tests/testdata/compositional_skills/array_yaml/qna.yaml @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +["a", "b"] diff --git a/tests/testdata/compositional_skills/empty_yaml/qna.yaml b/tests/testdata/compositional_skills/empty_yaml/qna.yaml new file mode 100644 index 0000000..0f8869a --- /dev/null +++ b/tests/testdata/compositional_skills/empty_yaml/qna.yaml @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# empty diff --git a/tests/testdata/compositional_skills/invalid_yaml/qna.yaml b/tests/testdata/compositional_skills/invalid_yaml/qna.yaml new file mode 100644 index 0000000..f0083c8 --- /dev/null +++ b/tests/testdata/compositional_skills/invalid_yaml/qna.yaml @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 + +createdby: invalid_key +version: 2 +seed_examples: +- question: What is this skill about? + answer: It's a skill that makes the tests more skillful +- answer: "answer2" + question: "question2" +- answer: "answer6" + question: "This is for a unit test and has a line with 124 characters! It is too long for the default rules but not too long for the customer rules!" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For invalid yaml tests diff --git a/tests/testdata/compositional_skills/skill_incomplete/qna.yaml b/tests/testdata/compositional_skills/skill_incomplete/qna.yaml new file mode 100644 index 0000000..4f56a0a --- /dev/null +++ b/tests/testdata/compositional_skills/skill_incomplete/qna.yaml @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test +version: 2 +seed_examples: +- question: "Does this yaml pass schema validation?" + answer: "No, it does not! It should have 5 examples." +task_description: 'This yaml does not conform to the schema' diff --git a/tests/testdata/compositional_skills/skill_valid/qna.yaml b/tests/testdata/compositional_skills/skill_valid/qna.yaml new file mode 100644 index 0000000..354af20 --- /dev/null +++ b/tests/testdata/compositional_skills/skill_valid/qna.yaml @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +seed_examples: +- answer: Yes, it is. + question: Is this for a test? +- answer: Yes I am very sure. + question: Are you sure it's for a test? +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: for testing diff --git a/tests/testdata/compositional_skills/version_1/qna.yaml b/tests/testdata/compositional_skills/version_1/qna.yaml new file mode 100644 index 0000000..da1b888 --- /dev/null +++ b/tests/testdata/compositional_skills/version_1/qna.yaml @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: someone +seed_examples: +- question: What is this skill about? + answer: It's a skill that makes the tests more skillful +- answer: "answer2" + question: "question2" +- answer: "answer6" + question: "This is for a unit test and has a line with 124 characters! It is too long for the default rules but not too long for the customer rules!" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For yaml tests diff --git a/tests/testdata/knowledge/invalid_name/file.yaml b/tests/testdata/knowledge/invalid_name/file.yaml new file mode 100644 index 0000000..d09b633 --- /dev/null +++ b/tests/testdata/knowledge/invalid_name/file.yaml @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +domain: test-domain +seed_examples: +- question: What is this knowledge about? + answer: It's a knowledge that makes the tests more knowledgeable +- answer: "answer2" + question: "question2" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For knowledge tests +document: + repo: https://github.com/example-org/example-repo + commit: a0c3c8e + patterns: + - "*.md" + - docs/*.md diff --git a/tests/testdata/knowledge/invalid_name/qna.yml b/tests/testdata/knowledge/invalid_name/qna.yml new file mode 100644 index 0000000..d09b633 --- /dev/null +++ b/tests/testdata/knowledge/invalid_name/qna.yml @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +domain: test-domain +seed_examples: +- question: What is this knowledge about? + answer: It's a knowledge that makes the tests more knowledgeable +- answer: "answer2" + question: "question2" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For knowledge tests +document: + repo: https://github.com/example-org/example-repo + commit: a0c3c8e + patterns: + - "*.md" + - docs/*.md diff --git a/tests/testdata/knowledge/knowledge_valid/qna.yaml b/tests/testdata/knowledge/knowledge_valid/qna.yaml new file mode 100644 index 0000000..d09b633 --- /dev/null +++ b/tests/testdata/knowledge/knowledge_valid/qna.yaml @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +domain: test-domain +seed_examples: +- question: What is this knowledge about? + answer: It's a knowledge that makes the tests more knowledgeable +- answer: "answer2" + question: "question2" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For knowledge tests +document: + repo: https://github.com/example-org/example-repo + commit: a0c3c8e + patterns: + - "*.md" + - docs/*.md diff --git a/tox.ini b/tox.ini index 1c4e721..b5443eb 100644 --- a/tox.ini +++ b/tox.ini @@ -7,12 +7,15 @@ envlist = ruff, pylint, mypy, jsonschema, py3-unit minversion = 4.4 [testenv] -description = Run tests (unit) +description = Run tests with pytest package = wheel wheel_build_env = pkg +passenv = + GITHUB_ACTIONS + GITHUB_WORKFLOW deps = pytest - jsonschema + assertpy commands = unit: {envpython} -m pytest {posargs:tests} @@ -20,7 +23,7 @@ commands = description = Lint with pylint deps = pylint - jsonschema + {[testenv]deps} commands = {envpython} -m pylint {posargs:src tests} @@ -30,10 +33,20 @@ skip_install = True skipsdist = true deps = ruff - jsonschema + {[testenv]deps} commands = - ./scripts/ruff.sh {posargs:fix} -allowlist_externals = ./scripts/ruff.sh + ruff check --fix + ruff format + +[testenv:ruffcheck] +description = Check code with Ruff +skip_install = True +skipsdist = true +deps = + {[testenv:ruff]deps} +commands = + ruff check --diff + ruff format --check [testenv:mypy] description = Python type checking with mypy @@ -41,7 +54,10 @@ namespace_packages = True explicit_package_bases = True deps = mypy - jsonschema + {[testenv]deps} + types-assertpy + types-PyYAML + types-jsonschema commands = {envpython} -m mypy {posargs:src tests}