diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 85c1d1b..ccb6572 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,3 +13,9 @@ updates: directory: "/.github/workflows" schedule: interval: "daily" + + # Maintain dependencies for Python code + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c156a3a..3a46994 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -48,7 +48,7 @@ jobs: tox -e jsonschema - name: "ruff" commands: | - tox -e ruff -- check + tox -e ruffcheck - name: "pylint" commands: | echo "::add-matcher::.github/workflows/matchers/pylint.json" diff --git a/pyproject.toml b/pyproject.toml index 4bc1753..882fb1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 [build-system] -requires = ["setuptools>=64", "setuptools_scm>=8"] +requires = ["setuptools>=70.1.0", "setuptools_scm>=8"] build-backend = "setuptools.build_meta" [project] @@ -21,7 +21,15 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", ] -dynamic = ["dependencies", "optional-dependencies", "version"] +dependencies = [ + "typing_extensions", + "jsonschema>=4.22.0", + "PyYAML>=6.0.0", + # The below library should NOT be imported into any python files + # We only use the command via subprocess + "yamllint>=1.35.1", +] +dynamic = ["version"] [project.urls] homepage = "https://instructlab.ai" @@ -40,6 +48,7 @@ exclude = ["^src/instructlab/schema/_version\\.py$"] target-version = "py310" src = ["src", "tests"] extend-exclude = ["src/instructlab/schema/_version.py"] +line-length = 180 [tool.ruff.lint] select = [ @@ -53,11 +62,22 @@ select = [ "TID", # flake8-tidy-imports ] +[tool.ruff.lint.flake8-tidy-imports.banned-api] +"yamllint".msg = "yamllint is for use as a command via subprocess." + [tool.pylint.main] py-version = "3.10" source-roots = ["src", "tests"] ignore = ["_version.py"] +[tool.pylint.design] +max-branches = 20 +max-line-length = 180 +min-public-methods = 1 + +[tool.pylint.format] +max-args = 8 + [tool.pylint."messages control"] disable = [ "missing-class-docstring", diff --git a/scripts/ruff.sh b/scripts/ruff.sh deleted file mode 100755 index 6bf131f..0000000 --- a/scripts/ruff.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# SPDX-License-Identifier: Apache-2.0 -set -e - -# wrapper to combine ruff check and ruff format -# -# "ruff.sh fix" runs fixes and reformats the code -# "ruff.sh check" checks style, format, and imports -# "ruff.sh " passes abitrary args to ruff - -if [ -z "$1" ]; then - echo "USAGE: $0 [check|fix|]" >&2 - exit 2 -fi - -run() { - declare -i err - - echo "RUN: '$*'" - "$@" - err=$? - echo - return $err -} - -case $1 in - "check") - declare -i exitcode=0 - - set +e - run ruff check --diff - exitcode=$(( exitcode + $? )) - - run ruff format --check - exitcode=$(( exitcode + $? )) - set -e - - if [ $exitcode -ne 0 ]; then - echo "ERROR: one or more checks have failed." >&2 - echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2 - exit 3 - fi - ;; - "fix") - run ruff check --fix - run ruff format - ;; - *) - ruff "$@" -esac diff --git a/src/instructlab/schema/__init__.py b/src/instructlab/schema/__init__.py index a1bc1f2..c8691da 100644 --- a/src/instructlab/schema/__init__.py +++ b/src/instructlab/schema/__init__.py @@ -3,14 +3,20 @@ """InstructLab Taxonomy Schema""" # Standard -from importlib import resources +import importlib.resources +from importlib.abc import Traversable -try: - from importlib.resources.abc import Traversable # type: ignore[import-not-found] -except ImportError: # python<3.11 - from importlib.abc import Traversable +__all__ = ["schema_base", "schema_versions"] -__all__ = ["schema_versions"] + +def schema_base() -> Traversable: + """Return the schema base. + + Returns: + Traversable: The base for the schema versions. + """ + base = importlib.resources.files(__name__) + return base def schema_versions() -> list[Traversable]: @@ -19,9 +25,8 @@ def schema_versions() -> list[Traversable]: Returns: list[Traversable]: A sorted list of schema versions. """ - schema_base = resources.files(__package__) versions = sorted( - (v for v in schema_base.iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), + (v for v in schema_base().iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), key=lambda k: int(k.name[1:]), ) return versions diff --git a/src/instructlab/schema/taxonomy.py b/src/instructlab/schema/taxonomy.py new file mode 100644 index 0000000..ec94667 --- /dev/null +++ b/src/instructlab/schema/taxonomy.py @@ -0,0 +1,419 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Taxonomy qna.yaml parsing""" + +# Standard +import enum +import functools +import json +import logging +import os +import pathlib +import re +import subprocess +import typing +from collections.abc import Mapping + +# Third Party +import yaml +from jsonschema.protocols import Validator +from jsonschema.validators import validator_for +from referencing import Registry, Resource +from referencing.exceptions import NoSuchResource +from referencing.jsonschema import DRAFT202012, Schema, SchemaRegistry, SchemaResource +from typing_extensions import Self + +from . import schema_base, schema_versions + +logger = logging.getLogger(__name__) + +DEFAULT_TAXONOMY_FOLDERS: list[str] = ["compositional_skills", "knowledge"] +"""Taxonomy folders which are also the schema names""" + +DEFAULT_YAMLLINT_CONFIG: str = "{extends: relaxed, rules: {line-length: {max: 120}}}" +"""Default yamllint configuration""" + + +class TaxonomyReadingException(Exception): + """An exception raised during reading of the taxonomy.""" + + +class TaxonomyMessageFormat(enum.Enum): + """An enum for the format choices for taxonomy parsing messages""" + + AUTO = enum.auto() + """ + If specified, then GITHUB will be used if both the GITHUB_ACTIONS and GITHUB_WORKFLOW + environment variables are set. Otherwise STANDARD will be used. + """ + STANDARD = enum.auto() + """ + A plain message starting with ERROR or WARN. + """ + GITHUB = enum.auto() + """ + Uses GitHub Actions workflow commands to set error or warning commands. + """ + LOGGING = enum.auto() + """ + Uses this module's logger to log warnings or errors. + """ + + +class Taxonomy: + """A container for a parsed taxonomy qna.yaml file.""" + + def __init__(self, *, taxonomy_path: pathlib.Path, abs_path: pathlib.Path, message_format: TaxonomyMessageFormat) -> None: + """Create a container for a parsed taxonomy qna.yaml file. + + Args: + taxonomy_path (pathlib.Path): The taxonomy path for the qna.yaml file. + This should be a relative path starting at the taxonomy folder holding the qna.yaml file. + abs_path (pathlib.Path): The absolute path for the qna.yaml file. + message_format (TaxonomyMessageFormat): The format of any messages. + If TaxonomyMessageFormat.AUTO is specified, then TaxonomyMessageFormat.GITHUB will be used if + both the GITHUB_ACTIONS and GITHUB_WORKFLOW environment variables are set. Otherwise + TaxonomyMessageFormat.STANDARD will be used. + """ + self.path: pathlib.Path = taxonomy_path + abs_path = abs_path.resolve() + cwd = pathlib.Path.cwd() + self.rel_path: pathlib.Path = abs_path.relative_to(cwd) if abs_path.is_relative_to(cwd) else abs_path + if message_format is TaxonomyMessageFormat.AUTO: + message_format = TaxonomyMessageFormat.GITHUB if "GITHUB_ACTIONS" in os.environ and "GITHUB_WORKFLOW" in os.environ else TaxonomyMessageFormat.STANDARD + self.message_format: TaxonomyMessageFormat = message_format + self.errors: int = 0 + self.warnings: int = 0 + self.contents: Mapping[str, typing.Any] = {} + self.version: int = 0 + + def error( + self, + message: str, + *message_args: object, + line: str | int = 1, + col: str | int = 1, + yaml_path: str = "", + ) -> Self: + """Report an error on the taxonomy qna.yaml file. + + Args: + message (str): The error message. The message supports string formatting using the specified message_args, if any. + message_args (object, optional): Values used by the message string formatting. + line (str | int, optional): The line in the qna.yaml file where the error occurred. Defaults to 1. + col (str | int, optional): The column in the qna.yaml file where the error occurred. Defaults to 1. + yaml_path (str, optional): The yaml path to the item in the qna.yaml file where the error occurred. + + Returns: + Self: This Taxonomy object. + """ + self.errors += 1 + match self.message_format: + case TaxonomyMessageFormat.GITHUB: + if message_args: + message = message % message_args + print( + f"::error file={self.rel_path},line={line},col={col}::{line}:{col} [{yaml_path}] {message}" + if yaml_path + else f"::error file={self.rel_path},line={line},col={col}::{line}:{col} {message}" + ) + case TaxonomyMessageFormat.LOGGING: + if yaml_path: + logger.error( + "%s:%s:%s [%s] " + message, + self.rel_path, + line, + col, + yaml_path, + *message_args, + ) + else: + logger.error("%s:%s:%s " + message, self.rel_path, line, col, *message_args) + case TaxonomyMessageFormat.STANDARD | _: + if message_args: + message = message % message_args + print(f"ERROR: {self.rel_path}:{line}:{col} [{yaml_path}] {message}" if yaml_path else f"ERROR: {self.rel_path}:{line}:{col} {message}") + return self + + def warning( + self, + message: str, + *message_args: object, + line: str | int = 1, + col: str | int = 1, + yaml_path: str = "", + ) -> Self: + """Report a warning on the taxonomy qna.yaml file. + + Args: + message (str): The warning message. The message supports string formatting using the specified message_args, if any. + message_args (object, optional): Values used by the message string formatting. + line (str | int, optional): The line in the qna.yaml file where the warning occurred. Defaults to 1. + col (str | int, optional): The column in the qna.yaml file where the warning occurred. Defaults to 1. + yaml_path (str, optional): The yaml path to the item in the qna.yaml file where the warning occurred. + + Returns: + Self: This Taxonomy object. + """ + self.warnings += 1 + match self.message_format: + case TaxonomyMessageFormat.GITHUB: + if message_args: + message = message % message_args + print( + f"::warning file={self.rel_path},line={line},col={col}::{line}:{col} [{yaml_path}] {message}" + if yaml_path + else f"::warning file={self.rel_path},line={line},col={col}::{line}:{col} {message}" + ) + case TaxonomyMessageFormat.LOGGING: + if yaml_path: + logger.warning( + "%s:%s:%s [%s] " + message, + self.rel_path, + line, + col, + yaml_path, + *message_args, + ) + else: + logger.warning("%s:%s:%s " + message, self.rel_path, line, col, *message_args) + case TaxonomyMessageFormat.STANDARD | _: + if message_args: + message = message % message_args + print(f"WARN: {self.rel_path}:{line}:{col} [{yaml_path}] {message}" if yaml_path else f"WARN: {self.rel_path}:{line}:{col} {message}") + return self + + +@functools.lru_cache +def _load_schema(path: str) -> SchemaResource: + schema_path = schema_base().joinpath(path) + try: + text = schema_path.read_text(encoding="utf-8") + contents: Schema = json.loads(text) + return Resource.from_contents(contents, default_specification=DRAFT202012) + except Exception as e: + raise NoSuchResource(str(schema_path)) from e + + +def _retrieve(version: str, uri: str) -> SchemaResource: + path = pathlib.Path(version, uri).as_posix() + return _load_schema(path) + + +class TaxonomyParser: + """A parser for taxonomy qna.yaml files. The parser will return a Taxonomy object.""" + + def __init__( + self, + *, + taxonomy_folders: list[str] | None = None, + schema_version: int | None = None, + yamllint_config: str | None = None, + yamllint_strict: bool = False, + message_format: TaxonomyMessageFormat | str | None = None, + ) -> None: + """Create a parser for a taxonomy qna.yaml file. + + Args: + taxonomy_folders (list[str] | None, optional): The folder/schema names. + DEFAULT_TAXONOMY_FOLDERS is used if None is specified. + schema_version (int | None, optional): The version of the Taxonomy schema. + Specifying a version less than 1 will use the schema version specified by each YAML document's "version" key. + The highest schema version is used if None is specified. + yamllint_config (str | None, optional): The yamllint configuration data. + DEFAULT_YAMLLINT_CONFIG is used if None is specified. + message_format (TaxonomyMessageFormat | str | None, optional): The format of any messages. + TaxonomyMessageFormat.AUTO is used if None is specified. + """ + if taxonomy_folders is None: + taxonomy_folders = DEFAULT_TAXONOMY_FOLDERS + self.taxonomy_folders: list[str] = taxonomy_folders + if schema_version is None: + versions = schema_versions() + if not versions: + raise TaxonomyReadingException(f'Schema base "{schema_base()}" does not contain any schema versions') + schema_version = int(versions[-1].name[1:]) + self.schema_version: int = schema_version + if yamllint_config is None: + yamllint_config = DEFAULT_YAMLLINT_CONFIG + self.yamllint_config: str = yamllint_config + self.yamllint_strict: bool = yamllint_strict + if message_format is None: + message_format = TaxonomyMessageFormat.AUTO + elif isinstance(message_format, str): + message_format = TaxonomyMessageFormat[message_format.upper()] + self.message_format: TaxonomyMessageFormat = message_format + self.yq_available: bool = True + + def _yamllint(self, text: str, taxonomy: Taxonomy) -> None: + yamllint_cmd = [ + "yamllint", + "-f", + "parsable", + "-d", + self.yamllint_config, + "-", # read from stdin + ] + + try: + result = subprocess.run( + yamllint_cmd, + check=False, + input=text, + text=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + ) + except FileNotFoundError as e: + logger.warning( + "could not run yamllint command", + exc_info=e, + ) + return + + lines = result.stdout.splitlines() + if lines: + pattern = re.compile(r"[^:]+:(?P[^:]+):(?P[^:]+):\s*\[(?P[^]]+)\]\s*(?P.*)") + for line in lines: + match = pattern.match(line) + if match: + line = match.group("line") + col = match.group("col") + severity = match.group("severity") + message = match.group("message") + if self.yamllint_strict or severity == "error": + taxonomy.error( + message, + line=line, + col=col, + ) + else: + taxonomy.warning( + message, + line=line, + col=col, + ) + + def _schema_validate(self, text: str, taxonomy: Taxonomy) -> None: + retrieve = functools.partial(_retrieve, f"v{taxonomy.version}") + schema_name = taxonomy.path.parts[0] + if schema_name not in self.taxonomy_folders: + schema_name = "knowledge" if "document" in taxonomy.contents else "compositional_skills" + + try: + schema_resource = retrieve(f"{schema_name}.json") + schema = schema_resource.contents + validator_cls = validator_for(schema) + # mypy doesn't understand attrs classes fields, see: https://github.com/python/mypy/issues/5406 + registry: SchemaRegistry = Registry(retrieve=retrieve) # type: ignore[call-arg] + validator: Validator = validator_cls(schema, registry=registry) + + for validation_error in validator.iter_errors(taxonomy.contents): + yaml_path = validation_error.json_path[1:] + if not yaml_path: + yaml_path = "." + line: str | int = 1 + if self.yq_available: + try: + yq_expression = f"{yaml_path} | line" + line = subprocess.check_output(["yq", yq_expression], input=text, text=True) + line = line.strip() if line else 1 + except (subprocess.CalledProcessError, FileNotFoundError) as e: + if isinstance(e, FileNotFoundError): + self.yq_available = False + logger.warning( + "could not run yq command", + exc_info=e, + ) + if validation_error.validator == "minItems": + # Special handling for minItems which can have a long message for seed_examples + taxonomy.error( + "Value must have at least %s items", + validation_error.validator_value, + line=line, + yaml_path=yaml_path, + ) + else: + taxonomy.error( + validation_error.message[-200:], + line=line, + yaml_path=yaml_path, + ) + except NoSuchResource as e: + taxonomy.error( + "Cannot load schema file %s. %s", + e.ref, + e, + ) + + def parse(self, path: str | pathlib.Path) -> Taxonomy: + """Parse a qna.yaml file into a Taxonomy object. + + Args: + path (str | pathlib.Path): The path to the qna.yaml file to parse. + + Raises: + TaxonomyReadingException: If an exception is raised while parsing. + + Returns: + Taxonomy: A Taxonomy object holding the parsed qna.yaml file and any errors or warnings + identified during parsing the qna.yaml file. + These can include lint issues and also schema validation issues. + """ + abs_path = pathlib.Path(path).resolve() + for i in range(len(abs_path.parts) - 1, -1, -1): + if abs_path.parts[i] in self.taxonomy_folders: + taxonomy_path = pathlib.Path(*abs_path.parts[i:]) + break + else: + taxonomy_path = abs_path + + taxonomy = Taxonomy(taxonomy_path=taxonomy_path, abs_path=abs_path, message_format=self.message_format) + + if not abs_path.is_file(): + return taxonomy.error( + 'The file "%s" does not exist or is not a file', + abs_path, + ) + + if abs_path.name != "qna.yaml": + return taxonomy.error( + 'Taxonomy file must be named "qna.yaml"; "%s" is not a valid name', + abs_path.name, + ) + + try: + text = abs_path.read_text(encoding="utf-8") + + contents: Mapping[str, typing.Any] = yaml.safe_load(text) + if not contents: + return taxonomy.warning("The file is empty") + + if not isinstance(contents, Mapping): + return taxonomy.error( + "The file is not valid. The top-level element is not an object with key-value pairs.", + ) + + version: int = self.schema_version + if version < 1: # Use version from YAML document + contents_version = contents.get("version", 1) + if isinstance(contents_version, int): + version = contents_version + else: + # schema validation will complain about the type + try: + version = int(contents_version) + except (ValueError, TypeError): + version = 1 # fallback to version 1 + + taxonomy.version = version + taxonomy.contents = contents + + if version > 1: # no linting for version 1 yaml + self._yamllint(text=text, taxonomy=taxonomy) + + self._schema_validate(text=text, taxonomy=taxonomy) + except Exception as e: + raise TaxonomyReadingException from e + + return taxonomy diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3138b55 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import logging +import pathlib + +# Third Party +import pytest + + +@pytest.fixture +def caplog(caplog: pytest.LogCaptureFixture) -> pytest.LogCaptureFixture: # pylint: disable=redefined-outer-name + caplog.set_level(logging.INFO, logger="instructlab.schema") + return caplog + + +@pytest.fixture(scope="session") +def testdata() -> pathlib.Path: + abs_path = pathlib.Path(__file__).parent.resolve().joinpath("testdata") + cwd = pathlib.Path.cwd() + return abs_path.relative_to(cwd) if abs_path.is_relative_to(cwd) else abs_path diff --git a/tests/test_parse.py b/tests/test_parse.py new file mode 100644 index 0000000..94d84c2 --- /dev/null +++ b/tests/test_parse.py @@ -0,0 +1,358 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import logging +import os +import pathlib +import re +from collections.abc import Callable + +# Third Party +import pytest +from assertpy import assert_that + +from instructlab.schema.taxonomy import TaxonomyMessageFormat, TaxonomyParser + + +class TestParsingLogging: + def message_filter(self, regex: str) -> Callable[[logging.LogRecord], bool]: + pattern = re.compile(regex) + return lambda r: bool(re.search(pattern, r.message)) + + def test_invalid(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), + ).contains_only(logging.WARNING) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"Unevaluated properties.*createdby"), + ).contains_only(logging.ERROR) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"created_by.*required property"), + ).contains_only(logging.ERROR) + + def test_invalid_yamlint_strict(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, yamllint_strict=True, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_greater_than_or_equal_to(3) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), + ).contains_only(logging.ERROR) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"Unevaluated properties.*createdby"), + ).contains_only(logging.ERROR) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"created_by.*required property"), + ).contains_only(logging.ERROR) + + def test_invalid_custom_yaml_config(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + yamllint_config = "{extends: relaxed, rules: {line-length: {max: 180}}}" + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser( + schema_version=0, + message_format="LOGGING", + yamllint_config=yamllint_config, + ) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), + ).is_empty() + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"Unevaluated properties.*createdby"), + ).contains_only(logging.ERROR) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"created_by.*required property"), + ).contains_only(logging.ERROR) + + def test_incomplete_skill(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/skill_incomplete/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_greater_than_or_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"[\.seed_examples].*Value must have at least"), + ).contains_only(logging.ERROR) + + def test_valid_skill(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/skill_valid/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format="logging") + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).is_empty() + + assert_that(taxonomy.contents).contains_only("version", "created_by", "seed_examples", "task_description") + assert_that(taxonomy.contents.get("seed_examples")).is_length(5) + + def test_valid_knowledge(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "knowledge/knowledge_valid/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).is_empty() + + assert_that(taxonomy.contents).contains_only( + "version", + "created_by", + "seed_examples", + "task_description", + "document", + "domain", + ) + assert_that(taxonomy.contents.get("seed_examples")).is_length(5) + assert_that(taxonomy.contents.get("document")).contains_only("repo", "commit", "patterns") + + def test_file_does_not_exist(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "knowledge/invalid_name/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"does not exist or is not a file"), + ).contains_only(logging.ERROR) + + def test_file_has_wrong_extension(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "knowledge/invalid_name/qna.yml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"""Taxonomy file must be named "qna\.yaml".*qna.yml"""), + ).contains_only(logging.ERROR) + + def test_file_has_wrong_name(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "knowledge/invalid_name/file.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"""Taxonomy file must be named "qna\.yaml".*file\.yaml"""), + ).contains_only(logging.ERROR) + + def test_empty_yaml(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/empty_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_equal_to(1) + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"The file is empty"), + ).contains_only(logging.WARNING) + + def test_array_yaml(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/array_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(re.escape(test_yaml)), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"The file is not valid"), + ).contains_only(logging.ERROR) + + def test_version_1(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/version_1/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_zero() + assert_that(taxonomy.errors).is_zero() + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(taxonomy.version).is_equal_to(1) + assert_that(caplog.records).is_empty() + + def test_version_1_as_version_2(self, caplog: pytest.LogCaptureFixture, testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/version_1/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=2, message_format=TaxonomyMessageFormat.LOGGING) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(1) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + assert_that(taxonomy.version).is_equal_to(2) + assert_that(caplog.records).extracting( + "message", + filter=self.message_filter(f"{re.escape(test_yaml)}:"), + ).is_length(len(caplog.records)) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"line too long"), + ).contains_only(logging.WARNING) + assert_that(caplog.records).extracting( + "levelno", + filter=self.message_filter(r"version.*required property"), + ).contains_only(logging.ERROR) + + +class TestParsingStdout: + def test_format_github(self, capsys: pytest.CaptureFixture[str], testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.GITHUB) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + captured = capsys.readouterr() + assert_that(captured.err).is_empty() + assert_that(captured.out).is_not_empty() + lines: list[str] = captured.out.splitlines() + assert_that(lines).is_not_empty() + pattern = f"^::(error|warning) file={re.escape(str(rel_path))}," + for line in lines: + assert_that(line).matches(pattern) # type: ignore[arg-type] + + def test_format_standard(self, capsys: pytest.CaptureFixture[str], testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0, message_format=TaxonomyMessageFormat.STANDARD) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + captured = capsys.readouterr() + assert_that(captured.err).is_empty() + assert_that(captured.out).is_not_empty() + lines: list[str] = captured.out.splitlines() + assert_that(lines).is_not_empty() + pattern = f"^(ERROR|WARN): {re.escape(str(rel_path))}:" + for line in lines: + assert_that(line).matches(pattern) # type: ignore[arg-type] + + def test_format_auto(self, capsys: pytest.CaptureFixture[str], testdata: pathlib.Path) -> None: + test_yaml = "compositional_skills/invalid_yaml/qna.yaml" + rel_path = testdata.joinpath(test_yaml) + parser = TaxonomyParser(schema_version=0) + taxonomy = parser.parse(rel_path) + + assert_that(taxonomy.warnings).is_greater_than_or_equal_to(1) + assert_that(taxonomy.errors).is_greater_than_or_equal_to(2) + assert_that(taxonomy.path.as_posix()).is_equal_to(test_yaml) + assert_that(taxonomy.rel_path).is_equal_to(rel_path) + captured = capsys.readouterr() + assert_that(captured.err).is_empty() + assert_that(captured.out).is_not_empty() + lines: list[str] = captured.out.splitlines() + assert_that(lines).is_not_empty() + pattern = ( + f"^::(error|warning) file={re.escape(str(rel_path))}," + if "GITHUB_ACTIONS" in os.environ and "GITHUB_WORKFLOW" in os.environ + else f"^(ERROR|WARN): {re.escape(str(rel_path))}:" + ) + for line in lines: + assert_that(line).matches(pattern) # type: ignore[arg-type] diff --git a/tests/test_versions.py b/tests/test_versions.py index ed08865..6fd0524 100644 --- a/tests/test_versions.py +++ b/tests/test_versions.py @@ -1,48 +1,49 @@ # SPDX-License-Identifier: Apache-2.0 # Standard +import importlib.resources import json -from importlib import resources +from importlib.abc import Traversable # Third Party +from assertpy import assert_that from referencing import Resource from referencing.jsonschema import DRAFT202012 -from instructlab.schema import schema_versions +from instructlab.schema import schema_base, schema_versions class TestVersions: - def test_versions(self): + def _load_schemas(self, version: Traversable) -> None: + assert_that(version).is_not_none() + for schema_name in ("compositional_skills", "knowledge", "version"): + path = version.joinpath(f"{schema_name}.json") + text = path.read_text(encoding="utf-8") + assert_that(text).is_not_none().is_not_empty() + contents = json.loads(text) + assert_that(contents).is_not_none().is_not_empty() + resource = Resource.from_contents(contents=contents, default_specification=DRAFT202012) + assert_that(resource).is_not_none().has_contents(contents) + + def test_schema_base(self) -> None: + base = schema_base() + assert_that(base).is_not_none() versions = schema_versions() - assert versions is not None - assert len(versions) > 1 - for i, v in enumerate(versions): - assert v.name == f"v{i+1}" - - def _load_schema(self, path): - text = path.read_text(encoding="utf-8") - assert text - assert len(text) > 1 - contents = json.loads(text) - assert contents - assert len(contents) > 1 - resource = Resource.from_contents( - contents=contents, default_specification=DRAFT202012 - ) - assert resource - assert resource.contents == contents - - def test_import_schema_base(self): - schema_base = resources.files("instructlab.schema") - for i in range(len(schema_versions())): - schema_version = schema_base.joinpath(f"v{i+1}") - for schema_name in ("compositional_skills", "knowledge", "version"): - path = schema_version.joinpath(f"{schema_name}.json") - self._load_schema(path) - - def test_import_schema_versions(self): - for i in range(len(schema_versions())): - schema_version = resources.files(f"instructlab.schema.v{i+1}") - for schema_name in ("compositional_skills", "knowledge", "version"): - path = schema_version.joinpath(f"{schema_name}.json") - self._load_schema(path) + assert_that(versions).is_not_none().is_not_empty() + for i in range(len(versions)): + version = base.joinpath(f"v{i+1}") + self._load_schemas(version) + + def test_schema_versions(self) -> None: + versions = schema_versions() + assert_that(versions).is_not_none().is_not_empty() + for i, version in enumerate(versions): + assert_that(version).has_name(f"v{i+1}") + self._load_schemas(version) + + def test_importlib_schema(self) -> None: + versions = schema_versions() + assert_that(versions).is_not_none().is_not_empty() + for i in range(len(versions)): + version = importlib.resources.files(f"instructlab.schema.v{i+1}") + self._load_schemas(version) diff --git a/tests/testdata/compositional_skills/array_yaml/qna.yaml b/tests/testdata/compositional_skills/array_yaml/qna.yaml new file mode 100644 index 0000000..dc790be --- /dev/null +++ b/tests/testdata/compositional_skills/array_yaml/qna.yaml @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +["a", "b"] diff --git a/tests/testdata/compositional_skills/empty_yaml/qna.yaml b/tests/testdata/compositional_skills/empty_yaml/qna.yaml new file mode 100644 index 0000000..0f8869a --- /dev/null +++ b/tests/testdata/compositional_skills/empty_yaml/qna.yaml @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# empty diff --git a/tests/testdata/compositional_skills/invalid_yaml/qna.yaml b/tests/testdata/compositional_skills/invalid_yaml/qna.yaml new file mode 100644 index 0000000..f0083c8 --- /dev/null +++ b/tests/testdata/compositional_skills/invalid_yaml/qna.yaml @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: Apache-2.0 + +createdby: invalid_key +version: 2 +seed_examples: +- question: What is this skill about? + answer: It's a skill that makes the tests more skillful +- answer: "answer2" + question: "question2" +- answer: "answer6" + question: "This is for a unit test and has a line with 124 characters! It is too long for the default rules but not too long for the customer rules!" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For invalid yaml tests diff --git a/tests/testdata/compositional_skills/skill_incomplete/qna.yaml b/tests/testdata/compositional_skills/skill_incomplete/qna.yaml new file mode 100644 index 0000000..4f56a0a --- /dev/null +++ b/tests/testdata/compositional_skills/skill_incomplete/qna.yaml @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test +version: 2 +seed_examples: +- question: "Does this yaml pass schema validation?" + answer: "No, it does not! It should have 5 examples." +task_description: 'This yaml does not conform to the schema' diff --git a/tests/testdata/compositional_skills/skill_valid/qna.yaml b/tests/testdata/compositional_skills/skill_valid/qna.yaml new file mode 100644 index 0000000..354af20 --- /dev/null +++ b/tests/testdata/compositional_skills/skill_valid/qna.yaml @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +seed_examples: +- answer: Yes, it is. + question: Is this for a test? +- answer: Yes I am very sure. + question: Are you sure it's for a test? +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: for testing diff --git a/tests/testdata/compositional_skills/version_1/qna.yaml b/tests/testdata/compositional_skills/version_1/qna.yaml new file mode 100644 index 0000000..da1b888 --- /dev/null +++ b/tests/testdata/compositional_skills/version_1/qna.yaml @@ -0,0 +1,17 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: someone +seed_examples: +- question: What is this skill about? + answer: It's a skill that makes the tests more skillful +- answer: "answer2" + question: "question2" +- answer: "answer6" + question: "This is for a unit test and has a line with 124 characters! It is too long for the default rules but not too long for the customer rules!" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For yaml tests diff --git a/tests/testdata/knowledge/invalid_name/file.yaml b/tests/testdata/knowledge/invalid_name/file.yaml new file mode 100644 index 0000000..d09b633 --- /dev/null +++ b/tests/testdata/knowledge/invalid_name/file.yaml @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +domain: test-domain +seed_examples: +- question: What is this knowledge about? + answer: It's a knowledge that makes the tests more knowledgeable +- answer: "answer2" + question: "question2" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For knowledge tests +document: + repo: https://github.com/example-org/example-repo + commit: a0c3c8e + patterns: + - "*.md" + - docs/*.md diff --git a/tests/testdata/knowledge/invalid_name/qna.yml b/tests/testdata/knowledge/invalid_name/qna.yml new file mode 100644 index 0000000..d09b633 --- /dev/null +++ b/tests/testdata/knowledge/invalid_name/qna.yml @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +domain: test-domain +seed_examples: +- question: What is this knowledge about? + answer: It's a knowledge that makes the tests more knowledgeable +- answer: "answer2" + question: "question2" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For knowledge tests +document: + repo: https://github.com/example-org/example-repo + commit: a0c3c8e + patterns: + - "*.md" + - docs/*.md diff --git a/tests/testdata/knowledge/knowledge_valid/qna.yaml b/tests/testdata/knowledge/knowledge_valid/qna.yaml new file mode 100644 index 0000000..d09b633 --- /dev/null +++ b/tests/testdata/knowledge/knowledge_valid/qna.yaml @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: Apache-2.0 + +created_by: test-bot +version: 2 +domain: test-domain +seed_examples: +- question: What is this knowledge about? + answer: It's a knowledge that makes the tests more knowledgeable +- answer: "answer2" + question: "question2" +- answer: "answer3" + question: "question3" +- answer: "answer4" + question: "question4" +- answer: "answer5" + question: "question5" +task_description: For knowledge tests +document: + repo: https://github.com/example-org/example-repo + commit: a0c3c8e + patterns: + - "*.md" + - docs/*.md diff --git a/tox.ini b/tox.ini index 1c4e721..b5443eb 100644 --- a/tox.ini +++ b/tox.ini @@ -7,12 +7,15 @@ envlist = ruff, pylint, mypy, jsonschema, py3-unit minversion = 4.4 [testenv] -description = Run tests (unit) +description = Run tests with pytest package = wheel wheel_build_env = pkg +passenv = + GITHUB_ACTIONS + GITHUB_WORKFLOW deps = pytest - jsonschema + assertpy commands = unit: {envpython} -m pytest {posargs:tests} @@ -20,7 +23,7 @@ commands = description = Lint with pylint deps = pylint - jsonschema + {[testenv]deps} commands = {envpython} -m pylint {posargs:src tests} @@ -30,10 +33,20 @@ skip_install = True skipsdist = true deps = ruff - jsonschema + {[testenv]deps} commands = - ./scripts/ruff.sh {posargs:fix} -allowlist_externals = ./scripts/ruff.sh + ruff check --fix + ruff format + +[testenv:ruffcheck] +description = Check code with Ruff +skip_install = True +skipsdist = true +deps = + {[testenv:ruff]deps} +commands = + ruff check --diff + ruff format --check [testenv:mypy] description = Python type checking with mypy @@ -41,7 +54,10 @@ namespace_packages = True explicit_package_bases = True deps = mypy - jsonschema + {[testenv]deps} + types-assertpy + types-PyYAML + types-jsonschema commands = {envpython} -m mypy {posargs:src tests}