diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4dcc1c5..85c1d1b 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,9 +13,3 @@ updates: directory: "/.github/workflows" schedule: interval: "daily" - - # Maintain dependencies for Python scripts - - package-ecosystem: "pip" - directory: "/.github/scripts" - schedule: - interval: "daily" diff --git a/.github/scripts/requirements.txt b/.github/scripts/requirements.txt deleted file mode 100644 index 1e6548c..0000000 --- a/.github/scripts/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -check-jsonschema>=0.28.2 diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 24bdfc8..6fac671 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -38,7 +38,6 @@ jobs: uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: fetch-depth: 0 - submodules: true - name: "Download actionlint" run: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 208f567..200e081 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -40,7 +40,6 @@ jobs: uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: fetch-depth: 0 - submodules: true - name: "Check Markdown documents" uses: DavidAnson/markdownlint-cli2-action@b4c9feab76d8025d1e83c653fa3990936df0e6c8 # v16.0.0 with: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c4a405b..6cb96bc 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,24 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 -name: Lint Schema +name: Lint on: - workflow_dispatch: push: branches: - - main + - "main" paths: - - 'v*/**/*.json' + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' - '.github/workflows/lint.yml' # This workflow - - '.github/scripts/**' # Scripts used by this workflow pull_request: branches: - - main + - "main" paths: - - 'v*/**/*.json' + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' - '.github/workflows/lint.yml' # This workflow - - '.github/scripts/**' # Scripts used by this workflow env: LC_ALL: en_US.UTF-8 @@ -33,6 +38,25 @@ permissions: jobs: lint: runs-on: ubuntu-latest + name: "${{ matrix.lint.name }}" + strategy: + fail-fast: false + matrix: + lint: + - name: "jsonschema" + commands: | + tox -e jsonschema + - name: "ruff" + commands: | + tox -e ruff -- check + - name: "pylint" + commands: | + echo "::add-matcher::.github/workflows/matchers/pylint.json" + tox -e pylint + - name: "mypy" + commands: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tox -e mypy steps: - name: "Harden Runner" uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 @@ -49,24 +73,13 @@ jobs: with: python-version: "3.11" - - name: "Install Python Packages" + - name: "Install tox" run: | - pip install -r .github/scripts/requirements.txt + python -m pip install --upgrade pip + python -m pip install tox tox-gh - - name: "Find changed schema files" - id: changed-files - uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78 # v44.5.2 - with: - files: | - v*/**/*.json - - - name: "Check changed schema file contents" - if: steps.changed-files.outputs.any_changed == 'true' - run: | - check-jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema ${{ steps.changed-files.outputs.all_changed_files }} - - - name: "Check all schema file contents" - if: steps.changed-files.outputs.any_changed != 'true' + - name: "${{ matrix.lint.name }}" run: | - # shellcheck disable=SC2046 - check-jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema $(find v* -name "*.json") + ${{ matrix.lint.commands }} + env: + RUFF_OUTPUT_FORMAT: github diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000..f048fce --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/pylint.json b/.github/workflows/matchers/pylint.json new file mode 100644 index 0000000..5624ca6 --- /dev/null +++ b/.github/workflows/matchers/pylint.json @@ -0,0 +1,32 @@ +{ + "problemMatcher": [ + { + "owner": "pylint-error", + "severity": "error", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([EF]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + }, + { + "owner": "pylint-warning", + "severity": "warning", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([CRW]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..eeb0ca0 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Build, test, and upload PyPI package + +on: + push: + branches: + - "main" + tags: + - "v*" + pull_request: + branches: + - "main" + release: + types: + - published + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + # Create and verify release artifacts + # - build source dist (tar ball) and wheel + # - validate artifacts with various tools + # - upload artifacts to GHA + build-package: + name: Build and check packages + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + + - name: "Checkout" + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + # for setuptools-scm + fetch-depth: 0 + + - name: "Build and Inspect" + uses: hynek/build-and-inspect-python-package@b4fc3f6ba2b3da04f09659be99e2a29fb6146a61 # v2.6.0 + + # push to Test PyPI on + # - a new GitHub release is published + # - a PR is merged into main branch + publish-test-pypi: + name: Publish packages to test.pypi.org + # environment: publish-test-pypi + if: ${{ false && (github.repository_owner == 'instructlab') && ((github.event.action == 'published') || ((github.event_name == 'push') && (github.ref == 'refs/heads/main'))) }} + permissions: + contents: read + # see https://docs.pypi.org/trusted-publishers/ + id-token: write + runs-on: ubuntu-latest + needs: build-package + + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Download build artifacts" + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: Packages + path: dist + + - name: "Upload to Test PyPI" + uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # v1.8.14 + with: + repository-url: https://test.pypi.org/legacy/ + + # push to Production PyPI on + # - a new GitHub release is published + publish-pypi: + name: Publish release to pypi.org + # environment: publish-pypi + if: ${{ false && (github.repository_owner == 'instructlab') && (github.event.action == 'published') }} + permissions: + # see https://docs.pypi.org/trusted-publishers/ + id-token: write + # allow gh release upload + contents: write + + runs-on: ubuntu-latest + needs: build-package + + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Download build artifacts" + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: Packages + path: dist + + - name: "Sigstore sign package" + uses: sigstore/gh-action-sigstore-python@61f6a500bbfdd9a2a339cf033e5421951fbc1cd2 # v2.1.1 + with: + inputs: | + ./dist/*.tar.gz + ./dist/*.whl + + - name: "Upload artifacts and signatures to GitHub release" + run: | + gh release upload '${{ github.ref_name }}' dist/* --repo '${{ github.repository }}' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # PyPI does not accept .sigstore artifacts and + # gh-action-pypi-publish has no option to ignore them. + - name: "Remove sigstore signatures before uploading to PyPI" + run: | + rm ./dist/*.sigstore + + - name: "Upload to PyPI" + uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # v1.8.14 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..b6c6de3 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Test + +on: + push: + branches: + - "main" + paths: + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' + - '.github/workflows/test.yml' # This workflow + pull_request: + branches: + - "main" + paths: + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' + - '.github/workflows/test.yml' # This workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + test: + name: "${{ matrix.python }} on ${{ matrix.platform }}" + runs-on: "${{ matrix.platform }}" + strategy: + matrix: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + platform: + - "ubuntu-latest" + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Checkout" + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + fetch-depth: 0 + + - name: "Setup Python ${{ matrix.python }}" + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: ${{ matrix.python }} + + - name: "Install tox" + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh + + - name: "Unit tests" + run: | + tox diff --git a/.gitignore b/.gitignore index 701ff28..7fd12a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,32 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# generated by setuptools_scm +/src/instructlab/schema/_version.py + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + # Environments .env .venv @@ -11,5 +40,9 @@ venv.bak/ .vscode/ .idea/ +# Caches +.tox/ +.*_cache/ + # Mac personalization files .DS_Store diff --git a/README.md b/README.md index 7756104..d5d5953 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # Taxonomy Schema -This repository defines the JSON schema for the [Taxonomy](https://github.com/instructlab/taxonomy) YAML. +This Python package defines the JSON schema for the InstructLab [Taxonomy](https://github.com/instructlab/taxonomy) YAML. + +Consumers of this schema can `pip install instructlab-schema`, and access the schema files using `importlib.resources` on the `instructlab.schema` package. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..5a28a04 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,606 @@ +# SPDX-License-Identifier: Apache-2.0 + +[build-system] +requires = ["setuptools>=64", "setuptools_scm>=8"] +build-backend = "setuptools.build_meta" + +[project] +name = "instructlab-schema" +authors = [ + { name="InstructLab", email="dev@instructlab.ai" }, +] +description = "InstructLab Taxonomy Schema" +readme = "README.md" +license = {text = "Apache-2.0"} +requires-python = ">=3.9" +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dynamic = ["dependencies", "optional-dependencies", "version"] + +[project.urls] +homepage = "https://instructlab.ai" +source = "https://github.com/instructlab/schema" +issues = "https://github.com/instructlab/schema/issues" + +[tool.setuptools_scm] +version_file = "src/instructlab/schema/_version.py" +# do not include +gREV local version, required for Test PyPI upload +local_scheme = "no-local-version" + +[tool.mypy] +python_version = "3.9" +exclude = ["^src/instructlab/schema/_version\\.py$"] + +[tool.ruff] +src = ["src"] +target-version = "py39" + +[tool.ruff.lint] +select = [ + "B", # flake8-bugbear + "E", # pycodestyle + "F", # Pyflakes + "Q", # flake8-quotes + "I", # isort + "UP", # pyupgrade + "SIM", # flake8-simplify + "TID", # flake8-tidy-imports +] + +[tool.pylint.main] +# Analyse import fallback blocks. This can be used to support both Python 2 and 3 +# compatible code, which means that the block might have code that exists only in +# one or another interpreter, leading to false positives when analysed. +# analyse-fallback-blocks = + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint in +# a server-like mode. +# clear-cache-post-run = + +# Always return a 0 (non-error) status code, even if lint errors are found. This +# is primarily useful in continuous integration scripts. +# exit-zero = + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +# extension-pkg-allow-list = + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +# extension-pkg-whitelist = + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +# fail-on = + +# Specify a score threshold under which the program will exit with error. +fail-under = 10.0 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +# from-stdin = + +# Files or directories to be skipped. They should be base names, not paths. +ignore = ["_version.py"] + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, it +# can't be used as an escape character. +# ignore-paths = + +# Files or directories matching the regular expression patterns are skipped. The +# regex matches against base names, not paths. The default value ignores Emacs +# file locks +ignore-patterns = ["^\\.#"] + +# List of module names for which member attributes should not be checked and will +# not be imported (useful for modules/projects where namespaces are manipulated +# during runtime and thus existing member attributes cannot be deduced by static +# analysis). It supports qualified module names, as well as Unix pattern +# matching. +# ignored-modules = + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +# init-hook = + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs = 1 + +# Control the amount of potential inferred values when inferring a single object. +# This can help the performance when dealing with large functions or complex, +# nested conditions. +limit-inference-results = 100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +# load-plugins = + +# Pickle collected data for later comparisons. +persistent = true + +# Resolve imports to .pyi stubs if available. May reduce no-member messages and +# increase not-an-iterable messages. +# prefer-stubs = + +# Minimum Python version to use for version dependent checks. Will default to the +# version used to run pylint. +py-version = "3.9" + +# Discover python modules and packages in the file system subtree. +# recursive = + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +# source-roots = + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode = true + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +# unsafe-load-any-extension = + +[tool.pylint.basic] +# Naming style matching correct argument names. +argument-naming-style = "snake_case" + +# Regular expression matching correct argument names. Overrides argument-naming- +# style. If left empty, argument names will be checked with the set naming style. +# argument-rgx = + +# Naming style matching correct attribute names. +attr-naming-style = "snake_case" + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +# attr-rgx = + +# Bad variable names which should always be refused, separated by a comma. +bad-names = ["foo", "bar", "baz", "toto", "tutu", "tata"] + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +# bad-names-rgxs = + +# Naming style matching correct class attribute names. +class-attribute-naming-style = "any" + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +# class-attribute-rgx = + +# Naming style matching correct class constant names. +class-const-naming-style = "UPPER_CASE" + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +# class-const-rgx = + +# Naming style matching correct class names. +class-naming-style = "PascalCase" + +# Regular expression matching correct class names. Overrides class-naming-style. +# If left empty, class names will be checked with the set naming style. +# class-rgx = + +# Naming style matching correct constant names. +const-naming-style = "UPPER_CASE" + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming style. +# const-rgx = + +# Minimum line length for functions/classes that require docstrings, shorter ones +# are exempt. +docstring-min-length = -1 + +# Naming style matching correct function names. +function-naming-style = "snake_case" + +# Regular expression matching correct function names. Overrides function-naming- +# style. If left empty, function names will be checked with the set naming style. +# function-rgx = + +# Good variable names which should always be accepted, separated by a comma. +good-names = ["i", "j", "k", "ex", "Run", "_"] + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +# good-names-rgxs = + +# Include a hint for the correct naming format with invalid-name. +# include-naming-hint = + +# Naming style matching correct inline iteration names. +inlinevar-naming-style = "any" + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +# inlinevar-rgx = + +# Naming style matching correct method names. +method-naming-style = "snake_case" + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +# method-rgx = + +# Naming style matching correct module names. +module-naming-style = "snake_case" + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +# module-rgx = + +# Colon-delimited sets of names that determine each other's naming style when the +# name regexes allow several styles. +# name-group = + +# Regular expression which should only match function or class names that do not +# require a docstring. +no-docstring-rgx = "^_" + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. These +# decorators are taken in consideration only for invalid-name. +property-classes = ["abc.abstractproperty"] + +# Regular expression matching correct type alias names. If left empty, type alias +# names will be checked with the set naming style. +# typealias-rgx = + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +# typevar-rgx = + +# Naming style matching correct variable names. +variable-naming-style = "snake_case" + +# Regular expression matching correct variable names. Overrides variable-naming- +# style. If left empty, variable names will be checked with the set naming style. +# variable-rgx = + +[tool.pylint.classes] +# Warn about protected attribute access inside special methods +# check-protected-access-in-special-methods = + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods = ["__init__", "__new__", "setUp", "__post_init__"] + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected = ["_asdict", "_fields", "_replace", "_source", "_make"] + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg = ["cls"] + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg = ["mcs"] + +[tool.pylint.design] +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +# exclude-too-few-public-methods = + +# List of qualified class names to ignore when counting class parents (see R0901) +# ignored-parents = + +# Maximum number of arguments for function / method. +max-args = 5 + +# Maximum number of attributes for a class (see R0902). +max-attributes = 7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr = 5 + +# Maximum number of branch for function / method body. +max-branches = 12 + +# Maximum number of locals for function / method body. +max-locals = 15 + +# Maximum number of parents for a class (see R0901). +max-parents = 7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods = 20 + +# Maximum number of return / yield for function / method body. +max-returns = 6 + +# Maximum number of statements in function / method body. +max-statements = 50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods = 2 + +[tool.pylint.exceptions] +# Exceptions that will emit a warning when caught. +overgeneral-exceptions = ["builtins.BaseException", "builtins.Exception"] + +[tool.pylint.format] +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +# expected-line-ending-format = + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines = "^\\s*(# )??$" + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren = 4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string = " " + +# Maximum number of characters on a single line. +max-line-length = 100 + +# Maximum number of lines in a module. +max-module-lines = 1100 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +# single-line-class-stmt = + +# Allow the body of an if to be on the same line as the test if there is no else. +# single-line-if-stmt = + +[tool.pylint.imports] +# List of modules that can be imported at any level, not just the top level one. +# allow-any-import-level = + +# Allow explicit reexports by alias from a package __init__. +# allow-reexport-from-package = + +# Allow wildcard imports from modules that define __all__. +# allow-wildcard-with-all = + +# Deprecated modules which should not be used, separated by a comma. +# deprecated-modules = + +# Output a graph (.gv or any supported image format) of external dependencies to +# the given file (report RP0402 must not be disabled). +# ext-import-graph = + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be disabled). +# import-graph = + +# Output a graph (.gv or any supported image format) of internal dependencies to +# the given file (report RP0402 must not be disabled). +# int-import-graph = + +# Force import order to recognize a module as part of the standard compatibility +# libraries. +# known-standard-library = + +# Force import order to recognize a module as part of a third party library. +known-third-party = ["enchant"] + +# Couples of modules and preferred modules, separated by a comma. +# preferred-modules = + +[tool.pylint.logging] +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style = "old" + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules = ["logging"] + +[tool.pylint."messages control"] +# Only show warnings with the listed confidence levels. Leave empty to show all. +# Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence = ["HIGH", "CONTROL_FLOW", "INFERENCE", "INFERENCE_FAILURE", "UNDEFINED"] + +# Disable the message, report, category or checker with the given id(s). You can +# either give multiple identifiers separated by comma (,) or put this option +# multiple times (only on the command line, not in the configuration file where +# it should appear only once). You can also use "--disable=all" to disable +# everything first and then re-enable specific checks. For example, if you want +# to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "use-implicit-booleaness-not-comparison-to-string", "use-implicit-booleaness-not-comparison-to-zero", "invalid-name", "missing-class-docstring", "missing-module-docstring", "missing-function-docstring", "consider-using-f-string", "inconsistent-return-statements", "no-member", "too-many-arguments", "too-many-locals", "too-many-branches", "too-many-statements", "cyclic-import", "too-few-public-methods", "protected-access", "fixme", "logging-format-interpolation", "logging-too-many-args", "attribute-defined-outside-init", "abstract-method", "pointless-statement", "wrong-import-order", "line-too-long"] + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where it +# should appear only once). See also the "--disable" option for examples. +enable = ["c-extension-no-member"] + +[tool.pylint.method_args] +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods = ["requests.api.delete", "requests.api.get", "requests.api.head", "requests.api.options", "requests.api.patch", "requests.api.post", "requests.api.put", "requests.api.request"] + +[tool.pylint.miscellaneous] +# List of note tags to take in consideration, separated by a comma. +notes = ["FIXME", "XXX", "TODO"] + +# Regular expression of note tags to take in consideration. +# notes-rgx = + +[tool.pylint.refactoring] +# Maximum number of nested blocks for function / method body +max-nested-blocks = 5 + +# Complete name of functions that never returns. When checking for inconsistent- +# return-statements if a never returning function is called then it will be +# considered as an explicit return statement and no message will be printed. +never-returning-functions = ["sys.exit", "argparse.parse_error"] + +# Let 'consider-using-join' be raised when the separator to join on would be non- +# empty (resulting in expected fixes of the type: ``"- " + " - ".join(items)``) +suggest-join-with-non-empty-separator = true + +[tool.pylint.reports] +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each category, +# as well as 'statement' which is the total number of statements analyzed. This +# score is used by the global evaluation report (RP0004). +evaluation = "max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))" + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +# msg-template = + +# Set the output format. Available formats are: text, parseable, colorized, json2 +# (improved json format), json (old json format) and msvs (visual studio). You +# can also give a reporter class, e.g. mypackage.mymodule.MyReporterClass. +# output-format = + +# Tells whether to display a full report or only the messages. +reports = true + +# Activate the evaluation score. +score = true + +[tool.pylint.similarities] +# Comments are removed from the similarity computation +ignore-comments = true + +# Docstrings are removed from the similarity computation +ignore-docstrings = true + +# Imports are removed from the similarity computation +ignore-imports = true + +# Signatures are removed from the similarity computation +ignore-signatures = true + +# Minimum lines number of a similarity. +min-similarity-lines = 4 + +[tool.pylint.spelling] +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions = 4 + +# Spelling dictionary name. No available dictionaries : You need to install both +# the python package and the system dependency for enchant to work. +# spelling-dict = + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives = "fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:" + +# List of comma separated words that should not be checked. +# spelling-ignore-words = + +# A path to a file that contains the private dictionary; one word per line. +# spelling-private-dict-file = + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +# spelling-store-unknown-words = + +[tool.pylint.typecheck] +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators = ["contextlib.contextmanager"] + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +# generated-members = + +# Tells whether missing members accessed in mixin class should be ignored. A +# class is considered mixin if its name matches the mixin-class-rgx option. +# Tells whether to warn about missing members when the owner of the attribute is +# inferred to be None. +ignore-none = true + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference can +# return multiple potential results while evaluating a Python object, but some +# branches might not be evaluated, which results in partial inference. In that +# case, it might be useful to still emit no-member and other checks for the rest +# of the inferred objects. +ignore-on-opaque-inference = true + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins = ["no-member", "not-async-context-manager", "not-context-manager", "attribute-defined-outside-init"] + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes = ["optparse.Values", "thread._local", "_thread._local", "argparse.Namespace"] + +# Show a hint with possible names when a member name was not found. The aspect of +# finding the hint is based on edit distance. +missing-member-hint = true + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance = 1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices = 1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx = ".*[Mm]ixin" + +# List of decorators that change the signature of a decorated function. +# signature-mutators = + +[tool.pylint.variables] +# List of additional names supposed to be defined in builtins. Remember that you +# should avoid defining new builtins when possible. +# additional-builtins = + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables = true + +# List of names allowed to shadow builtins +# allowed-redefined-builtins = + +# List of strings which can identify a callback function by name. A callback name +# must start or end with one of those strings. +callbacks = ["cb_", "_cb"] + +# A regular expression matching the name of dummy variables (i.e. expected to not +# be used). +dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_" + +# Argument names that match this expression will be ignored. +ignored-argument-names = "_.*|^ignored_|^unused_" + +# Tells whether we should check for unused import in __init__ files. +# init-import = + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules = ["six.moves", "past.builtins", "future.builtins", "builtins", "io"] + + + diff --git a/scripts/ruff.sh b/scripts/ruff.sh new file mode 100755 index 0000000..6bf131f --- /dev/null +++ b/scripts/ruff.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +set -e + +# wrapper to combine ruff check and ruff format +# +# "ruff.sh fix" runs fixes and reformats the code +# "ruff.sh check" checks style, format, and imports +# "ruff.sh " passes abitrary args to ruff + +if [ -z "$1" ]; then + echo "USAGE: $0 [check|fix|]" >&2 + exit 2 +fi + +run() { + declare -i err + + echo "RUN: '$*'" + "$@" + err=$? + echo + return $err +} + +case $1 in + "check") + declare -i exitcode=0 + + set +e + run ruff check --diff + exitcode=$(( exitcode + $? )) + + run ruff format --check + exitcode=$(( exitcode + $? )) + set -e + + if [ $exitcode -ne 0 ]; then + echo "ERROR: one or more checks have failed." >&2 + echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2 + exit 3 + fi + ;; + "fix") + run ruff check --fix + run ruff format + ;; + *) + ruff "$@" +esac diff --git a/src/instructlab/schema/__init__.py b/src/instructlab/schema/__init__.py new file mode 100644 index 0000000..5ac6fdc --- /dev/null +++ b/src/instructlab/schema/__init__.py @@ -0,0 +1,25 @@ +"""InstructLab Taxonomy Schema""" + +# Standard +from importlib import resources + +try: + from importlib.resources.abc import Traversable # type: ignore[import-not-found] +except ImportError: # python>=3.9,<3.11 + from importlib.abc import Traversable + +__all__ = ["schema_versions"] + + +def schema_versions() -> list[Traversable]: + """Return the sorted list of schema versions. + + Returns: + list[Traversable]: A sorted list of schema versions. + """ + schema_base = resources.files(__package__) + versions = sorted( + (v for v in schema_base.iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), + key=lambda k: int(k.name[1:]), + ) + return versions diff --git a/src/instructlab/schema/py.typed b/src/instructlab/schema/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/instructlab/schema/v1/__init__.py b/src/instructlab/schema/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v1/compositional_skills.json b/src/instructlab/schema/v1/compositional_skills.json similarity index 100% rename from v1/compositional_skills.json rename to src/instructlab/schema/v1/compositional_skills.json diff --git a/v1/knowledge.json b/src/instructlab/schema/v1/knowledge.json similarity index 100% rename from v1/knowledge.json rename to src/instructlab/schema/v1/knowledge.json diff --git a/v1/version.json b/src/instructlab/schema/v1/version.json similarity index 100% rename from v1/version.json rename to src/instructlab/schema/v1/version.json diff --git a/src/instructlab/schema/v2/__init__.py b/src/instructlab/schema/v2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v2/compositional_skills.json b/src/instructlab/schema/v2/compositional_skills.json similarity index 100% rename from v2/compositional_skills.json rename to src/instructlab/schema/v2/compositional_skills.json diff --git a/v2/knowledge.json b/src/instructlab/schema/v2/knowledge.json similarity index 100% rename from v2/knowledge.json rename to src/instructlab/schema/v2/knowledge.json diff --git a/v2/version.json b/src/instructlab/schema/v2/version.json similarity index 100% rename from v2/version.json rename to src/instructlab/schema/v2/version.json diff --git a/tests/test_versions.py b/tests/test_versions.py new file mode 100644 index 0000000..ed08865 --- /dev/null +++ b/tests/test_versions.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +import json +from importlib import resources + +# Third Party +from referencing import Resource +from referencing.jsonschema import DRAFT202012 + +from instructlab.schema import schema_versions + + +class TestVersions: + def test_versions(self): + versions = schema_versions() + assert versions is not None + assert len(versions) > 1 + for i, v in enumerate(versions): + assert v.name == f"v{i+1}" + + def _load_schema(self, path): + text = path.read_text(encoding="utf-8") + assert text + assert len(text) > 1 + contents = json.loads(text) + assert contents + assert len(contents) > 1 + resource = Resource.from_contents( + contents=contents, default_specification=DRAFT202012 + ) + assert resource + assert resource.contents == contents + + def test_import_schema_base(self): + schema_base = resources.files("instructlab.schema") + for i in range(len(schema_versions())): + schema_version = schema_base.joinpath(f"v{i+1}") + for schema_name in ("compositional_skills", "knowledge", "version"): + path = schema_version.joinpath(f"{schema_name}.json") + self._load_schema(path) + + def test_import_schema_versions(self): + for i in range(len(schema_versions())): + schema_version = resources.files(f"instructlab.schema.v{i+1}") + for schema_name in ("compositional_skills", "knowledge", "version"): + path = schema_version.joinpath(f"{schema_name}.json") + self._load_schema(path) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..91da27e --- /dev/null +++ b/tox.ini @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: Apache-2.0 + +[tox] +# py3-unit runs unit tests with 'python3' +# py311-unit runs the same tests with 'python3.11' +envlist = ruff, pylint, mypy, jsonschema, py3-unit +minversion = 4.4 + +[testenv] +description = Run tests (unit) +package = wheel +wheel_build_env = pkg +deps = + pytest + jsonschema +commands = + unit: {envpython} -m pytest {posargs:tests} + +[testenv:pylint] +description = Lint with pylint +deps = + pylint + pylint-pydantic + jsonschema +commands = + {envpython} -m pylint --load-plugins pylint_pydantic {posargs:src/instructlab/schema/ tests/} + +[testenv:ruff] +description = Reformat and fix code with Ruff +skip_install = True +skipsdist = true +deps = + ruff + jsonschema +commands = + ./scripts/ruff.sh {posargs:fix} +allowlist_externals = ./scripts/ruff.sh + +[testenv:mypy] +description = Python type checking with mypy +namespace_packages = True +explicit_package_bases = True +deps = + mypy + jsonschema +commands = + {envpython} -m mypy {posargs:src tests} + +[testenv:jsonschema] +description = JSON schema file validation with check-jsonschema +skip_install = True +skipsdist = true +deps = + check-jsonschema +commands = + bash -c "{envpython} -m check_jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema {posargs:$(find src/instructlab/schema/v* -name \"*.json\" -print)}" +allowlist_externals = bash + +[gh] +python = + 3.12 = py312-unit + 3.11 = py311-unit + 3.10 = py310-unit + 3.9 = py39-unit