From ef8815cea087b9a8b98ec8484a800b18f5adce1c Mon Sep 17 00:00:00 2001 From: BJ Hargrave Date: Wed, 5 Jun 2024 13:34:39 -0400 Subject: [PATCH] schema package: Build a python package for the schema This will allow schema users to pip install the schema instead of using something like git submodules to place the schema files at some local path. Signed-off-by: BJ Hargrave --- .github/dependabot.yml | 6 - .github/scripts/requirements.txt | 3 - .github/workflows/actionlint.yml | 1 - .github/workflows/docs.yml | 1 - .github/workflows/lint.yml | 65 +- .github/workflows/matchers/mypy.json | 16 + .github/workflows/matchers/pylint.json | 32 + .github/workflows/pypi.yml | 130 ++++ .github/workflows/test.yml | 73 ++ .gitignore | 33 + README.md | 4 +- pyproject.toml | 626 ++++++++++++++++++ scripts/ruff.sh | 54 ++ src/instructlab/schema/__init__.py | 29 + src/instructlab/schema/py.typed | 0 src/instructlab/schema/v1/__init__.py | 0 .../schema/v1}/compositional_skills.json | 0 .../instructlab/schema/v1}/knowledge.json | 0 .../instructlab/schema/v1}/version.json | 0 src/instructlab/schema/v2/__init__.py | 0 .../schema/v2}/compositional_skills.json | 0 .../instructlab/schema/v2}/knowledge.json | 0 .../instructlab/schema/v2}/version.json | 0 tests/test_versions.py | 49 ++ tox.ini | 65 ++ 25 files changed, 1149 insertions(+), 38 deletions(-) delete mode 100644 .github/scripts/requirements.txt create mode 100644 .github/workflows/matchers/mypy.json create mode 100644 .github/workflows/matchers/pylint.json create mode 100644 .github/workflows/pypi.yml create mode 100644 .github/workflows/test.yml create mode 100644 pyproject.toml create mode 100755 scripts/ruff.sh create mode 100644 src/instructlab/schema/__init__.py create mode 100644 src/instructlab/schema/py.typed create mode 100644 src/instructlab/schema/v1/__init__.py rename {v1 => src/instructlab/schema/v1}/compositional_skills.json (100%) rename {v1 => src/instructlab/schema/v1}/knowledge.json (100%) rename {v1 => src/instructlab/schema/v1}/version.json (100%) create mode 100644 src/instructlab/schema/v2/__init__.py rename {v2 => src/instructlab/schema/v2}/compositional_skills.json (100%) rename {v2 => src/instructlab/schema/v2}/knowledge.json (100%) rename {v2 => src/instructlab/schema/v2}/version.json (100%) create mode 100644 tests/test_versions.py create mode 100644 tox.ini diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4dcc1c5..85c1d1b 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -13,9 +13,3 @@ updates: directory: "/.github/workflows" schedule: interval: "daily" - - # Maintain dependencies for Python scripts - - package-ecosystem: "pip" - directory: "/.github/scripts" - schedule: - interval: "daily" diff --git a/.github/scripts/requirements.txt b/.github/scripts/requirements.txt deleted file mode 100644 index 1e6548c..0000000 --- a/.github/scripts/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -check-jsonschema>=0.28.2 diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml index 24bdfc8..6fac671 100644 --- a/.github/workflows/actionlint.yml +++ b/.github/workflows/actionlint.yml @@ -38,7 +38,6 @@ jobs: uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: fetch-depth: 0 - submodules: true - name: "Download actionlint" run: | diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 208f567..200e081 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -40,7 +40,6 @@ jobs: uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 with: fetch-depth: 0 - submodules: true - name: "Check Markdown documents" uses: DavidAnson/markdownlint-cli2-action@b4c9feab76d8025d1e83c653fa3990936df0e6c8 # v16.0.0 with: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c4a405b..6caec3e 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -1,24 +1,29 @@ # SPDX-License-Identifier: Apache-2.0 -name: Lint Schema +name: Lint on: - workflow_dispatch: push: branches: - - main + - "main" paths: - - 'v*/**/*.json' + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' - '.github/workflows/lint.yml' # This workflow - - '.github/scripts/**' # Scripts used by this workflow pull_request: branches: - - main + - "main" paths: - - 'v*/**/*.json' + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' - '.github/workflows/lint.yml' # This workflow - - '.github/scripts/**' # Scripts used by this workflow env: LC_ALL: en_US.UTF-8 @@ -33,6 +38,25 @@ permissions: jobs: lint: runs-on: ubuntu-latest + name: "${{ matrix.lint.name }}" + strategy: + fail-fast: false + matrix: + lint: + - name: "jsonschema" + commands: | + tox -e jsonschema + - name: "ruff" + commands: | + tox -e ruff -- check + - name: "pylint" + commands: | + echo "::add-matcher::.github/workflows/matchers/pylint.json" + tox -e lint + - name: "mypy" + commands: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tox -e mypy steps: - name: "Harden Runner" uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 @@ -49,24 +73,13 @@ jobs: with: python-version: "3.11" - - name: "Install Python Packages" + - name: "Install tox" run: | - pip install -r .github/scripts/requirements.txt + python -m pip install --upgrade pip + python -m pip install tox tox-gh - - name: "Find changed schema files" - id: changed-files - uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78 # v44.5.2 - with: - files: | - v*/**/*.json - - - name: "Check changed schema file contents" - if: steps.changed-files.outputs.any_changed == 'true' - run: | - check-jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema ${{ steps.changed-files.outputs.all_changed_files }} - - - name: "Check all schema file contents" - if: steps.changed-files.outputs.any_changed != 'true' + - name: "${{ matrix.lint.name }}" run: | - # shellcheck disable=SC2046 - check-jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema $(find v* -name "*.json") + ${{ matrix.lint.commands }} + env: + RUFF_OUTPUT_FORMAT: github diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 0000000..f048fce --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/pylint.json b/.github/workflows/matchers/pylint.json new file mode 100644 index 0000000..5624ca6 --- /dev/null +++ b/.github/workflows/matchers/pylint.json @@ -0,0 +1,32 @@ +{ + "problemMatcher": [ + { + "owner": "pylint-error", + "severity": "error", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([EF]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + }, + { + "owner": "pylint-warning", + "severity": "warning", + "pattern": [ + { + "regexp": "^(.+):(\\d+):(\\d+):\\s(([CRW]\\d{4}):\\s.+)$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..eeb0ca0 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,130 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Build, test, and upload PyPI package + +on: + push: + branches: + - "main" + tags: + - "v*" + pull_request: + branches: + - "main" + release: + types: + - published + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + # Create and verify release artifacts + # - build source dist (tar ball) and wheel + # - validate artifacts with various tools + # - upload artifacts to GHA + build-package: + name: Build and check packages + runs-on: ubuntu-latest + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + + - name: "Checkout" + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + # for setuptools-scm + fetch-depth: 0 + + - name: "Build and Inspect" + uses: hynek/build-and-inspect-python-package@b4fc3f6ba2b3da04f09659be99e2a29fb6146a61 # v2.6.0 + + # push to Test PyPI on + # - a new GitHub release is published + # - a PR is merged into main branch + publish-test-pypi: + name: Publish packages to test.pypi.org + # environment: publish-test-pypi + if: ${{ false && (github.repository_owner == 'instructlab') && ((github.event.action == 'published') || ((github.event_name == 'push') && (github.ref == 'refs/heads/main'))) }} + permissions: + contents: read + # see https://docs.pypi.org/trusted-publishers/ + id-token: write + runs-on: ubuntu-latest + needs: build-package + + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Download build artifacts" + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: Packages + path: dist + + - name: "Upload to Test PyPI" + uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # v1.8.14 + with: + repository-url: https://test.pypi.org/legacy/ + + # push to Production PyPI on + # - a new GitHub release is published + publish-pypi: + name: Publish release to pypi.org + # environment: publish-pypi + if: ${{ false && (github.repository_owner == 'instructlab') && (github.event.action == 'published') }} + permissions: + # see https://docs.pypi.org/trusted-publishers/ + id-token: write + # allow gh release upload + contents: write + + runs-on: ubuntu-latest + needs: build-package + + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@f086349bfa2bd1361f7909c78558e816508cdc10 # v2.8.0 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Download build artifacts" + uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 + with: + name: Packages + path: dist + + - name: "Sigstore sign package" + uses: sigstore/gh-action-sigstore-python@61f6a500bbfdd9a2a339cf033e5421951fbc1cd2 # v2.1.1 + with: + inputs: | + ./dist/*.tar.gz + ./dist/*.whl + + - name: "Upload artifacts and signatures to GitHub release" + run: | + gh release upload '${{ github.ref_name }}' dist/* --repo '${{ github.repository }}' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + # PyPI does not accept .sigstore artifacts and + # gh-action-pypi-publish has no option to ignore them. + - name: "Remove sigstore signatures before uploading to PyPI" + run: | + rm ./dist/*.sigstore + + - name: "Upload to PyPI" + uses: pypa/gh-action-pypi-publish@81e9d935c883d0b210363ab89cf05f3894778450 # v1.8.14 diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..b6c6de3 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Test + +on: + push: + branches: + - "main" + paths: + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' + - '.github/workflows/test.yml' # This workflow + pull_request: + branches: + - "main" + paths: + - '**.py' + - 'src/instructlab/schema/v*/**/*.json' + - 'pyproject.toml' + - 'tox.ini' + - 'scripts/**' + - '.github/workflows/test.yml' # This workflow + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + test: + name: "${{ matrix.python }} on ${{ matrix.platform }}" + runs-on: "${{ matrix.platform }}" + strategy: + matrix: + python: + - "3.9" + - "3.10" + - "3.11" + - "3.12" + platform: + - "ubuntu-latest" + steps: + - name: "Harden Runner" + uses: step-security/harden-runner@17d0e2bd7d51742c71671bd19fa12bdc9d40a3d6 # v2.8.1 + with: + egress-policy: audit # TODO: change to 'egress-policy: block' after couple of runs + + - name: "Checkout" + uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + with: + fetch-depth: 0 + + - name: "Setup Python ${{ matrix.python }}" + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5.1.0 + with: + python-version: ${{ matrix.python }} + + - name: "Install tox" + run: | + python -m pip install --upgrade pip + python -m pip install tox tox-gh + + - name: "Unit tests" + run: | + tox diff --git a/.gitignore b/.gitignore index 701ff28..7fd12a9 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,32 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# generated by setuptools_scm +/src/instructlab/schema/_version.py + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + # Environments .env .venv @@ -11,5 +40,9 @@ venv.bak/ .vscode/ .idea/ +# Caches +.tox/ +.*_cache/ + # Mac personalization files .DS_Store diff --git a/README.md b/README.md index 7756104..d5d5953 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ # Taxonomy Schema -This repository defines the JSON schema for the [Taxonomy](https://github.com/instructlab/taxonomy) YAML. +This Python package defines the JSON schema for the InstructLab [Taxonomy](https://github.com/instructlab/taxonomy) YAML. + +Consumers of this schema can `pip install instructlab-schema`, and access the schema files using `importlib.resources` on the `instructlab.schema` package. diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7678d08 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,626 @@ +# SPDX-License-Identifier: Apache-2.0 + +[build-system] +requires = ["setuptools>=64", "setuptools_scm>=8"] +build-backend = "setuptools.build_meta" + +[project] +name = "instructlab-schema" +authors = [ + { name="InstructLab", email="dev@instructlab.ai" }, +] +description = "InstructLab Taxonomy Schema" +readme = "README.md" +license = {text = "Apache-2.0"} +requires-python = ">=3.9" +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dynamic = ["dependencies", "optional-dependencies", "version"] + +[project.urls] +homepage = "https://instructlab.ai" +source = "https://github.com/instructlab/schema" +issues = "https://github.com/instructlab/schema/issues" + +[tool.setuptools_scm] +version_file = "src/instructlab/schema/_version.py" +# do not include +gREV local version, required for Test PyPI upload +local_scheme = "no-local-version" + +[tool.mypy] +python_version = "3.9" +exclude = ["^src/instructlab/schema/_version\\.py$"] + +[tool.ruff] +target-version = "py39" +extend-exclude = ["src/instructlab/schema/_version.py"] +# same as black's default line length +line-length = 88 + +[tool.ruff.lint] +# Allow fix for all enabled rules (when `--fix`) is provided. +fixable = ["ALL"] +unfixable = [] + +# Fixers will be enabled gradually. +select = [ + "B", # flake8-bugbear + "E", # pycodestyle + "F", # Pyflakes + "Q", # flake8-quotes + # Ruff does not support isort's import_headings feature, yet. + "I", # isort + # "UP", # pyupgrade + "SIM", # flake8-simplify + "TID", # flake8-tidy-imports +] + +[tool.isort] +extend_skip = ["src/instructlab/schema/_version.py"] +profile="black" +from_first=true +import_heading_future="Future" +import_heading_stdlib="Standard" +import_heading_thirdparty="Third Party" +import_heading_firstparty="First Party" +import_heading_localfolder="Local" +known_firstparty="" +known_localfolder="tuning" + +[tool.pylint.main] +# Analyse import fallback blocks. This can be used to support both Python 2 and 3 +# compatible code, which means that the block might have code that exists only in +# one or another interpreter, leading to false positives when analysed. +# analyse-fallback-blocks = + +# Clear in-memory caches upon conclusion of linting. Useful if running pylint in +# a server-like mode. +# clear-cache-post-run = + +# Always return a 0 (non-error) status code, even if lint errors are found. This +# is primarily useful in continuous integration scripts. +# exit-zero = + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. +# extension-pkg-allow-list = + +# A comma-separated list of package or module names from where C extensions may +# be loaded. Extensions are loading into the active Python interpreter and may +# run arbitrary code. (This is an alternative name to extension-pkg-allow-list +# for backward compatibility.) +# extension-pkg-whitelist = + +# Return non-zero exit code if any of these messages/categories are detected, +# even if score is above --fail-under value. Syntax same as enable. Messages +# specified are enabled, while categories only check already-enabled messages. +# fail-on = + +# Specify a score threshold under which the program will exit with error. +fail-under = 10.0 + +# Interpret the stdin as a python script, whose filename needs to be passed as +# the module_or_package argument. +# from-stdin = + +# Files or directories to be skipped. They should be base names, not paths. +ignore = ["_version.py"] + +# Add files or directories matching the regular expressions patterns to the +# ignore-list. The regex matches against paths and can be in Posix or Windows +# format. Because '\\' represents the directory delimiter on Windows systems, it +# can't be used as an escape character. +# ignore-paths = + +# Files or directories matching the regular expression patterns are skipped. The +# regex matches against base names, not paths. The default value ignores Emacs +# file locks +ignore-patterns = ["^\\.#"] + +# List of module names for which member attributes should not be checked and will +# not be imported (useful for modules/projects where namespaces are manipulated +# during runtime and thus existing member attributes cannot be deduced by static +# analysis). It supports qualified module names, as well as Unix pattern +# matching. +# ignored-modules = + +# Python code to execute, usually for sys.path manipulation such as +# pygtk.require(). +# init-hook = + +# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the +# number of processors available to use, and will cap the count on Windows to +# avoid hangs. +jobs = 1 + +# Control the amount of potential inferred values when inferring a single object. +# This can help the performance when dealing with large functions or complex, +# nested conditions. +limit-inference-results = 100 + +# List of plugins (as comma separated values of python module names) to load, +# usually to register additional checkers. +# load-plugins = + +# Pickle collected data for later comparisons. +persistent = true + +# Resolve imports to .pyi stubs if available. May reduce no-member messages and +# increase not-an-iterable messages. +# prefer-stubs = + +# Minimum Python version to use for version dependent checks. Will default to the +# version used to run pylint. +py-version = "3.9" + +# Discover python modules and packages in the file system subtree. +# recursive = + +# Add paths to the list of the source roots. Supports globbing patterns. The +# source root is an absolute path or a path relative to the current working +# directory used to determine a package namespace for modules located under the +# source root. +# source-roots = + +# When enabled, pylint would attempt to guess common misconfiguration and emit +# user-friendly hints instead of false-positive error messages. +suggestion-mode = true + +# Allow loading of arbitrary C extensions. Extensions are imported into the +# active Python interpreter and may run arbitrary code. +# unsafe-load-any-extension = + +[tool.pylint.basic] +# Naming style matching correct argument names. +argument-naming-style = "snake_case" + +# Regular expression matching correct argument names. Overrides argument-naming- +# style. If left empty, argument names will be checked with the set naming style. +# argument-rgx = + +# Naming style matching correct attribute names. +attr-naming-style = "snake_case" + +# Regular expression matching correct attribute names. Overrides attr-naming- +# style. If left empty, attribute names will be checked with the set naming +# style. +# attr-rgx = + +# Bad variable names which should always be refused, separated by a comma. +bad-names = ["foo", "bar", "baz", "toto", "tutu", "tata"] + +# Bad variable names regexes, separated by a comma. If names match any regex, +# they will always be refused +# bad-names-rgxs = + +# Naming style matching correct class attribute names. +class-attribute-naming-style = "any" + +# Regular expression matching correct class attribute names. Overrides class- +# attribute-naming-style. If left empty, class attribute names will be checked +# with the set naming style. +# class-attribute-rgx = + +# Naming style matching correct class constant names. +class-const-naming-style = "UPPER_CASE" + +# Regular expression matching correct class constant names. Overrides class- +# const-naming-style. If left empty, class constant names will be checked with +# the set naming style. +# class-const-rgx = + +# Naming style matching correct class names. +class-naming-style = "PascalCase" + +# Regular expression matching correct class names. Overrides class-naming-style. +# If left empty, class names will be checked with the set naming style. +# class-rgx = + +# Naming style matching correct constant names. +const-naming-style = "UPPER_CASE" + +# Regular expression matching correct constant names. Overrides const-naming- +# style. If left empty, constant names will be checked with the set naming style. +# const-rgx = + +# Minimum line length for functions/classes that require docstrings, shorter ones +# are exempt. +docstring-min-length = -1 + +# Naming style matching correct function names. +function-naming-style = "snake_case" + +# Regular expression matching correct function names. Overrides function-naming- +# style. If left empty, function names will be checked with the set naming style. +# function-rgx = + +# Good variable names which should always be accepted, separated by a comma. +good-names = ["i", "j", "k", "ex", "Run", "_"] + +# Good variable names regexes, separated by a comma. If names match any regex, +# they will always be accepted +# good-names-rgxs = + +# Include a hint for the correct naming format with invalid-name. +# include-naming-hint = + +# Naming style matching correct inline iteration names. +inlinevar-naming-style = "any" + +# Regular expression matching correct inline iteration names. Overrides +# inlinevar-naming-style. If left empty, inline iteration names will be checked +# with the set naming style. +# inlinevar-rgx = + +# Naming style matching correct method names. +method-naming-style = "snake_case" + +# Regular expression matching correct method names. Overrides method-naming- +# style. If left empty, method names will be checked with the set naming style. +# method-rgx = + +# Naming style matching correct module names. +module-naming-style = "snake_case" + +# Regular expression matching correct module names. Overrides module-naming- +# style. If left empty, module names will be checked with the set naming style. +# module-rgx = + +# Colon-delimited sets of names that determine each other's naming style when the +# name regexes allow several styles. +# name-group = + +# Regular expression which should only match function or class names that do not +# require a docstring. +no-docstring-rgx = "^_" + +# List of decorators that produce properties, such as abc.abstractproperty. Add +# to this list to register other decorators that produce valid properties. These +# decorators are taken in consideration only for invalid-name. +property-classes = ["abc.abstractproperty"] + +# Regular expression matching correct type alias names. If left empty, type alias +# names will be checked with the set naming style. +# typealias-rgx = + +# Regular expression matching correct type variable names. If left empty, type +# variable names will be checked with the set naming style. +# typevar-rgx = + +# Naming style matching correct variable names. +variable-naming-style = "snake_case" + +# Regular expression matching correct variable names. Overrides variable-naming- +# style. If left empty, variable names will be checked with the set naming style. +# variable-rgx = + +[tool.pylint.classes] +# Warn about protected attribute access inside special methods +# check-protected-access-in-special-methods = + +# List of method names used to declare (i.e. assign) instance attributes. +defining-attr-methods = ["__init__", "__new__", "setUp", "__post_init__"] + +# List of member names, which should be excluded from the protected access +# warning. +exclude-protected = ["_asdict", "_fields", "_replace", "_source", "_make"] + +# List of valid names for the first argument in a class method. +valid-classmethod-first-arg = ["cls"] + +# List of valid names for the first argument in a metaclass class method. +valid-metaclass-classmethod-first-arg = ["mcs"] + +[tool.pylint.design] +# List of regular expressions of class ancestor names to ignore when counting +# public methods (see R0903) +# exclude-too-few-public-methods = + +# List of qualified class names to ignore when counting class parents (see R0901) +# ignored-parents = + +# Maximum number of arguments for function / method. +max-args = 5 + +# Maximum number of attributes for a class (see R0902). +max-attributes = 7 + +# Maximum number of boolean expressions in an if statement (see R0916). +max-bool-expr = 5 + +# Maximum number of branch for function / method body. +max-branches = 12 + +# Maximum number of locals for function / method body. +max-locals = 15 + +# Maximum number of parents for a class (see R0901). +max-parents = 7 + +# Maximum number of public methods for a class (see R0904). +max-public-methods = 20 + +# Maximum number of return / yield for function / method body. +max-returns = 6 + +# Maximum number of statements in function / method body. +max-statements = 50 + +# Minimum number of public methods for a class (see R0903). +min-public-methods = 2 + +[tool.pylint.exceptions] +# Exceptions that will emit a warning when caught. +overgeneral-exceptions = ["builtins.BaseException", "builtins.Exception"] + +[tool.pylint.format] +# Expected format of line ending, e.g. empty (any line ending), LF or CRLF. +# expected-line-ending-format = + +# Regexp for a line that is allowed to be longer than the limit. +ignore-long-lines = "^\\s*(# )??$" + +# Number of spaces of indent required inside a hanging or continued line. +indent-after-paren = 4 + +# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 +# tab). +indent-string = " " + +# Maximum number of characters on a single line. +max-line-length = 100 + +# Maximum number of lines in a module. +max-module-lines = 1100 + +# Allow the body of a class to be on the same line as the declaration if body +# contains single statement. +# single-line-class-stmt = + +# Allow the body of an if to be on the same line as the test if there is no else. +# single-line-if-stmt = + +[tool.pylint.imports] +# List of modules that can be imported at any level, not just the top level one. +# allow-any-import-level = + +# Allow explicit reexports by alias from a package __init__. +# allow-reexport-from-package = + +# Allow wildcard imports from modules that define __all__. +# allow-wildcard-with-all = + +# Deprecated modules which should not be used, separated by a comma. +# deprecated-modules = + +# Output a graph (.gv or any supported image format) of external dependencies to +# the given file (report RP0402 must not be disabled). +# ext-import-graph = + +# Output a graph (.gv or any supported image format) of all (i.e. internal and +# external) dependencies to the given file (report RP0402 must not be disabled). +# import-graph = + +# Output a graph (.gv or any supported image format) of internal dependencies to +# the given file (report RP0402 must not be disabled). +# int-import-graph = + +# Force import order to recognize a module as part of the standard compatibility +# libraries. +# known-standard-library = + +# Force import order to recognize a module as part of a third party library. +known-third-party = ["enchant"] + +# Couples of modules and preferred modules, separated by a comma. +# preferred-modules = + +[tool.pylint.logging] +# The type of string formatting that logging methods do. `old` means using % +# formatting, `new` is for `{}` formatting. +logging-format-style = "old" + +# Logging modules to check that the string format arguments are in logging +# function parameter format. +logging-modules = ["logging"] + +[tool.pylint."messages control"] +# Only show warnings with the listed confidence levels. Leave empty to show all. +# Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE, UNDEFINED. +confidence = ["HIGH", "CONTROL_FLOW", "INFERENCE", "INFERENCE_FAILURE", "UNDEFINED"] + +# Disable the message, report, category or checker with the given id(s). You can +# either give multiple identifiers separated by comma (,) or put this option +# multiple times (only on the command line, not in the configuration file where +# it should appear only once). You can also use "--disable=all" to disable +# everything first and then re-enable specific checks. For example, if you want +# to run only the similarities checker, you can use "--disable=all +# --enable=similarities". If you want to run only the classes checker, but have +# no Warning level messages displayed, use "--disable=all --enable=classes +# --disable=W". +disable = ["raw-checker-failed", "bad-inline-option", "locally-disabled", "file-ignored", "suppressed-message", "useless-suppression", "deprecated-pragma", "use-symbolic-message-instead", "use-implicit-booleaness-not-comparison-to-string", "use-implicit-booleaness-not-comparison-to-zero", "invalid-name", "missing-class-docstring", "missing-module-docstring", "missing-function-docstring", "consider-using-f-string", "inconsistent-return-statements", "no-member", "too-many-arguments", "too-many-locals", "too-many-branches", "too-many-statements", "cyclic-import", "too-few-public-methods", "protected-access", "fixme", "logging-format-interpolation", "logging-too-many-args", "attribute-defined-outside-init", "abstract-method", "pointless-statement", "wrong-import-order", "line-too-long"] + +# Enable the message, report, category or checker with the given id(s). You can +# either give multiple identifier separated by comma (,) or put this option +# multiple time (only on the command line, not in the configuration file where it +# should appear only once). See also the "--disable" option for examples. +enable = ["c-extension-no-member"] + +[tool.pylint.method_args] +# List of qualified names (i.e., library.method) which require a timeout +# parameter e.g. 'requests.api.get,requests.api.post' +timeout-methods = ["requests.api.delete", "requests.api.get", "requests.api.head", "requests.api.options", "requests.api.patch", "requests.api.post", "requests.api.put", "requests.api.request"] + +[tool.pylint.miscellaneous] +# List of note tags to take in consideration, separated by a comma. +notes = ["FIXME", "XXX", "TODO"] + +# Regular expression of note tags to take in consideration. +# notes-rgx = + +[tool.pylint.refactoring] +# Maximum number of nested blocks for function / method body +max-nested-blocks = 5 + +# Complete name of functions that never returns. When checking for inconsistent- +# return-statements if a never returning function is called then it will be +# considered as an explicit return statement and no message will be printed. +never-returning-functions = ["sys.exit", "argparse.parse_error"] + +# Let 'consider-using-join' be raised when the separator to join on would be non- +# empty (resulting in expected fixes of the type: ``"- " + " - ".join(items)``) +suggest-join-with-non-empty-separator = true + +[tool.pylint.reports] +# Python expression which should return a score less than or equal to 10. You +# have access to the variables 'fatal', 'error', 'warning', 'refactor', +# 'convention', and 'info' which contain the number of messages in each category, +# as well as 'statement' which is the total number of statements analyzed. This +# score is used by the global evaluation report (RP0004). +evaluation = "max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))" + +# Template used to display messages. This is a python new-style format string +# used to format the message information. See doc for all details. +# msg-template = + +# Set the output format. Available formats are: text, parseable, colorized, json2 +# (improved json format), json (old json format) and msvs (visual studio). You +# can also give a reporter class, e.g. mypackage.mymodule.MyReporterClass. +# output-format = + +# Tells whether to display a full report or only the messages. +reports = true + +# Activate the evaluation score. +score = true + +[tool.pylint.similarities] +# Comments are removed from the similarity computation +ignore-comments = true + +# Docstrings are removed from the similarity computation +ignore-docstrings = true + +# Imports are removed from the similarity computation +ignore-imports = true + +# Signatures are removed from the similarity computation +ignore-signatures = true + +# Minimum lines number of a similarity. +min-similarity-lines = 4 + +[tool.pylint.spelling] +# Limits count of emitted suggestions for spelling mistakes. +max-spelling-suggestions = 4 + +# Spelling dictionary name. No available dictionaries : You need to install both +# the python package and the system dependency for enchant to work. +# spelling-dict = + +# List of comma separated words that should be considered directives if they +# appear at the beginning of a comment and should not be checked. +spelling-ignore-comment-directives = "fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:" + +# List of comma separated words that should not be checked. +# spelling-ignore-words = + +# A path to a file that contains the private dictionary; one word per line. +# spelling-private-dict-file = + +# Tells whether to store unknown words to the private dictionary (see the +# --spelling-private-dict-file option) instead of raising a message. +# spelling-store-unknown-words = + +[tool.pylint.typecheck] +# List of decorators that produce context managers, such as +# contextlib.contextmanager. Add to this list to register other decorators that +# produce valid context managers. +contextmanager-decorators = ["contextlib.contextmanager"] + +# List of members which are set dynamically and missed by pylint inference +# system, and so shouldn't trigger E1101 when accessed. Python regular +# expressions are accepted. +# generated-members = + +# Tells whether missing members accessed in mixin class should be ignored. A +# class is considered mixin if its name matches the mixin-class-rgx option. +# Tells whether to warn about missing members when the owner of the attribute is +# inferred to be None. +ignore-none = true + +# This flag controls whether pylint should warn about no-member and similar +# checks whenever an opaque object is returned when inferring. The inference can +# return multiple potential results while evaluating a Python object, but some +# branches might not be evaluated, which results in partial inference. In that +# case, it might be useful to still emit no-member and other checks for the rest +# of the inferred objects. +ignore-on-opaque-inference = true + +# List of symbolic message names to ignore for Mixin members. +ignored-checks-for-mixins = ["no-member", "not-async-context-manager", "not-context-manager", "attribute-defined-outside-init"] + +# List of class names for which member attributes should not be checked (useful +# for classes with dynamically set attributes). This supports the use of +# qualified names. +ignored-classes = ["optparse.Values", "thread._local", "_thread._local", "argparse.Namespace"] + +# Show a hint with possible names when a member name was not found. The aspect of +# finding the hint is based on edit distance. +missing-member-hint = true + +# The minimum edit distance a name should have in order to be considered a +# similar match for a missing member name. +missing-member-hint-distance = 1 + +# The total number of similar names that should be taken in consideration when +# showing a hint for a missing member. +missing-member-max-choices = 1 + +# Regex pattern to define which classes are considered mixins. +mixin-class-rgx = ".*[Mm]ixin" + +# List of decorators that change the signature of a decorated function. +# signature-mutators = + +[tool.pylint.variables] +# List of additional names supposed to be defined in builtins. Remember that you +# should avoid defining new builtins when possible. +# additional-builtins = + +# Tells whether unused global variables should be treated as a violation. +allow-global-unused-variables = true + +# List of names allowed to shadow builtins +# allowed-redefined-builtins = + +# List of strings which can identify a callback function by name. A callback name +# must start or end with one of those strings. +callbacks = ["cb_", "_cb"] + +# A regular expression matching the name of dummy variables (i.e. expected to not +# be used). +dummy-variables-rgx = "_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_" + +# Argument names that match this expression will be ignored. +ignored-argument-names = "_.*|^ignored_|^unused_" + +# Tells whether we should check for unused import in __init__ files. +# init-import = + +# List of qualified module names which can have objects that can redefine +# builtins. +redefining-builtins-modules = ["six.moves", "past.builtins", "future.builtins", "builtins", "io"] + + + diff --git a/scripts/ruff.sh b/scripts/ruff.sh new file mode 100755 index 0000000..fae9d2b --- /dev/null +++ b/scripts/ruff.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +set -e + +# wrapper to combine ruff check, ruff format, and isort +# +# "ruff.sh fix" runs fixes and reformats the code +# "ruff.sh check" checks style, format, and isort +# "ruff.sh " passes abitrary args to ruff + +if [ -z "$1" ]; then + echo "USAGE: $0 [check|fix|]" >&2 + exit 2 +fi + +run() { + declare -i err + + echo "RUN: '$*'" + "$@" + err=$? + echo + return $err +} + +case $1 in + "check") + declare -i exitcode=0 + + set +e + run ruff check . + exitcode=$(( exitcode + $? )) + + run ruff format --diff . + exitcode=$(( exitcode + $? )) + + run isort --check --diff . + exitcode=$(( exitcode + $? )) + set -e + + if [ $exitcode -ne 0 ]; then + echo "ERROR: one or more checks have failed." >&2 + echo "Run 'tox -e ruff' to auto-correct all fixable errors." >&2 + exit 3 + fi + ;; + "fix") + run ruff check --fix . + run ruff format . + run isort . + ;; + *) + ruff "$@" +esac diff --git a/src/instructlab/schema/__init__.py b/src/instructlab/schema/__init__.py new file mode 100644 index 0000000..99afb8a --- /dev/null +++ b/src/instructlab/schema/__init__.py @@ -0,0 +1,29 @@ +"""InstructLab Taxonomy Schema""" + +# Standard +from importlib import resources +from typing import List + +# pylint: disable=ungrouped-imports +try: + # Standard + from importlib.resources.abc import Traversable # type: ignore[import-not-found] +except ImportError: # python>=3.9,<3.11 + # Standard + from importlib.abc import Traversable + +__all__ = ["schema_versions"] + + +def schema_versions() -> List[Traversable]: + """Return the sorted list of schema versions. + + Returns: + List[Traversable]: A sorted list of schema versions. + """ + schema_base = resources.files(__package__) + versions = sorted( + (v for v in schema_base.iterdir() if v.name[0] == "v" and v.name[1:].isdigit()), + key=lambda k: int(k.name[1:]), + ) + return versions diff --git a/src/instructlab/schema/py.typed b/src/instructlab/schema/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/src/instructlab/schema/v1/__init__.py b/src/instructlab/schema/v1/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v1/compositional_skills.json b/src/instructlab/schema/v1/compositional_skills.json similarity index 100% rename from v1/compositional_skills.json rename to src/instructlab/schema/v1/compositional_skills.json diff --git a/v1/knowledge.json b/src/instructlab/schema/v1/knowledge.json similarity index 100% rename from v1/knowledge.json rename to src/instructlab/schema/v1/knowledge.json diff --git a/v1/version.json b/src/instructlab/schema/v1/version.json similarity index 100% rename from v1/version.json rename to src/instructlab/schema/v1/version.json diff --git a/src/instructlab/schema/v2/__init__.py b/src/instructlab/schema/v2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/v2/compositional_skills.json b/src/instructlab/schema/v2/compositional_skills.json similarity index 100% rename from v2/compositional_skills.json rename to src/instructlab/schema/v2/compositional_skills.json diff --git a/v2/knowledge.json b/src/instructlab/schema/v2/knowledge.json similarity index 100% rename from v2/knowledge.json rename to src/instructlab/schema/v2/knowledge.json diff --git a/v2/version.json b/src/instructlab/schema/v2/version.json similarity index 100% rename from v2/version.json rename to src/instructlab/schema/v2/version.json diff --git a/tests/test_versions.py b/tests/test_versions.py new file mode 100644 index 0000000..1b264bb --- /dev/null +++ b/tests/test_versions.py @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: Apache-2.0 + +# Standard +from importlib import resources +import json + +# Third Party +from referencing import Resource +from referencing.jsonschema import DRAFT202012 + +# First Party +from instructlab.schema import schema_versions + + +class TestVersions: + def test_versions(self): + versions = schema_versions() + assert versions is not None + assert len(versions) > 1 + for i, v in enumerate(versions): + assert v.name == f"v{i+1}" + + def _load_schema(self, path): + text = path.read_text(encoding="utf-8") + assert text + assert len(text) > 1 + contents = json.loads(text) + assert contents + assert len(contents) > 1 + resource = Resource.from_contents( + contents=contents, default_specification=DRAFT202012 + ) + assert resource + assert resource.contents == contents + + def test_import_schema_base(self): + schema_base = resources.files("instructlab.schema") + for i in range(len(schema_versions())): + schema_version = schema_base.joinpath(f"v{i+1}") + for schema_name in ("compositional_skills", "knowledge", "version"): + path = schema_version.joinpath(f"{schema_name}.json") + self._load_schema(path) + + def test_import_schema_versions(self): + for i in range(len(schema_versions())): + schema_version = resources.files(f"instructlab.schema.v{i+1}") + for schema_name in ("compositional_skills", "knowledge", "version"): + path = schema_version.joinpath(f"{schema_name}.json") + self._load_schema(path) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..b574695 --- /dev/null +++ b/tox.ini @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: Apache-2.0 + +[tox] +# py3-unit runs unit tests with 'python3' +# py311-unit runs the same tests with 'python3.11' +envlist = ruff, lint, mypy, jsonschema, py3-unit +minversion = 4.4 + +[testenv] +description = Run tests (unit) +package = wheel +wheel_build_env = pkg +deps = + pytest + jsonschema +commands = + unit: {envpython} -m pytest {posargs:tests} + +[testenv:lint] +description = Lint with pylint +deps = + pylint + pylint-pydantic + jsonschema +commands = + {envpython} -m pylint --load-plugins pylint_pydantic {posargs:src/instructlab/schema/ tests/} + +[testenv:ruff] +description = Reformat and fix code with Ruff (and isort) +skip_install = True +skipsdist = true +deps = + ruff + isort + jsonschema +commands = + ./scripts/ruff.sh {posargs:fix} +allowlist_externals = ./scripts/ruff.sh + +[testenv:mypy] +description = Python type checking with mypy +namespace_packages = True +explicit_package_bases = True +deps = + mypy + jsonschema +commands = + {envpython} -m mypy {posargs:src tests} + +[testenv:jsonschema] +description = JSON schema file validation with check-jsonschema +skip_install = True +skipsdist = true +deps = + check-jsonschema +commands = + bash -c "{envpython} -m check_jsonschema --verbose --schemafile https://json-schema.org/draft/2020-12/schema {posargs:$(find src/instructlab/schema/v* -name \"*.json\" -print)}" +allowlist_externals = bash + +[gh] +python = + 3.12 = py312-unit + 3.11 = py311-unit + 3.10 = py310-unit + 3.9 = py39-unit