diff --git a/.github/workflows/build-wheel.yml b/.github/workflows/build-wheel.yml new file mode 100644 index 0000000..8518c43 --- /dev/null +++ b/.github/workflows/build-wheel.yml @@ -0,0 +1,51 @@ +name: Run unit tests on every push + +on: [push] + +jobs: + build: + name: Build wheels for Python ${{ matrix.python-version }} + runs-on: ubuntu-latest + if: github.ref_name == github.event.repository.default_branch + + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Checkout + uses: actions/checkout@v3 + + - name: Install Poetry + run: | + PIPX_BIN_DIR=/usr/local/bin pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: poetry + cache-dependency-path: poetry.lock + + - name: Set Poetry environment + run: | + poetry env use ${{ matrix.python-version }} + + - name: Install dependencies + run: | + poetry install + + - name: Build wheels + run: | + poetry build + + - name: Upload wheels + uses: actions/upload-artifact@v2 + with: + path: dist/*.whl + + - name: Upload source distribution + uses: actions/upload-artifact@v2 + if: matrix.python-version == '3.8' + with: + path: dist/*.tar.gz diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml.bak similarity index 100% rename from .github/workflows/release.yml rename to .github/workflows/release.yml.bak diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 59ffca2..6d7bda6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -5,33 +5,35 @@ on: [push, pull_request] jobs: test: name: Python ${{ matrix.python-version }} tests - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.8] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 + - name: Checkout + uses: actions/checkout@v3 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + - name: Install Poetry + run: | + PIPX_BIN_DIR=/usr/local/bin pipx install poetry + + - name: Set up Python + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + cache: poetry + cache-dependency-path: poetry.lock - - name: Cache pip - uses: actions/cache@v1 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }} - restore-keys: | - ${{ runner.os }}-pip- + - name: Set Poetry environment + run: | + poetry env use ${{ matrix.python-version }} - - name: Install dependencies (Python ${{ matrix.python-version }}) + - name: Install dependencies run: | - python -m pip install --upgrade pip - pip install typing cython setuptools>=18.0 - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + poetry install - - name: Run tests on Python ${{ matrix.python-version }} - run: make test_local + - name: Run tests + run: | + poetry run pytest -v diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..da257d7 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,43 @@ +ci: + skip: [pytest] + +default_language_version: + python: python3.8 + +repos: + # ruff: linting + formatting + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.4.4 + hooks: + - id: ruff + args: ["--fix"] + - id: ruff-format + + # pytest: testing + - repo: local + hooks: + - id: pytest + name: pytest + entry: poetry run pytest + language: system + types: [python] + pass_filenames: false + + # enforce conventional commit messages + - repo: https://github.com/compilerla/conventional-pre-commit + rev: v3.2.0 + hooks: + - id: conventional-pre-commit + stages: [commit-msg] + args: [] + + # # skip poetry check for now, it's large and slow + # # poetry: check lock and generate requirements.txt + # - repo: https://github.com/python-poetry/poetry + # rev: 1.8.3 + # hooks: + # - id: poetry-check + # args: ["--lock"] + # - id: poetry-export + # args: ["-f", "requirements.txt", "--with", "build", "--output", "requirements.txt"] + # verbose: true diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index f9bd145..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include requirements.txt diff --git a/Makefile b/Makefile deleted file mode 100644 index cc8b6a4..0000000 --- a/Makefile +++ /dev/null @@ -1,33 +0,0 @@ -SHELL := /bin/bash - -.PHONY: build -build: - python setup.py build_ext --inplace - -# test ground for github action, requires building local image from -# https://github.com/RalfG/python-wheels-manylinux-build -.PHONY: build_manylinux -build_manylinux: - docker run --rm -e PLAT=manylinux2010_x86_64 -v `pwd`:/github/workspace/ python-wheels-manylinux-build "cp27-cp27m cp36-cp36m" "cython setuptools>=18.0" "bzip2-devel zlib-devel" - -.PHONY: test -test: - $(MAKE) build - PYTHONPATH=. python tests/unit_test.py - PYTHONPATH=. python tests/integration_test.py - -.PHONY: test_local -test_local: - bash -c "source tests/local_test.env; PYTHONPATH=. python tests/unit_test.py" - -.PHONY: clean -clean: - rm -rf oscar.egg-info dist build docs/build ~/.pyxbld/* *.c tests/*.c *.so tests/*.so - find -name "*.pyxbldc" -delete - find -name "*.pyo" -delete - find -name "*.pyc" -delete - find -name __pycache__ -delete - -.PHONY: html -html: - PYTHONPATH=. OSCAR_TEST=1 sphinx-build -M html "docs" "docs/build" diff --git a/README.md b/README.md index eb8b0f1..ac115e9 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,132 @@ -# Python interface for OSCAR data +# python-woc +**python-woc** is the python interface to the World of Code (WoC) data. +It precedes the [oscar.py](https://ssc-oscar.github.io/oscar.py) project and is hundreds of times faster than the invoking [lookup](https://github.com/ssc-oscar/lookup) scripts via subprocess. -This is a convenience library to access World of Code data -(WoC; it was referred internally as oscar while development, hence the name). -Since everything is stored in local files it won't work unless you have access -to one of the WoC servers. +## Requirements -### Installation +- Linux with a GNU toolchain (only tested on x86_64, Ubuntu / CentOS) -Normally it is preinstalled on WoC servers. To install manually, -e.g. to a virtual environment not using system packages, just use: +- Python 3.8 or later -```shell -python3 setup.py build_ext -python3 setup.py install --user +## Install python-woc + +### From PyPI + +The latest version of `python-woc` is available on PyPI and can be installed using `pip`: + +```bash +pip3 install python-woc +``` + +### From Source + +To try out latest features, you may install python-woc from source: + +```bash +git clone https://github.com/ssc-oscar/python-woc.git +cd python-woc +python3 -m pip install -r requirements.txt +python3 +``` + +## Generate Profiles + +One of the major improvents packed in python-woc is profile. Profiles tell the driver what versions of what maps are available, decoupling the driver from the folder structure of the data. It grants the driver the ability to work with multiple versions of WoC, on a different machine, or even on the cloud. + +Profiles are generated using the `woc.detect` script. The script takes a list of directories, scans for matched filenames, and generates a profile: + +```bash +python3 woc.detect /path/to/woc/1 /path/to/woc/2 ... > wocprofile.json +``` + +By default, python-woc looks for `wocprofile.json`, `~/.wocprofile.json`, and `/etc/wocprofile.json` for the profile. + +## Use CLI + +python-woc's CLI is a drop-in replacement for the `getValues` and `showCnt` perl scripts. We expect existing scripts to be work just well with the following: + +```bash +alias getValues='python3 -m woc.get_values' +alias showCnt='python3 -m woc.show_content' ``` -Installing from sources requires extra tools to compile (cython, -manylinux docker image etc), but still possible. Refer to the -[Build page](https://ssc-oscar.github.io/oscar.py) in the reference. +The usage is the same as the original scripts, and the output should be identical: + +```bash +# echo some_key | echo python3 -m woc.get_values some_map +> echo e4af89166a17785c1d741b8b1d5775f3223f510f | showCnt commit 3 +tree f1b66dcca490b5c4455af319bc961a34f69c72c2 +parent c19ff598808b181f1ab2383ff0214520cb3ec659 +author Audris Mockus 1410029988 -0400 +committer Audris Mockus 1410029988 -0400 + +News for Sep 5 +``` -### Reference +You may find more examples in the [lookup](https://github.com/ssc-oscar/lookup#ov-readme) repository. +If you find any incompatibilities, please [submit an issue report](https://github.com/ssc-oscar/python-woc/issues/new). -Please see for the full reference. +## Use Python API + +The python API is designed to get rid of the overhead of invoking the perl scripts via subprocess. It is also more native to python and provides a more intuitive interface. + +With a `wocprofile.json`, you can create a `WocMapsLocal` object and access the maps in the file system: + +```python +>>> from woc.local import WocMapsLocal +>>> woc = WocMapsLocal() +>>> woc.maps +{'p2c', 'a2b', 'c2ta', 'a2c', 'c2h', 'b2tac', 'a2p', 'a2f', 'c2pc', 'c2dat', 'b2c', 'P2p', 'P2c', 'c2b', 'f2b', 'b2f', 'c2p', 'P2A', 'b2fa', 'c2f', 'p2P', 'f2a', 'p2a', 'c2cc', 'f2c', 'c2r', 'b2P'} +``` + +To query the maps, you can use the `get_values` method: + +```python +>>> woc.get_values("b2fa", "05fe634ca4c8386349ac519f899145c75fff4169") +('1410029988', 'Audris Mockus ', 'e4af89166a17785c1d741b8b1d5775f3223f510f') +>>> woc.get_values("c2b", "e4af89166a17785c1d741b8b1d5775f3223f510f") +['05fe634ca4c8386349ac519f899145c75fff4169'] +>>> woc.get_values("b2tac", "05fe634ca4c8386349ac519f899145c75fff4169") +[('1410029988', 'Audris Mockus ', 'e4af89166a17785c1d741b8b1d5775f3223f510f')] +``` + +Use `show_content` to get the content of a blob, a commit, or a tree: + +```python +>>> woc.show_content("tree", "f1b66dcca490b5c4455af319bc961a34f69c72c2") +[('100644', 'README.md', '05fe634ca4c8386349ac519f899145c75fff4169'), ('100644', 'course.pdf', 'dfcd0359bfb5140b096f69d5fad3c7066f101389')] +>>> woc.show_content("commit", "e4af89166a17785c1d741b8b1d5775f3223f510f") +('f1b66dcca490b5c4455af319bc961a34f69c72c2', ('c19ff598808b181f1ab2383ff0214520cb3ec659',), ('Audris Mockus ', '1410029988', '-0400'), ('Audris Mockus ', '1410029988', '-0400'), 'News for Sep 5') +>>> woc.show_content("blob", "05fe634ca4c8386349ac519f899145c75fff4169") +'# Syllabus for "Fundamentals of Digital Archeology"\n\n## News\n\n* ...' +``` + +Note that the function yields different types for different maps. Please refer to the [documentation](https://ssc-oscar.github.io/python-woc) for details. + +## Use Python Objects API + +The objects API provides a more intuitive way to access the WoC data. +Note that the objects API is not a replacement to [oscar.py](https://ssc-oscar.github.io/oscar.py) even looks pretty much like the same: many of the methods have their signatures changed and refactored to be more consistent, intuitive and performant. Query results are cached, so you can access the same object multiple times without additional overhead. + +Call `init_woc_objects` to initialize the objects API with a WoC instance: + +```python +from woc.local import WocMapsLocal +from woc.objects import init_woc_objects +woc = WocMapsLocal() +init_woc_objects(woc) +``` + +To get the tree of a commit: + +```python +from woc.objects import Commit +>>> c1 = Commit("91f4da4c173e41ffbf0d9ecbe2f07f3a3296933c") +>>> c1.tree +Tree(836f04d5b374033b1608269e2f3aaabae263a0db) +>>> c1.projects[0].url +'https://github.com/woc-hack/thebridge' +``` +For more, check `woc.objects` in the documentation. diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..dcaab9e --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +*.html +*.js \ No newline at end of file diff --git a/docs/DataFormat.md b/docs/DataFormat.md index 3a25ac6..653e8e4 100644 --- a/docs/DataFormat.md +++ b/docs/DataFormat.md @@ -1,5 +1,3 @@ - - ## Git objects ### Sequential access: diff --git a/docs/_static/custom.css b/docs/_static/custom.css deleted file mode 100644 index 94d2983..0000000 --- a/docs/_static/custom.css +++ /dev/null @@ -1,18 +0,0 @@ -/* This file intentionally left blank. */ - -div.sphinxsidebarwrapper { - position:fixed; -} - -div.document { - width: 940px; - margin: 30px auto 0 30px; -} - -.py.class { - margin: 2em 0; -} - -.py.method { - margin: 1em 0; -} \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 006864b..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,163 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os -import sys -sys.path.insert(0, os.path.abspath('..')) - - -# -- Project information ----------------------------------------------------- - -project = u'oscar' -author = u'Marat (@cmu.edu)' -copyright = u'2018, ' + author - -# The full version, including alpha/beta/rc tags -release = u'1.3.3' -# The short X.Y version -version = u'.'.join(release.split(u'.', 3)[:2]) - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - # 'sphinx.ext.autosummary', - 'sphinx.ext.autodoc', - 'sphinx.ext.githubpages', - 'sphinx.ext.napoleon', - 'sphinx.ext.viewcode', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path . -exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'alabaster' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -# htmlhelp_basename = 'oscardoc' - - -# -- Options for LaTeX output ------------------------------------------------ - -# latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -# } - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -# latex_documents = [ -# (master_doc, 'oscar.tex', u'oscar Documentation', -# u'Marat (@cmu.edu)', 'manual'), -# ] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -# man_pages = [ -# (master_doc, 'oscar', u'oscar Documentation', -# [author], 1) -# ] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -# texinfo_documents = [ -# (master_doc, 'oscar', u'oscar Documentation', -# author, 'oscar', 'One line description of project.', -# 'Miscellaneous'), -# ] - - -# -- Extension configuration ------------------------------------------------- diff --git a/docs/contribute.rst b/docs/contributing.md similarity index 67% rename from docs/contribute.rst rename to docs/contributing.md index 2f42175..daf857c 100644 --- a/docs/contribute.rst +++ b/docs/contributing.md @@ -1,21 +1,9 @@ -Starting version 1.3, `oscar` is compiled from Cython, a language closely -resembling Python but with optional strong typing to improve performance. -Building Cython packages requires few extra build steps, as explained below. +## How to Commit -In addition, automation in this project relies on few assumptions which you are -expected to follow. Below you can find their brief description and the motivation -behind. +We follow the standard "fork-and-pull" Git workflow. All the development is done on feature branches, which are then merged to `master` via pull requests. Pull requires are unit tested and linted automatically. - -How to contribute ------------------ - -All the development is done on feature branches, which are then merged to `master` -via pull requests. Every pull request triggers unit testing, every merge triggers -a release. - -To generate release notes, we use `conventional commits `_, +To generate release notes, we use [conventional commits](https://www.conventionalcommits.org), a convention to commit messages. In a nutshell, it means commit messages should be prefixed with one of: @@ -40,9 +28,83 @@ be incremented. As a consequence, **you must never change version number manuall Not following these procedures might take your pull request extra time to review and in some cases will require rewriting the commit history. +## How to ... + +### Setup dev environment + +To make sure everyone is on the same page, we use [poetry](https://python-poetry.org) +to manage dependencies and virtual environments. +If you don't have it installed yet, please follow the [installation guide](https://python-poetry.org/docs/#installation). + +After installing poetry, create an virtual environment and install all dependencies: + +```bash +poetry shell # activate the virtual environment +poetry install # install all dependencies +``` + +The `poetry install` command builds `python-woc` from source as well. + +### Install pre-commit hooks + +Pre-commit hooks ensure that all code is formatted, linted, and tested before pushed to GitHub. +This "fail fast, fail early" approach saves time and effort for all of us. +` + +```bash +pre-commit install # install linter and unit tests to pre-commit hooks +pre-commit install --hook-type commit-msg # install the conventional commits checker +``` + +### Compile changes to Cython code + +```bash +python3 setup.py +``` -About Cython ------------- +### Lint + +```bash +ruff format # format all Python code +ruff check # lint all Python code +ruff check --fix # fix all linting issues +``` + +### Test + +```bash +pytest # run all unit tests +pytest -k test_name # run a specific test +pytest --cov # run all tests and check coverage +``` + +### Add or delete a dependency + +```bash +poetry add package_name # add a new dependency +# or +nano pyproject.toml # add a new dependency manually +poetry lock --no-update # update the lock file +``` + +```bash +poetry check --lock +poetry export -f requirements.txt --with build --output requirements.txt +``` + +### Build and publish to PyPI + +```bash +poetry build +``` + +### Publish to PyPI + +```bash +poetry publish --build +``` + +## About Cython The reason to use Cython was primarily Python 3 support. WoC data is stored in tokyocabinet (.tch) files, note natively supported by Python. @@ -70,14 +132,11 @@ Cython came a clear winner in this comparison, also helping to speed up some utility functions along the way (e.g. `fnvhash`, a pure Python version of which was previously used). -Compiling and packaging ------------------------ +## Compiling and packaging To compile oscar locally, run: -`python setup.py build_ext --inplace`. To explicitly specify python version, -replace `pyhon`, with the appropriate version, e.g. `python2`. -There shorter alias for this command, `make build`, will always use the default -Python. +`python setup.py`. To explicitly specify python version, +replace `python`, with the appropriate version, e.g. `python3.8`. If you are building for several Python versions in a row without changing the code (e.g. to check if it compiles at all), make sure you clean up first by @@ -91,20 +150,19 @@ this `.so` just a second ago in this case. Packaging is slightly more complicated than just compiling since oscar needs to support at least Python 2.7 and 3.6 simultaneously, meaning we need to package multiple binaries. Fortunately, `PEP 513 `_ -offers support for such packages. Building is done via `manylinux `_, +offers support for such packages. Building is done via [manylinux](https://github.com/pypa/manylinux), a special Docker image, and is automated via GitHub action. To build package locally, -#. clone the corresponding GitHub action repository, +- clone the corresponding GitHub action repository, `git clone git@github.com:user2589/python-wheels-manylinux-build.git`, -#. check out the desired tag if necessary, e.g. `git checkout v0.3.4` -#. build Docker image: `docker build -t python-wheels-manylinux-build .` -#. run the image: `make build_manylinux` +- check out the desired tag if necessary, e.g. `git checkout v0.3.4` +- build Docker image: `docker build -t python-wheels-manylinux-build .` +- run the image: `make build_manylinux` -Testing -------- +## Testing Every push to oscar repository is automatically tested; on top, you might want to test locally before making a commit to avoid awkward followup fixes and a @@ -121,9 +179,9 @@ oscar paths to point at these fixtures requires setting some environment variables, stored in `tests/local_test.env`. To tests locally, -#. set environment variables, `source tests/local_test.env` -#. clean up previously compiled binaries to avoid Py2/3 compatibility issues: `make clean` -#. run the test script: `PYTHONPATH=. python tests/unit_test.py`. +- set environment variables, `source tests/local_test.env` +- clean up previously compiled binaries to avoid Py2/3 compatibility issues: `make clean` +- run the test script: `PYTHONPATH=. python tests/unit_test.py`. Don't forget to replace `python` with a specific version if testing against non-default Python) @@ -140,15 +198,15 @@ To avoid manual compilation with `cythonize`, Cython tests are compiled with `pyximport`, an in-place JIT compiler. Thus, at the beginning of every test suite, install `pyximport`: -.. code-block:: python - - import pyximport - pyximport.install( - # build_dir='build', - setup_args={"script_args": ["--force"]}, - inplace=True, - language_level='3str' - ) +```python +import pyximport +pyximport.install( + # build_dir='build', + setup_args={"script_args": ["--force"]}, + inplace=True, + language_level='3str' +) +``` To tell `pyximport` where to find sources and libraries for the main module, there is a special file `oscar.pyxbld`. It is important to keep it consistent diff --git a/docs/favicon.ico b/docs/favicon.ico new file mode 100644 index 0000000..b7c4c45 Binary files /dev/null and b/docs/favicon.ico differ diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index 0a6aead..0000000 --- a/docs/index.rst +++ /dev/null @@ -1,95 +0,0 @@ -.. oscar documentation master file, created by - sphinx-quickstart on Mon May 28 14:28:44 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Reference -=========== - -.. toctree:: - :maxdepth: 2 - - contribute - - -This module provides interface to the objects: - -- :py:class:`.Project` - represents a repository -- :py:class:`.Commit` - represents a commit in a repository -- :py:class:`.Tree` - represents a directory and its content. Each `Commit` has a root tree. -- :py:class:`.File` - represents a file path, including all parent directories/trees -- :py:class:`.Blob` - Binary Large OBject, represents a file content. -- :py:class:`.Author` - represents a combination of author name and email. - -`Commit`, `Tree` and `Blob` are a straightforward representation of -objects used by Git internally. -It will be helpful to read `Chapter 2 `_ -of `Pro Git book `_ (free and Open Source) -for better understanding of these objects. - -Common methods --------------- -.. py:module:: oscar - -All objects have a unique key. -For git objects (`Commit`, `Tree`, `Blob`) -it is the object SHA hash; -for `Project` it is the project URI; -for `File` it is the filename; -for `Author` it is the author name and email. -Objects of the same type and having the same key will be considered equivalent: - - >>> sha = 'f2a7fcdc51450ab03cb364415f14e634fa69b62c' - >>> Commit(sha) == Commit(sha) - True - -It is possible to iterate all objects of a given type using `.all()` - -.. automethod:: _Base.all - - E.g. to iterate all repositories of user2589 on github: - - >>> for project in Project.all(): - ... print project.uri - -GitObject methods ------------------ - -These methods are shared by `Commit`, `Tree`, `Blob`. - -All git objects are instantiated by a 40-byte hex string SHA or a 20-byte binary SHA. -In most cases you will use hex form, the latter way is needed only fore relatively -rare cases you need to interface with binary data. - - >>> Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c') - >>> Commit(b'\xf2\xa7\xfc\xdcQE\n\xb0<\xb3dA_\x14\xe64\xfai\xb6,') - -Whatever form of SHA was used to instantiate the object, it will have properties: - -- `sha` - 40-byte hex string -- `bin_sha` - 20 bytes binary string - -All git objects, when coerced to `str`, will return their internal representation. -It is mostly important for `Blob` to access the file content. - - -Class reference ---------------- - -.. autoclass:: Project - :members: commit_shas, commits, head, tail, commits_fp - -.. autoclass:: Commit - :members: parents, project_names, projects, child_shas, children, blob_shas, blobs - -.. autoclass:: Tree - :members: traverse, files, blob_shas, blobs - -.. autoclass:: File - :members: commit_shas, commits - -.. autoclass:: Blob - :members: data, commit_shas, commits - -.. autoclass:: Author - :members: commit_shas, commits diff --git a/docs/tutorial.md b/docs/tutorial.md index e67a901..8c95e40 100644 --- a/docs/tutorial.md +++ b/docs/tutorial.md @@ -1,8 +1,6 @@ -# Tutorial basics for Hackathon -------- ## List of relevant directories ### da0 Server -#### .{0-31}.tch files in `/data/basemaps/`: +#### `.{0-31}.tch` files in `/data/basemaps/`: (.s) signifies that there are either .s or .gz versions of these files in gz/ subfolder, which can be opened with Python gzip module or Unix zcat. da0 is the only server with these .s/.gz files Keys for identifying letters: diff --git a/filescan.py b/filescan.py deleted file mode 100644 index 36af8ee..0000000 --- a/filescan.py +++ /dev/null @@ -1,180 +0,0 @@ -import os -import re -import logging -from typing import Optional, Set, Union, Dict, Tuple - -_basemap_pat = re.compile(r'^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$') -def parse_basemap_path(fname: str): - """ - Parse basemap filename into (src, dst, ver, idx) - >>> parse_basemap_path('c2fFullR.3.tch') - ('c', 'f', 'R', '3') - >>> parse_basemap_path('c2fFullR.tch') - ('c', 'f', 'R', None) - """ - m = _basemap_pat.match(fname) - if not m or len(m.groups()) != 4: - raise ValueError(f'Invalid path: {fname}') - return m.groups() - -_sha1map_pat = re.compile(r'^([\w\.]+)_(\d+).(\w+)$') -def parse_sha1map_path(fname: str): - """ - Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext) - >>> parse_sha1map_path('commit_0.tch') - ('commit', '0', 'tch') - >>> parse_sha1map_path('blob_0.idx') - ('blob', '0', 'idx') - >>> parse_sha1map_path('sha1.blob_0.bin') - ('sha1.blob', '0', 'bin') - """ - m = _sha1map_pat.match(fname) - if not m or len(m.groups()) != 3: - raise ValueError(f'Invalid path: {fname}') - return m.groups() - -_short_name_to_full = { - 'a': 'author', - 'A': 'author_dealised', - 'b': 'blob', - 'c': 'commit', - 'cc': 'child_commit', - 'f': 'file', - 'fa': 'first_author', - 't': 'tree', - 'h': 'head', - 'p': 'project', - 'P': 'project_deforked', - 'pc': 'parent_commit', - 'r': 'root_commit', - 'ta': 'time_author', - 'tac': 'time_author_commit', - 'trp': 'torvalds_path', - 'dat': 'colon_seperated_data', - 'tch': 'compressed_data', - 'bin': 'binary_data', - 'idx': 'binary_index' -} - -# match (name)Full(ver).(idx).tch - -_full_name_to_short = {v: k for k, v in _short_name_to_full.items()} - -##### module configuration variables ##### - -# default config values -DEFAULT_BASE_PATH = '/woc' -DEFAULT_STORES = { - 'OSCAR_ALL_BLOBS': 'All.blobs', - 'OSCAR_ALL_SHA1C': 'All.sha1c', - 'OSCAR_ALL_SHA1O': 'All.sha1o', - 'OSCAR_BASEMAPS': 'basemaps', -} - -# tokyo cabinet store paths -PATHS: Dict[Tuple[str, str], Tuple[str, int, Optional[str]]] = {} - -# prefixes used by World of Code to identify source project platforms -# See Project.to_url() for more details -# Prefixes have been deprecated by replacing them with the string resembling -# actual URL -URL_PREFIXES = { - b'bitbucket.org': b'bitbucket.org', - b'gitlab.com': b'gitlab.com', - b'android.googlesource.com': b'android.googlesource.com', - b'bioconductor.org': b'bioconductor.org', - b'drupal.com': b'git.drupal.org', - b'git.eclipse.org': b'git.eclipse.org', - b'git.kernel.org': b'git.kernel.org', - b'git.postgresql.org': b'git.postgresql.org', - b'git.savannah.gnu.org': b'git.savannah.gnu.org', - b'git.zx2c4.com': b'git.zx2c4.com', - b'gitlab.gnome.org': b'gitlab.gnome.org', - b'kde.org': b'anongit.kde.org', - b'repo.or.cz': b'repo.or.cz', - b'salsa.debian.org': b'salsa.debian.org', - b'sourceforge.net': b'git.code.sf.net/p' -} -IGNORED_AUTHORS = ( - b'GitHub Merge Button ' -) - -def set_config( - base_path: str = DEFAULT_BASE_PATH, - stores: Optional[Dict[str, str]] = None, - url_prefixes: Optional[Dict[bytes, bytes]] = None, - ignored_authors: Optional[Tuple[bytes]] = None - ): - """Set the configuration for the Oscar module. - :param base_path: path to the woc directory - :param stores: a dictionary of store names (OSCAR_ALL_BLOBS, OSCAR_ALL_SHA1C, OSCAR_ALL_SHA1O, OSCAR_BASEMAPS) - to their relative paths in the woc directory - :param url_prefixes: a BYTES dictionary of url prefixes to their full urls (e.g. b'bitbucket.org' -> b'bitbucket.org') - :param ignored_authors: a BYTES tuple of authors to ignore (e.g. b'GitHub Merge Button ' - """ - - global PATHS, IGNORED_AUTHORS, URL_PREFIXES - - if not os.path.exists(base_path): - raise ValueError(f'Oscar failed to locate {base_path},' - 'please call set_config("/path/to/woc")') - - if stores is None: - stores = {k: os.path.join(base_path, v) for k, v in DEFAULT_STORES.items()} - - # Scan the woc data directory - for store_name, store_path in stores.items(): - for f in os.listdir(store_path): - try: - if store_name == 'OSCAR_BASEMAPS': - src, dst, ver, idx = parse_basemap_path(f) - k = (src, dst) - prefix_len = int(idx).bit_length() if idx else 0 - if k in PATHS: - _, _predix_len, _ver = PATHS[k][0], PATHS[k][1], PATHS[k][2] - if _ver > ver or (_ver == ver and _predix_len >= prefix_len): - continue - PATHS[k] = ( - os.path.join(store_path, - f.replace(idx, '{key}') if idx else f - ), prefix_len, ver) - pass - elif store_name in ('OSCAR_ALL_BLOBS', 'OSCAR_ALL_SHA1C', 'OSCAR_ALL_SHA1O'): - name, idx, ext = parse_sha1map_path(f) - try: - src = _full_name_to_short[name.replace('sha1.','')] - except KeyError: - raise ValueError(f'Invalid file type: {name}') - k = (src, ext) - prefix_len = int(idx).bit_length() if idx else 0 - if k in PATHS: - _, _predix_len = PATHS[k][0], PATHS[k][1] - if _predix_len >= prefix_len: - continue - PATHS[k] = ( - os.path.join(store_path, - f.replace(idx, '{key}') if idx else f - ), prefix_len, None) - else: - raise ValueError(f'Invalid store name: {store_name}, expected one of {DEFAULT_STORES.keys()}') - - except ValueError as e: - logging.warning(f'Cannot parse {f}: {repr(e)} ') - - print(f'Loaded {len(PATHS.keys())} maps: ' - f'{[_short_name_to_full[x[0]] + "->" + _short_name_to_full[x[1]] + ":" + str(PATHS[x]) for x in PATHS.keys()]}') - - if url_prefixes is not None: - URL_PREFIXES = url_prefixes - - if ignored_authors is not None: - IGNORED_AUTHORS = ignored_authors - -# run set_config on import -set_config() - -if __name__ == '__main__': - import doctest - doctest.testmod() - - print(PATHS) \ No newline at end of file diff --git a/oscar.pxd b/oscar.pxd deleted file mode 100644 index 5d3449d..0000000 --- a/oscar.pxd +++ /dev/null @@ -1,8 +0,0 @@ - -# cython: language_level=3str -from libc.stdint cimport uint32_t - -# we need this header file for unit tests. -cdef unber(bytes buf) -cdef (int, int) lzf_length(bytes raw_data) -cdef uint32_t fnvhash(bytes data) diff --git a/oscar.pyx b/oscar.pyx deleted file mode 100644 index 30bbf2e..0000000 --- a/oscar.pyx +++ /dev/null @@ -1,1888 +0,0 @@ - -# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False - -import binascii -from datetime import datetime, timedelta, tzinfo, timezone -import difflib -from functools import wraps -import hashlib -import os -import re -from threading import Lock -import time -from typing import Dict, Tuple, Optional, Union -import warnings -import mmap - -from libc.stdint cimport uint8_t, uint32_t, uint64_t -from libc.stdlib cimport free - -# if throws "module 'lzf' has no attribute 'decompress'", -# `pip uninstall lzf && pip install python-lzf` -import lzf -if not hasattr(lzf, 'decompress'): - raise ImportError('python-lzf is required to run Oscar; ' - 'please install it with `pip uninstall lzf && pip install python-lzf`') - -__version__ = '2.2.1' -__author__ = 'marat@cmu.edu' -__license__ = 'GPL v3' - -_basemap_pat = re.compile(r'^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$') -def parse_basemap_path(fname: str): - """ - Parse basemap filename into (src, dst, ver, idx) - >>> parse_basemap_path('c2fFullR.3.tch') - ('c', 'f', 'R', '3') - >>> parse_basemap_path('c2fFullR.tch') - ('c', 'f', 'R', None) - """ - m = _basemap_pat.match(fname) - if not m or len(m.groups()) != 4: - raise ValueError(f'Invalid path: {fname}') - return m.groups() - -_sha1map_pat = re.compile(r'^([\w\.]+)_(\d+).(\w+)$') -def parse_sha1map_path(fname: str): - """ - Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext) - >>> parse_sha1map_path('commit_0.tch') - ('commit', '0', 'tch') - >>> parse_sha1map_path('blob_0.idx') - ('blob', '0', 'idx') - >>> parse_sha1map_path('sha1.blob_0.bin') - ('sha1.blob', '0', 'bin') - """ - m = _sha1map_pat.match(fname) - if not m or len(m.groups()) != 3: - raise ValueError(f'Invalid path: {fname}') - return m.groups() - -# identities and their shortcut -_short_name_to_full = { - 'a': 'author', - 'A': 'author_dealised', - 'b': 'blob', - 'c': 'commit', - 'cc': 'child_commit', - 'f': 'file', - 'fa': 'first_author', - 't': 'tree', - 'h': 'head', - 'p': 'project', - 'P': 'project_deforked', - 'pc': 'parent_commit', - 'r': 'root_commit', - 'ta': 'time_author', - 'tac': 'time_author_commit', - 'trp': 'torvalds_path', - 'dat': 'colon_seperated_data', - 'tch': 'compressed_data', - 'bin': 'binary_data', - 'idx': 'binary_index' -} -_full_name_to_short = {v: k for k, v in _short_name_to_full.items()} - -##### module configuration variables ##### - -# default config values -DEFAULT_BASE_PATH = '/woc' -DEFAULT_STORES = { - 'OSCAR_ALL_BLOBS': 'All.blobs', - 'OSCAR_ALL_SHA1C': 'All.sha1c', - 'OSCAR_ALL_SHA1O': 'All.sha1o', - 'OSCAR_BASEMAPS': 'basemaps', -} - -# tokyo cabinet store paths -PATHS: Dict[Tuple[str, str], Tuple[str, int, Optional[str]]] = {} - -# prefixes used by World of Code to identify source project platforms -# See Project.to_url() for more details -# Prefixes have been deprecated by replacing them with the string resembling -# actual URL -URL_PREFIXES = { - b'bitbucket.org': b'bitbucket.org', - b'gitlab.com': b'gitlab.com', - b'android.googlesource.com': b'android.googlesource.com', - b'bioconductor.org': b'bioconductor.org', - b'drupal.com': b'git.drupal.org', - b'git.eclipse.org': b'git.eclipse.org', - b'git.kernel.org': b'git.kernel.org', - b'git.postgresql.org': b'git.postgresql.org', - b'git.savannah.gnu.org': b'git.savannah.gnu.org', - b'git.zx2c4.com': b'git.zx2c4.com', - b'gitlab.gnome.org': b'gitlab.gnome.org', - b'kde.org': b'anongit.kde.org', - b'repo.or.cz': b'repo.or.cz', - b'salsa.debian.org': b'salsa.debian.org', - b'sourceforge.net': b'git.code.sf.net/p' -} -IGNORED_AUTHORS = ( - b'GitHub Merge Button ' -) - -def set_config( - base_path: str = DEFAULT_BASE_PATH, - stores: Optional[Dict[str, str]] = None, - url_prefixes: Optional[Dict[bytes, bytes]] = None, - ignored_authors: Optional[Tuple[bytes]] = None - ): - """Set the configuration for the Oscar module. - :param base_path: path to the woc directory - :param stores: a dictionary of store names (OSCAR_ALL_BLOBS, OSCAR_ALL_SHA1C, OSCAR_ALL_SHA1O, OSCAR_BASEMAPS) - to their relative paths in the woc directory - :param url_prefixes: a BYTES dictionary of url prefixes to their full urls (e.g. b'bitbucket.org' -> b'bitbucket.org') - :param ignored_authors: a BYTES tuple of authors to ignore (e.g. b'GitHub Merge Button ' - """ - - global PATHS, IGNORED_AUTHORS, URL_PREFIXES - - # add support for environment variables - if os.environ.get('OSCAR_BASE_PATH'): - base_path = os.environ['OSCAR_BASE_PATH'] - - for store_name in DEFAULT_STORES.keys(): - if os.environ.get(store_name): - if stores is None: - stores = {} - stores[store_name] = os.environ[store_name] - - if stores is None: - if not os.path.exists(base_path): - raise FileNotFoundError(base_path) - stores = {k: os.path.join(base_path, v) for k, v in DEFAULT_STORES.items()} - - # Scan the woc data directory - for store_name, store_path in stores.items(): - if not os.path.exists(store_path): - raise FileNotFoundError(f'{store_name}: {store_path}') - - for f in os.listdir(store_path): - try: - if store_name == 'OSCAR_BASEMAPS': - src, dst, ver, idx = parse_basemap_path(f) - k = (src, dst) - prefix_len = int(idx).bit_length() if idx else 0 - if k in PATHS: - _, _predix_len, _ver = PATHS[k][0], PATHS[k][1], PATHS[k][2] - assert _ver is not None, "Should not be here; check store type" - if _ver > ver or (_ver == ver and _predix_len >= prefix_len): - continue - PATHS[k] = ( - os.path.join(store_path, - f.replace(idx, '{key}') if idx else f - ), prefix_len, ver) - pass - elif store_name in ('OSCAR_ALL_BLOBS', 'OSCAR_ALL_SHA1C', 'OSCAR_ALL_SHA1O'): - name, idx, ext = parse_sha1map_path(f) - try: - src = _full_name_to_short[name.replace('sha1.','')] - except KeyError: - raise ValueError(f'Invalid file type: {name}') - k = (src, ext) - prefix_len = int(idx).bit_length() if idx else 0 - if k in PATHS: - _, _predix_len = PATHS[k][0], PATHS[k][1] - if _predix_len >= prefix_len: - continue - PATHS[k] = ( - os.path.join(store_path, - f.replace(idx, '{key}') if idx else f - ), prefix_len, None) - else: - raise ValueError(f'Invalid store name: {store_name}, expected one of {DEFAULT_STORES.keys()}') - - except ValueError as e: - warnings.warn(f'Cannot parse {f}: {repr(e)} ') - - print(f'Loaded {len(PATHS.keys())} maps: ' + \ - str([_short_name_to_full[x[0]] + "->" + _short_name_to_full[x[1]] + \ - ":" + PATHS[x][0].format(key=2**PATHS[x][1]-1) for x in PATHS.keys()]) - ) - - if url_prefixes is not None: - URL_PREFIXES = url_prefixes - - if ignored_authors is not None: - IGNORED_AUTHORS = ignored_authors - -# run set_config on import -try: - set_config() -except FileNotFoundError as e: - warnings.warn("Oscar failed to locate /woc. Call oscar.set_config('/path/to/woc') first.") - -### polyfill for @cached_property ### - -def cached_property(func): - """ Classic memoize with @property on top""" - @wraps(func) - def wrapper(self): - key = '_' + func.__name__ - if not hasattr(self, key): - setattr(self, key, func(self)) - return getattr(self, key) - return property(wrapper) - -class ObjectNotFound(KeyError): - pass - - -cdef unber(bytes buf): - r""" Perl BER unpacking. - BER is a way to pack several variable-length ints into one - binary string. Here we do the reverse. - Format definition: from http://perldoc.perl.org/functions/pack.html - (see "w" template description) - - Args: - buf (bytes): a binary string with packed values - - Returns: - str: a list of unpacked values - - >>> unber(b'\x00\x83M') - [0, 461] - >>> unber(b'\x83M\x96\x14') - [461, 2836] - >>> unber(b'\x99a\x89\x12') - [3297, 1170] - """ - # PY: 262ns, Cy: 78ns - cdef: - list res = [] - # blob_offset sizes are getting close to 32-bit integer max - uint64_t acc = 0 - uint8_t b - - for b in buf: - acc = (acc << 7) + (b & 0x7f) - if not b & 0x80: - res.append(acc) - acc = 0 - return res - - -cdef (int, int) lzf_length(bytes raw_data): - r""" Get length of uncompressed data from a header of Compress::LZF - output. Check Compress::LZF sources for the definition of this bit magic - (namely, LZF.xs, decompress_sv) - https://metacpan.org/source/MLEHMANN/Compress-LZF-3.8/LZF.xs - - Args: - raw_data (bytes): data compressed with Perl Compress::LZF - - Returns: - Tuple[int, int]: (header_size, uncompressed_content_length) in bytes - - >>> lzf_length(b'\xc4\x9b') - (2, 283) - >>> lzf_length(b'\xc3\xa4') - (2, 228) - >>> lzf_length(b'\xc3\x8a') - (2, 202) - >>> lzf_length(b'\xca\x87') - (2, 647) - >>> lzf_length(b'\xe1\xaf\xa9') - (3, 7145) - >>> lzf_length(b'\xe0\xa7\x9c') - (3, 2524) - """ - # PY:725us, Cy:194usec - cdef: - # compressed size, header length, uncompressed size - uint32_t csize=len(raw_data), start=1, usize - # first byte, mask, buffer iterator placeholder - uint8_t lower=raw_data[0], mask=0x80, b - - while mask and csize > start and (lower & mask): - mask >>= 1 + (mask == 0x80) - start += 1 - if not mask or csize < start: - raise ValueError('LZF compressed data header is corrupted') - usize = lower & (mask - 1) - for b in raw_data[1:start]: - usize = (usize << 6) + (b & 0x3f) - if not usize: - raise ValueError('LZF compressed data header is corrupted') - return start, usize - - -def decomp(bytes raw_data): - # type: (bytes) -> bytes - """ lzf wrapper to handle perl tweaks in Compress::LZF - This function extracts uncompressed size header - and then does usual lzf decompression. - - Args: - raw_data (bytes): data compressed with Perl Compress::LZF - - Returns: - str: unpacked data - """ - if not raw_data: - return b'' - if raw_data[0] == 0: - return raw_data[1:] - start, usize = lzf_length(raw_data) - # while it is tempting to include liblzf and link statically, there is - # zero advantage comparing to just using python-lzf - return lzf.decompress(raw_data[start:], usize) - - -cdef uint32_t fnvhash(bytes data): - """ - Returns the 32 bit FNV-1a hash value for the given data. - >>> hex(fnvhash('foo')) - '0xa9f37ed7' - """ - # PY: 5.8usec Cy: 66.8ns - cdef: - uint32_t hval = 0x811c9dc5 - uint8_t b - for b in data: - hval ^= b - hval *= 0x01000193 - return hval - - -def slice20(bytes raw_data): - """ Slice raw_data into 20-byte chunks and hex encode each of them - It returns tuple in order to be cacheable - """ - if raw_data is None: - return () - return tuple(raw_data[i:i + 20] for i in range(0, len(raw_data), 20)) - - -class CommitTimezone(tzinfo): - # TODO: replace with datetime.timezone once Py2 support is ended - # a lightweight version of pytz._FixedOffset - def __init__(self, hours, minutes): - self.offset = timedelta(hours=hours, minutes=minutes) - - def utcoffset(self, dt): - return self.offset - - def tzname(self, dt): - return 'fixed' - - def dst(self, dt): - # daylight saving time - no info - return timedelta(0) - - def __repr__(self): - h, m = divmod(self.offset.seconds // 60, 60) - return "" % (h, m) - - -DAY_Z = datetime.fromtimestamp(0, CommitTimezone(0, 0)) - - -def parse_commit_date(bytes timestamp, bytes tz): - """ Parse date string of authored_at/commited_at - - git log time is in the original timezone - gitpython - same as git log (also, it has the correct timezone) - unix timestamps (used internally by commit objects) are in UTC - datetime.fromtimestamp without a timezone will convert it to host tz - github api is in UTC (this is what trailing 'Z' means) - - Args: - timestamp (str): Commit.authored_at or Commit.commited_at, - e.g. '1337145807 +1100' - tz (str): timezone - Returns: - Optional[datetime.datetime]: UTC datetime - - >>> parse_commit_date(b'1337145807', b'+1130') - datetime.datetime(2012, 5, 16, 16, 23, 27, tzinfo=) - >>> parse_commit_date(b'3337145807', b'+1100') is None - True - """ - cdef: - int sign = -1 if tz.startswith(b'-') else 1 - uint32_t ts - int hours, minutes - uint8_t tz_len = len(tz) - try: - ts = int(timestamp) - hours = sign * int(tz[tz_len-4:tz_len-2]) - minutes = sign * int(tz[tz_len-2:]) - dt = datetime.fromtimestamp(ts, CommitTimezone(hours, minutes)) - except (ValueError, OverflowError): - # i.e. if timestamp or timezone is invalid - return None - - # timestamp is in the future - if ts > time.time(): - return None - - return dt - -cdef extern from 'Python.h': - object PyBytes_FromStringAndSize(char *s, Py_ssize_t len) - - -cdef extern from 'tchdb.h': - ctypedef struct TCHDB: # type of structure for a hash database - pass - - cdef enum: # enumeration for open modes - HDBOREADER = 1 << 0, # open as a reader - HDBONOLCK = 1 << 4, # open without locking - - const char *tchdberrmsg(int ecode) - TCHDB *tchdbnew() - int tchdbecode(TCHDB *hdb) - bint tchdbopen(TCHDB *hdb, const char *path, int omode) - bint tchdbclose(TCHDB *hdb) - void *tchdbget(TCHDB *hdb, const void *kbuf, int ksiz, int *sp) - bint tchdbiterinit(TCHDB *hdb) - void *tchdbiternext(TCHDB *hdb, int *sp) - - -cdef class Hash: - """Object representing a Tokyocabinet Hash table""" - cdef TCHDB* _db - cdef bytes filename - - def __cinit__(self, char *path, nolock=True): - cdef int mode = HDBOREADER - if nolock: - mode |= HDBONOLCK - self._db = tchdbnew() - self.filename = path - if self._db is NULL: - raise MemoryError() - cdef bint result = tchdbopen(self._db, path, mode) - if not result: - raise IOError('Failed to open .tch file "%s": ' % self.filename - + self._error()) - - def _error(self): - cdef int code = tchdbecode(self._db) - cdef bytes msg = tchdberrmsg(code) - return msg.decode('ascii') - - def __iter__(self): - cdef: - bint result = tchdbiterinit(self._db) - char *buf - int sp - bytes key - if not result: - raise IOError('Failed to iterate .tch file "%s": ' % self.filename - + self._error()) - while True: - buf = tchdbiternext(self._db, &sp) - if buf is NULL: - break - key = PyBytes_FromStringAndSize(buf, sp) - free(buf) - yield key - - cdef bytes read(self, bytes key): - cdef: - char *k = key - char *buf - int sp - int ksize=len(key) - buf = tchdbget(self._db, k, ksize, &sp) - if buf is NULL: - raise ObjectNotFound(key) - cdef bytes value = PyBytes_FromStringAndSize(buf, sp) - free(buf) - return value - - def __getitem__(self, bytes key): - return self.read(key) - - def __del__(self): - cdef bint result = tchdbclose(self._db) - if not result: - raise IOError('Failed to close .tch "%s": ' % self.filename - + self._error()) - - def __dealloc__(self): - free(self._db) - - -# Pool of open TokyoCabinet databases to save few milliseconds on opening -cdef dict _TCH_POOL = {} # type: Dict[str, Hash] -TCH_LOCK = Lock() - -def _get_tch(char *path): - """ Cache Hash() objects """ - if path in _TCH_POOL: - return _TCH_POOL[path] - try: - TCH_LOCK.acquire() - # in multithreading environment this can cause race condition, - # so we need a lock - if path not in _TCH_POOL: - _TCH_POOL[path] = Hash(path) - finally: - TCH_LOCK.release() - return _TCH_POOL[path] - -def _get_value( - key: bytes, - dtype: Tuple[str, str], - use_fnv_keys = False -) -> bytes: - """Read value (in BYTES) from tch""" - try: - path_tmpl, prefix_length = PATHS[dtype][0], PATHS[dtype][1] - except KeyError as e: - raise KeyError(f'Invalid dtype: {dtype}, expected one of {PATHS.keys()}') from e - - cdef uint8_t p - if use_fnv_keys: - p = fnvhash(key) - else: - p = key[0] - cdef uint8_t prefix = p & (2**prefix_length - 1) - - path_b = path_tmpl.format(key=prefix).encode('ascii') - - return _get_tch(path_b)[key] - -GIT_DTYPES = ('c', 'cc', 'pc', 'r', 'b', 'h', 't') - -def _decode_value( - value: bytes, - out_dtype: str -): - if out_dtype in GIT_DTYPES: - return tuple( - value[i:i + 20].hex() for i in range(0, len(value), 20)) # type: Tuple[str, ...] - elif out_dtype == 'fa': - buf0 = value[0:len(value)-21] - cmt_sha = value[(len(value)-20):len(value)] - (Time, Author) = buf0.decode('utf-8').split(";") - return (Time, Author, cmt_sha.hex()) # type: Tuple[str, str, str] - elif out_dtype == 'tac': - data = decomp(value) - _splited = data.decode('utf-8').split(";") - return tuple( - (_splited[i],_splited[i+1],_splited[i+2]) - for i in range(0, len(_splited), 3) - ) # type: Tuple[Tuple[str, str, str], ...] - elif out_dtype == 'ta': - (Time, Author) = value.decode('utf-8').split(";") - return (Time, Author) # type: Tuple[str, str] - elif out_dtype in ('p', 'P'): - data = decomp(value) - return tuple(project_name.decode('utf-8') - for project_name in data.split(b';') - if project_name and project_name != b'EMPTY') # type: Tuple[str, ...] - elif out_dtype == 'f': - data = decomp(value) - return tuple(file_name.decode('utf-8') - for file_name in (data.split(b";") if data else []) - if file_name and file_name != b'EMPTY') # type: Tuple[str, ...] - elif out_dtype == 'dat': - return tuple(value.decode('utf-8').split(';')) # type: Tuple[str, ...] - elif out_dtype in ('a', 'A'): - data = decomp(value) - return tuple(author.decode('utf-8') for author in (data.split(b';') if data else []) - if author not in IGNORED_AUTHORS) # type: Tuple[str, ...] - raise ValueError(f'Unsupported dtype: {out_dtype}') - -def get_values( - dtype: Union[Tuple[str, str], str], - key: Union[bytes, str], -): - """Eqivalent to getValues in WoC Perl API - >>> get_values('P2c', 'user2589_minicms') # doctest: +SKIP - ... - >>> get_values(('P','c'),'user2589_minicms'): # doctest: +SKIP - ... - """ - # transform dtype -> (src, dst) - if isinstance(dtype, str): - try: - dtype = tuple(dtype.split('2')) - except ValueError as e: - raise ValueError(f'Invalid dtype: {dtype}, expected one of' - f'{list(lambda x: x[0] + "2" + x[1] for x in PATHS.keys())}') from e - - # use fnv hash as shading idx if key is not a git object - _use_fnv_keys = False if dtype[0] in GIT_DTYPES else True - if isinstance(key, str): - # unhexlify if key is a hex string - key = bytes.fromhex(key) \ - if len(key) == 40 and dtype[0] in GIT_DTYPES \ - else key.encode('utf-8') - - _out_raw = _get_value(key, dtype, use_fnv_keys=_use_fnv_keys) - return _decode_value(_out_raw, dtype[1]) - -def _decode_tree( - value: bytes -) -> Tuple[Tuple[str, str, str], ...]: - # Format description: https://stackoverflow.com/questions/14790681/ - # mode (ASCII encoded decimal) - # SPACE (\0x20) - # filename - # NULL (\x00) - # 20-byte binary hash - _out_buf = [] - _file_buf = [] - _curr_buf = bytes() - - # inefficient, but works - i = 0 - while i < len(value): - if value[i] == 0x20: - _file_buf.append(_curr_buf.decode('utf-8')) - _curr_buf = bytes() - elif value[i] == 0x00: - _file_buf.append(_curr_buf.decode('utf-8')) - # take next 20 bytes as a hash - _curr_buf = value[i+1:i+21] - _file_buf.append(_curr_buf.hex()) - _out_buf.append(tuple(_file_buf)) - # clear buffers - _file_buf = [] - _curr_buf = bytes() - i += 20 - else: - _curr_buf += bytes([value[i]]) - i += 1 - - return tuple(_out_buf) - -def _decode_content( - value: bytes, - dtype: Tuple[str, str] -): - if dtype == ('c', 'tch'): - return decomp(value).decode('utf-8') - elif dtype == ('t', 'tch'): - return _decode_tree(decomp(value)) - raise ValueError(f'Unsupported dtype: {dtype}') - -_file_obj_pool = {} -def read_file_with_offset(file_path, offset, length): - _f = _file_obj_pool.setdefault(file_path, open(file_path, "rb")) - with mmap.mmap(_f.fileno(), length=0, access=mmap.ACCESS_READ) as _m: - _m.seek(offset) - return _m.read(length) - -def _get_predix( - key: bytes, - dtype: Tuple[str, str], - use_fnv_keys = False -): - """Calculate prefix (in BYTES) from tch""" - try: - path_tmpl, prefix_length = PATHS[dtype][0], PATHS[dtype][1] - except KeyError as e: - raise KeyError(f'Invalid dtype: {dtype}, expected one of {PATHS.keys()}') from e - - cdef uint8_t p - if use_fnv_keys: - p = fnvhash(key) - else: - p = key[0] - prefix = p & (2**prefix_length - 1) - return prefix - -def show_content( - in_dtype: str, - key: Union[bytes, str], -): - """Eqivalent to showCnt in WoC perl API - >>> show_content('tree', '7a374e58c5b9dec5f7508391246c48b73c40d200') # doctest: +SKIP - ... - """ - dtype = (_full_name_to_short[in_dtype], 'tch') - if isinstance(key, str): - key = bytes.fromhex(key) - _out_raw = _get_value(key, dtype, use_fnv_keys=False) - if in_dtype in ('commit', 'tree'): - return _decode_content(_out_raw, dtype) - elif in_dtype == 'blob': - offset, length = unber(_out_raw) - _prefix = _get_predix(key, ('b', 'bin'), use_fnv_keys=False) - _path = PATHS[('b', 'bin')][0].format(key=_prefix) - _out_bin = read_file_with_offset(_path, offset, length) - return decomp(_out_bin).decode('utf-8') - else: - raise ValueError(f'Unsupported dtype: {in_dtype}, expected one of ("commit", "blob", "tree")') - -class _Base(object): - type = 'oscar_base' # type: str - key = b'!Err!' # type: bytes - # fnv keys are used for non-git objects, such as files, projects and authors - use_fnv_keys = True # type: bool - _keys_registry_dtype = ('Err', 'Err') # type: Tuple[str, str] - - def __init__(self, key): - self.key = key - - def __repr__(self): - return '<%s: %s>' % (self.type.capitalize(), self) - - def __hash__(self): - return hash(self.key) - - def __eq__(self, other): - return isinstance(other, type(self)) \ - and self.type == other.type \ - and self.key == other.key - - def __ne__(self, other): - return not self == other - - def __str__(self): - return (binascii.hexlify(self.key).decode('ascii') - if isinstance(self.key, bytes) else self.key) - - def resolve_path(self, dtype: Tuple[str, str]): - """ Get path to a file using data type and object key (for sharding) - """ - path, prefix_length = PATHS[dtype][0], PATHS[dtype][1] - - cdef uint8_t p - if self.use_fnv_keys: - p = fnvhash(self.key) - else: - p = self.key[0] - cdef uint8_t prefix = p & (2**prefix_length - 1) - return path.format(key=prefix) - - def read_tch(self, dtype: Tuple[str, str]): - return _get_value( - self.key, - dtype, - use_fnv_keys=self.use_fnv_keys - ) - - @classmethod - def all_keys(cls): - """ Iterate keys of all objects of the given type - This might be useful to get a list of all projects, or a list of - all file names. - - Yields: - bytes: objects key - """ - if not cls._keys_registry_dtype: - raise NotImplemented - - base_path, prefix_length = PATHS[cls._keys_registry_dtype][:2] - for file_prefix in range(2 ** prefix_length): - for key in _get_tch( - base_path.format(key=file_prefix).encode('ascii')): - yield key - - @classmethod - def all(cls): - for key in cls.all_keys(): - yield cls(key) - - -class GitObject(_Base): - use_fnv_keys = False - - @classmethod - def all(cls): - """ Iterate ALL objects of this type (all projects, all times) """ - base_idx_path, prefix_length = PATHS[cls.type,'idx'][:2] - base_bin_path, prefix_length = PATHS[cls.type,'bin'][:2] - for key in range(2**prefix_length): - idx_path = base_idx_path.format(key=key) - bin_path = base_bin_path.format(key=key) - datafile = open(bin_path, "rb") - for line in open(idx_path): - chunks = line.strip().split(";") - offset, comp_length, sha = chunks[1:4] - if len(chunks) > 4: # cls.type == "blob": - # usually, it's true for blobs; - # however, some blobs follow common pattern - sha = chunks[4] - - obj = cls(sha) - # obj.data = decomp(datafile.read(int(comp_length))) - - yield obj - datafile.close() - - def __init__(self, sha): - if isinstance(sha, str) and len(sha) == 40: - self.sha = sha - self.bin_sha = binascii.unhexlify(sha) - elif isinstance(sha, bytes) and len(sha) == 20: - self.bin_sha = sha - self.sha = binascii.hexlify(sha).decode('ascii') - else: - raise ValueError('Invalid SHA1 hash: %s' % sha) - super(GitObject, self).__init__(self.bin_sha) - - @cached_property - def data(self): - # type: () -> bytes - if self.type not in ('commit', 'tree'): - raise NotImplementedError - # default implementation will only work for commits and trees - return decomp(self.read_tch((_full_name_to_short[self.type], 'tch'))) - - @classmethod - def string_sha(cls, data): - # type: (bytes) -> str - """Manually compute blob sha from its content passed as `data`. - The main use case for this method is to identify source of a file. - - Blob SHA is computed from a string: - "blob " - - # https://gist.github.com/masak/2415865 - Commit SHAs are computed in a similar way - "commit " - - note that commit content includes committed/authored date - - Args: - data (bytes): content of the GitObject to get hash for - - Returns: - str: 40-byte hex SHA1 hash - """ - sha1 = hashlib.sha1() - sha1.update(b'%s %d\x00' % (cls.type.encode('ascii'), len(data))) - sha1.update(data) - return sha1.hexdigest() - - @classmethod - def file_sha(cls, path): - buffsize = 1024 ** 2 - size = os.stat(path).st_size - with open(path, 'rb') as fh: - sha1 = hashlib.sha1() - sha1.update(b'%s %d\x00' % (cls.type.encode('ascii'), size)) - while True: - data = fh.read(min(size, buffsize)) - if not data: - return sha1.hexdigest() - sha1.update(data) - - -class Blob(GitObject): - type = 'blob' - - def __len__(self): - _, length = self.position - return length - - @cached_property - def position(self): - # type: () -> (int, int) - """ Get offset and length of the blob data in the storage """ - value = self.read_tch(('b', 'tch')) - if value is None: # empty read -> value not found - raise ObjectNotFound('Blob data not found (bad sha?)') - return unber(value) - - @cached_property - def data(self): - """ Content of the blob """ - offset, length = self.position - # no caching here to stay thread-safe - with open(self.resolve_path(('b','bin')), 'rb') as fh: - fh.seek(offset) - return decomp(fh.read(length)) - - @cached_property - def commit_shas(self): - """ SHAs of Commits in which this blob have been - introduced or modified. - - **NOTE: commits removing this blob are not included** - """ - return slice20(self.read_tch(('b','c'))) - - @property - def commits(self): - """ Commits where this blob has been added or changed - - **NOTE: commits removing this blob are not included** - """ - return (Commit(bin_sha) for bin_sha in self.commit_shas) - - @cached_property - def first_author(self): - """ get time, first author and first commit for the blob - """ - buf = self.read_tch(('b','fa')) - buf0 = buf [0:len(buf)-21] - cmt_sha = buf [(len(buf)-20):len(buf)] - (Time, Author) = buf0 .decode('ascii') .split(";") - return (Time, Author, cmt_sha .hex()) - - - -class Tree(GitObject): - """ A representation of git tree object, basically - a directory. - - Trees are iterable. Each element of the iteration is a 3-tuple: - `(mode, filename, sha)` - - - `mode` is an ASCII decimal **string** similar to file mode - in Unix systems. Subtrees always have mode "40000" - - `filename` is a string filename, not including directories - - `sha` is a 40 bytes hex string representing file content Blob SHA - - .. Note:: iteration is not recursive. - For a recursive walk, use Tree.traverse() or Tree.files - - Both files and blobs can be checked for membership, - either by their id (filename or SHA) or a corresponding object: - - >>> tree = Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d") - >>> '.gitignore' in tree - True - >>> File('.keep') in tree - False - >>> '83d22195edc1473673f1bf35307aea6edf3c37e3' in tree - True - >>> Blob('83d22195edc1473673f1bf35307aea6edf3c37e3') in tree - True - - `len(tree)` returns the number of files under the tree, including files in - subtrees but not the subtrees themselves: - - >>> len(Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d")) - 16 - """ - - type = 'tree' - - def __iter__(self): - """ Unpack binary tree structures, yielding 3-tuples of - (mode (ASCII decimal), filename, sha (40 bytes hex)) - - Format description: https://stackoverflow.com/questions/14790681/ - mode (ASCII encoded decimal) - SPACE (\0x20) - filename - NULL (\x00) - 20-byte binary hash - >>> len(list(Tree("d4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d"))) - 6 - >>> all(len(line) == 3 - ... for line in Tree("954829887af5d9071aa92c427133ca2cdd0813cc")) - True - """ - # unfortunately, Py2 cython doesn't know how to instantiate bytes from - # memoryviews. TODO: reuse libgit2 git_tree__parse_raw - data = self.data - - i = 0 - while i < len(data): - # mode - start = i - while i < len(data) and data[i] != 32: # 32 is space - i += 1 - mode = data[start:i] - i += 1 - # file name - start = i - while i < len(data) and data[i] != 0: - i += 1 - fname = data[start:i] - # sha - start = i + 1 - i += 21 - yield mode, fname, data[start:i] - - def __len__(self): - return len(self.files) - - def __contains__(self, item): - if isinstance(item, File): - return item.key in self.files - elif isinstance(item, Blob): - return item.bin_sha in self.blob_shas - elif isinstance(item, str) and len(item) == 40: - item = binascii.unhexlify(item) - elif not isinstance(item, bytes): - return False - - return item in self.blob_shas or item in self.files - - def traverse(self): - """ Recursively traverse the tree - This will generate 3-tuples of the same format as direct tree - iteration, but will recursively include subtrees content. - - Yields: - Tuple[bytes, bytes, bytes]: (mode, filename, blob/tree sha) - - >>> c = Commit(u'1e971a073f40d74a1e72e07c682e1cba0bae159b') - >>> len(list(c.tree.traverse())) - 8 - >>> c = Commit(u'e38126dbca6572912013621d2aa9e6f7c50f36bc') - >>> len(list(c.tree.traverse())) - 36 - """ - for mode, fname, sha in self: - yield mode, fname, sha - # trees are always 40000: - # https://stackoverflow.com/questions/1071241 - if mode == b'40000': - for mode2, fname2, sha2 in Tree(sha).traverse(): - yield mode2, fname + b'/' + fname2, sha2 - - def __str__(self): - """ - >>> print(Tree('954829887af5d9071aa92c427133ca2cdd0813cc')) - 100644 __init__.py ff1f7925b77129b31938e76b5661f0a2c4500556 - 100644 admin.py d05d461b48a8a5b5a9d1ea62b3815e089f3eb79b - 100644 models.py d1d952ee766d616eae5bfbd040c684007a424364 - 40000 templates 7ff5e4c9bd3ce6ab500b754831d231022b58f689 - 40000 templatetags e5e994b0be2c9ce6af6f753275e7d8c29ccf75ce - 100644 urls.py e9cb0c23a7f6683911305efff91dcabadb938794 - 100644 utils.py 2cfbd298f18a75d1f0f51c2f6a1f2fcdf41a9559 - 100644 views.py 973a78a1fe9e69d4d3b25c92b3889f7e91142439 - """ - return b'\n'.join(b' '.join((mode, fname, binascii.hexlify(sha))) - for mode, fname, sha in self).decode('ascii') - - @cached_property - def files(self): - """ A dict of all files and their content/blob sha under this tree. - It includes recursive files (i.e. files in subdirectories). - It does NOT include subdirectories themselves. - """ - return {fname: sha for mode, fname, sha in self if mode != b'40000'} - - @property - def blob_shas(self): - """A tuple of all file content shas, including files in subdirectories - """ - return tuple(self.files.values()) - - @property - def blobs(self): - """ A generator of Blob objects with file content. - It does include files in subdirectories. - - >>> tuple(Tree('d20520ef8c1537a42628b72d481b8174c0a1de84').blobs - ... ) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE - (, ..., - ) - """ - return (Blob(sha) for sha in self.blob_shas) - - -class Commit(GitObject): - """ A git commit object. - - Commits have some special properties. - Most of object properties provided by this project are lazy, i.e. they are - computed when you access them for the first time. - The following `Commit` properties will be instantiated all at once on the - first access to *any* of them. - - - :data:`tree`: root `Tree` of the commit - - :data:`parent_shas`: tuple of parent commit sha hashes - - :data:`message`: str, first line of the commit message - - :data:`full_message`: str, full commit message - - :data:`author`: str, Name - - :data:`authored_at`: str, unix_epoch+timezone - - :data:`committer`: str, Name - - :data:`committed_at`: str, unix_epoch+timezone - """ - type = 'commit' - encoding = 'utf8' - - def __getattr__(self, attr): - """ Mimic special properties: - tree: root Tree of the commit - parent_shas: tuple of parent commit sha hashes - message: str, first line of the commit message - full_message: str, full commit message - author: str, Name - authored_at: timezone-aware datetime or None (if invalid) - committer: str, Name - committed_at: timezone-aware datetime or None (if invalid) - signature: str or None, PGP signature - - - Commit: https://github.com/user2589/minicms/commit/e38126db - >>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc') - >>> c.author.startswith(b'Marat') - True - >>> c.authored_at - datetime.datetime(2012, 5, 19, 1, 14, 8, tzinfo=) - >>> c.tree.sha - '6845f55f47ddfdbe4628a83fdaba35fa4ae3c894' - >>> len(c.parent_shas) - 1 - >>> c.parent_shas[0] - 'ab124ab4baa42cd9f554b7bb038e19d4e3647957' - >>> c.committed_at - datetime.datetime(2012, 5, 19, 1, 14, 8, tzinfo=) - """ - # using libgit2 commit_parse would be a bit faster, but would require - # to face internal git structures with manual memory management. - # The probability of introducing bugs and memory leaks isn't worth it - - attrs = ('tree', 'parent_shas', 'message', 'full_message', 'author', - 'committer', 'authored_at', 'committed_at', 'signature') - if attr not in attrs: - raise AttributeError( - '\'%s\'has no attribute \'%s\'' % (self.__class__.__name__, attr)) - - for a in attrs: - setattr(self, a, None) - self._parse() - return getattr(self, attr) - - # def _parse2(self): - # # TODO: port to Cython - # # Py: 22.6usec, Cy: - # cdef: - # const unsigned char[:] data = self.data - # uint32_t data_len = len(self.data) - # uint32_t start, end, sol, eol = 101 - # list parent_shas = [] - # bytes timestamp, timezone - # # fields come in this exact order: - # # tree, parent, author, committer, [gpgsig], [encoding] - # if data[0:5] != b'tree ': raise ValueError('Malformed commit') - # self.tree = Tree(binascii.unhexlify(data[5:5+40])) - # - # if data[45:45+8] != b'\nparent ': raise ValueError('Malformed commit') - # parent_shas.append(binascii.unhexlify(data[53:53+40])) - # - # if data[93:93+8] != b'\nauthor ': raise ValueError('Malformed commit') - # # eol is initialized at 101 already - # while data[eol] != b'\n': eol += 1 - # end = eol - 1 - # start = end - # while data[start] != b' ': start -= 1 - # timezone = data[start+1:end] - # end = start-1 - # start = end - # while data[start] != b' ': start -= 1 - # timestamp = data[start+1:end] - # self.authored_at = parse_commit_date(timestamp, timezone) - # self.author = bytes(data[101:start-1]) - # - # sol = eol - # eol += 1 - # if data[sol:sol+11] != b'\ncommitter ': raise ValueError('Malformed commit') - # while data[eol] != b'\n': eol += 1 - # end = eol - 1 - # start = end - # while data[start] != b' ': start -= 1 - # timezone = data[start+1:end] - # end = start-1 - # start = end - # while data[start] != b' ': start -= 1 - # timestamp = data[start+1:end] - # self.committed_at = parse_commit_date(timestamp, timezone) - # self.committer = bytes(data[101:start-1]) - # - # - # for field, field_len in ((b'tree', 5), (b'parent', 7)): - # for field, field_len in ((b'author', 7), (b'committer', 10)): - # - # - # self.header = bytes(data[0:i]) - # start = i - # self.header, self.full_message = self.data.split(b'\n\n', 1) - # self.message = self.full_message.split(b'\n', 1)[0] - # cdef list parent_shas = [] - # cdef bytes signature = None - # cdef bint reading_signature = False - # for line in self.header.split(b'\n'): - # if reading_signature: - # # examples: - # # 1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab - # # cbbc685c45bdff4da5ea0984f1dd3a73486b4556 - # signature += line - # if line.strip() == b'-----END PGP SIGNATURE-----': - # self.signature = signature - # reading_signature = False - # continue - # - # if line.startswith(b' '): # mergetag object, not supported (yet?) - # # example: c1313c68c7f784efaf700fbfb771065840fc260a - # continue - # - # line = line.strip() - # if not line: # sometimes there is an empty line after gpgsig - # continue - # try: - # key, value = line.split(b' ', 1) - # except ValueError: - # raise ValueError('Unexpected header in commit ' + self.sha) - # # fields come in this exact order: - # # tree, parent, author, committer, [gpgsig], [encoding] - # if key == b'tree': - # # value is bytes holding hex values -> need to decode - # self.tree = Tree(binascii.unhexlify(value)) - # elif key == b'parent': # multiple parents possible - # parent_shas.append(binascii.unhexlify(value)) - # elif key == b'author': - # # author name can have arbitrary number of spaces while - # # timestamp is guaranteed to have one, so rsplit - # self.author, timestamp, timezone = value.rsplit(b' ', 2) - # self.authored_at = parse_commit_date(timestamp, timezone) - # elif key == b'committer': - # # same logic as author - # self.committer, timestamp, timezone = value.rsplit(b' ', 2) - # self.committed_at = parse_commit_date(timestamp, timezone) - # elif key == b'gpgsig': - # signature = value - # reading_signature = True - # elif key == b'encoding': - # self.encoding = value.decode('ascii') - # self.parent_shas = tuple(parent_shas) - - def _parse(self): - try: - self.header, self.full_message = self.data.split(b'\n\n', 1) - except ValueError: # Sometimes self.data == b'' - raise ObjectNotFound('Failed to parse output ' + str(self.data)) - self.message = self.full_message.split(b'\n', 1)[0] - cdef list parent_shas = [] - cdef bytes signature = None - cdef bint reading_signature = False - for line in self.header.split(b'\n'): - if reading_signature: - # examples: - # 1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab - # cbbc685c45bdff4da5ea0984f1dd3a73486b4556 - signature += line - if line.strip() == b'-----END PGP SIGNATURE-----': - self.signature = signature - reading_signature = False - continue - - if line.startswith(b' '): # mergetag object, not supported (yet?) - # example: c1313c68c7f784efaf700fbfb771065840fc260a - continue - - line = line.strip() - if not line: # sometimes there is an empty line after gpgsig - continue - try: - key, value = line.split(b' ', 1) - except ValueError: - raise ValueError('Unexpected header in commit ' + self.sha) - # fields come in this exact order: - # tree, parent, author, committer, [gpgsig], [encoding] - if key == b'tree': - # value is bytes holding hex values -> need to decode - self.tree = Tree(binascii.unhexlify(value)) - elif key == b'parent': # multiple parents possible - parent_shas.append(binascii.unhexlify(value)) - elif key == b'author': - # author name can have arbitrary number of spaces while - # timestamp is guaranteed to have one, so rsplit - self.author, timestamp, timezone = value.rsplit(b' ', 2) - self.authored_at = parse_commit_date(timestamp, timezone) - elif key == b'committer': - # same logic as author - self.committer, timestamp, timezone = value.rsplit(b' ', 2) - self.committed_at = parse_commit_date(timestamp, timezone) - elif key == b'gpgsig': - signature = value - reading_signature = True - elif key == b'encoding': - self.encoding = value.decode('ascii') - self.parent_shas = tuple(parent_shas) - - def __sub__(self, parent, threshold=0.5): - """ Compare two Commits. - - Args: - parent (Commit): another commit to compare to. - Expected order is `diff = child_commit - parent_commit` - - Yields: - Tuple[Optional[str], Optional[str], Optional[str], Optional[str]]: - 4-tuples: `(old_path, new_path, old_sha, new_sha)` - - Examples: - - a new file 'setup.py' was created: - `(None, 'setup.py', None, 'file_sha')` - - an existing 'setup.py' was deleted: - `('setup.py', None, 'old_file_sha', None)` - - setup.py.old was renamed to setup.py, content unchanged: - `('setup.py.old', 'setup.py', 'file_sha', 'file_sha')` - - setup.py was edited: - `('setup.py', 'setup.py', 'old_file_sha', 'new_file_sha')` - - setup.py.old was edited and renamed to setup.py: - `('setup.py.old', 'setup.py', 'old_file_sha', 'new_file_sha')` - - Detecting the last one is computationally expensive. You can adjust this - behaviour by passing the `threshold` parameter, which is 0.5 by default. - It means that if roughly 50% of the file content is the same, - it is considered a match. `threshold=1` means that only exact - matches are considered, effectively disabling this comparison. - If threshold is set to 0, any pair of deleted and added file will be - considered renamed and edited; this last case doesn't make much sense so - don't set it too low. - """ - if parent.sha not in self.parent_shas: - warnings.warn("Comparing non-adjacent commits might be " - "computationally expensive. Proceed with caution.") - - # filename: (blob sha before, blob sha after) - new_files = self.tree.files - new_paths = set(new_files.keys()) - old_files = parent.tree.files - old_paths = set(old_files.keys()) - - # unchanged_paths - for fname in new_paths.intersection(old_paths): - if new_files[fname] != old_files[fname]: - # i.e. the Blob sha is the same - yield fname, fname, old_files[fname], new_files[fname] - - added_paths = new_paths - old_paths - deleted_paths = old_paths - new_paths - - if threshold >= 1: # i.e. only exact matches are considered - for fname in added_paths: - yield None, fname, None, new_files[fname] - for fname in deleted_paths: - yield fname, None, old_files[fname], None - return - - # search for matches - sm = difflib.SequenceMatcher() - added_blobs = {f: Blob(new_files[f]) for f in added_paths} - deleted_blobs = {f: Blob(old_files[f]) for f in deleted_paths} - # for each added blob, try to find a match in deleted blobs - # if there is a match, signal a rename and remove from deleted - # if there is no match, signal a new file - # unused deleted blobs are indeed deleted - for added_fname, added_blob in added_blobs.items(): - sm.set_seq1(added_blob) - matched = False - for deleted_fname, deleted_blob in deleted_blobs.items(): - sm.set_seq2(deleted_blob) - # use quick checks first (lower bound by length diff) - if sm.real_quick_ratio() > threshold \ - and sm.quick_ratio() > threshold \ - and sm.ratio() > threshold: - yield deleted_fname, added_fname, deleted_blob, added_blob - del(deleted_blobs[deleted_fname]) - matched = True - break - if not matched: # this is a new file - yield None, added_fname, None, added_blob - - for deleted_fname, deleted_blob in deleted_blobs.items(): - yield deleted_fname, None, deleted_blob, None - - @property - def parents(self): - """ A generator of parent commits. - If you only need hashes (and not `Commit` objects), - use `.parent_sha` instead - - Commit: https://github.com/user2589/minicms/commit/e38126db - >>> c = Commit('e38126dbca6572912013621d2aa9e6f7c50f36bc') - >>> tuple(c.parents) - (,) - """ - return (Commit(sha) for sha in self.parent_shas) - - @cached_property - def project_names(self): - # type: () -> tuple - """ URIs of projects including this commit. - This property can be used to find all forks of a project - by its first commit. - - Commit: https://github.com/user2589/minicms/commit/f2a7fcdc - >>> c = Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c') - >>> isinstance(c.project_names, tuple) - True - >>> len(c.project_names) > 0 - True - >>> 'user2589_minicms' in c.project_names - True - """ - data = decomp(self.read_tch(('c','p'))) - return tuple(project_name for project_name in data.split(b';') - if project_name and project_name != 'EMPTY') - - @property - def projects(self): - """ A generator of `Project` s, in which this commit is included. - """ - return (Project(uri) for uri in self.project_names) - - @cached_property - def child_shas(self): - """ Children commit binary sha hashes. - Basically, this is a reverse parent_shas - - Commit: https://github.com/user2589/minicms/commit/1e971a07 - >>> Commit('1e971a073f40d74a1e72e07c682e1cba0bae159b').child_shas - ('9bd02434b834979bb69d0b752a403228f2e385e8',) - """ - return slice20(self.read_tch(('c','cc'))) - - @property - def children(self): - """ A generator of children `Commit` objects - - Commit: https://github.com/user2589/minicms/commit/1e971a07 - >>> tuple(Commit('1e971a073f40d74a1e72e07c682e1cba0bae159b').children) - (,) - """ - return (Commit(sha) for sha in self.child_shas) - - @cached_property - def blob_shas(self): - """ SHA hashes of all blobs in the commit - - >>> Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blob_shas - ... # doctest: +NORMALIZE_WHITESPACE - ('b2f49ffef1c8d7ce83a004b34035f917713e2766', - 'c92011c5ccc32a9248bd929a6e56f846ac5b8072', - 'bf3c2d2df2ef710f995b590ac3e2c851b592c871') - """ - return self.tree.blob_shas - - @cached_property - def changed_file_names(self): - data = decomp(self.read_tch(('c','f'))) - return tuple((data and data.split(b';')) or []) - - def files_changed(self): - return (File(filename) for filename in self.changed_file_names) - - @property - def blob_shas_rel(self): - """ - This relation is known to miss every first file in all trees. - Consider using Commit.tree.blobs as a slower but more accurate - alternative. - - When this relation passes the test, please replace blob_sha with it - It should be faster but as of now it is not accurate - """ - # still true as of Sep 2020 - warnings.warn( - 'This relation is known to miss every first file in all trees. ' - 'Consider using Commit.tree.blobs as a slower but more accurate ' - 'alternative', DeprecationWarning) - return slice20(self.read_tch(('c','b'))) - - @property - def blobs(self): - """ A generator of `Blob` objects included in this commit - - >>> tuple(Commit('af0048f4aac8f4760bf9b816e01524d7fb20a3fc').blobs) - ... # doctest: +NORMALIZE_WHITESPACE - (, - , - ) - """ - return (Blob(bin_sha) for bin_sha in self.blob_shas) - - @cached_property - def attributes(self): - """ pre-parsed from the commit commit binary sha hashes.uthor Time, Author time zone, author, tree, parents (colon separated) - >>> oscar.Commit("80b4ca99f8605903d8ac6bd921ebedfdfecdd660").attributes - ['1432848535', '-0400', 'Robert Lefebvre ', '8a08c812a15051605da7c594b970cad57ec07e3b', 'd24664ccf959bd6e5bacb8ad2c0ceebcdcc8551c'] - """ - return self.read_tch(('c','dat')).decode('ascii').split(";") - - @cached_property - def files(self): - data = decomp(self.read_tch(('c','f'))) - return tuple(file_name - for file_name in (data and data.split(b";")) or [] - if file_name and file_name != 'EMPTY') - - -class Tag(GitObject): - """ Tag doesn't have any functionality associated. - You can't really do anything useful with it yet - """ - type = 'tag' - - -class Project(_Base): - """ - Projects are iterable: - - >>> for commit in Project('user2589_minicms'): # doctest: +SKIP - ... print(commit.sha) - - Commits can be checked for membership in a project, either by their SHA - hash or by a Commit object itself: - - Commit: https://github.com/user2589/minicms/commit/e38126db - >>> sha = 'e38126dbca6572912013621d2aa9e6f7c50f36bc' - >>> sha in Project('user2589_minicms') - True - >>> Commit(sha) in Project('user2589_minicms') - True - """ - - type = 'project' - _keys_registry_dtype = 'p2c' - - def __init__(self, uri): - if isinstance(uri, str): - uri = uri.encode('ascii') - self.uri = uri - super(Project, self).__init__(uri) - - def __iter__(self): - """ Generator of all commits in the project. - Order of commits is not guaranteed - - >>> commits = tuple(Project(b'user2589_minicms')) - >>> len(commits) > 60 - True - >>> isinstance(commits[0], Commit) - True - """ - for sha in self.commit_shas: - try: - c = Commit(sha) - author = c.author - except ObjectNotFound: - continue - if author not in IGNORED_AUTHORS: - yield c - - def __contains__(self, item): - if isinstance(item, Commit): - key = item.key - elif isinstance(item, bytes) and len(item) == 20: - key = item - elif isinstance(item, str) and len(item) == 40: - key = binascii.unhexlify(item) - else: - return False - return key in self.commit_shas - - @cached_property - def commit_shas(self): - """ SHA1 of all commits in the project - - >>> Project(b'user2589_django-currencies').commit_shas - ... # doctest: +NORMALIZE_WHITESPACE - ('2dbcd43f077f2b5511cc107d63a0b9539a6aa2a7', - '7572fc070c44f85e2a540f9a5a05a95d1dd2662d') - """ - return slice20(self.read_tch(('p','c'))) - - @property - def commits(self): - """ A generator of all Commit objects in the project. - It has the same effect as iterating a `Project` instance itself, - with some additional validation of commit dates. - - >>> tuple(Project('user2589_django-currencies').commits) - ... # doctest: +NORMALIZE_WHITESPACE - (, - ) - """ - commits = tuple(c for c in self) - tails = tuple(c for c in commits - if not c.parent_shas and c.authored_at is not None) - if tails: - min_date = min(c.authored_at for c in tails) - else: # i.e. if all tails have invalid date - min_date = DAY_Z - - for c in commits: - if c.authored_at and c.authored_at < min_date: - c.authored_at = None - yield c - - @cached_property - def head(self): - """ Get the HEAD commit of the repository - - >>> Project('user2589_minicms').head - - >>> Project('RoseTHERESA_SimpleCMS').head - - """ - # Sometimes (very rarely) commit dates are wrong, so the latest commit - # is not actually the head. The magic below is to account for this - commits = {c.sha: c for c in self.commits} - parents = set().union(*(c.parent_shas for c in commits.values())) - heads = set(commits.keys()) - parents - - # it is possible that there is more than one head. - # E.g. it happens when HEAD is moved manually (git reset) - # and continued with a separate chain of commits. - # in this case, let's just use the latest one - # actually, storing refs would make it much simpler - return sorted((commits[sha] for sha in heads), - key=lambda c: c.authored_at or DAY_Z)[len(commits)-1] - - @cached_property - def tail(self): - """ Get the first commit SHA by following first parents - - >>> Project(b'user2589_minicms').tail - '1e971a073f40d74a1e72e07c682e1cba0bae159b' - """ - commits = {c.bin_sha: c for c in self.commits} - pts = set(c.parent_shas[0] for c in commits.values() if c.parent_shas) - for bin_sha, c in commits.items(): - if bin_sha in pts and not c.parent_shas: - return bin_sha - - @property - def commits_fp(self): - """ Get a commit chain by following only the first parent, to mimic - https://git-scm.com/docs/git-log#git-log---first-parent . - Thus, you only get a small subset of the full commit tree: - - >>> p = Project(b'user2589_minicms') - >>> set(c.sha for c in p.commits_fp).issubset(p.commit_shas) - True - - In scenarios where branches are not important, it can save a lot - of computing. - - Yields: - Commit: binary commit shas, following first parent only, - from the latest to the earliest. - """ - # Simplified version of self.head(): - # - slightly less precise, - # - 20% faster - # - # out of 500 randomly sampled projects, 493 had the same head. - # In the remaining 7: - # 2 had the same commit chain length, - # 3 had one more commit - # 1 had two more commits - # 1 had three more commits - # Execution time: - # simplified version (argmax): ~153 seconds - # self.head(): ~190 seconds - - # at this point we know all commits are in the dataset - # (validated in __iter___) - result = [] - commits = {c.sha: c for c in self.commits} - commit = max(commits.values(), key=lambda c: c.authored_at or DAY_Z) - - while commit: - try: # here there is no guarantee commit is in the dataset - first_parent = commit.parent_shas and commit.parent_shas[0] - except ObjectNotFound: - break - - yield commit - - if not first_parent: - break - - commit = commits.get(first_parent, Commit(first_parent)) - - @cached_property - def url(self): - """ Get the URL for a given project URI - >>> Project('CS340-19_lectures').url - 'http://github.com/CS340-19/lectures' - """ - prefix, body = self.uri.split(b'_', 1) - if prefix == b'sourceforge.net': - platform = URL_PREFIXES[prefix] - elif prefix in URL_PREFIXES and b'_' in body: - platform = URL_PREFIXES[prefix] - body = body.replace(b'_', b'/', 1) - else: - platform = b'github.com' - body = self.uri.replace(b'_', b'/', 1) - return b'/'.join((b'https:/', platform, body)) - - @cached_property - def author_names(self): - data = decomp(self.read_tch(('p','a'))) - return tuple(author_name - for author_name in (data and data.split(b';')) or [] - if author_name and author_name != 'EMPTY') - - -class File(_Base): - """ - Files are initialized with a path, starting from a commit root tree: - - >>> File(b'.gitignore') # doctest: +SKIP - >>> File(b'docs/Index.rst') # doctest: +SKIP - """ - type = 'file' - _keys_registry_dtype = 'f2c' - - def __init__(self, path): - if isinstance(path, str): - path = path.encode('utf8') - self.path = path - super(File, self).__init__(path) - - """ - deprecated due to current lack of use cases - @cached_property - def author_names(self): - data = decomp(self.read_tch('file_authors')) - return tuple(author for author in (data and data.split(b';')) - if author not in IGNORED_AUTHORS) - """ - - @cached_property - def commit_shas(self): - """ SHA1 of all commits changing this file - - **NOTE: this relation considers only diff with the first parent, - which substantially limits its application** - - >>> commits = File('minicms/templatetags/minicms_tags.py').commit_shas - >>> len(commits) > 0 - True - >>> isinstance(commits, tuple) - True - >>> isinstance(commits[0], str) - True - >>> len(commits[0]) == 40 - True - """ - return slice20(self.read_tch(('f','c'))) - - @property - def commits(self): - """ All commits changing the file - - .. note: this relation considers only diff with the first parent, - which substantially limits its application - - >>> cs = tuple(File('minicms/templatetags/minicms_tags.py').commits) - >>> len(cs) > 0 - True - >>> isinstance(cs[0], Commit) - True - """ - for sha in self.commit_shas: - c = Commit(sha) - try: - author = c.author - except ObjectNotFound: - continue - if author not in IGNORED_AUTHORS: - yield c - - @cached_property - def author_names(self): - data = decomp(self.read_tch(('f','a'))) - return tuple(author_name - for author_name in (data and data.split(b';')) or [] - if author_name and author_name != 'EMPTY') - - def __str__(self): - return super(File, self).__str__().rstrip("\n\r") - - -class Author(_Base): - """ - Authors are initialized with a combination of name and email, as they - appear in git configuration. - - >>> Author('John Doe ') # doctest: +SKIP - - At this point we don't have a relation to map all aliases of the same - author, so keep in mind this object represents an alias, not a person. - """ - type = 'author' - _keys_registry_dtype = 'a2c' - - def __init__(self, full_email): - if isinstance(full_email, str): - full_email = full_email.encode('utf8') - self.full_email = full_email - super(Author, self).__init__(full_email) - - @cached_property - def commit_shas(self): - """ SHA1 of all commits authored by the Author - - >>> commits = Author('user2589 ').commit_shas - >>> len(commits) > 50 - True - >>> isinstance(commits, tuple) - True - >>> isinstance(commits[0], str) - True - >>> len(commits[0]) == 40 - True - """ - return slice20(self.read_tch(('a','c'))) - - @property - def commits(self): - """ A generator of all Commit objects authored by the Author - - >>> commits = tuple( - ... Author('user2589 ').commits) - >>> len(commits) > 40 - True - >>> isinstance(commits[0], Commit) - True - """ - return (Commit(sha) for sha in self.commit_shas) - - @cached_property - def file_names(self): - data = decomp(self.read_tch(('a','f'))) - return tuple(fname for fname in (data and data.split(b';'))) - - @cached_property - def project_names(self): - """ URIs of projects where author has committed to - A generator of all Commit objects authored by the Author - """ - data = decomp(self.read_tch(('a','p'))) - return tuple(project_name - for project_name in (data and data.split(b';')) - if project_name and project_name != b'EMPTY') - - # This relation went MIA as of Sep 6 2020 - # @cached_property - # def torvald(self): - # data = decomp(self.read_tch('author_trpath')) - # return tuple(path for path in (data and data.split(";"))) - -# temporary data for local test -# TODO: remove once commit parse -# -# c = Commit("1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab") -# c._data = b'tree 0a2def6627be9bf2f3c7c2a84eb1e2feb0e7c03e\n' \ -# b'parent d55f5fb86e5892dd1673a9c6cf5e3fdff8c5d93b\n' \ -# b'author AlecGoldstein123 <34690976+AlecGoldstein123@users.noreply.github.com> 1513882101 -0500\n' \ -# b'committer GitHub 1513882101 -0500\n' \ -# b'gpgsig -----BEGIN PGP SIGNATURE-----\n' \ -# b' \n' \ -# b' wsBcBAABCAAQBQJaPAH1CRBK7hj4Ov3rIwAAdHIIACYBs+bTOv7clJSYr9NT0gbX\n' \ -# b' zb4XeJJADvDISZUJChwebEENDue5+GX+dX03ILptRizVVnASwNZR30DENeJNcOpw\n' \ -# b' WqXKho+AV0H0C91x8CIbICnDjdgGdcyKFBCWQ8lBV6BjiRwGXFKJU6dyt480lzs8\n' \ -# b' Eu2PqpTg59Xr/msd4vTrQofSoRwu8kW8KXBWou6G1f9KVCoOXWvhRmiLngFupyPV\n' \ -# b' 0jbNLOe6IQ37xrvvSULCiBmemeYfAJSUywMPIPFyUpzZc2+jKDOcxJeKrRxzmQM0\n' \ -# b' XKeHQIqKSQOVPB/SB7i2Pnxf/UBObaa4kiFoDGHp5IjolgMC+4pFuF2mOE5pbcQ=\n' \ -# b' =cWKt\n' \ -# b' -----END PGP SIGNATURE-----\n' \ -# b' \n\n' \ -# b'Add files via upload' diff --git a/oscar.pyxbld b/oscar.pyxbld deleted file mode 100644 index 6605060..0000000 --- a/oscar.pyxbld +++ /dev/null @@ -1,16 +0,0 @@ - -# this file is needed for oscar.pyx to be compiled by tests/unit_test.py -# because pyximport.install() doesn't take libraries argument - -def make_ext(modname, pyxfilename): - from distutils.extension import Extension - return Extension( - name=modname, libraries=['bz2', 'z'], include_dirs=['lib'], - sources=[pyxfilename, - 'lib/tchdb.c', 'lib/myconf.c', 'lib/tcutil.c', 'lib/md5.c'], - extra_compile_args=['-std=gnu11'] - ) - - -def make_setup_args(): - return {"script_args": ["--force"]} diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 0000000..aa51cc1 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,621 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "astunparse" +version = "1.6.3" +description = "An AST unparser for Python" +optional = false +python-versions = "*" +files = [ + {file = "astunparse-1.6.3-py2.py3-none-any.whl", hash = "sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8"}, + {file = "astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872"}, +] + +[package.dependencies] +six = ">=1.6.1,<2.0" +wheel = ">=0.23.0,<1.0" + +[[package]] +name = "cfgv" +version = "3.4.0" +description = "Validate configuration and produce human readable error messages." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + +[[package]] +name = "chardet" +version = "5.2.0" +description = "Universal encoding detector for Python 3" +optional = false +python-versions = ">=3.7" +files = [ + {file = "chardet-5.2.0-py3-none-any.whl", hash = "sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970"}, + {file = "chardet-5.2.0.tar.gz", hash = "sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7"}, +] + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "coverage" +version = "7.5.3" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "coverage-7.5.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a6519d917abb15e12380406d721e37613e2a67d166f9fb7e5a8ce0375744cd45"}, + {file = "coverage-7.5.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:aea7da970f1feccf48be7335f8b2ca64baf9b589d79e05b9397a06696ce1a1ec"}, + {file = "coverage-7.5.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:923b7b1c717bd0f0f92d862d1ff51d9b2b55dbbd133e05680204465f454bb286"}, + {file = "coverage-7.5.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62bda40da1e68898186f274f832ef3e759ce929da9a9fd9fcf265956de269dbc"}, + {file = "coverage-7.5.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8b7339180d00de83e930358223c617cc343dd08e1aa5ec7b06c3a121aec4e1d"}, + {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:25a5caf742c6195e08002d3b6c2dd6947e50efc5fc2c2205f61ecb47592d2d83"}, + {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:05ac5f60faa0c704c0f7e6a5cbfd6f02101ed05e0aee4d2822637a9e672c998d"}, + {file = "coverage-7.5.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:239a4e75e09c2b12ea478d28815acf83334d32e722e7433471fbf641c606344c"}, + {file = "coverage-7.5.3-cp310-cp310-win32.whl", hash = "sha256:a5812840d1d00eafae6585aba38021f90a705a25b8216ec7f66aebe5b619fb84"}, + {file = "coverage-7.5.3-cp310-cp310-win_amd64.whl", hash = "sha256:33ca90a0eb29225f195e30684ba4a6db05dbef03c2ccd50b9077714c48153cac"}, + {file = "coverage-7.5.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f81bc26d609bf0fbc622c7122ba6307993c83c795d2d6f6f6fd8c000a770d974"}, + {file = "coverage-7.5.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7cec2af81f9e7569280822be68bd57e51b86d42e59ea30d10ebdbb22d2cb7232"}, + {file = "coverage-7.5.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55f689f846661e3f26efa535071775d0483388a1ccfab899df72924805e9e7cd"}, + {file = "coverage-7.5.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50084d3516aa263791198913a17354bd1dc627d3c1639209640b9cac3fef5807"}, + {file = "coverage-7.5.3-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:341dd8f61c26337c37988345ca5c8ccabeff33093a26953a1ac72e7d0103c4fb"}, + {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ab0b028165eea880af12f66086694768f2c3139b2c31ad5e032c8edbafca6ffc"}, + {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:5bc5a8c87714b0c67cfeb4c7caa82b2d71e8864d1a46aa990b5588fa953673b8"}, + {file = "coverage-7.5.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:38a3b98dae8a7c9057bd91fbf3415c05e700a5114c5f1b5b0ea5f8f429ba6614"}, + {file = "coverage-7.5.3-cp311-cp311-win32.whl", hash = "sha256:fcf7d1d6f5da887ca04302db8e0e0cf56ce9a5e05f202720e49b3e8157ddb9a9"}, + {file = "coverage-7.5.3-cp311-cp311-win_amd64.whl", hash = "sha256:8c836309931839cca658a78a888dab9676b5c988d0dd34ca247f5f3e679f4e7a"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:296a7d9bbc598e8744c00f7a6cecf1da9b30ae9ad51c566291ff1314e6cbbed8"}, + {file = "coverage-7.5.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:34d6d21d8795a97b14d503dcaf74226ae51eb1f2bd41015d3ef332a24d0a17b3"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e317953bb4c074c06c798a11dbdd2cf9979dbcaa8ccc0fa4701d80042d4ebf1"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:705f3d7c2b098c40f5b81790a5fedb274113373d4d1a69e65f8b68b0cc26f6db"}, + {file = "coverage-7.5.3-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1196e13c45e327d6cd0b6e471530a1882f1017eb83c6229fc613cd1a11b53cd"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:015eddc5ccd5364dcb902eaecf9515636806fa1e0d5bef5769d06d0f31b54523"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:fd27d8b49e574e50caa65196d908f80e4dff64d7e592d0c59788b45aad7e8b35"}, + {file = "coverage-7.5.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:33fc65740267222fc02975c061eb7167185fef4cc8f2770267ee8bf7d6a42f84"}, + {file = "coverage-7.5.3-cp312-cp312-win32.whl", hash = "sha256:7b2a19e13dfb5c8e145c7a6ea959485ee8e2204699903c88c7d25283584bfc08"}, + {file = "coverage-7.5.3-cp312-cp312-win_amd64.whl", hash = "sha256:0bbddc54bbacfc09b3edaec644d4ac90c08ee8ed4844b0f86227dcda2d428fcb"}, + {file = "coverage-7.5.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f78300789a708ac1f17e134593f577407d52d0417305435b134805c4fb135adb"}, + {file = "coverage-7.5.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b368e1aee1b9b75757942d44d7598dcd22a9dbb126affcbba82d15917f0cc155"}, + {file = "coverage-7.5.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f836c174c3a7f639bded48ec913f348c4761cbf49de4a20a956d3431a7c9cb24"}, + {file = "coverage-7.5.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:244f509f126dc71369393ce5fea17c0592c40ee44e607b6d855e9c4ac57aac98"}, + {file = "coverage-7.5.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c4c2872b3c91f9baa836147ca33650dc5c172e9273c808c3c3199c75490e709d"}, + {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:dd4b3355b01273a56b20c219e74e7549e14370b31a4ffe42706a8cda91f19f6d"}, + {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:f542287b1489c7a860d43a7d8883e27ca62ab84ca53c965d11dac1d3a1fab7ce"}, + {file = "coverage-7.5.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:75e3f4e86804023e991096b29e147e635f5e2568f77883a1e6eed74512659ab0"}, + {file = "coverage-7.5.3-cp38-cp38-win32.whl", hash = "sha256:c59d2ad092dc0551d9f79d9d44d005c945ba95832a6798f98f9216ede3d5f485"}, + {file = "coverage-7.5.3-cp38-cp38-win_amd64.whl", hash = "sha256:fa21a04112c59ad54f69d80e376f7f9d0f5f9123ab87ecd18fbb9ec3a2beed56"}, + {file = "coverage-7.5.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f5102a92855d518b0996eb197772f5ac2a527c0ec617124ad5242a3af5e25f85"}, + {file = "coverage-7.5.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d1da0a2e3b37b745a2b2a678a4c796462cf753aebf94edcc87dcc6b8641eae31"}, + {file = "coverage-7.5.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8383a6c8cefba1b7cecc0149415046b6fc38836295bc4c84e820872eb5478b3d"}, + {file = "coverage-7.5.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9aad68c3f2566dfae84bf46295a79e79d904e1c21ccfc66de88cd446f8686341"}, + {file = "coverage-7.5.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e079c9ec772fedbade9d7ebc36202a1d9ef7291bc9b3a024ca395c4d52853d7"}, + {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bde997cac85fcac227b27d4fb2c7608a2c5f6558469b0eb704c5726ae49e1c52"}, + {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:990fb20b32990b2ce2c5f974c3e738c9358b2735bc05075d50a6f36721b8f303"}, + {file = "coverage-7.5.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:3d5a67f0da401e105753d474369ab034c7bae51a4c31c77d94030d59e41df5bd"}, + {file = "coverage-7.5.3-cp39-cp39-win32.whl", hash = "sha256:e08c470c2eb01977d221fd87495b44867a56d4d594f43739a8028f8646a51e0d"}, + {file = "coverage-7.5.3-cp39-cp39-win_amd64.whl", hash = "sha256:1d2a830ade66d3563bb61d1e3c77c8def97b30ed91e166c67d0632c018f380f0"}, + {file = "coverage-7.5.3-pp38.pp39.pp310-none-any.whl", hash = "sha256:3538d8fb1ee9bdd2e2692b3b18c22bb1c19ffbefd06880f5ac496e42d7bb3884"}, + {file = "coverage-7.5.3.tar.gz", hash = "sha256:04aefca5190d1dc7a53a4c1a5a7f8568811306d7a8ee231c42fb69215571944f"}, +] + +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + +[package.extras] +toml = ["tomli"] + +[[package]] +name = "cython" +version = "0.29.37" +description = "The Cython compiler for writing C extensions for the Python language." +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "Cython-0.29.37-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f2d621fe4cb50007446742134a890500b34e3f50abaf7993baaca02634af7e15"}, + {file = "Cython-0.29.37-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d94caf90ae9cb56116ca6d54cdcbccd3c4df6b0cb7233922b2233ee7fe81d05b"}, + {file = "Cython-0.29.37-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:852cd4378cbc9ade02f53709107ff9fdad55019a3a636e8a27663ba6cfce10b6"}, + {file = "Cython-0.29.37-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:bbce388431a2608a81c8ab13cb14c50611473843ca766031b8b24bb1723faf79"}, + {file = "Cython-0.29.37-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4658499a41255431f6bbdca7e634e9c8d3a4c190bf24b4aa1646dac751d3da4d"}, + {file = "Cython-0.29.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:12192ab269e7185720f2d2f8894587bf1da4276db1b9b869e4622a093f18cae6"}, + {file = "Cython-0.29.37-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:9450e0766ab65947f8a2a36f9e59079fc879c3807ec936c61725a48c97741a52"}, + {file = "Cython-0.29.37-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:177481b0a7e003e5c49e2bf0dda1d6fe610c239f17642a5da9f18c2ad0c5f6b6"}, + {file = "Cython-0.29.37-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b048354fd380278f2fa096e7526973beb6e0491a9d44d7e4e29df52612d25776"}, + {file = "Cython-0.29.37-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ea6d208be1906c5df25b674777d5905c6d8e9ef0b201b830849e0729ba08caba"}, + {file = "Cython-0.29.37-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:af03854571738307a5f30cc6b724081d72db12f907699e7fdfc04c12c839158e"}, + {file = "Cython-0.29.37-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c33508ede9172a6f6f99d5a6dadc7fee23c840423b411ef8b5a403c04e530297"}, + {file = "Cython-0.29.37-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8af5975ecfae254d8c0051204fca995dda8f93cf9f0bbf7571e3cda2b0cef4d"}, + {file = "Cython-0.29.37-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:29415d8eb2fdc1ea518ca4810c50a2d062b387d4c9fbcfb3352346e93db22c6d"}, + {file = "Cython-0.29.37-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fe0eaf6b1e9ee97c5ee7bfc943f00e36cf59d929db16886cb018352bff8208da"}, + {file = "Cython-0.29.37-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:cc1b9ce2b73b9ee8c305e06173b35c7c202d4b82d084a0cd73dcedfd6d310aec"}, + {file = "Cython-0.29.37-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:2618af0b8df26d32ee4e8858d4ad8167546596762620aeade84954ae37194a0e"}, + {file = "Cython-0.29.37-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:ac910a28a2fd3d280faf3077b6fe63b97a4b93994ff05647581846f0e4b2f8d1"}, + {file = "Cython-0.29.37-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:8bf38373773f967cfd793997a6fb96cf972d41a9fce987ace5767349d6f15572"}, + {file = "Cython-0.29.37-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6cddb567dadb3aa3e280a8a35e5126030915ea744c2812206e9c194b8881475d"}, + {file = "Cython-0.29.37-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:79ecfc48694e156402c05561e0adb0e25a6e9d35ac0b41693733a08219d38c58"}, + {file = "Cython-0.29.37-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:9a455347e20ddfad0c5dfee32a3e855ee96811269e5fd86be622ddc4cb326404"}, + {file = "Cython-0.29.37-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:fa5b6a0f69bf1823c9fd038fa77a2568b78fda2de045a95b48a71dee4d0d578f"}, + {file = "Cython-0.29.37-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:a6164a05440dcd9daa760c6488bc91bdac1380c7b4b3aca38cf307ba66042d54"}, + {file = "Cython-0.29.37-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:562f8f911dbd6f1a1b9be8f6cba097125700355688f613994ccd4406f220557a"}, + {file = "Cython-0.29.37-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8c39c2f5a0fe29bb01de9b1fb449bf65bed6f192317c677f181732791c63fe28"}, + {file = "Cython-0.29.37-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0a0a6d5972bb3b8c7363cf19a42a988bb0c0bb5ebd9c736c84eca85113ccfdbe"}, + {file = "Cython-0.29.37-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b82584836e9e7c0d6effee976595e5cd7fa88dbef3e96e900187983c1d4637d1"}, + {file = "Cython-0.29.37-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b6c48f1032b379135a5b4a31976d6c468e02490688acf9254c6c8ed27bd4cbd4"}, + {file = "Cython-0.29.37-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:3f87bef1808d255cf13be378c7ad27ae7c6db6df7732217d32428d1daf4109be"}, + {file = "Cython-0.29.37-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:9e68bafeeb97d5a403fb1f7700bd4a55a1f8989824c323ae02ae8a4fcd88f6a1"}, + {file = "Cython-0.29.37-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e14cd44c830e53cf9d7269c87a6bcc638bb065ec07e24990e338162c7001d3c3"}, + {file = "Cython-0.29.37-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0544f7a3e4437b89b356baa15387494c18214e03f2ffaddada5a2c71c3dfd24b"}, + {file = "Cython-0.29.37-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2de3e729d25f041036e81e2f15683dd129f977dfb5b06267e30e8d7acec43225"}, + {file = "Cython-0.29.37-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:2ad634dc77a6a74022881826099eccac19c9b79153942cc82e754ffac2bec116"}, + {file = "Cython-0.29.37-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:e841a8b4f9ceefb2916e32dac4f28a895cd519e8ece71505144da1ee355c548a"}, + {file = "Cython-0.29.37-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:6c672089fba6a8f6690b8d7924a58c04477771401ad101d53171a13405ee12cb"}, + {file = "Cython-0.29.37-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0301d4739c6894e012f1d410052082fdda9e63888c815d9e23e0f7f82fff7d79"}, + {file = "Cython-0.29.37-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:af8e7b4397620e2d18259a11f3bfa026eff9846657e397d02616962dd5dd035a"}, + {file = "Cython-0.29.37-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b225d5e2091c224d4ab328165fef224ba3919b3ed44bd9b3241416f523b4d51a"}, + {file = "Cython-0.29.37-py2.py3-none-any.whl", hash = "sha256:95f1d6a83ef2729e67b3fa7318c829ce5b07ac64c084cd6af11c228e0364662c"}, + {file = "Cython-0.29.37.tar.gz", hash = "sha256:f813d4a6dd94adee5d4ff266191d1d95bf6d4164a4facc535422c021b2504cfb"}, +] + +[[package]] +name = "distlib" +version = "0.3.8" +description = "Distribution utilities" +optional = false +python-versions = "*" +files = [ + {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"}, + {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.1-py3-none-any.whl", hash = "sha256:5258b9ed329c5bbdd31a309f53cbfb0b155341807f6ff7606a1e801a891b29ad"}, + {file = "exceptiongroup-1.2.1.tar.gz", hash = "sha256:a4785e48b045528f5bfe627b6ad554ff32def154f42372786903b7abcfe1aa16"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "filelock" +version = "3.14.0" +description = "A platform independent file lock." +optional = false +python-versions = ">=3.8" +files = [ + {file = "filelock-3.14.0-py3-none-any.whl", hash = "sha256:43339835842f110ca7ae60f1e1c160714c5a6afd15a2873419ab185334975c0f"}, + {file = "filelock-3.14.0.tar.gz", hash = "sha256:6ea72da3be9b8c82afd3edcf99f2fffbb5076335a5ae4d03248bb5b6c3eae78a"}, +] + +[package.extras] +docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8.0.1)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] +typing = ["typing-extensions (>=4.8)"] + +[[package]] +name = "identify" +version = "2.5.36" +description = "File identification library for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "identify-2.5.36-py2.py3-none-any.whl", hash = "sha256:37d93f380f4de590500d9dba7db359d0d3da95ffe7f9de1753faa159e71e7dfa"}, + {file = "identify-2.5.36.tar.gz", hash = "sha256:e5e00f54165f9047fbebeb4a560f9acfb8af4c88232be60a488e9b68d122745d"}, +] + +[package.extras] +license = ["ukkonen"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "jinja2" +version = "3.1.4" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "markupsafe" +version = "2.1.5" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win32.whl", hash = "sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4"}, + {file = "MarkupSafe-2.1.5-cp310-cp310-win_amd64.whl", hash = "sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win32.whl", hash = "sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906"}, + {file = "MarkupSafe-2.1.5-cp311-cp311-win_amd64.whl", hash = "sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win32.whl", hash = "sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad"}, + {file = "MarkupSafe-2.1.5-cp312-cp312-win_amd64.whl", hash = "sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win32.whl", hash = "sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371"}, + {file = "MarkupSafe-2.1.5-cp37-cp37m-win_amd64.whl", hash = "sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win32.whl", hash = "sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff"}, + {file = "MarkupSafe-2.1.5-cp38-cp38-win_amd64.whl", hash = "sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win32.whl", hash = "sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf"}, + {file = "MarkupSafe-2.1.5-cp39-cp39-win_amd64.whl", hash = "sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5"}, + {file = "MarkupSafe-2.1.5.tar.gz", hash = "sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b"}, +] + +[[package]] +name = "nodeenv" +version = "1.9.1" +description = "Node.js virtual environment builder" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, + {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, +] + +[[package]] +name = "packaging" +version = "24.0" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-24.0-py3-none-any.whl", hash = "sha256:2ddfb553fdf02fb784c234c7ba6ccc288296ceabec964ad2eae3777778130bc5"}, + {file = "packaging-24.0.tar.gz", hash = "sha256:eb82c5e3e56209074766e6885bb04b8c38a0c015d0a30036ebe7ece34c9989e9"}, +] + +[[package]] +name = "pdoc" +version = "14.5.0" +description = "API Documentation for Python Projects" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pdoc-14.5.0-py3-none-any.whl", hash = "sha256:9a8a84e19662610c0620fbe9f2e4174e3b090f8b601ed46348786ebb7517c508"}, + {file = "pdoc-14.5.0.tar.gz", hash = "sha256:79f534dc8a6494638dd6056b78e17a654df7ed34cc92646553ce3a7ba5a4fa4a"}, +] + +[package.dependencies] +astunparse = {version = "*", markers = "python_version < \"3.9\""} +Jinja2 = ">=2.11.0" +MarkupSafe = "*" +pygments = ">=2.12.0" + +[package.extras] +dev = ["hypothesis", "mypy", "pdoc-pyo3-sample-library (==1.0.11)", "pygments (>=2.14.0)", "pytest", "pytest-cov", "pytest-timeout", "ruff", "tox", "types-pygments"] + +[[package]] +name = "platformdirs" +version = "4.2.2" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +files = [ + {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, + {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, +] + +[package.extras] +docs = ["furo (>=2023.9.10)", "proselint (>=0.13)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1.25.2)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)"] +type = ["mypy (>=1.8)"] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pre-commit" +version = "3.5.0" +description = "A framework for managing and maintaining multi-language pre-commit hooks." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pre_commit-3.5.0-py2.py3-none-any.whl", hash = "sha256:841dc9aef25daba9a0238cd27984041fa0467b4199fc4852e27950664919f660"}, + {file = "pre_commit-3.5.0.tar.gz", hash = "sha256:5804465c675b659b0862f07907f96295d490822a450c4c40e747d0b1c6ebcb32"}, +] + +[package.dependencies] +cfgv = ">=2.0.0" +identify = ">=1.0.0" +nodeenv = ">=0.11.1" +pyyaml = ">=5.1" +virtualenv = ">=20.10.0" + +[[package]] +name = "pygments" +version = "2.18.0" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, + {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[[package]] +name = "pytest" +version = "8.2.2" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.2.2-py3-none-any.whl", hash = "sha256:c434598117762e2bd304e526244f67bf66bbd7b5d6cf22138be51ff661980343"}, + {file = "pytest-8.2.2.tar.gz", hash = "sha256:de4bb8104e201939ccdc688b27a89a7be2079b22e2bd2b07f806b6ba71117977"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2.0" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-cov" +version = "5.0.0" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857"}, + {file = "pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652"}, +] + +[package.dependencies] +coverage = {version = ">=5.2.1", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] + +[[package]] +name = "python-lzf" +version = "0.2.4" +description = "C Extension for liblzf" +optional = false +python-versions = "*" +files = [ + {file = "python-lzf-0.2.4.tar.gz", hash = "sha256:d1420f1544e612ef1bb41ce0f1d14c2964b3444612f1468f85a886caff3615d1"}, +] + +[[package]] +name = "pyyaml" +version = "6.0.1" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, + {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, + {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, + {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, + {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, + {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, +] + +[[package]] +name = "ruff" +version = "0.4.8" +description = "An extremely fast Python linter and code formatter, written in Rust." +optional = false +python-versions = ">=3.7" +files = [ + {file = "ruff-0.4.8-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:7663a6d78f6adb0eab270fa9cf1ff2d28618ca3a652b60f2a234d92b9ec89066"}, + {file = "ruff-0.4.8-py3-none-macosx_11_0_arm64.whl", hash = "sha256:eeceb78da8afb6de0ddada93112869852d04f1cd0f6b80fe464fd4e35c330913"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aad360893e92486662ef3be0a339c5ca3c1b109e0134fcd37d534d4be9fb8de3"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:284c2e3f3396fb05f5f803c9fffb53ebbe09a3ebe7dda2929ed8d73ded736deb"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7354f921e3fbe04d2a62d46707e569f9315e1a613307f7311a935743c51a764"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:72584676164e15a68a15778fd1b17c28a519e7a0622161eb2debdcdabdc71883"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9678d5c9b43315f323af2233a04d747409d1e3aa6789620083a82d1066a35199"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704977a658131651a22b5ebeb28b717ef42ac6ee3b11e91dc87b633b5d83142b"}, + {file = "ruff-0.4.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d05f8d6f0c3cce5026cecd83b7a143dcad503045857bc49662f736437380ad45"}, + {file = "ruff-0.4.8-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:6ea874950daca5697309d976c9afba830d3bf0ed66887481d6bca1673fc5b66a"}, + {file = "ruff-0.4.8-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:fc95aac2943ddf360376be9aa3107c8cf9640083940a8c5bd824be692d2216dc"}, + {file = "ruff-0.4.8-py3-none-musllinux_1_2_i686.whl", hash = "sha256:384154a1c3f4bf537bac69f33720957ee49ac8d484bfc91720cc94172026ceed"}, + {file = "ruff-0.4.8-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:e9d5ce97cacc99878aa0d084c626a15cd21e6b3d53fd6f9112b7fc485918e1fa"}, + {file = "ruff-0.4.8-py3-none-win32.whl", hash = "sha256:6d795d7639212c2dfd01991259460101c22aabf420d9b943f153ab9d9706e6a9"}, + {file = "ruff-0.4.8-py3-none-win_amd64.whl", hash = "sha256:e14a3a095d07560a9d6769a72f781d73259655919d9b396c650fc98a8157555d"}, + {file = "ruff-0.4.8-py3-none-win_arm64.whl", hash = "sha256:14019a06dbe29b608f6b7cbcec300e3170a8d86efaddb7b23405cb7f7dcaf780"}, + {file = "ruff-0.4.8.tar.gz", hash = "sha256:16d717b1d57b2e2fd68bd0bf80fb43931b79d05a7131aa477d66fc40fbd86268"}, +] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "virtualenv" +version = "20.26.2" +description = "Virtual Python Environment builder" +optional = false +python-versions = ">=3.7" +files = [ + {file = "virtualenv-20.26.2-py3-none-any.whl", hash = "sha256:a624db5e94f01ad993d476b9ee5346fdf7b9de43ccaee0e0197012dc838a0e9b"}, + {file = "virtualenv-20.26.2.tar.gz", hash = "sha256:82bf0f4eebbb78d36ddaee0283d43fe5736b53880b8a8cdcd37390a07ac3741c"}, +] + +[package.dependencies] +distlib = ">=0.3.7,<1" +filelock = ">=3.12.2,<4" +platformdirs = ">=3.9.1,<5" + +[package.extras] +docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2,!=7.3)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] + +[[package]] +name = "wheel" +version = "0.43.0" +description = "A built-package format for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "wheel-0.43.0-py3-none-any.whl", hash = "sha256:55c570405f142630c6b9f72fe09d9b67cf1477fcf543ae5b8dcb1f5b7377da81"}, + {file = "wheel-0.43.0.tar.gz", hash = "sha256:465ef92c69fa5c5da2d1cf8ac40559a8c940886afcef87dcf14b9470862f1d85"}, +] + +[package.extras] +test = ["pytest (>=6.0.0)", "setuptools (>=65)"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.8" +content-hash = "c814018ea216521755e7c6cb1dc2278192f919166580a2ee24133fa4e99f2ee2" diff --git a/pyproject.toml b/pyproject.toml index 2820f39..cd109c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,133 @@ +[tool.poetry] +name = "python-woc" +version = "0.1.0" +description = "Python interface for World of Code" +authors = ["Runzhi He ", "Marat "] +license = "GPL-3.0" +readme = "README.md" +packages = [ + { include = "woc" } +] +include = [ + { path = "woc/**/*.so", format = "wheel" }, + { path = "lib/*", format = "sdist" } +] + +[tool.poetry.build] +script = "setup.py" +generate-setup-file = false + +[tool.poetry.dependencies] +python = "^3.8" +python-lzf = "^0.2.4" +chardet = "^5.2.0" + +[tool.poetry.group.build.dependencies] +cython = "^0.29.0" + +[tool.poetry.group.dev.dependencies] +pytest = "^8.2.2" +pytest-cov = "^5.0.0" +coverage = {extras = ["toml"], version = "^7.5.3"} +# bandit = "^1.7.8" +# docformatter = "^1.7.5" +ruff = "^0.4.4" # <- ruff is powerful enough to be the only linter +# sphinx = "^7.1.2" +# sphinx-rtd-theme = "^1.0.0" +# sphinx-autobuild = "^2021.3.14" +# sphinx-pyproject = "^0.3.0" +# sphinx-autodoc-typehints = "^1.25.3" +pdoc = "^14.5.0" # <- drop sphinx, too many deps and rst is not fun +pre-commit = "^3.5" + [build-system] -requires = ["setuptools>=18.0", "wheel", "Cython"] - -[tool.semantic_release] -version_variable = ['oscar.pyx:__version__', 'docs/conf.py:release'] -# wheel build is performed by manylinux, so just skip it -build_command = '' -# remove_dist deletes all pre-existing files in dist/, including built by manylinux -remove_dist = false \ No newline at end of file +requires = ["poetry-core", "Cython>=0.29.0,<1.0.0", "setuptools>=42"] +build-backend = "poetry.core.masonry.api" + +### Test and coverage ### + +[tool.pytest.ini_options] +addopts = "-ra -q" +testpaths = [ + "tests", +] + +[tool.coverage.report] +show_missing = true +omit = [ "tests/*" ] + +[tool.coverage.run] +plugins = [ "Cython.Coverage" ] + +### Documentation ### + +# [tool.sphinx-autobuild] +# watch = ["woc", "docs"] + + +### Formatting and linting ### + +# [tool.docformatter] +# recursive = true +# wrap-summaries = 88 +# wrap-descriptions = 88 +# style = 'sphinx' +# blank = true + +[tool.ruff] +target-version = "py38" +line-length = 90 +extend-exclude = [ + "__pycache__", + "build", + "dist", +] + +[tool.ruff.lint] +# rules ref: https://docs.astral.sh/ruff +extend-select = [ + "C4", + "D201", + "D204", + "D205", + "D206", + "D210", + "D211", + "D213", + "D300", + "D419", + "E", + "F", + "G010", + "I001", + "INP001", + "N805", + "PERF101", + "PERF102", + "PERF401", + "PERF402", + "PGH004", + "PGH005", + "PIE794", + "PIE796", + "PIE807", + "PIE810", + "RUF015", + "RUF100", +] +# disable line length check, as ruff format already handles that +ignore = ["E203", "E501", "E741"] + +[tool.ruff.lint.per-file-ignores] +# loose checks for test files +"tests/**/*.py" = [ + "S101", + "ARG", + "FBT", + "PLR2004", + "S311", + "D", + "F405", + "F403", + "F841" +] diff --git a/requirements.txt b/requirements.txt index 7a33918..0a648d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,48 @@ -python-lzf -typing \ No newline at end of file +chardet==5.2.0 ; python_version >= "3.8" and python_version < "4.0" \ + --hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \ + --hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970 +cython==0.29.37 ; python_version >= "3.8" and python_version < "4.0" \ + --hash=sha256:0301d4739c6894e012f1d410052082fdda9e63888c815d9e23e0f7f82fff7d79 \ + --hash=sha256:0544f7a3e4437b89b356baa15387494c18214e03f2ffaddada5a2c71c3dfd24b \ + --hash=sha256:0a0a6d5972bb3b8c7363cf19a42a988bb0c0bb5ebd9c736c84eca85113ccfdbe \ + --hash=sha256:12192ab269e7185720f2d2f8894587bf1da4276db1b9b869e4622a093f18cae6 \ + --hash=sha256:177481b0a7e003e5c49e2bf0dda1d6fe610c239f17642a5da9f18c2ad0c5f6b6 \ + --hash=sha256:2618af0b8df26d32ee4e8858d4ad8167546596762620aeade84954ae37194a0e \ + --hash=sha256:29415d8eb2fdc1ea518ca4810c50a2d062b387d4c9fbcfb3352346e93db22c6d \ + --hash=sha256:2ad634dc77a6a74022881826099eccac19c9b79153942cc82e754ffac2bec116 \ + --hash=sha256:2de3e729d25f041036e81e2f15683dd129f977dfb5b06267e30e8d7acec43225 \ + --hash=sha256:3f87bef1808d255cf13be378c7ad27ae7c6db6df7732217d32428d1daf4109be \ + --hash=sha256:4658499a41255431f6bbdca7e634e9c8d3a4c190bf24b4aa1646dac751d3da4d \ + --hash=sha256:562f8f911dbd6f1a1b9be8f6cba097125700355688f613994ccd4406f220557a \ + --hash=sha256:6c672089fba6a8f6690b8d7924a58c04477771401ad101d53171a13405ee12cb \ + --hash=sha256:6cddb567dadb3aa3e280a8a35e5126030915ea744c2812206e9c194b8881475d \ + --hash=sha256:79ecfc48694e156402c05561e0adb0e25a6e9d35ac0b41693733a08219d38c58 \ + --hash=sha256:852cd4378cbc9ade02f53709107ff9fdad55019a3a636e8a27663ba6cfce10b6 \ + --hash=sha256:8bf38373773f967cfd793997a6fb96cf972d41a9fce987ace5767349d6f15572 \ + --hash=sha256:8c39c2f5a0fe29bb01de9b1fb449bf65bed6f192317c677f181732791c63fe28 \ + --hash=sha256:9450e0766ab65947f8a2a36f9e59079fc879c3807ec936c61725a48c97741a52 \ + --hash=sha256:95f1d6a83ef2729e67b3fa7318c829ce5b07ac64c084cd6af11c228e0364662c \ + --hash=sha256:9a455347e20ddfad0c5dfee32a3e855ee96811269e5fd86be622ddc4cb326404 \ + --hash=sha256:9e68bafeeb97d5a403fb1f7700bd4a55a1f8989824c323ae02ae8a4fcd88f6a1 \ + --hash=sha256:a6164a05440dcd9daa760c6488bc91bdac1380c7b4b3aca38cf307ba66042d54 \ + --hash=sha256:ac910a28a2fd3d280faf3077b6fe63b97a4b93994ff05647581846f0e4b2f8d1 \ + --hash=sha256:af03854571738307a5f30cc6b724081d72db12f907699e7fdfc04c12c839158e \ + --hash=sha256:af8e7b4397620e2d18259a11f3bfa026eff9846657e397d02616962dd5dd035a \ + --hash=sha256:b048354fd380278f2fa096e7526973beb6e0491a9d44d7e4e29df52612d25776 \ + --hash=sha256:b225d5e2091c224d4ab328165fef224ba3919b3ed44bd9b3241416f523b4d51a \ + --hash=sha256:b6c48f1032b379135a5b4a31976d6c468e02490688acf9254c6c8ed27bd4cbd4 \ + --hash=sha256:b82584836e9e7c0d6effee976595e5cd7fa88dbef3e96e900187983c1d4637d1 \ + --hash=sha256:bbce388431a2608a81c8ab13cb14c50611473843ca766031b8b24bb1723faf79 \ + --hash=sha256:c33508ede9172a6f6f99d5a6dadc7fee23c840423b411ef8b5a403c04e530297 \ + --hash=sha256:cc1b9ce2b73b9ee8c305e06173b35c7c202d4b82d084a0cd73dcedfd6d310aec \ + --hash=sha256:d94caf90ae9cb56116ca6d54cdcbccd3c4df6b0cb7233922b2233ee7fe81d05b \ + --hash=sha256:e14cd44c830e53cf9d7269c87a6bcc638bb065ec07e24990e338162c7001d3c3 \ + --hash=sha256:e841a8b4f9ceefb2916e32dac4f28a895cd519e8ece71505144da1ee355c548a \ + --hash=sha256:e8af5975ecfae254d8c0051204fca995dda8f93cf9f0bbf7571e3cda2b0cef4d \ + --hash=sha256:ea6d208be1906c5df25b674777d5905c6d8e9ef0b201b830849e0729ba08caba \ + --hash=sha256:f2d621fe4cb50007446742134a890500b34e3f50abaf7993baaca02634af7e15 \ + --hash=sha256:f813d4a6dd94adee5d4ff266191d1d95bf6d4164a4facc535422c021b2504cfb \ + --hash=sha256:fa5b6a0f69bf1823c9fd038fa77a2568b78fda2de045a95b48a71dee4d0d578f \ + --hash=sha256:fe0eaf6b1e9ee97c5ee7bfc943f00e36cf59d929db16886cb018352bff8208da +python-lzf==0.2.4 ; python_version >= "3.8" and python_version < "4.0" \ + --hash=sha256:d1420f1544e612ef1bb41ce0f1d14c2964b3444612f1468f85a886caff3615d1 diff --git a/setup.py b/setup.py index 3061936..ea88c27 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,16 @@ import os -from setuptools import setup, Extension, find_packages + from Cython.Build import cythonize +# Thanks to @tryptofame for proposing an updated snippet +from Cython.Compiler.Options import get_directive_defaults +from setuptools import Extension, find_packages, setup + +directive_defaults = get_directive_defaults() + +directive_defaults["linetrace"] = True +directive_defaults["binding"] = True + ROOT = os.path.abspath(os.path.dirname(__file__)) PACKAGE_ROOT = os.path.join(ROOT, "woc") TCH_ROOT = os.path.join(ROOT, "lib") @@ -9,33 +18,36 @@ ext_modules = [ Extension( - 'woc.local', - libraries=['bz2', 'z'], - include_dirs=['lib'], + "woc.local", + libraries=["bz2", "z"], + include_dirs=["lib"], sources=[ - os.path.join(PACKAGE_ROOT, 'local.pyx'), - ], - extra_compile_args=['-std=gnu11'] + os.path.join(PACKAGE_ROOT, "local.pyx"), + ], + extra_compile_args=["-std=gnu11"], + define_macros=[("CYTHON_TRACE", "1")], ), Extension( - 'woc.tch', - libraries=['bz2', 'z'], - include_dirs=['lib'], + "woc.tch", + libraries=["bz2", "z"], + include_dirs=["lib"], sources=[ - os.path.join(PACKAGE_ROOT, 'tch.pyx'), - os.path.join(TCH_ROOT, 'tchdb.c'), - os.path.join(TCH_ROOT, 'myconf.c'), - os.path.join(TCH_ROOT, 'tcutil.c'), - os.path.join(TCH_ROOT, 'md5.c') - ], - extra_compile_args=['-std=gnu11'] + os.path.join(PACKAGE_ROOT, "tch.pyx"), + os.path.join(TCH_ROOT, "tchdb.c"), + os.path.join(TCH_ROOT, "myconf.c"), + os.path.join(TCH_ROOT, "tcutil.c"), + os.path.join(TCH_ROOT, "md5.c"), + ], + extra_compile_args=["-std=gnu11"], + define_macros=[("CYTHON_TRACE", "1")], ), ] setup( name="python-woc", - ext_modules=cythonize(ext_modules), + ext_modules=cythonize(ext_modules, emit_linenums=True), packages=PACKAGES, package_data={"": ["*.pyx", "*.pxd", "*.pxi"]}, include_package_data=True, -) \ No newline at end of file + script_args=["build_ext", "--inplace"], +) diff --git a/setup.py.bak b/setup.py.bak deleted file mode 100644 index 8d80e99..0000000 --- a/setup.py.bak +++ /dev/null @@ -1,51 +0,0 @@ - -# cython: language_level=3str -import re -from setuptools import Extension, setup - -# IMPORTANT: update oscar.pyxbld if changing any of the Extension parameters -extensions = [ - Extension( - 'oscar', libraries=['bz2', 'z'], include_dirs=['lib'], - sources=['oscar.pyx', - 'lib/tchdb.c', 'lib/myconf.c', 'lib/tcutil.c', 'lib/md5.c'], extra_compile_args=['-std=gnu11'] - ), -] - -head = open('oscar.pyx').read(2048) -pattern = r"""__%s__\s*=\s*['"]([^'"]*)['"]""" -kwargs = {keyword: re.search(pattern % keyword, head).group(1) - for keyword in ('version', 'author', 'license')} - -requirements = [ - line.strip() - for line in open('requirements.txt') - if line.strip() and not line.strip().startswith('#')] - -# options reference: https://docs.python.org/2/distutils/ -# see also: https://packaging.python.org/tutorials/distributing-packages/ -setup( - name='oscar', - description='A Python interface to OSCAR data', - long_description=open('README.md').read(), - long_description_content_type='text/markdown', - classifiers=[ # full list: https://pypi.org/classifiers/ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Science/Research', - 'Programming Language :: Cython', - 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', - 'Operating System :: POSIX :: Linux', - 'Topic :: Scientific/Engineering' - ], - # since setuptools 18.0 it is possible to pass Cython sources to extensions - # without `cythonize` - # https://stackoverflow.com/questions/37471313 - setup_requires=['setuptools>=18.0', 'cython'], - python_requires='>2.6, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4', - # py_modules=['oscar.timeline'], - ext_modules=extensions, - author_email=kwargs['author'], - url='https://github.com/ssc-oscar/oscar.py', - install_requires=requirements, - **kwargs -) diff --git a/tests/fixtures/blob_127.bin b/tests/fixtures/__init__.py similarity index 100% rename from tests/fixtures/blob_127.bin rename to tests/fixtures/__init__.py diff --git a/tests/fixtures/a2cFullR.1.tch b/tests/fixtures/a2cFullR.1.tch new file mode 100644 index 0000000..284a9a9 Binary files /dev/null and b/tests/fixtures/a2cFullR.1.tch differ diff --git a/tests/fixtures/a2fFullR.1.tch b/tests/fixtures/a2fFullR.1.tch new file mode 100644 index 0000000..9a7a25f Binary files /dev/null and b/tests/fixtures/a2fFullR.1.tch differ diff --git a/tests/fixtures/sha1.blob_125.tch b/tests/fixtures/b2cFullR.1.tch similarity index 99% rename from tests/fixtures/sha1.blob_125.tch rename to tests/fixtures/b2cFullR.1.tch index 1410acb..20b9606 100644 Binary files a/tests/fixtures/sha1.blob_125.tch and b/tests/fixtures/b2cFullR.1.tch differ diff --git a/tests/fixtures/b2cFullR.31.tch.large.3f2eca18f1bc0f3117748e2cea9251e5182db2f7 b/tests/fixtures/b2cFullR.31.tch.large.3f2eca18f1bc0f3117748e2cea9251e5182db2f7 new file mode 100644 index 0000000..abe1b9a Binary files /dev/null and b/tests/fixtures/b2cFullR.31.tch.large.3f2eca18f1bc0f3117748e2cea9251e5182db2f7 differ diff --git a/tests/fixtures/sha1.blob_3.tch b/tests/fixtures/b2fFullR.1.tch similarity index 99% rename from tests/fixtures/sha1.blob_3.tch rename to tests/fixtures/b2fFullR.1.tch index e1c1e13..f3b25b5 100644 Binary files a/tests/fixtures/sha1.blob_3.tch and b/tests/fixtures/b2fFullR.1.tch differ diff --git a/tests/fixtures/b2faFullU.1.tch b/tests/fixtures/b2faFullU.1.tch new file mode 100644 index 0000000..2a5b3ee Binary files /dev/null and b/tests/fixtures/b2faFullU.1.tch differ diff --git a/tests/fixtures/b2tacFullU.1.tch b/tests/fixtures/b2tacFullU.1.tch new file mode 100644 index 0000000..dcc5a77 Binary files /dev/null and b/tests/fixtures/b2tacFullU.1.tch differ diff --git a/tests/fixtures/blob_70.bin b/tests/fixtures/blob_0.bin similarity index 100% rename from tests/fixtures/blob_70.bin rename to tests/fixtures/blob_0.bin diff --git a/tests/fixtures/blob_1.bin b/tests/fixtures/blob_1.bin new file mode 100644 index 0000000..31bd3c0 Binary files /dev/null and b/tests/fixtures/blob_1.bin differ diff --git a/tests/fixtures/blob_125.bin b/tests/fixtures/blob_125.bin deleted file mode 100644 index 6e4a5b3..0000000 Binary files a/tests/fixtures/blob_125.bin and /dev/null differ diff --git a/tests/fixtures/blob_3.bin b/tests/fixtures/blob_3.bin deleted file mode 100644 index 646089c..0000000 Binary files a/tests/fixtures/blob_3.bin and /dev/null differ diff --git a/tests/fixtures/blob_35.bin b/tests/fixtures/blob_35.bin deleted file mode 100644 index d8edeec..0000000 Binary files a/tests/fixtures/blob_35.bin and /dev/null differ diff --git a/tests/fixtures/sha1.blob_70.tch b/tests/fixtures/c2bFullR.0.tch similarity index 99% rename from tests/fixtures/sha1.blob_70.tch rename to tests/fixtures/c2bFullR.0.tch index 2e46abf..a4cfea6 100644 Binary files a/tests/fixtures/sha1.blob_70.tch and b/tests/fixtures/c2bFullR.0.tch differ diff --git a/tests/fixtures/c2datFullU.0.tch b/tests/fixtures/c2datFullU.0.tch new file mode 100644 index 0000000..db8a423 Binary files /dev/null and b/tests/fixtures/c2datFullU.0.tch differ diff --git a/tests/fixtures/sha1.blob_35.tch b/tests/fixtures/c2fFullR.0.tch similarity index 99% rename from tests/fixtures/sha1.blob_35.tch rename to tests/fixtures/c2fFullR.0.tch index 8c3d3d2..98281be 100644 Binary files a/tests/fixtures/sha1.blob_35.tch and b/tests/fixtures/c2fFullR.0.tch differ diff --git a/tests/fixtures/c2pFullU.0.tch b/tests/fixtures/c2pFullU.0.tch new file mode 100644 index 0000000..c2b5718 Binary files /dev/null and b/tests/fixtures/c2pFullU.0.tch differ diff --git a/tests/fixtures/c2rFullR.0.tch b/tests/fixtures/c2rFullR.0.tch new file mode 100644 index 0000000..1239720 Binary files /dev/null and b/tests/fixtures/c2rFullR.0.tch differ diff --git a/tests/fixtures/c2taFullR.0.tch b/tests/fixtures/c2taFullR.0.tch new file mode 100644 index 0000000..9ff541d Binary files /dev/null and b/tests/fixtures/c2taFullR.0.tch differ diff --git a/tests/fixtures/commit_0.tch b/tests/fixtures/commit_0.tch index 271e73e..8308141 100644 Binary files a/tests/fixtures/commit_0.tch and b/tests/fixtures/commit_0.tch differ diff --git a/tests/fixtures/commit_114.tch b/tests/fixtures/commit_1.tch similarity index 99% rename from tests/fixtures/commit_114.tch rename to tests/fixtures/commit_1.tch index efd77b8..2db11b6 100644 Binary files a/tests/fixtures/commit_114.tch and b/tests/fixtures/commit_1.tch differ diff --git a/tests/fixtures/commit_127.tch b/tests/fixtures/commit_127.tch deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/create_fixtures.py b/tests/fixtures/create_fixtures.py old mode 100755 new mode 100644 index 4a9a404..874f99a --- a/tests/fixtures/create_fixtures.py +++ b/tests/fixtures/create_fixtures.py @@ -1,10 +1,18 @@ -#!/usr/bin/python2 -# IMPORTANT: this script requires python-tokyocabinet, which is Py2 only +import gzip +import json +from typing import List, Union -import binascii -from collections import defaultdict +try: + import lzf -from tokyocabinet import hash as tch + assert lzf.decompress +except ImportError or AssertionError: + raise ImportError( + "python-lzf is required to decompress LZF-compressed data: `pip install python-lzf`" + ) + +from woc.local import * +from woc.tch import TCHashDB def ber(*numbers): @@ -13,122 +21,220 @@ def gen(): a = True while a: a, num = divmod(num, 128) - yield chr(a + 0x80 if a else num) - return b''.join(gen()) - - -def unber(s): - res = [] - acc = 0 - for char in s: - b = ord(char) - acc = (acc << 7) + (b & 0x7f) - if not b & 0x80: - res.append(acc) - acc = 0 - return res - - -def shas2prefixes(shas, max_prefix): - # type: (Iterable[str], int) -> Dict[int, str] - prefixes = defaultdict(list) - for sha in shas: - key = binascii.unhexlify(sha) - prefixes[ord(key[0]) & max_prefix].append(key) - return prefixes - - -def create_fixture(shas, input_path, key_length=7, num_records=1000): - # type: (Iterable[str], str, int) -> None - """ Create fixtures for local testing. - Object type is implicitly given in input_fmask - just copy the data, object - structure is not relevant for fixture preparation purposes - - Special cases to handle: - - create a placeholder for max prefix to make key length calculation work - - prefix 0 .tch should contain num_records - - the same prefix 0 .tch should contain a predefined key: value, - b'test_key' -> b'\x00\x01\x02\x03' - - """ - max_prefix = 2**key_length - 1 - prefixes = shas2prefixes(shas, max_prefix) - output_path = input_path.rsplit('/', 1)[-1] - - # - create a placeholder for max prefix to make key length calculation work - with open(output_path.format(key=max_prefix), 'wb') as _: - pass - # - prefix 0 .tch should contain num_records - get enough keys - db = tch.Hash(input_path.format(key=0), tch.HDBOREADER | tch.HDBONOLCK) - # -1 is to reserve a record for the predefined key: value - prefixes[0].extend(db.fwmkeys('')[:num_records-len(prefixes[0]) - 1]) + yield (a + 0x80 if a else num).to_bytes(1, "big") + + return b"".join(gen()) + + +def encode_value(value, dtype: str) -> bytes: + if dtype == "h": # type: list[str] + return b"".join(bytes.fromhex(v) for v in value) + elif dtype == "sh": # type: tuple[str, str, str] + Time, Author, cmt_sha = value + buf0 = f"{Time};{Author}".encode() + cmt_sha_bytes = bytes.fromhex(cmt_sha) + return buf0 + cmt_sha_bytes + elif dtype == "cs3": # type: list[tuple[str, str, str]] + _joined = ";".join(f"{t[0]};{t[1]};{t[2]}" for t in value) + data = _joined.encode + return lzf.compress(data) + elif dtype == "cs": # type: list[str] + _joined = ";".join(v.encode() for v in value if v) + return lzf.compress(_joined.encode()) + elif dtype == "s": # type: list[str] + return b";".join(v.encode() for v in value) + elif dtype == "r": # type: list[str, int] + _hex, _len = value + return bytes.fromhex(_hex) + ber(_len) + elif dtype == "hhwww": + raise NotImplementedError + raise ValueError(f"Unsupported dtype: {dtype}") + + +def write_to_tch( + key: bytes, value: bytes, shards: List[str], sharding_bits: int, use_fnv_keys: bool +): + shard = get_shard(key, sharding_bits, use_fnv_keys) + _path = shards[shard] + db = TCHashDB(_path) + db[key] = value db.close() - for prefix, keys in prefixes.items(): - db = tch.Hash(output_path.format(key=prefix), - tch.HDBOCREAT | tch.HDBOWRITER) - data_db = tch.Hash(input_path.format(key=prefix), - tch.HDBOREADER | tch.HDBONOLCK) - for key in keys: - db.put(key, data_db[key]) - - # prefix 0 .tch should contain a predefined key: value - if not prefix: - db.put(b'test_key', b'\x00\x01\x02\x03') - db.close() - data_db.close() - - -def create_blob_fixture(shas, key_length=7): - max_prefix = 2**key_length - 1 - prefixes = shas2prefixes(shas, max_prefix) - - blob_content = b'*.egg-info/\ndist/\nbuild/\n*.pyc\n*.mo\n*.gz\n' - - offset_input_path = '/fast/All.sha1o/sha1.blob_{key}.tch' - offset_output_path = offset_input_path.rsplit('/', 1)[-1] - data_input_path = '/da4_data/All.blobs/blob_{key}.bin' - data_output_path = data_input_path.rsplit('/', 1)[-1] - - with open(offset_output_path.format(key=max_prefix), 'wb') as _: - pass - with open(data_output_path.format(key=max_prefix), 'wb') as _: - pass - - for prefix, keys in prefixes.items(): - offset_out = tch.Hash(offset_output_path.format(key=prefix), - tch.HDBOCREAT | tch.HDBOWRITER) - data_out = open(data_output_path.format(key=prefix), 'wb') - offset_in = tch.Hash(offset_input_path.format(key=prefix), - tch.HDBOREADER | tch.HDBONOLCK) - data_in = open(data_input_path.format(key=prefix), 'rb') - - pos = 0 - for key in keys: - offset, length = unber(offset_in[key]) - data_in.seek(offset, 0) - blob_data = data_in.read(length) - data_out.write(blob_data) - offset_out.put(key, ber(pos, length)) - pos += length - - data_out.close() - offset_out.close() - - -def main(): - # only 83d22195edc1473673f1bf35307aea6edf3c37e3 is actually used: - create_blob_fixture([u'234a57538f15d72f00603bf086b465b0f2cda7b5', - u'83d22195edc1473673f1bf35307aea6edf3c37e3', - u'fda94b84122f6f36473ca3573794a8f2c4f4a58c', - u'46aaf071f1b859c5bf452733c2583c70d92cd0c8']) - create_fixture([u'd4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d'], - '/fast/All.sha1c/tree_{key}.tch') - create_fixture([u'f2a7fcdc51450ab03cb364415f14e634fa69b62c', - u'e38126dbca6572912013621d2aa9e6f7c50f36bc', - u'1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab'], - '/fast/All.sha1c/commit_{key}.tch') - - -if __name__ == '__main__': - main() + +def write_large(path: str, key: bytes, value: bytes, dtype: str): + if dtype == "h": + with open(path, "wb") as f: + f.write(key) + f.write(value[:160]) + else: + # use zlib to decompress + with gzip.open(path, "wb") as f: + f.write(key) + f.write(b"\n") + # run a fast scan to find idx of 3rd ';' in value + idx = 0 + for _ in range(3): + idx = value.find(b";", idx + 1) + f.write(value[:idx]) + + +class WocMapsCopier(WocMapsLocal): + def __init__(self, config1, config2): + super().__init__(config1) + with open(config2) as f: + self.config2 = json.load(f) + + def copy_values(self, map_name, key): + """One large file can only contain one record""" + value, _ = self._get_tch_bytes(map_name, key) + + if map_name in self.config2["maps"]: + _map = self.config2["maps"][map_name][0] + elif map_name in self.config2["objects"]: + _map = self.config2["objects"][map_name] + else: + raise KeyError( + f'Invalid map name: {map_name}, ' + f'expect one of {", ".join(self.config2["maps"].keys())}' + ) + + if _map["dtypes"][0] == "h": + if isinstance(key, str): + _hex = key + key = bytes.fromhex(key) + else: + _hex = bytes(key).hex() + else: + assert isinstance(key, str), "key must be a string for non-hash keys" + _hex = hex(fnvhash(key.encode("utf-8")))[2:] + key = key.encode("utf-8") + + if "larges" in _map and _hex in _map["larges"]: + print( + "writing large", + _map["larges"][_hex], + "key", + key, + "dtype", + _map["dtypes"][1], + ) + return write_large(_map["larges"][_hex], key, value, _map["dtypes"][1]) + else: + # use fnv hash as shading idx if key is not a git sha + print("writing to tch", key, _map["sharding_bits"], _map["dtypes"][0] != "h") + return write_to_tch( + key, + value, + _map["shards"], + _map["sharding_bits"], + _map["dtypes"][0] != "h", + ) + + def copy_content(self, obj: str, key: Union[bytes, str]): + """One blob shard can only contain one record""" + value, _ = self._get_tch_bytes(obj, key) + + if obj == "tree": + _map_obj = self.config2["objects"]["tree.tch"] + print("writing to tch", key, _map_obj["sharding_bits"]) + write_to_tch( + bytes.fromhex(key), + value, + _map_obj["shards"], + _map_obj["sharding_bits"], + use_fnv_keys=False, + ) + + elif obj == "commit": + _map_obj = self.config2["objects"]["commit.tch"] + print("writing to tch", key, _map_obj["sharding_bits"]) + write_to_tch( + bytes.fromhex(key), + value, + _map_obj["shards"], + _map_obj["sharding_bits"], + use_fnv_keys=False, + ) + + elif obj == "blob": + # read blob + key = bytes.fromhex(key) if isinstance(key, str) else key + offset, length = self._get_pos("blob", key) + _map_obj = self.config["objects"]["blob.bin"] + shard = get_shard(key, _map_obj["sharding_bits"], use_fnv_keys=False) + with open(_map_obj["shards"][shard], "rb") as f: + f.seek(offset) + _v = f.read(length) + # write tch + _map_obj = self.config2["objects"]["sha1.blob.tch"] + _idx = ber(0, length) + print("writing to tch", key, _map_obj["sharding_bits"]) + write_to_tch( + key, + _idx, + _map_obj["shards"], + _map_obj["sharding_bits"], + use_fnv_keys=False, + ) + # write blob + _map_obj = self.config2["objects"]["blob.bin"] + shard = get_shard(key, _map_obj["sharding_bits"], use_fnv_keys=False) + print("writing to file", _map_obj["shards"][shard], length) + with open(_map_obj["shards"][shard], "ab") as f: + f.write(_v) + + else: + raise ValueError( + f"Unsupported object type: {obj}, expected one of tree, blob, commit" + ) + + +if __name__ == "__main__": + import glob + import os + + for f in glob.glob("./tests/fixtures/*.tch*") + glob.glob("./tests/fixtures/*.bin"): + print("removing", f) + os.remove(f) + + cp = WocMapsCopier("./wocprofile.json", "./tests/test_profile.json") + cp.copy_values("c2p", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_values("c2dat", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_values("c2ta", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_values("b2tac", "05fe634ca4c8386349ac519f899145c75fff4169") + cp.copy_values("p2a", "ArtiiQ_PocketMine-MP") + cp.copy_values("b2c", "05fe634ca4c8386349ac519f899145c75fff4169") + cp.copy_values("b2c", "3f2eca18f1bc0f3117748e2cea9251e5182db2f7") # large + cp.copy_values("a2c", "Audris Mockus ") + # cp.copy_values('c2cc', 'e4af89166a17785c1d741b8b1d5775f3223f510f') # null + cp.copy_values("a2f", "Audris Mockus ") + cp.copy_values("c2f", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_values("c2b", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_values("p2c", "ArtiiQ_PocketMine-MP") + cp.copy_values("f2a", "youtube-statistics-analysis.pdf") + cp.copy_values("b2f", "05fe634ca4c8386349ac519f899145c75fff4169") + cp.copy_values("c2r", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_values("b2fa", "05fe634ca4c8386349ac519f899145c75fff4169") + cp.copy_content("tree", "f1b66dcca490b5c4455af319bc961a34f69c72c2") + cp.copy_content("commit", "e4af89166a17785c1d741b8b1d5775f3223f510f") + cp.copy_content("blob", "05fe634ca4c8386349ac519f899145c75fff4169") + cp.copy_content("blob", "46aaf071f1b859c5bf452733c2583c70d92cd0c8") + # woc-hack_thebridge + cp.copy_values("p2c", "woc-hack_thebridge") + cp.copy_content("commit", "0d8228bb25ce89c7e731c7410bc8c5a4e2636e52") + cp.copy_content("commit", "34a8662a4f31dacb923e39ae6792f6fc4476a939") + cp.copy_content("commit", "898d5a21241aaf16acf92566aa34103d06cf2ac6") + cp.copy_content("commit", "91f4da4c173e41ffbf0d9ecbe2f07f3a3296933c") + cp.copy_content("commit", "ae6e15fa4d8d4d454977ddbb4e97e922ddecebf7") + cp.copy_content("commit", "f249b14a111279faa8d65c29ecf46bb6ce59a139") + cp.copy_content("tree", "706aa4dedb560358bff21c3120a0b09532d3484d") + cp.copy_content("tree", "3ccf6f8320740a1afec68b38b3b9ba46cedef368") + cp.copy_content("tree", "e5798457aebae7c84eff7b80b50c3a938cc4cb63") + cp.copy_content("tree", "836f04d5b374033b1608269e2f3aaabae263a0db") + cp.copy_content("tree", "f54cb5527226aa2096307c08e15c62248b98f763") + cp.copy_content("tree", "da65e1401d11a955686b8a49e46b9a457f3febab") + cp.copy_content("tree", "a28f1558be9867d35cc1fa17477565c08786cf83") + cp.copy_content("tree", "4db2ad30097924cbe5da9c0f2c49350fdc19c3a4") + cp.copy_content("tree", "1cf86145b4a9492ebbe0fa640638504946315ca6") + cp.copy_content("tree", "29a422c19251aeaeb907175e9b3219a9bed6c616") + cp.copy_content("tree", "51968a7a4e67fd2696ffd5ccc041560a4d804f5d") diff --git a/tests/fixtures/commit_99.tch b/tests/fixtures/f2aFullR.1.tch similarity index 99% rename from tests/fixtures/commit_99.tch rename to tests/fixtures/f2aFullR.1.tch index 6cd197a..557795c 100644 Binary files a/tests/fixtures/commit_99.tch and b/tests/fixtures/f2aFullR.1.tch differ diff --git a/tests/fixtures/p2aFullR.1.tch b/tests/fixtures/p2aFullR.1.tch new file mode 100644 index 0000000..a876a5c Binary files /dev/null and b/tests/fixtures/p2aFullR.1.tch differ diff --git a/tests/fixtures/p2cFullR.1.tch b/tests/fixtures/p2cFullR.1.tch new file mode 100644 index 0000000..079c4e8 Binary files /dev/null and b/tests/fixtures/p2cFullR.1.tch differ diff --git a/tests/fixtures/sha1.blob.tch b/tests/fixtures/sha1.blob.tch new file mode 100644 index 0000000..b2366e0 Binary files /dev/null and b/tests/fixtures/sha1.blob.tch differ diff --git a/tests/fixtures/sha1.blob_127.tch b/tests/fixtures/sha1.blob_127.tch deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/tree_0.tch b/tests/fixtures/tree_0.tch index 808e0e5..3a1b5ee 100644 Binary files a/tests/fixtures/tree_0.tch and b/tests/fixtures/tree_0.tch differ diff --git a/tests/fixtures/commit_28.tch b/tests/fixtures/tree_1.tch similarity index 99% rename from tests/fixtures/commit_28.tch rename to tests/fixtures/tree_1.tch index 1e407ad..4b67e5b 100644 Binary files a/tests/fixtures/commit_28.tch and b/tests/fixtures/tree_1.tch differ diff --git a/tests/fixtures/tree_127.tch b/tests/fixtures/tree_127.tch deleted file mode 100644 index e69de29..0000000 diff --git a/tests/fixtures/tree_84.tch b/tests/fixtures/tree_84.tch deleted file mode 100644 index 6c7acfd..0000000 Binary files a/tests/fixtures/tree_84.tch and /dev/null differ diff --git a/tests/integration_test.py b/tests/integration_test.py deleted file mode 100755 index 6604097..0000000 --- a/tests/integration_test.py +++ /dev/null @@ -1,116 +0,0 @@ -#!python3 -""" -Unit tests - only to check functions do what they are expected to do. -Please avoid checking the integrity of the dataset. -""" - -import unittest - -from oscar import * - - -class TestBlob(unittest.TestCase): - def test_commits_shas(self): - # setup.py from minicms - used in at least couple commits - blob = Blob('46aaf071f1b859c5bf452733c2583c70d92cd0c8') - self.assertGreater(len(blob.commit_shas), 1) - self.assertIsInstance(blob.commit_shas[0], (str, bytes)) - - -class TestTree(unittest.TestCase): - # there are no relations in this class, everything is covered by unit tests - pass - - -class TestCommit(unittest.TestCase): - def test_projects(self): - # a commit in numpy from Oct 2009 - present in over 3k projects - c = Commit('4fb4c64cae2ce1ba16082d918e94e845fa2c87f3') - self.assertGreater(len(c.project_names), 3000) - self.assertIsInstance(c.project_names[0], (str, bytes)) - self.assertTrue(any(pname.endswith(b'numpy') - for pname in c.project_names)) - - def test_children(self): - # minicms commit with two children - c = Commit('a443e1e76c39c7b1ad6f38967a75df667b9fed57') - self.assertGreater(len(c.child_shas), 1) - self.assertIsInstance(c.child_shas[0], (str, bytes)) - - def test_changed_files(self): - c = Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c') - # 3 files changed, 1 deleted - self.assertGreater(len(c.changed_file_names), 2) - self.assertIsInstance(c.changed_file_names[0], (str, bytes)) - - -class TestProject(unittest.TestCase): - def test_commits(self): - p = Project(b'user2589_minicms') - self.assertGreater(len(p.commit_shas), 30) - self.assertIsInstance(p.commit_shas[0], (str, bytes)) - - def test_commits_fp(self): - p = Project(b'user2589_minicms') - commits = set(c.bin_sha for c in p.commits_fp) - self.assertGreater(len(commits), 2) - self.assertLessEqual(len(commits), len(p.commit_shas)) - self.assertTrue(commits.issubset(p.commit_shas)) - - def test_in(self): - p = Project(b'user2589_minicms') - c = Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c') - self.assertIn(c, p) - self.assertIn(c.sha, p) - self.assertIn(c.bin_sha, p) - - def test_head(self): - self.assertEqual( - Project(b'user2589_minicms').head, - Commit('f2a7fcdc51450ab03cb364415f14e634fa69b62c')) - self.assertEqual( - Project('RoseTHERESA_SimpleCMS').head, - Commit('a47afa002ccfd3e23920f323b172f78c5c970250')) - - def test_tail(self): - self.assertEqual( - Project(b'user2589_minicms').tail, - binascii.unhexlify('1e971a073f40d74a1e72e07c682e1cba0bae159b')) - - def test_authors(self): - p = Project(b'user2589_minicms') - self.assertGreater(len(p.author_names), 1) - self.assertIsInstance(p.author_names[0], (str, bytes)) - - -class TestFile(unittest.TestCase): - def test_authors(self): - f = File(b'minicms/templates/minicms/tags/breadcrumbs.html') - self.assertGreater(len(f.author_names), 1) - self.assertIsInstance(f.author_names[0], (str, bytes)) - - def test_commits(self): - f = File(b'minicms/templates/minicms/tags/breadcrumbs.html') - self.assertGreater(len(f.commit_shas), 1) - self.assertIsInstance(f.commit_shas[0], (str, bytes)) - - -class TestAuthor(unittest.TestCase): - def test_commits(self): - a = Author(b'user2589 ') - self.assertGreater(len(a.commit_shas), 40) - self.assertIsInstance(a.commit_shas[0], (str, bytes)) - - def test_files(self): - a = Author(b'user2589 ') - self.assertGreater(len(a.file_names), 10) - self.assertIsInstance(a.file_names[0], (str, bytes)) - - def test_projects(self): - a = Author(b'user2589 ') - self.assertGreater(len(a.project_names), 10) - self.assertIsInstance(a.project_names[0], (str, bytes)) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/local_test.env b/tests/local_test.env deleted file mode 100644 index f72f48b..0000000 --- a/tests/local_test.env +++ /dev/null @@ -1,7 +0,0 @@ -# no shebang line because this file is only expected to be `source`-ed - -export OSCAR_TEST=1 -# it would be cool to set relative to `$(dirname $0)`, but it is sourced -export OSCAR_ALL_BLOBS="tests/fixtures" -export OSCAR_ALL_SHA1C="tests/fixtures" -export OSCAR_ALL_SHA1O="tests/fixtures" diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..03468e2 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,188 @@ +import subprocess + + +def run_show_content(commit_hash, *args): + result = subprocess.run( + ["python3", "-m", "woc.show_content", *args, "-p", "./tests/test_profile.json"], + capture_output=True, + text=True, + input=commit_hash, + ) + return result.returncode, result.stdout.strip(), result.stderr.strip() + + +def run_get_values(input_str, *args): + result = subprocess.run( + ["python3", "-m", "woc.get_values", *args, "-p", "./tests/test_profile.json"], + capture_output=True, + text=True, + input=input_str, + ) + return result.returncode, result.stdout.strip(), result.stderr.strip() + + +def test_cli_commit(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = ( + "e4af89166a17785c1d741b8b1d5775f3223f510f;" + "f1b66dcca490b5c4455af319bc961a34f69c72c2;" + "c19ff598808b181f1ab2383ff0214520cb3ec659;" + "Audris Mockus ;" + "Audris Mockus ;1410029988;1410029988" + ) + actual_output = run_show_content(commit_hash, "commit") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_commit_1(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = ( + "e4af89166a17785c1d741b8b1d5775f3223f510f;1410029988;" + "Audris Mockus " + ) + actual_output = run_show_content(commit_hash, "commit", "1") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_commit_2(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = ( + "e4af89166a17785c1d741b8b1d5775f3223f510f;Audris Mockus ;" + "1410029988;-0400;News for Sep 5" + ) + actual_output = run_show_content(commit_hash, "commit", "2") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_commit_3(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = """tree f1b66dcca490b5c4455af319bc961a34f69c72c2 +parent c19ff598808b181f1ab2383ff0214520cb3ec659 +author Audris Mockus 1410029988 -0400 +committer Audris Mockus 1410029988 -0400 + +News for Sep 5""" + actual_output = run_show_content(commit_hash, "commit", "3") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_commit_4(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = ( + "e4af89166a17785c1d741b8b1d5775f3223f510f;Audris Mockus " + ) + actual_output = run_show_content(commit_hash, "commit", "4") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_commit_5(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = "e4af89166a17785c1d741b8b1d5775f3223f510f;c19ff598808b181f1ab2383ff0214520cb3ec659" + actual_output = run_show_content(commit_hash, "commit", "5") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_commit_6(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output_end = "c19ff598808b181f1ab2383ff0214520cb3ec659" + actual_output = run_show_content(commit_hash, "commit", "6") + assert actual_output[1].endswith(expected_output_end), actual_output + + +def test_cli_commit_7(): + commit_hash = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output_end = "QHV0ay5lZHU+IDE0MTAwMjk5ODggLTA0MDAKCk5ld3MgZm9yIFNlcCA1\\n" + actual_output = run_show_content(commit_hash, "commit", "7") + assert actual_output[1].endswith(expected_output_end), actual_output + + +def test_cli_tree(): + tree_hash = "f1b66dcca490b5c4455af319bc961a34f69c72c2" + expected_output = "100644;05fe634ca4c8386349ac519f899145c75fff4169;README.md\n100644;dfcd0359bfb5140b096f69d5fad3c7066f101389;course.pdf" + actual_output = run_show_content(tree_hash, "tree") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_blob(): + blob_hash = "05fe634ca4c8386349ac519f899145c75fff4169" + expected_output_start = '# Syllabus for "Fundamentals of Digital Archeology"\n\n## News\n\n* Assignment1 due Monday Sep 8 before 2:30PM' + actual_output = run_show_content(blob_hash, "blob") + assert actual_output[1].startswith(expected_output_start), actual_output + + +def test_cli_a2c(): + input_str = "Audris Mockus " + expected_output_start = ( + "Audris Mockus ;001ec7302de3b07f32669a1f1faed74585c8a8dc" + ) + actual_output = run_get_values(input_str, "a2c") + assert actual_output[1].startswith(expected_output_start), actual_output + + +def test_cli_a2f(): + input_str = "Audris Mockus " + expected_output_start = ( + "Audris Mockus ;.#analyze.sh;.README.md.swp;.Rhistory;.bowerrc" + ) + actual_output = run_get_values(input_str, "a2f") + assert actual_output[1].startswith(expected_output_start), actual_output + + +def test_cli_b2c(): + input_str = "05fe634ca4c8386349ac519f899145c75fff4169" + expected_output = "05fe634ca4c8386349ac519f899145c75fff4169;e4af89166a17785c1d741b8b1d5775f3223f510f" + actual_output = run_get_values(input_str, "b2c") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_c2b(): + input_str = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = "e4af89166a17785c1d741b8b1d5775f3223f510f;05fe634ca4c8386349ac519f899145c75fff4169" + actual_output = run_get_values(input_str, "c2b") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_c2cc(): # expect error + input_str = "e4af89166a17785c1d741b8b1d5775f3223f510f" + actual_output = run_get_values(input_str, "c2cc") + assert actual_output[2].endswith("shard 0 not found at None"), actual_output + + +def test_cli_c2f(): + input_str = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = "e4af89166a17785c1d741b8b1d5775f3223f510f;README.md" + assert run_get_values(input_str, "c2f")[1] == expected_output + + +def test_cli_c2p(): + input_str = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output_start = ( + "e4af89166a17785c1d741b8b1d5775f3223f510f;W4D3_news;chumekaboom_news" + ) + actual_output = run_get_values(input_str, "c2p") + assert actual_output[1].startswith(expected_output_start), actual_output + + +def test_cli_c2r(): + input_str = "e4af89166a17785c1d741b8b1d5775f3223f510f" + expected_output = "e4af89166a17785c1d741b8b1d5775f3223f510f;9531fc286ef1f4753ca4be9a3bf76274b929cdeb;27" + actual_output = run_get_values(input_str, "c2r") + assert actual_output[1] == expected_output, actual_output + + +def test_cli_p2a(): + input_str = "ArtiiQ_PocketMine-MP" + expected_output_start = ( + "ArtiiQ_PocketMine-MP;0929hitoshi ;" + ) + actual_output = run_get_values(input_str, "p2a") + assert actual_output[1].startswith(expected_output_start), actual_output + + +def test_cli_p2c(): + input_str = "ArtiiQ_PocketMine-MP" + expected_output_start = ( + "ArtiiQ_PocketMine-MP;0000000bab11354f9a759332065be5f066c3398f" + ) + actual_output = run_get_values(input_str, "p2c") + assert actual_output[1].startswith(expected_output_start), actual_output diff --git a/tests/test_local.py b/tests/test_local.py new file mode 100644 index 0000000..1c91586 --- /dev/null +++ b/tests/test_local.py @@ -0,0 +1,126 @@ +import os + +import pytest + +# Import the TCHashDB class +from woc.local import WocMapsLocal + + +@pytest.fixture +def woc(): + _test_pr = os.path.join(os.path.dirname(__file__), "test_profile.json") + woc = WocMapsLocal(_test_pr) + yield woc + + +def test_c2p(woc): + res = woc.get_values("c2p", "e4af89166a17785c1d741b8b1d5775f3223f510f") + assert res[0] == "W4D3_news" + + +def test_c2dat(woc): + res = woc.get_values("c2dat", "e4af89166a17785c1d741b8b1d5775f3223f510f") + assert res[0] == "1410029988" + + +def test_b2tac(woc): + res = woc.get_values("b2tac", "05fe634ca4c8386349ac519f899145c75fff4169") + assert res[0] == ( + "1410029988", + "Audris Mockus ", + "e4af89166a17785c1d741b8b1d5775f3223f510f", + ) + + +def test_p2a(woc): + res = woc.get_values("p2a", "ArtiiQ_PocketMine-MP") + assert res[0] == "0929hitoshi " + + +def test_b2c(woc): + res = woc.get_values("b2c", "05fe634ca4c8386349ac519f899145c75fff4169") + assert res[0] == "e4af89166a17785c1d741b8b1d5775f3223f510f" + + +def test_b2c_large(woc): + res = woc.get_values("b2c", "3f2eca18f1bc0f3117748e2cea9251e5182db2f7") + assert res[0] == "00003a69db53b45a67f76632f33a93691da77197" + + +def test_a2c(woc): + res = woc.get_values("a2c", "Audris Mockus ") + assert res[0] == "001ec7302de3b07f32669a1f1faed74585c8a8dc" + + +def test_c2cc_null_filename(woc): # file name is null + with pytest.raises(AssertionError): + woc.get_values("c2cc", "e4af89166a17785c1d741b8b1d5775f3223f510f") + + +def test_a2f(woc): + res = woc.get_values("a2f", "Audris Mockus ") + assert res[0] == ".#analyze.sh" + + +def test_c2f(woc): + res = woc.get_values("c2f", "e4af89166a17785c1d741b8b1d5775f3223f510f") + assert res[0] == "README.md" + + +def test_c2b(woc): + res = woc.get_values("c2b", "e4af89166a17785c1d741b8b1d5775f3223f510f") + assert res[0] == "05fe634ca4c8386349ac519f899145c75fff4169" + + +def test_p2c(woc): + res = woc.get_values("p2c", "ArtiiQ_PocketMine-MP") + assert res[0] == "0000000bab11354f9a759332065be5f066c3398f" + + +def test_f2a(woc): + res = woc.get_values("f2a", "youtube-statistics-analysis.pdf") + assert res[0] == "Audris Mockus " + + +def test_b2f(woc): + res = woc.get_values("b2f", "05fe634ca4c8386349ac519f899145c75fff4169") + assert res[0] == "README.md" + + +def test_c2r(woc): + res = woc.get_values("c2r", "e4af89166a17785c1d741b8b1d5775f3223f510f") + assert res[0] == "9531fc286ef1f4753ca4be9a3bf76274b929cdeb" + + +def test_b2fa(woc): + res = woc.get_values("b2fa", "05fe634ca4c8386349ac519f899145c75fff4169") + assert res[0] == "1410029988" + + +def test_tree(woc): + res = woc.show_content("tree", "f1b66dcca490b5c4455af319bc961a34f69c72c2") + assert len(res) == 2 + + +def test_commit(woc): + res = woc.show_content("commit", "e4af89166a17785c1d741b8b1d5775f3223f510f") + assert res[-1] == "News for Sep 5" + + +def test_blob_1(woc): + res = woc.show_content("blob", "05fe634ca4c8386349ac519f899145c75fff4169") + assert len(res) == 14194 + + +def test_blob_2(woc): + res = woc.show_content("blob", "46aaf071f1b859c5bf452733c2583c70d92cd0c8") + assert len(res) == 1236 + + +def test_count(woc): + res = woc.count("blob") + assert res == 2 + res = woc.count("tree") + assert res == 12 + res = woc.count("commit") + assert res == 7 diff --git a/tests/test_objects.py b/tests/test_objects.py new file mode 100644 index 0000000..4ef1b4f --- /dev/null +++ b/tests/test_objects.py @@ -0,0 +1,305 @@ +import os + +import pytest + +# Import the TCHashDB class +from woc.local import WocMapsLocal +from woc.objects import * + + +@pytest.fixture +def woc(): + _test_pr = os.path.join(os.path.dirname(__file__), "test_profile.json") + woc = WocMapsLocal(_test_pr) + init_woc_objects(woc) + yield woc + + +def test_equal(woc): + assert Author("1") == Author("1") + assert Blob("05fe634ca4c8386349ac519f899145c75fff4169") == Blob( + "05fe634ca4c8386349ac519f899145c75fff4169" + ) + assert Author("1") != Project("2") + + +def test_author_commits(woc): + author = Author("Audris Mockus ") + commits = author.commits + assert all(isinstance(c, Commit) for c in commits) + assert commits[0].key == "001ec7302de3b07f32669a1f1faed74585c8a8dc" + + +def test_author_files(woc): + author = Author("Audris Mockus ") + files = author.files + assert all(isinstance(f, File) for f in files) + assert files[0].key == ".#analyze.sh" + + +def test_author_name_email(woc): + author = Author("Audris Mockus ") + assert author.name == "Audris Mockus" + assert author.email == "audris@utk.edu" + + +def test_blob_data(woc): + blob = Blob("05fe634ca4c8386349ac519f899145c75fff4169") + res = blob.data + assert len(res) == 14194 + + +def test_blob_commits(woc): + blob = Blob("05fe634ca4c8386349ac519f899145c75fff4169") + commits = blob.commits + assert all(isinstance(c, Commit) for c in commits) + assert commits[0].key == "e4af89166a17785c1d741b8b1d5775f3223f510f" + + +def test_blob_commits_large(woc): + blob = Blob("3f2eca18f1bc0f3117748e2cea9251e5182db2f7") + commits = blob.commits + assert all(isinstance(c, Commit) for c in commits) + assert commits[0].key == "00003a69db53b45a67f76632f33a93691da77197" + + +def test_blob_first_author(woc): + blob = Blob("05fe634ca4c8386349ac519f899145c75fff4169") + date, author, commit = blob.first_author + assert isinstance(date, datetime) + assert isinstance(author, Author) + assert isinstance(commit, Commit) + assert date.timestamp() == 1410029988 + assert date.tzinfo is None # naive datetime + assert author.key == "Audris Mockus " + assert commit.key == "e4af89166a17785c1d741b8b1d5775f3223f510f" + + +def test_blob_time_author_commits(woc): + blob = Blob("05fe634ca4c8386349ac519f899145c75fff4169") + date, author, commit = blob.time_author_commits[0] + assert isinstance(date, datetime) + assert isinstance(author, Author) + assert isinstance(commit, Commit) + assert date.timestamp() == 1410029988 + assert date.tzinfo is None # naive datetime + assert author.key == "Audris Mockus " + assert commit.key == "e4af89166a17785c1d741b8b1d5775f3223f510f" + + +def test_blob_files(woc): + blob = Blob("05fe634ca4c8386349ac519f899145c75fff4169") + files = blob.files + assert all(isinstance(f, File) for f in files) + assert files[0].key == "README.md" + + +def test_commit_author(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + author = commit.author + assert isinstance(author, Author) + assert author.key == "Audris Mockus " + + +def test_commit_authored_at(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + authored_at = commit.authored_at + assert isinstance(authored_at, datetime) + assert authored_at.timestamp() == 1410029988 + assert authored_at.tzinfo is not None # aware datetime + + +def test_commit_committer(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + committer = commit.committer + assert isinstance(committer, Author) + assert committer.key == "Audris Mockus " + + +def test_commit_committed_at(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + committed_at = commit.committed_at + assert isinstance(committed_at, datetime) + assert committed_at.timestamp() == 1410029988 + assert committed_at.tzinfo is not None # aware datetime + + +def test_commit_full_message(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + full_message = commit.full_message + assert full_message == "News for Sep 5" + + +def test_commit_message(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + message = commit.message + assert message == "News for Sep 5" + + +def test_commit_tree(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + tree = commit.tree + assert isinstance(tree, Tree) + assert tree.key == "f1b66dcca490b5c4455af319bc961a34f69c72c2" + + +def test_commit_parents(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + parents = commit.parents + assert all(isinstance(p, Commit) for p in parents) + assert parents[0].key == "c19ff598808b181f1ab2383ff0214520cb3ec659" + + +def test_commit_projects(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + projects = commit.projects + assert all(isinstance(p, Project) for p in projects) + assert projects[0].key == "W4D3_news" + + +def test_commit_children_null_filename(woc): + with pytest.raises(AssertionError): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + children = commit.children + + +def test_commit_files(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + files = commit.files + assert all(isinstance(f, File) for f in files) + assert files[0].key == "README.md" + + +def test_commit_blobs(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + blobs = commit.blobs + assert blobs[0].key == "05fe634ca4c8386349ac519f899145c75fff4169" + assert all(isinstance(b, Blob) for b in blobs) + + +def test_commit_time_author(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + time_author = commit.time_author + assert isinstance(time_author[0], datetime) + assert isinstance(time_author[1], Author) + assert time_author[0].timestamp() == 1410029988 + assert time_author[0].tzinfo is None # naive datetime + + +def test_commit_root(woc): + commit = Commit("e4af89166a17785c1d741b8b1d5775f3223f510f") + root, distance = commit.root + assert isinstance(root, Commit) + assert root.key == "9531fc286ef1f4753ca4be9a3bf76274b929cdeb" + assert distance == 27 + + +def test_file_authors(woc): + file = File("youtube-statistics-analysis.pdf") + authors = file.authors + assert all(isinstance(a, Author) for a in authors) + assert authors[0].key == "Audris Mockus " + + +def test_tree_files(woc): + tree = Tree("f1b66dcca490b5c4455af319bc961a34f69c72c2") + files = tree.files + assert all(isinstance(f, File) for f in files) + assert set(files) == {File("README.md"), File("course.pdf")}, str(files) + + +def test_tree_blobs(woc): + tree = Tree("f1b66dcca490b5c4455af319bc961a34f69c72c2") + blobs = tree.blobs + assert all(isinstance(b, Blob) for b in blobs) + assert Blob("05fe634ca4c8386349ac519f899145c75fff4169") in blobs + + +def test_tree_traverse(woc): + tree = Tree("706aa4dedb560358bff21c3120a0b09532d3484d") + traverse = list(tree.traverse()) + assert all(isinstance(t[0], File) for t in traverse) + assert all(isinstance(t[1], Blob) for t in traverse) + + +def test_project_authors(woc): + project = Project("ArtiiQ_PocketMine-MP") + authors = project.authors + assert all(isinstance(a, Author) for a in authors) + assert authors[0].key == "0929hitoshi " + + +def test_project_commits(woc): + project = Project("ArtiiQ_PocketMine-MP") + commits = project.commits + assert all(isinstance(c, Commit) for c in commits) + assert commits[0].key == "0000000bab11354f9a759332065be5f066c3398f" + + +def test_project_url(woc): + project = Project("ArtiiQ_PocketMine-MP") + assert project.url == "https://github.com/ArtiiQ/PocketMine-MP" + project = Project( + "sourceforge.net_peazip" + ) # <- How does sourceforge repos looks like in woc? + assert project.url == "https://git.code.sf.net/p/peazip" + project = Project("gitlab.com_openRGB_openRGB") + assert project.url == "https://gitlab.com/openRGB/openRGB" + + +def test_project_head(woc): + project = Project("woc-hack_thebridge") + head = project.head + assert isinstance(head, Commit) + assert head.key == "f249b14a111279faa8d65c29ecf46bb6ce59a139" + + +def test_project_tail(woc): + project = Project("woc-hack_thebridge") + tail = project.tail + assert isinstance(tail, Commit) + assert tail.key == "ae6e15fa4d8d4d454977ddbb4e97e922ddecebf7" + + +def test_project_earliest(woc): + project = Project("woc-hack_thebridge") + earliest = project.earliest_commit + assert isinstance(earliest, Commit) + assert earliest.key == "ae6e15fa4d8d4d454977ddbb4e97e922ddecebf7" + + +def test_project_latest(woc): + project = Project("woc-hack_thebridge") + latest = project.latest_commit + assert isinstance(latest, Commit) + assert latest.key == "f249b14a111279faa8d65c29ecf46bb6ce59a139" + + +def test_project_walk(woc): + project = Project("woc-hack_thebridge") + commits = list(project.commits_fp()) + assert all(isinstance(c, Commit) for c in commits) + assert len(commits) == 6 + + +def test_commit_compare(woc): + c1 = Commit("91f4da4c173e41ffbf0d9ecbe2f07f3a3296933c") + c2 = Commit("ae6e15fa4d8d4d454977ddbb4e97e922ddecebf7") + + # We can't test fuzz matching because blob storage is broken :( + diff = list(c1.compare(c2, threshold=1)) + modified_files = {f[0].key for f in diff if f[0] is not None} | { + f[1].key for f in diff if f[1] is not None + } + assert all(len(d) == 4 for d in diff) + assert all(isinstance(d[0], File) for d in diff if d[0] is not None) + assert all(isinstance(d[1], File) for d in diff if d[1] is not None) + assert all(isinstance(d[2], Blob) for d in diff if d[2] is not None) + assert all(isinstance(d[3], Blob) for d in diff if d[3] is not None) + assert modified_files == { + "README.md", + "woc_service/requirements.txt", + "woc_service/app.py", + "woc_service/static/.keep", + "woc_service/oscar.py", + } diff --git a/tests/test_profile.json b/tests/test_profile.json new file mode 100644 index 0000000..ac02ed9 --- /dev/null +++ b/tests/test_profile.json @@ -0,0 +1,372 @@ +{ + "wocSchemaVersion": 1, + "entities": { + "a": "author", + "A": "author_unalised", + "b": "blob", + "c": "commit", + "cc": "child_commit", + "f": "file", + "fa": "first_author", + "t": "tree", + "h": "head", + "p": "project", + "P": "project_deforked", + "pc": "parent_commit", + "r": "root_commit", + "ta": "time_author", + "tac": "time_author_commit", + "trp": "torvalds_path", + "dat": "colon_seperated_data", + "tch": "compressed_data", + "bin": "binary_data", + "idx": "binary_index" + }, + "dtypes": { + "h": "hex", + "s": "str", + "cs": "[compressed]str", + "sh": "str_hex", + "hhwww": "hex_hex_url", + "r": "hex_berint", + "cs3": "[compressed]str_str_str" + }, + "sites": { + "bitbucket.org": "bitbucket.org", + "gitlab.com": "gitlab.com", + "android.googlesource.com": "android.googlesource.com", + "bioconductor.org": "bioconductor.org", + "drupal.com": "git.drupal.org", + "git.eclipse.org": "git.eclipse.org", + "git.kernel.org": "git.kernel.org", + "git.postgresql.org": "git.postgresql.org", + "git.savannah.gnu.org": "git.savannah.gnu.org", + "git.zx2c4.com": "git.zx2c4.com", + "gitlab.gnome.org": "gitlab.gnome.org", + "kde.org": "anongit.kde.org", + "repo.or.cz": "repo.or.cz", + "salsa.debian.org": "salsa.debian.org", + "sourceforge.net": "git.code.sf.net/p" + }, + "ignoredAuthors": [ + "GitHub Merge Button " + ], + "maps": { + "c2p": [ + { + "version": "U", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/c2pFullU.0.tch", + "./tests/fixtures/c2pFullU.1.tch" + ], + "larges": {}, + "dtypes": [ + "h", + "cs" + ] + } + ], + "c2dat": [ + { + "version": "U", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/c2datFullU.0.tch", + "./tests/fixtures/c2datFullU.1.tch" + ], + "larges": {}, + "dtypes": [ + "h", + "s" + ] + } + ], + "c2ta": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/c2taFullR.0.tch", + "./tests/fixtures/c2taFullR.1.tch" + ], + "larges": {}, + "dtypes": [ + "h", + "s" + ] + } + ], + "b2tac": [ + { + "version": "U", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/b2tacFullU.0.tch", + "./tests/fixtures/b2tacFullU.1.tch" + ], + "larges": { + "a11777cc471a4344702741ab1c8a588998b1311a": "./tests/fixtures/b2tacFullU.1.tch.large.a11777cc471a4344702741ab1c8a588998b1311a" + }, + "dtypes": [ + "h", + "cs3" + ] + } + ], + "p2a": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/p2aFullR.0.tch", + "./tests/fixtures/p2aFullR.1.tch" + ], + "larges": {}, + "dtypes": [ + "s", + "cs" + ] + } + ], + "b2c": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/b2cFullR.0.tch", + "./tests/fixtures/b2cFullR.1.tch" + ], + "larges": { + "3f2eca18f1bc0f3117748e2cea9251e5182db2f7": "./tests/fixtures/b2cFullR.31.tch.large.3f2eca18f1bc0f3117748e2cea9251e5182db2f7" + }, + "dtypes": [ + "h", + "h" + ] + } + ], + "a2c": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/a2cFullR.0.tch", + "./tests/fixtures/a2cFullR.1.tch" + ], + "larges": {}, + "dtypes": [ + "s", + "h" + ] + } + ], + "c2cc": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + null, + null + ], + "larges": {}, + "dtypes": [ + "h", + "h" + ] + } + ], + "a2f": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/a2fFullR.0.tch", + "./tests/fixtures/a2fFullR.1.tch" + ], + "larges": { + }, + "dtypes": [ + "s", + "cs" + ] + } + ], + "c2f": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/c2fFullR.0.tch", + "./tests/fixtures/c2fFullR.1.tch" + ], + "larges": { + }, + "dtypes": [ + "h", + "cs" + ] + } + ], + "c2b": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/c2bFullR.0.tch", + "./tests/fixtures/c2bFullR.1.tch" + ], + "larges": { + }, + "dtypes": [ + "h", + "h" + ] + } + ], + "p2c": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/p2cFullR.0.tch", + "./tests/fixtures/p2cFullR.1.tch" + ], + "larges": {}, + "dtypes": [ + "s", + "h" + ] + } + ], + "f2a": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/f2aFullR.0.tch", + "./tests/fixtures/f2aFullR.1.tch" + ], + "larges": {}, + "dtypes": [ + "s", + "cs" + ] + } + ], + "b2f": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/b2fFullR.0.tch", + "./tests/fixtures/b2fFullR.1.tch" + ], + "larges": { + }, + "dtypes": [ + "h", + "cs" + ] + } + ], + "c2r": [ + { + "version": "R", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/c2rFullR.0.tch", + "./tests/fixtures/c2rFullR.1.tch" + ], + "larges": {}, + "dtypes": [ + "h", + "r" + ] + } + ], + "b2fa": [ + { + "version": "U", + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/b2faFullU.0.tch", + "./tests/fixtures/b2faFullU.1.tch" + ], + "larges": {}, + "dtypes": [ + "h", + "sh" + ] + } + ] + }, + "objects": { + "tree.tch": { + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/tree_0.tch", + "./tests/fixtures/tree_1.tch" + ] + }, + "commit.tch": { + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/commit_0.tch", + "./tests/fixtures/commit_1.tch" + ] + }, + "blob.idx": { + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/blob_0.idx", + "./tests/fixtures/blob_1.idx" + ] + }, + "blob.bin": { + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/blob_0.bin", + "./tests/fixtures/blob_1.bin" + ] + }, + "commit.bin": { + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/commit_0.bin", + "./tests/fixtures/commit_1.bin" + ] + }, + "commit.idx": { + "sharding_bits": 1, + "shards": [ + "./tests/fixtures/commit_0.idx", + "./tests/fixtures/commit_1.idx" + ] + }, + "tree.idx": { + "sharding_bits": 0, + "shards": [ + "./tests/fixtures/tree_0.idx" + ] + }, + "tree.bin": { + "sharding_bits": 0, + "shards": [ + "./tests/fixtures/tree_0.bin" + ] + }, + "sha1.blob.tch": { + "sharding_bits": 0, + "shards": [ + "./tests/fixtures/sha1.blob.tch" + ] + }, + "sha1.commit.tch": { + "sharding_bits": 0, + "shards": [ + "./tests/fixtures/sha1.commit.tch" + ] + } + } + } diff --git a/tests/test_tch.py b/tests/test_tch.py new file mode 100644 index 0000000..4e0163d --- /dev/null +++ b/tests/test_tch.py @@ -0,0 +1,94 @@ +import pytest + +# Import the TCHashDB class +from woc.tch import TCHashDB + + +@pytest.fixture +def db(tmp_path): + """Fixture to create and return a TCHashDB instance""" + path = tmp_path / "test_db.tch" + db = TCHashDB(path=str(path)) + yield db + + +def test_put_and_get(db): + key = b"key1" + value = b"value1" + db.put(key, value) + assert db.get(key) == value + + +def test_get_nonexistent_key(db): + with pytest.raises(KeyError): + db.get(b"nonexistent_key") + + +def test_delete(db): + key = b"key2" + value = b"value2" + db.put(key, value) + db.delete(key) + with pytest.raises(KeyError): + db.get(key) + + +def test_drop(db): + db.put(b"key3", b"value3") + db.put(b"key4", b"value4") + db.drop() + assert len(db) == 0 + + +def test_len(db): + db.put(b"key5", b"value5") + db.put(b"key6", b"value6") + assert len(db) == 2 + + +def test_iter(db): + keys = [b"key7", b"key8", b"key9"] + for key in keys: + db.put(key, b"value") + assert set(db) == set(keys) + + +def test_getitem(db): + key = b"key10" + value = b"value10" + db[key] = value + assert db[key] == value + + +def test_setitem(db): + key = b"key11" + value = b"value11" + db[key] = value + assert db.get(key) == value + + +def test_delitem(db): + key = b"key12" + value = b"value12" + db[key] = value + del db[key] + with pytest.raises(KeyError): + db.get(key) + + +def test_open(tmp_path): + """Test that two TCHashDB instances can be opened concurrently""" + path = tmp_path / "test_db.tch" + db = TCHashDB(path=str(path), ro=False) + db[b"key"] = b"value" + # The following should yield no error + db.close() + db = TCHashDB(path=str(path), ro=True) + assert db[b"key"] == b"value" + # can't write to a read-only db + with pytest.raises(OSError): + db[b"key"] = b"value" + # tch does not allow opening a db in the same process + # ref: lib/tchdb.c#L370 + with pytest.raises(OSError): + db2 = TCHashDB(path=str(path), ro=True) diff --git a/tests/unit_test.py b/tests/unit_test.py deleted file mode 100755 index 1a3d7a8..0000000 --- a/tests/unit_test.py +++ /dev/null @@ -1,209 +0,0 @@ -#!python3 - -# cython: language_level=3str -""" -Unit tests - only to check functions do what they are expected to do. -Please refrain from checking integrity of the dataset. -""" -from __future__ import unicode_literals - -# Cython caches compiled files, so even if the main file did change but the -# test suite didn't, it won't recompile. More details in this SO answer: -# https://stackoverflow.com/questions/42259741/ -import pyximport -pyximport.install( - # build_dir='build', - setup_args={"script_args": ["--force"]}, - inplace=True, - language_level='3str' -) - -from oscar import * -from unit_test_cy import * - - -class TestBasics(unittest.TestCase): - def test_commit_tz(self): - ctz = CommitTimezone(9, 30) - self.assertEqual(repr(ctz), '') - - def test_parse_commit_date(self): - cdate = parse_commit_date(b'1337145807', b'+1130') - # Things to consider: - # - unix time is always UTC - # - when datetime is rendered, it shows time in the specified timezone, - # at the given UTC time. - # - if no timezone is specified, the server timezone is used - # So, when the timezeon is specified, rendered time should be consistent - self.assertEqual(cdate.strftime('%Y-%m-%d %H:%M:%S %z'), - '2012-05-16 16:53:27 +1130') - cdate = parse_commit_date(b'1337145807', b'-1130') - self.assertEqual(cdate.strftime('%Y-%m-%d %H:%M:%S %z'), - '2012-05-15 17:53:27 -1130') - self.assertIsNone(parse_commit_date(b'3337145807', b'+1100')) - - -class TestHash(unittest.TestCase): - # libtokyocabinet is not thread-safe; you cannot have two open instances of - # the same DB. `unittest` runs multiple tests in threads, so if we use - # `.setUp` and multiple tests, it will fail with "threading error". - # Hence, monolitic test - def test_hash(self): - # setup - # key 114 is from the commit used by TestCommit below, which present - # in both test and production environment. - # just a reminder, PATHS[data_type] is a (path, key_length) tuple - db_path = PATHS['commit_random'][0].format(key=0).encode('ascii') - self.db = Hash(db_path) - - # reading a single key - k = b'test_key' - self.assertEqual(self.db[k], b'\x00\x01\x02\x03') - - # reading all keys - # create_fixtures.py adds more commits to this file to make it up to 1K - keys = list(self.db) - self.assertGreaterEqual(len(keys), 1000) - - -class TestBase(unittest.TestCase): - # there is nothing testable at this class right now - pass - - -class TestBlob(unittest.TestCase): - # GitObject: all, instantiate from str/bytes - def test_string_sha(self): - self.assertEqual(Blob.string_sha(b'Hello world!'), - u'6769dd60bdf536a83c9353272157893043e9f7d0') - - def test_file_sha(self): - self.assertEqual(Blob.file_sha('LICENSE'), - u'94a9ed024d3859793618152ea559a168bbcbb5e2') - - def test_len(self): - sha = u'83d22195edc1473673f1bf35307aea6edf3c37e3' - self.assertEqual(len(Blob(sha)), 42) - - def test_data(self): - # blob has a different .data implementation - sha = u'83d22195edc1473673f1bf35307aea6edf3c37e3' - self.assertEqual( - Blob(sha).data, b'*.egg-info/\ndist/\nbuild/\n*.pyc\n*.mo\n*.gz\n') - - -class TestTree(unittest.TestCase): - def test_data(self): - tree = Tree(u'd4ddbae978c9ec2dc3b7b3497c2086ecf7be7d9d') - self.assertEqual(tree.data, ( - b'100755 .gitignore' - b'\x00\x83\xd2!\x95\xed\xc1G6s\xf1\xbf50z\xean\xdf<7\xe3' - b'100644 COPYING' - b'\x00\xfd\xa9K\x84\x12/o6G<\xa3W7\x94\xa8\xf2\xc4\xf4\xa5\x8c' - b'100644 MANIFEST.in' - b'\x00\xb7$\x83\x15\x19\x90N+\xc2SsR;6\x8c]A\xdc6\x8e' - b'100644 README.rst' - b'\x00#JWS\x8f\x15\xd7/\x00`;\xf0\x86\xb4e\xb0\xf2\xcd\xa7\xb5' - b'40000 minicms' - b'\x00\x95H)\x88z\xf5\xd9\x07\x1a\xa9,Bq3\xca,\xdd\x08\x13\xcc' - b'100644 setup.py' - b'\x00F\xaa\xf0q\xf1\xb8Y\xc5\xbfE\'3\xc2X 1375321509 +1100\n' - b'committer Pavel Puchkin 1375321597 +1100\n' - b'\nLicense changed :P\n', data) - - def test_attrs(self): - c = Commit(u'e38126dbca6572912013621d2aa9e6f7c50f36bc') - self.assertTrue(c.author.startswith(b'Marat')) - self.assertTrue(c.committer.startswith(b'Marat')) - self.assertEqual(c.message, b'support no i18n') - parent_sha = b'ab124ab4baa42cd9f554b7bb038e19d4e3647957' - self.assertEqual(c.parent_shas, (binascii.unhexlify(parent_sha),)) - self.assertIsInstance(c.committed_at, datetime) - self.assertIsInstance(c.authored_at, datetime) - self.assertEqual(c.committed_at.strftime('%Y-%m-%d %H:%M:%S %z'), - '2012-05-19 01:14:08 +1100') - self.assertEqual(c.authored_at.strftime('%Y-%m-%d %H:%M:%S %z'), - '2012-05-19 01:14:08 +1100') - self.assertIsInstance(c.tree, Tree) - self.assertEqual(c.tree.sha, u'6845f55f47ddfdbe4628a83fdaba35fa4ae3c894') - self.assertRaises(AttributeError, lambda: c.arbitrary_attr) - self.assertIsNone(c.signature) - - c = Commit(u'1cc6f4418dcc09f64dcbb0410fec76ceaa5034ab') - self.assertIsInstance(c.signature, bytes) - self.assertGreater(len(c.signature), 450) # 454 for this commit - - -class TestProject(unittest.TestCase): - def test_url(self): - self.assertEqual(Project(b'testuser_test_proj').url, - b'https://github.com/testuser/test_proj') - self.assertEqual(Project(b'testuser_test_proj').url, - b'https://github.com/testuser/test_proj') - self.assertEqual(Project(b'sourceforge.net_tes_tproj').url, - b'https://git.code.sf.net/p/tes_tproj') - self.assertEqual(Project(b'drupal.com_testproj').url, - b'https://github.com/drupal.com/testproj') - - -class TestFile(unittest.TestCase): - # this class consists of relations only - nothing to unit test - pass - - -class TestAuthor(unittest.TestCase): - # this class consists of relations only - nothing to unit test - pass - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/unit_test_cy.pyx b/tests/unit_test_cy.pyx deleted file mode 100644 index 9c66aec..0000000 --- a/tests/unit_test_cy.pyx +++ /dev/null @@ -1,41 +0,0 @@ - -# cython: language_level=3str -""" -Unit tests - only to check functions do what they are expected to do. -Please refrain from checking integrity of the dataset. -""" - -import unittest - -cimport oscar - -class TestUtils(unittest.TestCase): - # ignored, as they're executed on import anyway: - # _latest_version - # _key_length - # _get_paths - - def test_unber(self): - self.assertEqual(oscar.unber(b'\x00\x83M'), [0, 461]) - self.assertEqual(oscar.unber(b'\x83M\x96\x14'), [461, 2836]) - self.assertEqual(oscar.unber(b'\x99a\x89\x12'), [3297, 1170]) - # test number exceeding 32-bit signed int - self.assertEqual(oscar.unber( - b'\x84\xb0\xfb\x82\xd93*'), [150581849267, 42]) - - def test_lzf_length(self): - self.assertEqual(oscar.lzf_length(b'\xc4\x9b'), (2, 283)) - self.assertEqual(oscar.lzf_length(b'\xc3\xa4'), (2, 228)) - self.assertEqual(oscar.lzf_length(b'\xc3\x8a'), (2, 202)) - self.assertEqual(oscar.lzf_length(b'\xca\x87'), (2, 647)) - self.assertEqual(oscar.lzf_length(b'\xe1\xaf\xa9'), (3, 7145)) - self.assertEqual(oscar.lzf_length(b'\xe0\xa7\x9c'), (3, 2524)) - # test extra bytes don't affect the result - self.assertEqual(oscar.lzf_length(b'\xc4\xa6\x1f100644'), (2, 294)) - - def test_decomp(self): - # TODO: test decomp() - pass - - def test_fnvhash(self): - self.assertEqual(hex(oscar.fnvhash(b'foo')), '0xa9f37ed7') diff --git a/woc/__init__.py b/woc/__init__.py index 63c62bd..33fa6a4 100644 --- a/woc/__init__.py +++ b/woc/__init__.py @@ -2,4 +2,18 @@ # SPDX-License-Identifier: GPL-3.0-or-later # @authors: Runzhi He -# @date: 2024-01-17 \ No newline at end of file +# @date: 2024-01-17 + +""" +# Quickstart +.. include:: ../README.md +:start-after: # python-woc +# World of Code Tutorial +.. include:: ../docs/tutorial.md +# World of Code DataFormat +.. include:: ../docs/DataFormat.md +# Contributing +.. include:: ../docs/contributing.md +""" # noqa: D205 + +__all__ = ["local", "tch", "detect", "objects"] diff --git a/woc/base.py b/woc/base.py index 08ff74d..0477c25 100644 --- a/woc/base.py +++ b/woc/base.py @@ -1,41 +1,85 @@ -import os -from typing import Tuple, Union, Literal +from typing import Iterable, List, Literal, Tuple, Union -WocObjectsWithContent = Literal['tree', 'blob', 'commit', 'tkns', 'tag', 'bdiff'] -WocSupportedProfileVersions = (1, ) +WocObjectsWithContent = Literal["tree", "blob", "commit", "tkns", "tag", "bdiff"] +"""WoC objects stored in stacked binary files.""" + +WocSupportedProfileVersions = (1,) +"""Profile versions supported by the current python-woc.""" class WocMapsBase: + maps: Iterable[str] + """List of basemaps available in the WoC database.""" + objects: Iterable[str] + """List of objects available in the WoC database.""" + def __init__(self, *args, **kwargs): - raise NotImplementedError("WocMapsBase is an abstract class, use WoCMapsLocal instead") + raise NotImplementedError( + "WocMapsBase is an abstract class, use WoCMapsLocal instead" + ) def get_values( self, map_name: str, key: Union[bytes, str], - ): - raise NotImplementedError("WocMapsBase is an abstract class, use WoCMapsLocal instead") + ) -> Union[List[str], Tuple[str, str, str], List[Tuple[str, str, str]]]: + """ + Eqivalent to getValues in WoC Perl API. + + :param map_name: The name of the map, e.g. 'c2p', 'c2r', 'P2c' + :param key: The key of the object. For git objects, it is the SHA-1 hash of the object + (in bytes or hex string). For other objects like Author, it is the name of the object. + :return: The value of the object. Can be a list of strings, a tuple of strings, or a list of tuples of strings. Please refer to the documentation for details. + + >>> self.get_values('P2c', 'user2589_minicms') + ['05cf84081b63cda822ee407e688269b494a642de', ...] + >>> self.get_values('c2r', 'e4af89166a17785c1d741b8b1d5775f3223f510f') + ('9531fc286ef1f4753ca4be9a3bf76274b929cdeb', 27) + >>> self.get_values('b2fa', '05fe634ca4c8386349ac519f899145c75fff4169') + ('1410029988', + 'Audris Mockus ', + 'e4af89166a17785c1d741b8b1d5775f3223f510f') + """ + raise NotImplementedError("WocMapsBase is an abstract class") def show_content( self, - obj: WocObjectsWithContent, + obj_name: WocObjectsWithContent, key: Union[bytes, str], - ): - raise NotImplementedError("WocMapsBase is an abstract class, use WoCMapsLocal instead") - - -class WocKeyError(KeyError): - def __init__(self, - key: bytes, - file_path: str, - ) -> None: - try: - _decoded_key = key.decode('utf-8') - except UnicodeDecodeError: - _decoded_key = key.hex() - _filename = os.path.basename(file_path) - self.message = f"{_decoded_key} in {_filename}" - super().__init__(self.message) - - - \ No newline at end of file + ) -> Union[ + List[Tuple[str, str, str]], + str, + Tuple[str, Tuple[str, str, str], Tuple[str, str, str], str], + ]: + """ + Eqivalent to showCnt in WoC Perl API. + + :param obj_name: The name of the object, e.g. 'blob', 'tree', 'commit' + :param key: The key of the object. It is the SHA-1 hash of the object (in bytes or hex string). + :return: The content of the object. Can be a list of tuples of strings, a string, or a tuple of strings. + + >>> self.show_content('blob', '05fe634ca4c8386349ac519f899145c75fff4169') + 'This is the content of the blob' + Eqivalent to showCnt in WoC perl API + >>> self.show_content('tree', '7a374e58c5b9dec5f7508391246c48b73c40d200') + [('100644', '.gitignore', '8e9e1...'), ...] + >>> self.show_content('commit', 'e4af89166a17785c1d741b8b1d5775f3223f510f') + ('f1b66dcca490b5c4455af319bc961a34f69c72c2', + ('c19ff598808b181f1ab2383ff0214520cb3ec659',), + ('Audris Mockus 1410029988', '1410029988', '-0400'), + ('Audris Mockus ', '1410029988', '-0400'), + 'News for Sep 5, 2014\\n') + """ + raise NotImplementedError("WocMapsBase is an abstract class") + + def count(self, map_name: str) -> int: + """ + Count the number of keys in a map. + + :param map_name: The name of the mapping / object, e.g. 'c2p', 'c2r', 'commit'. + :return: The number of keys in the tch databases plus the number of large files. + + >>> self.count('c2r') + 12345 + """ + raise NotImplementedError("WocMapsBase is an abstract class") diff --git a/woc/detect.py b/woc/detect.py index 1d6bb97..b0daf16 100644 --- a/woc/detect.py +++ b/woc/detect.py @@ -4,23 +4,27 @@ # @authors: Runzhi He # @date: 2024-01-17 -import os +import argparse import json import logging -import argparse +import os import re -from typing import Dict, Iterable, Tuple, Optional from functools import cmp_to_key +from typing import Iterable, Optional, Tuple -DEFAULT_PROFILE = os.path.join(os.path.dirname(__file__), 'wocprofile.default.json') +_default_profile = os.path.join(os.path.dirname(__file__), "wocprofile.default.json") _logger = logging.getLogger(__name__) _logger.setLevel(logging.INFO) MAP_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch$" +"""Filename regex for basemap files""" _map_pat = re.compile(MAP_REGEX) + + def parse_map_fname(fname: str): """ - Parse basemap filename into (src, dst, ver, idx) + Parse basemap filename into (src, dst, ver, idx). + >>> parse_map_fname('c2fFullR.3.tch') ('c', 'f', 'R', '3') >>> parse_map_fname('c2fFullR.tch') @@ -31,11 +35,16 @@ def parse_map_fname(fname: str): return None return m.groups() + LARGE_REGEX = r"^(\w+)2(\w+)Full(\w+)(?:.(\d+))?.tch.large.([0-9a-f]+)$" +"""Filename regex for large basemap files""" _large_pat = re.compile(LARGE_REGEX) + + def parse_large_fname(fname: str): """ - Parse basemap filename into (src, dst, ver, idx, hash) + Parse basemap filename into (src, dst, ver, idx, hash). + >>> parse_large_fname('A2cFullU.15.tch.large.59016a4f') ('A', 'c', 'U', '15', '59016a4f') """ @@ -44,11 +53,16 @@ def parse_large_fname(fname: str): return None return m.groups() + OBJ_REGEX = r"^([\w\.]+)_(\d+).(idx|bin|tch)$" +"""Filename regex for object files""" _obj_pat = re.compile(OBJ_REGEX) + + def parse_obj_fname(fname: str): """ - Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext) + Parse sha1map (sha1o/sha1c/blob) filename into (name, idx, ext). + >>> parse_obj_fname('commit_0.tch') ('commit', '0', 'tch') >>> parse_obj_fname('blob_0.idx') @@ -61,9 +75,11 @@ def parse_obj_fname(fname: str): return None return m.groups() + def compare_woc_version(ver1: str, ver2: str): """ - Compare two woc version strings (A < Z < AA) + Compare two woc version strings (A < Z < AA). + >>> compare_woc_version('S', 'T') > 0 False >>> compare_woc_version('AA', 'U') > 0 @@ -73,9 +89,11 @@ def compare_woc_version(ver1: str, ver2: str): return len(ver1) - len(ver2) return ord(ver1[0]) - ord(ver2[0]) + def infer_dtype(map_name: str) -> Tuple[str, str]: """ - Infer the data types from the map's name (entity -> entity) + Infer the data types from the map's name (entity -> entity). + Should be bug-to-bug compatible with: https://github.com/ssc-oscar/lookup/blob/7289885/getValues.perl#L34 >>> infer_dtype('c2f') @@ -84,105 +102,113 @@ def infer_dtype(map_name: str) -> Tuple[str, str]: ('h', 'cs3') """ ent_all = map_name.lower() - ent_in, ent_out = ent_all.split('2') - - dtype_in, dtype_out = 'h', 'h' - - if ent_in in ('a', 'f', 'p'): - dtype_in = 's' - if ent_out in ('a', 'f', 'p'): - dtype_out = 'cs' - if ent_in in ('c','b','w','ob','td'): - dtype_in = 'h' - if ent_out in ('c','b','cc', 'pc','ob','td'): - dtype_out = 'h' - if ent_all == 'b2fa': - dtype_out = 'sh' - if ent_out in ('ta',): - dtype_out = 's' - if ent_all in ('b2tk', 'td2f'): - dtype_out = 's' - if ent_all in ('c2h', 'c2r'): - dtype_out = 'r' - if ent_in in ('ps', 'pf', 'pfs'): - dtype_in = 's' - if ent_out in ('ps', 'pf', 'pfs'): - dtype_out = 's' - if ent_out in ('rhp',): - dtype_out = 'hhwww' - if ent_all in ('p2p', 'a2a'): - dtype_in, dtype_out = 's', 'cs' - if ent_all in ('b2baddate', 'b2manyp'): - dtype_in, dtype_out = 's', 'h' - if ent_all in ('c2fbb', 'obb2cf', 'bb2cf'): - dtype_in, dtype_out = 'h', 'cs' - if ent_all in ('c2fbb', 'obb2cf', 'bb2cf'): - dtype_in, dtype_out = 'h', 'cs' - if ent_all in ('c2dat',): - dtype_in, dtype_out = 'h', 's' - if ent_all in ('b2tac',): - dtype_in, dtype_out = 'h', 'cs3' + ent_in, ent_out = ent_all.split("2") + + dtype_in, dtype_out = "h", "h" + + if ent_in in ("a", "f", "p"): + dtype_in = "s" + if ent_out in ("a", "f", "p"): + dtype_out = "cs" + if ent_in in ("c", "b", "w", "ob", "td"): + dtype_in = "h" + if ent_out in ("c", "b", "cc", "pc", "ob", "td"): + dtype_out = "h" + if ent_all == "b2fa": + dtype_out = "sh" + if ent_out in ("ta",): + dtype_out = "s" + if ent_all in ("b2tk", "td2f"): + dtype_out = "s" + if ent_all in ("c2h", "c2r"): + dtype_out = "r" + if ent_in in ("ps", "pf", "pfs"): + dtype_in = "s" + if ent_out in ("ps", "pf", "pfs"): + dtype_out = "s" + if ent_out in ("rhp",): + dtype_out = "hhwww" + if ent_all in ("p2p", "a2a"): + dtype_in, dtype_out = "s", "cs" + if ent_all in ("b2baddate", "b2manyp"): + dtype_in, dtype_out = "s", "h" + if ent_all in ("c2fbb", "obb2cf", "bb2cf"): + dtype_in, dtype_out = "h", "cs3" + if ent_all in ("c2dat",): + dtype_in, dtype_out = "h", "s" + if ent_all in ("b2tac",): + dtype_in, dtype_out = "h", "cs3" return dtype_in, dtype_out + def detect_profile( paths: Iterable[str], version: Optional[str] = None, - preset_path: str = DEFAULT_PROFILE, + preset_path: Optional[str] = None, + check_missing: bool = True, ): _maps, _objs = {}, {} + if not preset_path: + preset_path = _default_profile + def _handle_map(src, dst, ver, idx, hash): if version and ver != version: - logging.info(f'Found map {f} with version {ver}, expected {version}') + logging.info(f"Found map {f} with version {ver}, expected {version}") return - _map_name = f'{src}2{dst}' + _map_name = f"{src}2{dst}" if idx is None: idx = "0" - prefix_len = int(idx).bit_length() + prefix_len = int(idx).bit_length() - _map = (_maps - .setdefault(_map_name, {}) - .setdefault(ver, { + _map = _maps.setdefault(_map_name, {}).setdefault( + ver, + { "version": ver, "sharding_bits": prefix_len, "shards": {}, "larges": {}, "dtypes": infer_dtype(_map_name), - }) + }, ) if not hash: - logging.debug(f'Found map {f} with hash {hash} idx {idx}') + logging.debug(f"Found map {f} with hash {hash} idx {idx}") _map["shards"][int(idx)] = os.path.join(root, f) else: - logging.debug(f'Found large map {f} with hash {hash} idx {idx}') + logging.debug(f"Found large map {f} with hash {hash} idx {idx}") _map["larges"][hash] = os.path.join(root, f) _map["sharding_bits"] = max(_map["sharding_bits"], prefix_len) - def _handle_obj(name, idx, ext): _map_name = f"{name}.{ext}" prefix_len = int(idx).bit_length() if idx else 0 - _obj = (_objs - .setdefault(_map_name, { + _obj = _objs.setdefault( + _map_name, + { "sharding_bits": prefix_len, "shards": {}, - }) + }, ) - logging.debug(f'Found obj {f} idx {idx}') + logging.debug(f"Found obj {f} idx {idx}") _obj["shards"][int(idx)] = os.path.join(root, f) _obj["sharding_bits"] = max(_obj["sharding_bits"], prefix_len) - for path in paths: # walk the directory for all files for root, _, files in os.walk(path): # only consider .tch, .idx, .bin files - files = [f for f in files if '.tch' in f or (not f.startswith('pack') and f.endswith('.idx')) or f.endswith('.bin')] + files = [ + f + for f in files + if ".tch" in f + or (not f.startswith("pack") and f.endswith(".idx")) + or f.endswith(".bin") + ] for idx, f in enumerate(files): if idx % 1000 == 0: - _logger.info(f'Processing {f} in {path}, {idx+1}/{len(files)}') + _logger.info(f"Processing {f} in {path}, {idx+1}/{len(files)}") _r = parse_map_fname(f) if _r: @@ -195,81 +221,101 @@ def _handle_obj(name, idx, ext): src, dst, ver, idx, hash = _r _handle_map(src, dst, ver, idx, hash) continue - + _r = parse_obj_fname(f) if _r: name, idx, ext = _r _handle_obj(name, idx, ext) continue - _logger.warning(f'Unrecognized file: {f}') + _logger.warning(f"Unrecognized file: {f}") - # transform maps and objs + # transform maps _ls_maps = {} for k, v in _maps.items(): _to_drop = [] for ver, vv in v.items(): # convert shards to list - _ls = [None] * 2**vv['sharding_bits'] - for kkk, vvv in vv['shards'].items(): + _ls = [None] * 2 ** vv["sharding_bits"] + for kkk, vvv in vv["shards"].items(): _ls[kkk] = vvv # see if we can find the None in _ls _nones = [i for i, x in enumerate(_ls) if x is None] - if _nones: - _logger.warning(f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping') + if _nones and check_missing: + _logger.warning( + f'Cannot find shards {", ".join(map(str, _nones))} in map {k} ver {ver}, skipping' + ) _logger.warning(f"Got: {vv['shards']}") _to_drop.append(ver) else: - vv['shards'] = _ls + vv["shards"] = _ls for ver in _to_drop: del v[ver] # move latest maps to the front of the list - _ls_maps[k] = [v for k, v in sorted( - v.items(), - key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])), - reverse=True - )] + if len(v) == 0: + continue + _ls_maps[k] = [ + vv + for _, vv in sorted( + v.items(), + key=cmp_to_key(lambda x, y: compare_woc_version(x[0], y[0])), + reverse=True, + ) + ] + # transform objects _ls_objs = {} for k, v in _objs.items(): # convert shards to list - _ls = [None] * 2**v['sharding_bits'] - for kk, vv in v['shards'].items(): + _ls = [None] * 2 ** v["sharding_bits"] + for kk, vv in v["shards"].items(): _ls[kk] = vv # see if we can find the None in _ls _nones = [i for i, x in enumerate(_ls) if x is None] - if _nones: - _logger.warning(f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping') + if _nones and check_missing: + _logger.warning( + f'Cannot find shards {", ".join(map(str, _nones))} in obj {k}, skipping' + ) _logger.warning(f"Got: {v['shards']}") else: - v['shards'] = _ls + v["shards"] = _ls _ls_objs[k] = v - # load the preset profile - with open(preset_path, 'r') as f: + with open(preset_path, "r") as f: res = json.load(f) res["maps"] = _ls_maps res["objects"] = _ls_objs return res -parser = argparse.ArgumentParser(description='Detect woc profile') -parser.add_argument('paths', metavar='PATH', type=str, nargs='+', help='path to woc directory') -parser.add_argument('--version', type=str, default=None, help='woc mapping version') -parser.add_argument('--preset', type=str, default=DEFAULT_PROFILE, help='path to preset profile') -parser.add_argument('--output', type=str, default=None, help='path to output profile') - -if __name__ == '__main__': +if __name__ == "__main__": import doctest + doctest.testmod() + parser = argparse.ArgumentParser(description="Detect woc profile") + parser.add_argument( + "paths", metavar="PATH", type=str, nargs="+", help="path to woc directory" + ) + parser.add_argument("--version", type=str, default=None, help="woc mapping version") + parser.add_argument( + "--preset", type=str, default=_default_profile, help="path to preset profile" + ) + parser.add_argument("--output", type=str, default=None, help="path to output profile") + parser.add_argument( + "--no-skip-missing", + dest="check_missing", + action="store_false", + help="do not check missing shards", + ) + args = parser.parse_args() - - res = detect_profile(args.paths, args.version, args.preset) + + res = detect_profile(args.paths, args.version, args.preset, args.check_missing) if args.output: - with open(args.output, 'w') as f: + with open(args.output, "w") as f: json.dump(res, f, indent=2) else: - print(json.dumps(res, indent=2)) \ No newline at end of file + print(json.dumps(res, indent=2)) diff --git a/woc/get_values.py b/woc/get_values.py new file mode 100644 index 0000000..a8d90f7 --- /dev/null +++ b/woc/get_values.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 + +# SPDX-License-Identifier: GPL-3.0-or-later +# @authors: Runzhi He +# @date: 2024-05-27 + +from typing import Iterable + +from .local import WocMapsLocal + + +def format_map(key: str, map_objs: Iterable) -> str: + return key + ";" + ";".join(map(str, map_objs)) + + +if __name__ == "__main__": + import argparse + import logging + import os + import sys + + parser = argparse.ArgumentParser(description="Get record of various maps") + parser.add_argument("type", type=str, help="The type of the object") + parser.add_argument( + "-p", "--profile", type=str, help="The path to the profile file", default=None + ) + args = parser.parse_args() + + woc = WocMapsLocal(args.profile) + for line in sys.stdin: + try: + key = line.strip() + obj = woc.get_values(args.type, key) + print(format_map(key, obj)) + except BrokenPipeError: + # ref: https://docs.python.org/3/library/signal.html#note-on-sigpipe + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + sys.exit(1) # Python exits with error code 1 on EPIPE + except Exception as e: + logging.error(f"Error in {key}: {e}", exc_info=True) + continue diff --git a/woc/local.pxd b/woc/local.pxd new file mode 100644 index 0000000..ca8388a --- /dev/null +++ b/woc/local.pxd @@ -0,0 +1,11 @@ +# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False, profile=True, linetrace=True + +from libc.stdint cimport uint32_t, uint8_t + +# Make utility functions accessible from Python -> easier testing +cpdef uint32_t fnvhash(bytes data) +cpdef unber(bytes buf) +cpdef (int, int) lzf_length(bytes raw_data) +cpdef get_tch(str path) +cpdef uint8_t get_shard(bytes key, uint8_t sharding_bits, bint use_fnv_keys) +# cpdef bytes get_from_tch(bytes key, list shards, int sharding_bits, bint use_fnv_keys) \ No newline at end of file diff --git a/woc/local.pyi b/woc/local.pyi index 993c8dd..83f5a5a 100644 --- a/woc/local.pyi +++ b/woc/local.pyi @@ -1,24 +1,54 @@ -from typing import Iterable, Union +from typing import Iterable, List, Tuple, Union -from .base import WocMapsBase, WocObjectsWithContent +from .base import WocMapsBase class WocMapsLocal(WocMapsBase): - def __init__(self, - profile_path: str | Iterable[str] | None = None, - version: str | Iterable[str] | None = None - ) -> None: + def __init__( + self, + profile_path: Union[str, Iterable[str], None] = None, + version: Union[str, Iterable[str], None] = None, + ) -> None: + """ + Initialize local WoC maps with a profile. + + :param profile_path: path to the woc profile. + if not provided, use `./wocprofile.json`, `~/.wocprofile.json`, `/etc/wocprofile.json`. + :param version: version of the profile, default to the latest version. + **NOT IMPLEMENTED YET** + """ ... - def get_values( + def _get_tch_bytes( + self, map_name: str, key: Union[bytes, str] + ) -> Tuple[bytes, str]: ... + def _get_pos( self, - map_name: str, + obj_name: str, key: Union[bytes, str], - ): + ) -> Tuple[int, int]: + """ + Get offset and length of a stacked binary object, currently only support blob. + + Extract this part because it's much cheaper than decode the content. + >>> self._get_pos('blob', bytes.fromhex('7a374e58c5b9dec5f7508391246c48b73c40d200')) + (0, 123) + """ ... - def show_content( - self, - obj: WocObjectsWithContent, - key: Union[bytes, str], - ): - ... \ No newline at end of file +# The following functions are internal and should not be used by the user +# Exposing them here for testing purposes +def fnvhash(data: bytes) -> int: ... +def unber(buf: bytes) -> bytes: ... +def lzf_length(raw_data: bytes) -> Tuple[int, int]: ... +def decomp(data: bytes) -> bytes: ... +def decomp_or_raw(data: bytes) -> bytes: ... +def get_tch(path: str): ... +def get_shard(key: bytes, sharding_bits: int, use_fnv_keys: bool) -> int: ... + +# def get_from_tch(key: bytes, shards: List[bytes], sharding_bits: int, use_fnv_keys: bool) -> bytes: ... +def decode_value(value: bytes, out_dtype: str): ... +def decode_tree(value: bytes) -> List[Tuple[str, str, str]]: ... +def decode_commit( + commit_bin: bytes, +) -> Tuple[str, Tuple[str, str, str], Tuple[str, str, str], str]: ... +def decode_str(raw_data: str, encoding="utf-8"): ... diff --git a/woc/local.pyx b/woc/local.pyx index cb81db0..663db8e 100644 --- a/woc/local.pyx +++ b/woc/local.pyx @@ -1,13 +1,17 @@ -# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False +# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False, profile=True, linetrace=True # SPDX-License-Identifier: GPL-3.0-or-later # @authors: Runzhi He # @date: 2024-01-17 import os import json +import logging +import time from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t -from typing import Tuple, Dict, Iterable, List, Union, Literal -import zlib +from libc.string cimport memchr, strstr, strchr, strlen, strncmp +from threading import Lock +from typing import Tuple, Dict, Iterable, List, Union, Literal, Optional +import gzip try: import lzf @@ -15,21 +19,38 @@ try: except ImportError or AssertionError: raise ImportError('python-lzf is required to decompress LZF-compressed data: `pip install python-lzf`') -from .base import WocMapsBase, WocKeyError, WocObjectsWithContent, WocSupportedProfileVersions -from .tch cimport fnvhash, get_from_tch, get_shard +from .base import WocMapsBase, WocObjectsWithContent, WocSupportedProfileVersions +from .tch cimport TCHashDB -cdef unber(bytes buf): +cdef extern from 'Python.h': + object PyBytes_FromStringAndSize(char *s, Py_ssize_t len) + +### Utility functions ### + +cpdef uint32_t fnvhash(bytes data): + """ + Returns the 32 bit FNV-1a hash value for the given data. + >>> hex(fnvhash('foo')) + '0xa9f37ed7' + """ + # PY: 5.8usec Cy: 66.8ns + cdef: + uint32_t hval = 0x811c9dc5 + uint8_t b + for b in data: + hval ^= b + hval *= 0x01000193 + return hval + +cpdef unber(bytes buf): r""" Perl BER unpacking. BER is a way to pack several variable-length ints into one binary string. Here we do the reverse. Format definition: from http://perldoc.perl.org/functions/pack.html (see "w" template description) - Args: - buf (bytes): a binary string with packed values - - Returns: - str: a list of unpacked values + :param buf: a binary string with packed values + :return: a list of unpacked values >>> unber(b'\x00\x83M') [0, 461] @@ -42,27 +63,25 @@ cdef unber(bytes buf): cdef: list res = [] # blob_offset sizes are getting close to 32-bit integer max - uint64_t acc = 0 + uint64_t acc = 0 uint8_t b for b in buf: acc = (acc << 7) + (b & 0x7f) if not b & 0x80: res.append(acc) - acc = 0 + acc = 0 return res -cdef (int, int) lzf_length(bytes raw_data): - r""" Get length of uncompressed data from a header of Compress::LZF - output. Check Compress::LZF sources for the definition of this bit magic - (namely, LZF.xs, decompress_sv) - https://metacpan.org/source/MLEHMANN/Compress-LZF-3.8/LZF.xs +cpdef (int, int) lzf_length(bytes raw_data): + r""" Get length of uncompressed data from a header of Compress::LZF output. - Args: - raw_data (bytes): data compressed with Perl Compress::LZF + Check Compress::LZF sources for the definition of this bit magic: + (namely, LZF.xs, decompress_sv) + https://metacpan.org/source/MLEHMANN/Compress-LZF-3.8/LZF.xs - Returns: - Tuple[int, int]: (header_size, uncompressed_content_length) in bytes + :param raw_data: data compressed with Perl `Compress::LZF` + :return: (header_size, uncompressed_content_length) in bytes >>> lzf_length(b'\xc4\x9b') (2, 283) @@ -98,15 +117,13 @@ cdef (int, int) lzf_length(bytes raw_data): def decomp(bytes raw_data): # type: (bytes) -> bytes - """ lzf wrapper to handle perl tweaks in Compress::LZF + """lzf wrapper to handle perl tweaks in `Compress::LZF` + This function extracts uncompressed size header and then does usual lzf decompression. - Args: - raw_data (bytes): data compressed with Perl Compress::LZF - - Returns: - str: unpacked data + :param raw_data: data compressed with Perl `Compress::LZF` + :return: unpacked data """ if not raw_data: return b'' @@ -115,7 +132,18 @@ def decomp(bytes raw_data): start, usize = lzf_length(raw_data) # while it is tempting to include liblzf and link statically, there is # zero advantage comparing to just using python-lzf - return lzf.decompress(raw_data[start:], usize) + _ret = lzf.decompress(raw_data[start:], usize) + + # NOTE: lzf.decompress may return None if it fails + # e.g. blob b0c0dca2eca2160ec81ff10bec565c790e6b2e97, version R + if _ret is not None: + return _ret + # This case should be exetremely rare and indicates a corrupted file + logging.error(f"Failed to decompress: {len(raw_data) - start} bytes of compressed data " + f"does not fit into {usize} bytes") + raise ValueError(f"Failed to decompress: {len(raw_data) - start} bytes of compressed data " + f"does not fit into {usize} bytes") + def decomp_or_raw(bytes raw_data): """ Try to decompress raw_data, return raw_data if it fails""" @@ -132,12 +160,388 @@ def slice20(bytes raw_data): return () return tuple(raw_data[i:i + 20] for i in range(0, len(raw_data), 20)) +def decode_str(bytes raw_data, str encoding='utf-8'): + """ Decode raw_data, detect the encoding if utf-8 fails """ + try: + return raw_data.decode(encoding) + except UnicodeDecodeError: + import chardet # should be rarely used + _encoding = chardet.detect(raw_data)['encoding'] + _ret = raw_data.decode(_encoding, errors='replace') + if len(_ret) == 0: + logging.error(f"Failed to decode: {raw_data[:20]}... with encoding {_encoding}") + return _ret + + +### TCH helpers ### + +# Pool of open TokyoCabinet databases to save few milliseconds on opening +cdef dict _TCH_POOL = {} # type: Dict[str, TCHashDB] +TCH_LOCK = Lock() + +cpdef TCHashDB get_tch(str path): + """ Cache TCHashDB objects """ + if path in _TCH_POOL: + return _TCH_POOL[path] + try: + TCH_LOCK.acquire() + # in multithreading environment this can cause race condition, + # so we need a lock + if path not in _TCH_POOL: + # open database in read-only mode and allow concurrent access + _TCH_POOL[path] = TCHashDB(path, ro=True) + finally: + TCH_LOCK.release() + return _TCH_POOL[path] + +cpdef uint8_t get_shard(bytes key, uint8_t sharding_bits, bint use_fnv_keys): + """ Get shard id """ + cdef uint8_t p + if use_fnv_keys: + p = fnvhash(key) + else: + p = key[0] + cdef uint8_t prefix = p & (2**sharding_bits - 1) + return prefix + +# cpdef bytes get_from_tch(bytes key, list shards, int sharding_bits, bint use_fnv_keys): +# """DEPRECATED""" +# # not 100% necessary but there are cases where some tchs are miserably missing +# _shard = get_shard(key, sharding_bits, use_fnv_keys) +# _path = shards[_shard] +# assert _path and os.path.exists(_path), f"shard {_shard} not found at {_path}" +# return get_tch( +# shards[get_shard(key, sharding_bits, use_fnv_keys)].encode('utf-8') +# )[key] + +### deserializers ### + +def decode_value( + value: bytes, + out_dtype: str +): + """ + Decode values from tch maps. + """ + if out_dtype == 'h': # type: list[str] + return [value[i:i + 20].hex() for i in range(0, len(value), 20)] + elif out_dtype == 'sh': # type: tuple[str, str, str] + buf0 = value[0:len(value)-21] + cmt_sha = value[(len(value)-20):len(value)] + (Time, Author) = decode_str(buf0).split(";") + return (Time, Author, cmt_sha.hex()) + elif out_dtype == 'cs3': # type: list[tuple[str, str, str]] + data = decomp(value) + _splited = decode_str(data).split(";") + return [ + (_splited[i],_splited[i+1],_splited[i+2]) + for i in range(0, len(_splited), 3) + ] + elif out_dtype == 'cs': # type: list[str] + data = decomp(value) + return [decode_str(v) + for v in data.split(b';') + if v and v != b'EMPTY'] + elif out_dtype == 's': # type: list[str] + return [decode_str(v) + for v in value.split(b';')] + elif out_dtype == 'r': # type: list[str, int] + _hex = value[:20].hex() + _len = unber(value[20:])[0] + return (_hex, _len) + elif out_dtype == 'hhwww': + raise NotImplemented + raise ValueError(f'Unsupported dtype: {out_dtype}') + +def decode_tree( + value: bytes +) -> List[Tuple[str, str, str]]: + """ + Decode a tree binary object into tuples. + + Python: 4.77 µs, Cython: 280 ns + Reference: https://stackoverflow.com/questions/14790681/ + + >>> decode_tree(b'100644 .gitignore\\x00\\x8e\\x9e\\x1f...') + [('100644', '.gitignore', '8e9e1...'), ...] + """ + files = [] + + cdef: + const char* tree_cstr = value + const char* end = tree_cstr + len(value) + const char* pos = tree_cstr + const char* mode_start + const char* filename_start + const char* hash_start + uint8_t mode_len + uint16_t filename_len # git filenames can be 4096 chars long + + while pos < end: + mode_start = pos + pos = memchr(pos, b' ', end - pos) + if not pos: + raise ValueError('Invalid tree object: missing space after mode') + + mode_len = pos - mode_start + pos += 1 # Skip the space + + filename_start = pos + pos = memchr(pos, b'\x00', end - pos) + if not pos: + raise ValueError('Invalid tree object: missing null byte after filename') + + filename_len = pos - filename_start + pos += 1 # Skip the null byte + + if pos + 20 > end: + raise ValueError('Invalid tree object: missing or truncated hash') + + hash_start = pos + pos += 20 # Skip the 20-byte hash + + files.append(( + value[mode_start - tree_cstr:mode_start - tree_cstr + mode_len].decode('ascii'), + value[filename_start - tree_cstr:filename_start - tree_cstr + filename_len].decode('utf-8'), + value[hash_start - tree_cstr :hash_start - tree_cstr + 20].hex() + )) + + return files + +# def decode_tree( +# value: bytes +# ) -> List[Tuple[str, str, str]]: +# """ +# Decode a tree binary object into tuples +# Reference: https://stackoverflow.com/questions/14790681/ +# mode (ASCII encoded decimal) +# SPACE (\0x20) +# filename +# NULL (\x00) +# 20-byte binary hash +# """ +# _out_buf = [] +# _file_buf = [] +# _curr_buf = bytes() + +# # TODO: current impl is not efficient, need to optimize +# i = 0 +# while i < len(value): +# if value[i] == 0x20: +# _file_buf.append(decode_str(_curr_buf)) +# _curr_buf = bytes() +# elif value[i] == 0x00: +# _file_buf.append(decode_str(_curr_buf)) +# # take next 20 bytes as a hash +# _curr_buf = value[i+1:i+21] +# _file_buf.append(_curr_buf.hex()) +# _out_buf.append(tuple(_file_buf)) +# # clear buffers +# _file_buf = [] +# _curr_buf = bytes() +# i += 20 +# else: +# _curr_buf += bytes([value[i]]) +# i += 1 + +# return _out_buf + +cdef const char* strrchr2(const char* s, char c, const char* end): + """Like strrchr but with a limit""" + cdef const char* p = NULL + while s and s < end: + if s[0] == c: + p = s + s += 1 + return p + +def decode_commit( + commit_bin: bytes +) -> Tuple[str, Tuple[str, str, str], Tuple[str, str, str], str]: + """ + Decode git commit objects into tuples. + + Python: 2.35 µs, Cython: 855 ns + Reference: https://git-scm.com/book/en/v2/Git-Internals-Git-Objects + + >>> decode_commit(b'tree f1b66dcca490b5c4455af319bc961a34f69c72c2\\n...') + ('f1b66dcca490b5c4455af319bc961a34f69c72c2', + ('c19ff598808b181f1ab2383ff0214520cb3ec659',), + ('Audris Mockus 1410029988', '1410029988', '-0400'), + ('Audris Mockus ', '1410029988', '-0400'), + 'News for Sep 5, 2014\\n') + """ + cdef: + const char* cmt_cstr = commit_bin + const char* header + const char* full_msg + const char* line + const char* next_line + const char* key + const char* value + const char* timestamp + const char* timezone + bint is_reading_pgp = False + int header_len + int line_len + + _parent_shas = [] + _tree = '' + _author_bytes = b'' + _author_timestamp = '' + _author_timezone = '' + _committer_bytes = b'' + _committer_timestamp = '' + _committer_timezone = '' + _encoding = 'utf-8' + + if not cmt_cstr or cmt_cstr[0] == b'\0': + raise ValueError('Empty commit object') + + header = cmt_cstr + full_msg = strstr(cmt_cstr, b"\n\n") + if not full_msg: + raise ValueError('Invalid commit object: no \\n\\n') + + header_len = full_msg - header + full_msg += 2 # Skip the '\n\n' + + line = header + while line < header + header_len: + next_line = strchr(line, b'\n') + if not next_line: + next_line = header + header_len + line_len = next_line - line + + if line_len == 0: + line = next_line + 1 + continue + + key = line + value = strchr(line, b' ') + if not value or value >= next_line: + line = next_line + 1 + continue + value += 1 + + if strncmp(key, "tree ", 5) == 0: + _tree = (value[:line_len - 5]).decode('ascii') + elif strncmp(key, "parent ", 7) == 0: + _parent_shas.append(value[:line_len - 7].decode('ascii')) + elif strncmp(key, "author ", 7) == 0: + timezone = strrchr2(value, b' ', next_line) + if not timezone: + continue + timestamp = strrchr2(value, b' ', timezone - 1) + if not timestamp: + continue + _author_bytes = value[:timestamp - value] + _author_timestamp = (value[timestamp - value + 1: timezone - value]).decode('ascii') + _author_timezone = (value[timezone - value + 1: next_line - value]).decode('ascii') + elif strncmp(key, "committer ", 10) == 0: + timezone = strrchr2(value, b' ', next_line) + if not timezone: + continue + timestamp = strrchr2(value, b' ', timezone - 1) + if not timestamp: + continue + _committer_bytes = value[:timestamp - value] + _committer_timestamp = (value[timestamp - value + 1: timezone - value]).decode('ascii') + _committer_timezone = (value[timezone - value + 1: next_line - value]).decode('ascii') + elif strncmp(key, "gpgsig", 6) == 0: + is_reading_pgp = True + elif is_reading_pgp and strncmp(line, "-----END PGP SIGNATURE-----", 27) == 0: + is_reading_pgp = False + elif strncmp(key, "encoding", 8) == 0: + _encoding = value[:line_len - 8].decode('ascii') + + line = next_line + 1 + + _author = decode_str(_author_bytes, _encoding) + _committer = decode_str(_committer_bytes, _encoding) + _message = decode_str(full_msg, _encoding) + + return ( + _tree, + tuple(_parent_shas), + (_author, _author_timestamp, _author_timezone), + (_committer, _committer_timestamp, _committer_timezone), + _message, + ) + +# def decode_commit(cmt: bytes): +# """ +# Decode git commit objects into tuples +# """ +# cmt = decode_str(cmt) +# if cmt.strip() == '': +# raise ValueError('Empty commit object') +# try: +# header, full_msg = cmt.split('\n\n', 1) +# except ValueError: +# raise ValueError('Invalid commit object: no \\n\\n') + +# tree = '' +# parent = [] +# author, author_timestamp, author_timezone = '', '', '' +# committer, committer_timestamp, committer_timezone = '', '', '' +# encoding = 'utf-8' +# # parse the header +# _is_reading_pgp = False +# for line in header.split('\n'): +# line = line.strip() +# if line.startswith('tree'): +# tree = line[5:] +# elif line.startswith('parent'): # merge commits have multiple parents +# parent.append(line[7:]) +# elif line.startswith('author'): +# # res['author'], res['author_timestamp'], res['author_timezone'] = line[7:].rsplit(' ', 2) +# author, timestamp, timezone = line[7:].rsplit(' ', 2) +# elif line.startswith('committer'): +# # res['committer'], res['committer_timestamp'], res['committer_timezone'] = line[10:].rsplit(' ', 2) +# committer, timestamp, timezone = line[10:].rsplit(' ', 2) +# elif line.startswith('gpgsig'): +# _is_reading_pgp = True +# elif _is_reading_pgp and line.strip() == '-----END PGP SIGNATURE-----': +# _is_reading_pgp = False +# elif line.startswith('encoding'): +# encoding = line[8:] + +# return ( +# tree, +# tuple(parent), +# (author, author_timestamp, author_timezone), +# (committer, committer_timestamp, committer_timezone), +# full_msg, +# ) + +def read_large(path: str, dtype: str) -> bytes: + """Read a *.large.* and return its content""" + if dtype == 'h': + with open(path, 'rb') as f: + f.seek(20) # 160 bits of SHA1 + return f.read() + else: + # use zlib to decompress + with gzip.open(path, 'rb') as f: + _uncompressed = f.read() + # find first 256 bytes for b'\n', don't scan the whole document + _idx = _uncompressed[:256].find(b'\n') + if _idx > 0: + return _uncompressed[_idx+1:] # a2f + return _uncompressed # b2tac + class WocMapsLocal(WocMapsBase): - def __init__(self, - profile_path: str | Iterable[str] | None = None, - version: str | Iterable[str] | None = None + def __init__(self, + profile_path: Union[str, Iterable[str], None] = None, + version: Union[str, Iterable[str], None] = None ) -> None: + # init logger + self._logger = logging.getLogger(__name__) + # cache logger level + self._is_debug_enabled = self._logger.isEnabledFor(logging.DEBUG) + # load profile if profile_path is None: profile_path = ( @@ -156,194 +560,262 @@ class WocMapsLocal(WocMapsBase): break else: raise FileNotFoundError("No wocprofile.json found in the following paths: {}, " - "run `python3 -m woc detect` to generate".format(profile_path)) + "run `python3 -m woc.detect` to generate".format(profile_path)) # check profile assert self.config["wocSchemaVersion"] in WocSupportedProfileVersions, \ "Unsupported wocprofile version: {}".format(self.config["wocSchemaVersion"]) - assert self.config["maps"], "Run `python3 -m woc detect` to scan data files and generate wocprofile.json" - - @staticmethod - def _read_large(path: str, dtype: str) -> bytes: - """Read a *.large.* and return its content""" - if dtype == 'h': - with open(path, 'rb') as f: - f.seek(20) # 160 bits of SHA1 - return f.read() - else: - # use zlib to decompress - with open(path, 'rb') as f: - _uncompressed = zlib.decompress(f.read()) - # find first 256 bytes for b'\n', don't scan the whole document - _idx = _uncompressed[:256].find(b'\n') - if _idx > 0: - return _uncompressed[_idx+1:] # a2f - return _uncompressed # b2tac - - def _decode_value( - self, - value: bytes, - out_dtype: str - ): - if out_dtype == 'h': # type: list[str] - return [value[i:i + 20].hex() for i in range(0, len(value), 20)] - elif out_dtype == 'sh': # type: tuple[str, str, str] - buf0 = value[0:len(value)-21] - cmt_sha = value[(len(value)-20):len(value)] - (Time, Author) = buf0.decode('utf-8').split(";") - return (Time, Author, cmt_sha.hex()) - elif out_dtype == 'cs3': # type: list[tuple[str, str, str]] - data = decomp(value) - _splited = data.decode('utf-8').split(";") - return [ - (_splited[i],_splited[i+1],_splited[i+2]) - for i in range(0, len(_splited), 3) - ] - elif out_dtype == 'cs': # type: list[str] - data = decomp(value) - return [v.decode('utf-8') - for v in data.split(b';') - if v and v != b'EMPTY'] - elif out_dtype == 's': # type: list[str] - return value.decode('utf-8').split(';') - elif out_dtype == 'r': # type: list[str] - data = decomp(value) - return [author.decode('utf-8') for author in (data.split(b';') if data else []) - if author not in self.config['ignoredAuthors']] - elif out_dtype == 'hhwww': - raise NotImplemented - raise ValueError(f'Unsupported dtype: {out_dtype}') + assert self.config["maps"], "Run `python3 -m woc.detect` to scan data files and generate wocprofile.json" - def get_values( - self, - map_name: str, - key: Union[bytes, str], - ): - """Eqivalent to getValues in WoC Perl API - >>> get_values('P2c', 'user2589_minicms') # doctest: +SKIP - ... - """ + # store name of maps and objects + self.maps = set(self.config["maps"].keys()) + self.objects = set(self.config["objects"].keys()) - if map_name in self.config["maps"]: + def _get_tch_bytes( + self, map_name, key + ) -> Tuple[bytes, str]: + """ + Get value (in bytes) from tch maps, return bytes and dtype + """ + # translate obj_name to map_name + if map_name == 'tree': + map_name = 'tree.tch' + elif map_name == 'commit': + map_name = 'commit.tch' + elif map_name == 'blob': + map_name = 'sha1.blob.tch' + + # find dtype object + if map_name in self.maps: _map = self.config["maps"][map_name][0] - elif map_name in self.config["objects"]: + elif map_name in self.objects: _map = self.config["objects"][map_name] else: raise KeyError(f'Invalid map name: {map_name}, ' - f'expect one of {", ".join(self.config["maps"].keys())}') - - if _map["dtypes"][0] == 'h': + f'expected one of {", ".join(self.maps | self.objects)}') + + in_dtype = _map["dtypes"][0] if "dtypes" in _map else "h" + out_dtype = _map["dtypes"][1] if "dtypes" in _map else "c?" # c? means maybe compressed + + if self._is_debug_enabled: + start_time = time.time_ns() + self._logger.debug(f"get from tch: {map_name} {key}") + + if in_dtype == 'h': if isinstance(key, str): - _hex = key + hex_str = key key = bytes.fromhex(key) else: - _hex = bytes.hex(key) + hex_str = bytes(key).hex() else: assert isinstance(key, str), "key must be a string for non-hash keys" - _hex = bytes(fnvhash(key.encode('utf-8'))).hex() + hex_str = hex(fnvhash(key.encode('utf-8')))[2:] key = key.encode('utf-8') - if "larges" in _map and _hex in _map["larges"]: - _bytes = self._read_large(_map["larges"][_hex], _map["dtypes"][0]) + if self._is_debug_enabled: + self._logger.debug(f"hash: hex={hex_str} in {(time.time_ns() - start_time) / 1e6:.2f}ms") + start_time = time.time_ns() + + if "larges" in _map and hex_str in _map["larges"]: + _bytes = read_large(_map["larges"][hex_str], out_dtype) + + if self._is_debug_enabled: + self._logger.debug(f"read large: file={_map['larges'][hex_str]} " + f"in {(time.time_ns() - start_time) / 1e6:.2f}ms") + start_time = time.time_ns() + + # compress string data is not compressed in larges + if out_dtype == 'cs': + out_dtype = 's' else: # use fnv hash as shading idx if key is not a git sha - _bytes = get_from_tch(key, _map["shards"], _map["sharding_bits"], _map["dtypes"][0] != 'h') + _shard = get_shard(key, _map["sharding_bits"], in_dtype != 'h') + _path = _map["shards"][_shard] + assert _path, f"shard {_shard} not found at {_path}" - return self._decode_value(_bytes, _map["dtypes"][1]) + _tch = get_tch(_path) + _bytes = _tch[key] - @staticmethod - def _decode_tree( - value: bytes - ) -> list[tuple[str, str, str]]: - """ - Decode a tree binary object into tuples - Reference: https://stackoverflow.com/questions/14790681/ - mode (ASCII encoded decimal) - SPACE (\0x20) - filename - NULL (\x00) - 20-byte binary hash + if self._is_debug_enabled: + self._logger.debug(f"get from tch: shard={_shard} db={_path} " + f"in {(time.time_ns() - start_time) / 1e6:.2f}ms") + + return _bytes, out_dtype + + def get_values( + self, + map_name: str, + key: Union[bytes, str], + ): + """Eqivalent to getValues in WoC Perl API. + >>> self.get_values('P2c', 'user2589_minicms') + ['05cf84081b63cda822ee407e688269b494a642de', ...] """ - _out_buf = [] - _file_buf = [] - _curr_buf = bytes() - - # TODO: current impl is not efficient, need to optimize - i = 0 - while i < len(value): - if value[i] == 0x20: - _file_buf.append(_curr_buf.decode('utf-8')) - _curr_buf = bytes() - elif value[i] == 0x00: - _file_buf.append(_curr_buf.decode('utf-8')) - # take next 20 bytes as a hash - _curr_buf = value[i+1:i+21] - _file_buf.append(_curr_buf.hex()) - _out_buf.append(tuple(_file_buf)) - # clear buffers - _file_buf = [] - _curr_buf = bytes() - i += 20 - else: - _curr_buf += bytes([value[i]]) - i += 1 + _bytes, decode_dtype = self._get_tch_bytes(map_name, key) + + if self._is_debug_enabled: + start_time = time.time_ns() - return _out_buf + _decoded = decode_value(_bytes, decode_dtype) - @staticmethod - def _read_file_with_offset(file_path, offset, length): - with open(file_path, "rb") as f: - f.seek(offset) - return f.read(length) + if self._is_debug_enabled: + self._logger.debug(f"decode: in {(time.time_ns() - start_time) / 1e6:.2f}ms") + return _decoded + + def _get_pos( + self, + obj: str, + key: Union[bytes, str] + ) -> Tuple[int, int]: + """ + Get offset and length of a stacked binary object, currently only support blob. + Move out this part because it's much cheaper than decode the content. + >>> self._get_pos('blob', bytes.fromhex('7a374e58c5b9dec5f7508391246c48b73c40d200')) + (0, 123) + """ + if obj == 'blob': + r_res = unber(self._get_tch_bytes('blob', key)[0]) + assert len(r_res) == 2, f"Invalid (offset, length) pair: {r_res}" + return r_res[0], r_res[1] + else: + raise ValueError(f'Unsupported object type: {obj}, expected blob') + + # def _show_content_bytes( + # self, + # obj_name: str, + # key: Union[bytes, str], + # ): + # start_time = time.time_ns() + # self._logger.debug(f"show_content: {obj_name} {key}") + + # if isinstance(key, str): + # key = bytes.fromhex(key) + + # self._logger.debug(f"hash: {(time.time_ns() - start_time) / 1e6:.2f}ms") + # start_time = time.time_ns() + + # if obj_name == 'tree': + # _map_obj = self.config['objects']['tree.tch'] + # v = get_from_tch(key, + # shards=_map_obj['shards'], + # sharding_bits=_map_obj['sharding_bits'], + # use_fnv_keys=False + # ) + # self._logger.debug(f"get from tch: {(time.time_ns() - start_time) / 1e6:.2f}ms") + # return decomp_or_raw(v) + + # elif obj_name == 'commit': + # _map_obj = self.config['objects']['commit.tch'] + # v = get_from_tch(key, + # shards=_map_obj['shards'], + # sharding_bits=_map_obj['sharding_bits'], + # use_fnv_keys=False + # ) + # self._logger.debug(f"get from tch: {(time.time_ns() - start_time) / 1e6:.2f}ms") + # return decomp_or_raw(v) + + # elif obj_name == 'blob': + # offset, length = self._get_pos('blob', key) + # self._logger.debug(f"get from tch: offset={offset} len={length} {(time.time_ns() - start_time) / 1e6:.2f}ms") + # start_time = time.time_ns() + + # _map_obj = self.config['objects']['blob.bin'] + # shard = get_shard(key, _map_obj['sharding_bits'], use_fnv_keys=False) + + # with open(_map_obj['shards'][shard], "rb") as f: + # f.seek(offset) + # _out_bin = f.read(length) + # self._logger.debug(f"read blob: {(time.time_ns() - start_time) / 1e6:.2f}ms") + # start_time = time.time_ns() + + # return decomp_or_raw(_out_bin) + + # else: + # raise ValueError(f'Unsupported object type: {obj_name}') def show_content( self, - obj: Literal['tree', 'blob', 'commit', 'tkns', 'tag', 'bdiff'], + obj_name: str, key: Union[bytes, str], ): - """Eqivalent to showCnt in WoC perl API - >>> show_content('tree', '7a374e58c5b9dec5f7508391246c48b73c40d200') # doctest: +SKIP - ... """ - if isinstance(key, str): - key = bytes.fromhex(key) - - if obj == 'tree': - _map_obj = self.config['objects']['tree.tch'] - v = get_from_tch(key, - shards=_map_obj['shards'], - sharding_bits=_map_obj['sharding_bits'], - use_fnv_keys=False - ) - return self._decode_tree(decomp_or_raw(v)) - elif obj == 'commit': - _map_obj = self.config['objects']['commit.tch'] - v = get_from_tch(key, - shards=_map_obj['shards'], - sharding_bits=_map_obj['sharding_bits'], - use_fnv_keys=False - ) - return decomp_or_raw(v).decode('utf-8') - elif obj == 'blob': - _map_obj = self.config['objects']['blob.tch'] - v = get_from_tch(key, - shards=_map_obj['shards'], - sharding_bits=_map_obj['sharding_bits'], - use_fnv_keys=False - ) - offset, length = unber(v) + Eqivalent to showCnt in WoC perl API + >>> self.show_content('tree', '7a374e58c5b9dec5f7508391246c48b73c40d200') + [('100644', '.gitignore', '8e9e1...'), ...] + """ + if self._is_debug_enabled: + start_time = time.time_ns() + + if obj_name == 'tree': + _ret = decode_tree(decomp_or_raw(self._get_tch_bytes(obj_name, key)[0])) + if self._is_debug_enabled: + self._logger.debug(f"decode tree: len={len(_ret)} in {(time.time_ns() - start_time) / 1e6:.2f}ms") + return _ret + + elif obj_name == 'commit': + _ret = decode_commit(decomp_or_raw(self._get_tch_bytes(obj_name, key)[0])) + if self._is_debug_enabled: + self._logger.debug(f"decode commit: len={len(_ret)}items in {(time.time_ns() - start_time) / 1e6:.2f}ms") + return _ret + + elif obj_name == 'blob': + key = bytes.fromhex(key) if isinstance(key, str) else key + offset, length = self._get_pos('blob', key) + if self._is_debug_enabled: + self._logger.debug(f"decode pos: offset={offset} len={length} in {(time.time_ns() - start_time) / 1e6:.2f}ms") + start_time = time.time_ns() + + _map_obj = self.config['objects']['blob.bin'] shard = get_shard(key, _map_obj['sharding_bits'], use_fnv_keys=False) - _out_bin = self._read_file_with_offset( - _map_obj['shards'][shard], - offset, - length - ) - return decomp_or_raw(_out_bin).decode('utf-8') - elif obj == 'tkns': + + with open(_map_obj['shards'][shard], "rb") as f: + f.seek(offset) + _out_bin = f.read(length) + if self._is_debug_enabled: + self._logger.debug(f"read blob: in {(time.time_ns() - start_time) / 1e6:.2f}ms") + + return decode_str(decomp_or_raw(_out_bin)) + + elif obj_name == 'tkns': raise NotImplemented - elif obj == 'tag': + elif obj_name == 'tag': raise NotImplemented - elif obj == 'bdiff': + elif obj_name == 'bdiff': raise NotImplemented else: - raise ValueError(f'Unsupported object type: {obj}, expected one of tree, blob, commit, tkns, tag, bdiff') \ No newline at end of file + raise ValueError(f'Unsupported object type: {obj_name}, expected one of tree, blob, commit, tkns, tag, bdiff') + + def count( + self, map_name + ) -> int: + """ + Count the number of keys in a map (# of larges + # of tch keys) + """ + # translate obj_name to map_name + if map_name == 'tree': + map_name = 'tree.tch' + elif map_name == 'commit': + map_name = 'commit.tch' + elif map_name == 'blob': + map_name = 'sha1.blob.tch' + + if self._is_debug_enabled: + start_time = time.time_ns() + + if map_name in self.maps: + _map = self.config["maps"][map_name][0] + elif map_name in self.objects: + _map = self.config["objects"][map_name] + else: + raise KeyError(f'Invalid map name: {map_name}, ' + f'expect one of {", ".join(self.maps | self.objects)}') + + _count = len(_map["larges"]) if "larges" in _map else 0 + for _shard in _map["shards"]: + _tch = get_tch(_shard) + _count += len(_tch) + + if self._is_debug_enabled: + self._logger.debug(f'count: len={_count} shards={len(_map["shards"])} ' + f'larges={len(_map["larges"])} in {(time.time_ns() - start_time) / 1e6:.2f}ms') + return _count diff --git a/woc/objects.py b/woc/objects.py new file mode 100644 index 0000000..93e8682 --- /dev/null +++ b/woc/objects.py @@ -0,0 +1,764 @@ +import difflib +import re +import warnings +from datetime import datetime, timedelta, timezone +from functools import cached_property, lru_cache +from logging import getLogger +from typing import Dict, Generator, List, Optional, Set, Tuple, Union + +from .base import WocMapsBase +from .local import fnvhash + +_global_woc: Optional[WocMapsBase] = None +_logger = getLogger(__name__) +_DAY_Z = datetime.fromtimestamp(0, tz=None) + + +def init_woc_objects(woc: WocMapsBase): + """ + Stores wocMaps object globally so you don't have to pass it around. + + :param woc: a wocMaps object. + """ + global _global_woc + _global_woc = woc + + +@lru_cache(maxsize=None) +def parse_timezone_offset(offset_str: str) -> timezone: + """ + Parse a timezone offset string in the format '+HHMM' or '-HHMM' into a timezone object. + + >>> parse_timezone_offset('+0530') + timezone(timedelta(seconds=19800)) + """ + match = re.match(r"([+-])(\d{2})(\d{2})", offset_str) + if not match: + raise ValueError(f"Invalid timezone offset format: {offset_str}") + sign, hours, minutes = match.groups() + hours, minutes = int(hours), int(minutes) + offset = timedelta(hours=hours, minutes=minutes) + + if sign == "-": + offset = -offset + + return timezone(offset) + + +class _WocObject: + _ident: str + """Identifier of the object""" + woc: WocMapsBase + """WocMap instance""" + key: str + """Key of the object""" + + def __init__( + self, + *args, + woc: Optional[WocMapsBase] = None, + **kwargs, + ): + self.woc = woc or _global_woc + assert ( + self.woc is not None + ), "WocMaps not initialized: call init_woc_objects() or supply a woc keyword argument" + + def __repr__(self) -> str: + return f"{self.__class__.__name__}({self.key})" + + def __str__(self) -> str: + return self.key + + def __eq__(self, value: object) -> bool: + if not isinstance(value, self.__class__): + return False + return self.key == value.key + + @property + def hash(self) -> str: + return hex(hash(self))[2:] + + def _get_list_values(self, map_name: str): + """A thin wrapper around WocMapsBase.get_values to handle KeyError""" + try: + return self.woc.get_values(map_name, self.key) + except KeyError: + return [] + + +class _GitObject(_WocObject): + """Base class for SHA1-indexed Git objects (commit, tree, blob)""" + + def __init__( + self, + key: str, + *args, + woc: Optional[WocMapsBase] = None, + **kwargs, + ): + super().__init__(*args, woc=woc, **kwargs) + assert len(key) == 40, "SHA1 hash must be 40 characters long" + self.key = key + + @cached_property + def data(self): + obj = self.__class__.__name__.lower() + return self.woc.show_content(obj, self.key) + + def __hash__(self): + return int(self.key, 16) + + @property + def hash(self) -> str: + return self.key + + +class _NamedObject(_WocObject): + """Base class for objects indexed by a string key""" + + def __init__( + self, + key: str, + *args, + woc: Optional[WocMapsBase] = None, + **kwargs, + ): + super().__init__(*args, woc=woc, **kwargs) + self.key = key + + def __hash__(self): + return fnvhash(self.key.encode()) + + +class Author(_NamedObject): + _ident = "a" + + @cached_property + def _username_email(self) -> Tuple[str, str]: + _splited = self.key.split(" <", 1) + if len(_splited) == 1: + return _splited[0], "" + return _splited[0], _splited[1][:-1] + + @property + def name(self) -> str: + return self._username_email[0] + + @property + def email(self) -> str: + return self._username_email[1] + + @cached_property + def blobs(self) -> "List[Blob]": + return [Blob(b) for b in self._get_list_values(f"{self._ident}2b")] + + @cached_property + def commits(self) -> "List[Commit]": + return [Commit(c) for c in self._get_list_values(f"{self._ident}2c")] + + @cached_property + def files(self) -> "List[File]": + return [File(f) for f in self._get_list_values(f"{self._ident}2f")] + + @cached_property + def projects(self) -> "List[Project]": + return [Project(p) for p in self._get_list_values(f"{self._ident}2p")] + + @cached_property + def unique_authors(self) -> List["UniqueAuthor"]: + return [UniqueAuthor(a) for a in self._get_list_values(f"{self._ident}2A")] + + @property + def authors(self): + raise NotImplementedError("Author object does not have authors method") + + @cached_property + def first_blobs(self) -> List["Blob"]: + return [Blob(b) for b in self._get_list_values(f"{self._ident}2fb")] + + +class UniqueAuthor(Author): + _ident = "A" + + @property + def unique_authors(self) -> "List[Author]": + raise NotImplementedError( + "UniqueAuthor object does not have unique_authors method" + ) + + @cached_property + def authors(self) -> "List[Author]": + return [Author(a) for a in self._get_list_values(f"{self._ident}2a")] + + +class Blob(_GitObject): + _ident = "b" + + @cached_property + def _pos(self) -> Tuple[int, int]: + return self.woc.get_pos("blob", self.key) + + def __len__(self) -> int: + return self._pos[1] + + def __str__(self) -> str: + return self.data + + @cached_property + def commits(self) -> "List[Commit]": + return [Commit(sha) for sha in self._get_list_values("b2c")] + + @cached_property + def first_author(self) -> "Tuple[datetime, Author, Commit]": + """ + Returns the timestamp, author, and commit of the first author. + + >>> woc.get_values('b2fa', '05fe634ca4c8386349ac519f899145c75fff4169')) + (datetime.datetime(2014, 9, 7, 2, 59, 48), Author(Audris Mockus ), Commit(e4af89166a17785c1d741b8b1d5775f3223f510f)) + """ + _out = self.woc.get_values("b2fa", self.key) + _date = datetime.fromtimestamp(int(_out[0])) + _author = Author(_out[1]) + _commit = Commit(_out[2]) + return _date, _author, _commit + + @cached_property + def time_author_commits(self) -> "List[Tuple[datetime, Author, Commit]]": + _out = self._get_list_values("b2tac") + return [ + (datetime.fromtimestamp(int(d[0])), Author(d[1]), Commit(d[2])) for d in _out + ] + + @cached_property + def files(self) -> "List[File]": + return [File(f) for f in self._get_list_values("b2f")] + + @cached_property + def projects_unique(self) -> "List[RootProject]": + return [RootProject(p) for p in self._get_list_values("b2P")] + + @cached_property + def changed_from(self) -> "List[Tuple[Blob, Commit, File]]": + return [ + (Blob(b), Commit(c), File(f)) for b, c, f in self._get_list_values("bb2cf") + ] + + @cached_property + def changed_to(self) -> "List[Tuple[Blob, Commit, File]]": + return [ + (Blob(b), Commit(c), File(f)) for b, c, f in self._get_list_values("obb2cf") + ] + + +class Commit(_GitObject): + _ident = "c" + + @cached_property + def data_obj(self): + _ret = {} + ( + _ret["tree"], + _ret["parent"], + (_ret["author"], _ret["author_timestamp"], _ret["author_timezone"]), + (_ret["committer"], _ret["committer_timestamp"], _ret["committer_timezone"]), + _ret["message"], + ) = self.data + return _ret + + @property + def author(self) -> Author: + return Author(self.data_obj["author"]) + + @property + def authored_at(self) -> datetime: + tz = parse_timezone_offset(self.data_obj["author_timezone"]) + return datetime.fromtimestamp(int(self.data_obj["author_timestamp"]), tz) + + @property + def committer(self) -> Author: + return Author(self.data_obj["committer"]) + + @property + def committed_at(self) -> datetime: + tz = parse_timezone_offset(self.data_obj["committer_timezone"]) + return datetime.fromtimestamp(int(self.data_obj["committer_timestamp"]), tz) + + @property + def full_message(self) -> str: + """Full message of the commit""" + return self.data_obj["message"] + + @property + def message(self) -> str: + """Short message of the commit""" + return self.data_obj["message"].split("\n", 1)[0] + + @cached_property + def tree(self) -> "Tree": + return Tree(self.data_obj["tree"]) + + @property + def _parent_shas(self) -> List[str]: + return self.data_obj["parent"] + + @property + def parents(self) -> List["Commit"]: + """Parent commits of this commit""" + return [Commit(p) for p in self.data_obj["parent"]] + + @cached_property + def projects(self) -> List["Project"]: + """Projects associated with this commit""" + return [Project(p) for p in self._get_list_values("c2p")] + + @cached_property + def root_projects(self) -> List["RootProject"]: + """Root projects associated with this commit""" + return [RootProject(p) for p in self._get_list_values("c2P")] + + @cached_property + def children(self) -> List["Commit"]: + """Children of this commit""" + return [Commit(c) for c in self._get_list_values("c2cc")] + + @cached_property + def _file_names(self) -> List[str]: + return self._get_list_values("c2f") + + @cached_property + def _file_set(self) -> Set[str]: + return set(self._file_names) + + @cached_property + def files(self) -> List["File"]: + """Files changed in this commit""" + return [File(f) for f in self._file_names] + + @cached_property + def _blob_shas(self) -> List[str]: + return self._get_list_values("c2b") + + @cached_property + def _blob_set(self) -> Set[str]: + return set(self._blob_shas) + + @cached_property + def blobs(self) -> List["Blob"]: + """ + Blobs changed in this commit. + + This relation is known to miss every first file in all trees. + Consider using Commit.tree.blobs as a slower but more accurate + alternative. + """ + return [Blob(b) for b in self._get_list_values("c2b")] + + @cached_property + def time_author(self) -> Tuple[datetime, Author]: + """Timestamp and author of the commit""" + res = self.woc.get_values("c2ta", self.key) + return datetime.fromtimestamp(int(res[0])), Author(res[1]) + + @cached_property + def root(self) -> "Tuple[Commit, int]": + """Root commit of the project""" + sha, dis = self.woc.get_values("c2r", self.key) + return Commit(sha), int(dis) + + @cached_property + def changeset(self) -> "List[Tuple[File, Blob, Blob]]": + """Returns changed files, their new and old blobs""" + return [ + (File(f), Blob(new), Blob(old)) + for f, new, old in self._get_list_values("c2fbb") + ] + + def compare( + self, parent: Union["Commit", str], threshold=0.5 + ) -> Generator[ + Tuple[Optional["File"], Optional["File"], Optional["Blob"], Optional["Blob"]], + None, + None, + ]: + """ + Compare two Commits. + + :param parent: another commit to compare to. + Expected order is `diff = child_commit - parent_commit` + + :return: a generator of 4-tuples `(old_path, new_path, old_sha, new_sha)` + + Examples: + - a new file 'setup.py' was created: + `(None, 'setup.py', None, 'file_sha')` + - an existing 'setup.py' was deleted: + `('setup.py', None, 'old_file_sha', None)` + - setup.py.old was renamed to setup.py, content unchanged: + `('setup.py.old', 'setup.py', 'file_sha', 'file_sha')` + - setup.py was edited: + `('setup.py', 'setup.py', 'old_file_sha', 'new_file_sha')` + - setup.py.old was edited and renamed to setup.py: + `('setup.py.old', 'setup.py', 'old_file_sha', 'new_file_sha')` + + Detecting the last one is computationally expensive. You can adjust this + behaviour by passing the `threshold` parameter, which is 0.5 by default. + It means that if roughly 50% of the file content is the same, + it is considered a match. `threshold=1` means that only exact + matches are considered, effectively disabling this comparison. + If threshold is set to 0, any pair of deleted and added file will be + considered renamed and edited; this last case doesn't make much sense so + don't set it too low. + """ + if isinstance(parent, str): + parent = Commit(parent) + if not isinstance(parent, Commit): + raise TypeError("parent must be a Commit or a commit hash") + + # # filename: (blob sha before, blob sha after) + # new_files = self.tree._file_blob_map + # new_paths = self.tree._file_set + # old_files = parent.tree._file_blob_map + # old_paths = parent.tree._file_set + + # !!! We really need to traverse the trees ### + new_files: Dict[File, Blob] = {} + for f, b in self.tree.traverse(): + new_files[f] = b + old_files: Dict[File, Blob] = {} + for f, b in parent.tree.traverse(): + old_files[f] = b + + # unchanged_paths + for f in new_files.keys() & old_files.keys(): + if new_files[f] != old_files[f]: + # i.e. Blob sha Changed! + yield f, f, old_files[f], new_files[f] + + added_paths: Set[File] = new_files.keys() - old_files.keys() + deleted_paths: Set[File] = old_files.keys() - new_files.keys() + + if threshold >= 1: # i.e. only exact matches are considered + for f in added_paths: # add + yield None, f, None, new_files[f] + for f in deleted_paths: + yield f, None, old_files[f], None + return + + if parent.hash not in self._parent_shas: + warnings.warn( + "Comparing non-adjacent commits might be " + "computationally expensive. Proceed with caution." + ) + + # search for matches + sm = difflib.SequenceMatcher() + # for each added blob, try to find a match in deleted blobs + # if there is a match, signal a rename and remove from deleted + # if there is no match, signal a new file + # unused deleted blobs are indeed deleted + for added_file, added_blob in new_files.items(): + sm.set_seq1(added_blob.data) + matched = False + for deleted_file, deleted_blob in old_files.items(): + sm.set_seq2(deleted_blob.data) + # use quick checks first (lower bound by length diff) + if ( + sm.real_quick_ratio() > threshold + and sm.quick_ratio() > threshold + and sm.ratio() > threshold + ): + yield deleted_file, added_file, deleted_blob, added_blob + del old_files[deleted_file] + matched = True + break + if not matched: # this is a new file + yield None, added_file, None, added_blob + + for deleted_file, deleted_blob in old_files.items(): + yield deleted_file, None, deleted_blob, None + + def __sub__(self, parent: "Commit"): + return self.compare(parent) + + +class File(_NamedObject): + _ident = "f" + + @property + def path(self) -> str: + return self.key + + @property + def name(self) -> str: + return self.key.split("/")[-1] + + @cached_property + def authors(self) -> List[Author]: + return [Author(a) for a in self._get_list_values("f2a")] + + @cached_property + def blobs(self) -> List[Blob]: + return [Blob(b) for b in self._get_list_values("f2b")] + + @cached_property + def commits(self) -> List[Commit]: + return [Commit(c) for c in self._get_list_values("f2c")] + + +class Tree(_GitObject): + _ident = "t" + + @cached_property + def data(self) -> str: + return self.woc.show_content("tree", self.key) + + @property + def _file_names(self) -> List[str]: + return [l[1] for l in self.data] + + @cached_property + def _file_set(self) -> Set[str]: + return {l[1] for l in self.data} + + @property + def files(self) -> List["File"]: + return [File(f) for f in self._file_names] + + @property + def _blob_shas(self) -> List[str]: + return [l[2] for l in self.data] + + @cached_property + def _blob_set(self) -> Set[str]: + return {l[2] for l in self.data} + + @property + def blobs(self) -> List["Blob"]: + return [Blob(b) for b in self._blob_shas] + + @cached_property + def _file_blob_map(self) -> Dict[str, str]: + return {l[1]: l[2] for l in self.data} + + def _traverse(self) -> "Generator[Tuple[str, str], None, None]": + for mode, fname, sha in self.data: + # trees are always 40000: + # https://stackoverflow.com/questions/1071241 + if mode != "40000": + yield fname, sha + else: + _logger.debug(f"traverse: into {fname} ({sha})") + for _fname, _sha in Tree(sha)._traverse(): + yield fname + "/" + _fname, _sha + + def traverse(self) -> "Generator[Tuple[File, Blob], None, None]": + for fname, sha in self._traverse(): + yield File(fname), Blob(sha) + + def __contains__(self, item: Union[str, File, Blob]) -> bool: + if isinstance(item, str): + return item in self._file_names or item in self._blob_shas + if isinstance(item, File): + return item.text in self._file_names + if isinstance(item, Blob): + return item.hex in self._blob_shas + return False + + def __str__(self) -> str: + return "\n".join([" ".join(l) for l in self.data]) + + def __len__(self) -> int: + return len(self.data) + + def __iter__(self) -> "Generator[Tuple[File, Blob], None, None]": + for l in self.data: + yield File(l[1]), Blob(l[2]) + + +class Project(_NamedObject): + _ident = "p" + + @cached_property + def _platform_repo(self) -> str: + URL_PREFIXES = self.woc.config["sites"] + prefix, body = self.key.split("_", 1) + if prefix == "sourceforge.net": + platform = URL_PREFIXES[prefix] + elif prefix in URL_PREFIXES and "_" in body: + platform = URL_PREFIXES[prefix] + body = body.replace("_", "/", 1) + else: + platform = "github.com" + body = self.key.replace("_", "/", 1) + return platform, body + + @property + def url(self) -> str: + """ + Get the URL for a given project URI. + + >>> Project('CS340-19_lectures').url + 'http://github.com/CS340-19/lectures' + """ + platform, body = self._platform_repo + URL_PREFIXES = self.woc.config["sites"] + if platform in URL_PREFIXES: + return f"https://{URL_PREFIXES[platform]}/{body}" + return f"https://{platform}/{body}" + + @cached_property + def authors(self) -> "List[Author]": + return [Author(a) for a in self._get_list_values(f"{self._ident}2a")] + + @cached_property + def _commit_shas(self) -> "List[str]": + return self._get_list_values(f"{self._ident}2c") + + @cached_property + def _commit_set(self) -> "Set[str]": + return self._commit_map.keys() + + @cached_property + def _commit_map(self) -> "Dict[str, Commit]": + return {c.hash: c for c in self.commits} + + @cached_property + def commits(self) -> "List[Commit]": + return [Commit(c) for c in self._commit_shas] + + @cached_property + def root_projects(self) -> "List[RootProject]": + return [RootProject(p) for p in self._get_list_values(f"{self._ident}2P")] + + def __contains__(self, item: Union[str, Commit]) -> bool: + if isinstance(item, str): + return item in self._commit_set + elif isinstance(item, Commit): + return item.hash in self._commit_set + return False + + @cached_property + def head(self) -> "Commit": + """ + Get the HEAD commit of the repository. + + >>> Project('user2589_minicms').head + Commit(f2a7fcdc51450ab03cb364415f14e634fa69b62c) + >>> Project('RoseTHERESA_SimpleCMS').head + Commit(a47afa002ccfd3e23920f323b172f78c5c970250) + """ + # Sometimes (very rarely) commit dates are wrong, so the latest commit + # is not actually the head. The magic below is to account for this + parents = set().union(*(c._parent_shas for c in self.commits)) + heads = [self._commit_map[c] for c in self._commit_set - parents] + + # it is possible that there is more than one head. + # E.g. it happens when HEAD is moved manually (git reset) + # and continued with a separate chain of commits. + # in this case, let's just use the latest one + # actually, storing refs would make it much simpler + _heads_sorted = sorted(heads, key=lambda c: c.authored_at or _DAY_Z, reverse=True) + if len(_heads_sorted) == 0: + raise ValueError("No head commit found") + return _heads_sorted[0] + + @cached_property + def tail(self) -> "Commit": + """ + Get the first commit SHA by following first parents. + + >>> Project(b'user2589_minicms').tail + Commit(1e971a073f40d74a1e72e07c682e1cba0bae159b) + """ + pts = {c._parent_shas[0] for c in self.commits if c._parent_shas} + for c in self.commits: + if c.hash in pts and not c._parent_shas: + return c + + @cached_property + def earliest_commit(self) -> "Commit": + """Get the earliest commit of the repository""" + return min(self.commits, key=lambda c: c.authored_at or _DAY_Z) + + @cached_property + def latest_commit(self) -> "Commit": + """Get the latest commit of the repository""" + return max(self.commits, key=lambda c: c.authored_at or _DAY_Z) + + def commits_fp(self) -> Generator["Commit", None, None]: + """ + Get a commit chain by following only the first parent. + + Mimic https://git-scm.com/docs/git-log#git-log---first-parent. + Thus, you only get a small subset of the full commit tree. + + >>> p = Project(b'user2589_minicms') + >>> set(c.sha for c in p.commits_fp).issubset(p.commit_shas) + True + + In scenarios where branches are not important, it can save a lot + of computing. + + Yields: + Commit: binary commit shas, following first parent only, + from the latest to the earliest. + """ + # Simplified version of self.head(): + # - slightly less precise, + # - 20% faster + # + # out of 500 randomly sampled projects, 493 had the same head. + # In the remaining 7: + # 2 had the same commit chain length, + # 3 had one more commit + # 1 had two more commits + # 1 had three more commits + # Execution time: + # simplified version (argmax): ~153 seconds + # self.head(): ~190 seconds + + # at this point we know all commits are in the dataset + # (validated in __iter___) + commit = self.latest_commit + + while commit: + # no point try-except: the truth value of a list is len(list) + first_parent = commit._parent_shas and commit._parent_shas[0] + yield commit + if not first_parent: + break + commit = self._commit_map.get(first_parent, Commit(first_parent)) + + def __iter__(self) -> "Generator[Commit, None, None]": + for c in self.commits: + try: + if c.author in self.woc.config["ignoredAuthors"]: + continue + yield c + except KeyError: + pass + + @property + def projects(self) -> List["Project"]: + raise NotImplementedError("Project object does not have projects method") + + +class RootProject(Project): + _ident = "P" + + @cached_property + def unique_authors(self) -> "List[Author]": + return [UniqueAuthor(a) for a in self._get_list_values(f"{self._ident}2A")] + + @cached_property + def commits(self) -> "List[Commit]": + return [Commit(c) for c in self._get_list_values(f"{self._ident}2C")] + + @cached_property + def projects(self) -> "List[Project]": + return [Project(p) for p in self._get_list_values(f"{self._ident}2p")] + + @property + def root_projects(self) -> List["RootProject"]: + raise NotImplementedError("RootProject object does not have root_projects method") diff --git a/woc/show_content.py b/woc/show_content.py new file mode 100644 index 0000000..58f03d2 --- /dev/null +++ b/woc/show_content.py @@ -0,0 +1,145 @@ +#!/usr/bin/env python3 + +# SPDX-License-Identifier: GPL-3.0-or-later +# @authors: Runzhi He +# @date: 2024-05-27 + +from .local import WocMapsLocal, decode_commit, decode_str, decomp_or_raw + + +def format_tree(tree_objs: list) -> str: + _out = "" + for line in tree_objs: + _out += f"{line[0]};{line[2]};{line[1]}\n" + return _out + + +def format_commit(sha: str, cmt_bin: bytes, format: int = 0): + if format == 3: # raw + cmt = decode_str(cmt_bin) + return cmt + + if format == 7: # commit sha; base64(raw) + import base64 + + _b64 = base64.b64encode(cmt_bin).decode() + # mock linux's base64, add newline every 76 characters + _b64 = "\\n".join([_b64[i : i + 76] for i in range(0, len(_b64), 76)]) + "\\n" + return sha + ";" + _b64 + + ( + tree_sha, + parents, + (author, author_timestamp, author_timezone), + (committer, committer_timestamp, committer_timezone), + commit_msg, + ) = decode_commit(cmt_bin) + parent_sha = parents[0] # only the first parent + + if ( + format == 0 + ): # commit SHA;tree SHA;parent commit SHA;author;committer;author timestamp;commit timestamp + return ";".join( + [ + sha, + tree_sha, + parent_sha, + author, + committer, + author_timestamp, + committer_timestamp, + ] + ) + + elif format == 1: # commit SHA;author timestamp;author + return ";".join([sha, author_timestamp, author]) + + elif ( + format == 2 + ): # commit SHA;author;author timestamp; author timezone;commit message + return ";".join([sha, author, author_timestamp, author_timezone, commit_msg]) + + elif format == 4: # commit SHA;author + return ";".join([sha, author]) + + elif format == 5: # commit SHA; parent commit SHA + return ";".join([sha, parent_sha]) + + elif ( + format == 6 + ): # commit SHA;author timestamp;author timezone;author;tree sha;parent sha + return ";".join( + [sha, author_timestamp, author_timezone, author, tree_sha, parent_sha] + ) + + elif ( + format == 8 + ): # commit sha; author timestamp; commit timestamp; author; committer; parent sha + return ";".join( + [sha, author_timestamp, committer_timestamp, author, committer, parent_sha] + ) + + elif ( + format == 9 + ): # commit sha; tree sha; parent sha; author; committer; author timestamp; commit timestamp; author timezone; committer timezone; commit message + return ";".join( + [ + sha, + tree_sha, + parent_sha, + author, + committer, + author_timestamp, + committer_timestamp, + author_timezone, + committer_timezone, + commit_msg, + ] + ) + + else: + raise ValueError(f"Invalid format {format}") + + +if __name__ == "__main__": + import argparse + import logging + import os + import sys + + parser = argparse.ArgumentParser( + description="See the Content of Git Object", + usage="echo | %(prog)s type (format)", + ) + parser.add_argument("type", type=str, help="The type of the object") + parser.add_argument( + "format", type=int, help="The format of the object", default=0, nargs="?" + ) + parser.add_argument( + "-p", "--profile", type=str, help="The path to the profile file", default=None + ) + args = parser.parse_args() + + woc = WocMapsLocal(args.profile) + for line in sys.stdin: + try: + key = line.strip() + if args.type == "commit": + obj_bin = decomp_or_raw(woc._get_tch_bytes(args.type, key)[0]) + print(format_commit(key, obj_bin, args.format)) + elif args.type == "tree": + obj = woc.show_content(args.type, key) + print(format_tree(obj)) + elif args.type == "blob": + obj = woc.show_content(args.type, key) + print(obj) + else: + raise ValueError(f"Invalid object type {args.type}") + except BrokenPipeError: + # ref: https://docs.python.org/3/library/signal.html#note-on-sigpipe + devnull = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull, sys.stdout.fileno()) + sys.exit(1) # Python exits with error code 1 on EPIPE + except Exception as e: + logging.error(f"Error in {key}: {e}", exc_info=True) + continue diff --git a/woc/tch.pxd b/woc/tch.pxd index 6b74a64..0170251 100644 --- a/woc/tch.pxd +++ b/woc/tch.pxd @@ -1,7 +1,16 @@ -# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False +# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False, profile=True, linetrace=True -from libc.stdint cimport uint8_t, uint32_t +cdef extern from 'tchdb.h': + ctypedef struct TCHDB: # type of structure for a hash database + pass -cdef uint32_t fnvhash(bytes data) -cpdef uint8_t get_shard(bytes key, uint8_t sharding_bits, bint use_fnv_keys) -cpdef bytes get_from_tch(bytes key, list shards, int sharding_bits, bint use_fnv_keys) \ No newline at end of file +cdef class TCHashDB: + cdef TCHDB* _db + cdef str filename + + """Object representing a Tokyocabinet Hash table""" + cpdef bytes get(self, bytes key) + cpdef void put(self, bytes key, bytes value) except * + cpdef void delete(self, bytes key) except * + cpdef void drop(self) except * + cpdef void close(self) except * diff --git a/woc/tch.pyi b/woc/tch.pyi index a33c82c..c2ea3fa 100644 --- a/woc/tch.pyi +++ b/woc/tch.pyi @@ -1,2 +1,62 @@ -def get_shard(key: bytes, sharding_bits: int, use_fnv_keys: bool) -> int: ... -def get_from_tch(key: bytes, shards: list[str], sharding_bits: int, use_fnv_keys: bool) -> bytes: ... \ No newline at end of file +from typing import Iterator + +class TCHashDB: + """Object representing a TokyoCabinet Hash table""" + + def __init__(self, path: str, ro: bool = False) -> None: + """ + Create a new TokyoCabinet hash table object. + + :param path: path to the database file + :param ro: if True, open in lock-free read-only mode; if False, lock and open in write mode (create if not exists) + :raises OSError: if the database cannot be opened + """ + ... + + def __iter__(self) -> "Iterator[bytes]": ... + def get(self, key: bytes) -> bytes: + """ + Get a record. + + :raises KeyError: if the key is not found + :raises OSError: if the operation fails + """ + ... + + def put(self, key: bytes, value: bytes) -> None: + """ + Upsert a record. + + :raises OSError: if the operation fails + """ + ... + + def delete(self, key: bytes) -> None: + """ + Delete a record from the database. + + :raises OSError: if the operation fails + """ + ... + + def drop(self) -> None: + """ + Delete all records in the database. + + :raises OSError: if the operation fails + """ + ... + + def close(self) -> None: + """ + Close the database. + + :raises OSError: if the operation fails + """ + ... + + def __getitem__(self, key: bytes) -> bytes: ... + def __setitem__(self, key: bytes, value: bytes) -> None: ... + def __delitem__(self, key: bytes) -> None: ... + def __len__(self) -> int: ... + def __del__(self) -> None: ... diff --git a/woc/tch.pyx b/woc/tch.pyx index 43e1122..bc0792a 100644 --- a/woc/tch.pyx +++ b/woc/tch.pyx @@ -1,4 +1,4 @@ -# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False +# cython: language_level=3str, wraparound=False, boundscheck=False, nonecheck=False, profile=True, linetrace=True # SPDX-License-Identifier: GPL-3.0-or-later # @authors: Runzhi He @@ -6,9 +6,6 @@ from libc.stdint cimport uint8_t, uint32_t, uint64_t from libc.stdlib cimport free -from threading import Lock - -from .base import WocKeyError cdef extern from 'Python.h': object PyBytes_FromStringAndSize(char *s, Py_ssize_t len) @@ -18,51 +15,48 @@ cdef extern from 'tchdb.h': pass cdef enum: # enumeration for open modes - HDBOREADER = 1 << 0, # open as a reader - HDBONOLCK = 1 << 4, # open without locking - - const char *tchdberrmsg(int ecode) - TCHDB *tchdbnew() - int tchdbecode(TCHDB *hdb) + HDBOREADER = 1 << 0, # open as a reader + HDBOWRITER = 1 << 1, # open as a writer + HDBOCREAT = 1 << 2, # writer creating + HDBOTRUNC = 1 << 3, # writer truncating + HDBONOLCK = 1 << 4, # open without locking + + const char *tchdberrmsg(int ecode) # Get the message string corresponding to an error code + TCHDB *tchdbnew() # Create a hash database object + int tchdbecode(TCHDB *hdb) # Set the error code of a hash database object bint tchdbopen(TCHDB *hdb, const char *path, int omode) - bint tchdbclose(TCHDB *hdb) + bint tchdbclose(TCHDB *hdb) # Close a hash database object + void tchdbdel(TCHDB *hdb) # Delete a hash database object void *tchdbget(TCHDB *hdb, const void *kbuf, int ksiz, int *sp) - bint tchdbiterinit(TCHDB *hdb) - void *tchdbiternext(TCHDB *hdb, int *sp) - -cdef uint32_t fnvhash(bytes data): - """ - Returns the 32 bit FNV-1a hash value for the given data. - >>> hex(fnvhash('foo')) - '0xa9f37ed7' - """ - # PY: 5.8usec Cy: 66.8ns - cdef: - uint32_t hval = 0x811c9dc5 - uint8_t b - for b in data: - hval ^= b - hval *= 0x01000193 - return hval - - -cdef class Hash: + bint tchdbiterinit(TCHDB *hdb) # Initialize the iterator of a hash database object + void *tchdbiternext(TCHDB *hdb, int *sp) # Get the next key of the iterator of a hash database object + bint tchdbput(TCHDB *hdb, const void *kbuf, int ksiz, const void *vbuf, int vsiz) # Store a new record into a hash database object + bint tchdbout(TCHDB *hdb, const void *kbuf, int ksiz) # Remove a record of a hash database object + uint64_t tchdbrnum(TCHDB *hdb) # Get the number of records of a hash database object + bint tchdbvanish(TCHDB *hdb) # Remove all records of a hash database object + +cdef class TCHashDB: """Object representing a Tokyocabinet Hash table""" - cdef TCHDB* _db - cdef bytes filename - def __cinit__(self, char *path, nolock=True): - cdef int mode = HDBOREADER - if nolock: + def __cinit__(self, str path, bint ro=False): + self.filename = path + _encoded = path.encode() + cdef char* dbpath = _encoded + + cdef int mode = 0 + if not ro: # write mode: create if not exists + mode |= HDBOWRITER + mode |= HDBOCREAT + else: # read mode: disable locks + mode |= HDBOREADER mode |= HDBONOLCK + self._db = tchdbnew() - self.filename = path if self._db is NULL: raise MemoryError() - cdef bint result = tchdbopen(self._db, path, mode) + cdef bint result = tchdbopen(self._db, dbpath, mode) if not result: - raise IOError('Failed to open .tch file "%s": ' % self.filename - + self._error()) + raise IOError(f'Failed to open {self.filename}: ' + self._error()) def _error(self): cdef int code = tchdbecode(self._db) @@ -76,71 +70,74 @@ cdef class Hash: int sp bytes key if not result: - raise IOError('Failed to iterate .tch file "%s": ' % self.filename - + self._error()) + raise IOError(f'Failed to iterate {self.filename}: ' + self._error()) while True: buf = tchdbiternext(self._db, &sp) if buf is NULL: break - key = PyBytes_FromStringAndSize(buf, sp) + key = PyBytes_FromStringAndSize(buf, sp) free(buf) yield key - cdef bytes read(self, bytes key): + cpdef bytes get(self, bytes key): cdef: - char *k = key + char *k = key char *buf int sp int ksize=len(key) buf = tchdbget(self._db, k, ksize, &sp) if buf is NULL: - raise WocKeyError(key, self.filename.decode('utf-8')) - cdef bytes value = PyBytes_FromStringAndSize(buf, sp) + raise KeyError(f'Key {key.hex()} not found in {self.filename}') + cdef bytes value = PyBytes_FromStringAndSize(buf, sp) free(buf) return value - def __getitem__(self, bytes key): - return self.read(key) + cpdef void put(self, bytes key, bytes value) except *: + cdef: + char *k = key + int ksize = len(key) + char *v = value + int vsize = len(value) + bint result + result = tchdbput(self._db, k, ksize, v, vsize) + if not result: + raise IOError(f'Failed to put {key.hex()} in {self.filename}: ' + self._error()) - def __del__(self): + cpdef void delete(self, bytes key) except *: + cdef: + char *k = key + int ksize = len(key) + bint result + result = tchdbout(self._db, k, ksize) + if not result: + raise IOError(f'Failed to delete {key.hex()} in {self.filename}: ' + self._error()) + + cpdef void drop(self) except *: + cdef: + bint result + result = tchdbvanish(self._db) + if not result: + raise IOError(f'Failed to drop all records in {self.filename}: ' + self._error()) + + cpdef void close(self) except *: cdef bint result = tchdbclose(self._db) if not result: - raise IOError('Failed to close .tch "%s": ' % self.filename - + self._error()) + raise IOError(f'Failed to close {self.filename}: ' + self._error()) + + def __getitem__(self, bytes key): + return self.get(key) + + def __setitem__(self, bytes key, bytes value): + self.put(key, value) + + def __delitem__(self, bytes key): + self.delete(key) + + def __len__(self): + return tchdbrnum(self._db) + + def __del__(self): + self.close() def __dealloc__(self): - free(self._db) - - -# Pool of open TokyoCabinet databases to save few milliseconds on opening -cdef dict _TCH_POOL = {} # type: Dict[str, Hash] -TCH_LOCK = Lock() - -cdef _get_tch(char *path): - """ Cache Hash() objects """ - if path in _TCH_POOL: - return _TCH_POOL[path] - try: - TCH_LOCK.acquire() - # in multithreading environment this can cause race condition, - # so we need a lock - if path not in _TCH_POOL: - _TCH_POOL[path] = Hash(path) - finally: - TCH_LOCK.release() - return _TCH_POOL[path] - -cpdef uint8_t get_shard(bytes key, uint8_t sharding_bits, bint use_fnv_keys): - """ Get shard id """ - cdef uint8_t p - if use_fnv_keys: - p = fnvhash(key) - else: - p = key[0] - cdef uint8_t prefix = p & (2**sharding_bits - 1) - return prefix - -cpdef bytes get_from_tch(bytes key, list shards, int sharding_bits, bint use_fnv_keys): - return _get_tch( - shards[get_shard(key, sharding_bits, use_fnv_keys)].encode('utf-8') - )[key] + free(self._db) # it should never be null diff --git a/woc/wocprofile.default.json b/woc/wocprofile.default.json index aa3b548..e58780b 100644 --- a/woc/wocprofile.default.json +++ b/woc/wocprofile.default.json @@ -20,15 +20,19 @@ "dat": "colon_seperated_data", "tch": "compressed_data", "bin": "binary_data", - "idx": "binary_index" + "idx": "binary_index", + "bb": "new_blob", + "obb": "old_blob", + "fb": "first_blob" }, "dtypes": { "h": "hex", - "s": "string", - "cs": "compressed_string", - "sh": "string_hex", + "s": "str", + "cs": "[compressed]str", + "sh": "str_hex", "hhwww": "hex_hex_url", - "cs3": "compressed_3_strings" + "r": "hex_berint", + "cs3": "[compressed]str_str_str" }, "sites": { "bitbucket.org": "bitbucket.org",