diff --git a/README.md b/README.md index 5a4febc..e1adb2a 100644 --- a/README.md +++ b/README.md @@ -34,16 +34,30 @@ RAGLite is a Python package for Retrieval-Augmented Generation (RAG) with Postgr ## Installing -To install this package (including Metal acceleration if on macOS), run: +First, begin by installing SpaCy's multilingual sentence model: ```sh -pip install raglite +# Install SpaCy's xx_sent_ud_sm: +pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl +``` + +Next, it is optional but recommended to install [an accelerated llama-cpp-python precompiled binary](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) with: + +```sh +# Configure which llama-cpp-python precompiled binary to install (⚠️ only v0.2.88 is supported right now): +LLAMA_CPP_PYTHON_VERSION=0.2.88 +PYTHON_VERSION=310 +ACCELERATOR=metal|cu121|cu122|cu123|cu124 +PLATFORM=macosx_11_0_arm64|linux_x86_64|win_amd64 + +# Install llama-python-cpp: +pip install "https://github.com/abetlen/llama-cpp-python/releases/download/v$LLAMA_CPP_PYTHON_VERSION-$ACCELERATOR/llama_cpp_python-$LLAMA_CPP_PYTHON_VERSION-cp$PYTHON_VERSION-cp$PYTHON_VERSION-$PLATFORM.whl" ``` -To add CUDA 12.4 support, use the `cuda124` extra: +Finally, install RAGLite with: ```sh -pip install raglite[cuda124] +pip install raglite ``` To add support for filetypes other than PDF, use the `pandoc` extra: diff --git a/poetry.lock b/poetry.lock index dd0bf9f..bf9ec6c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -581,13 +581,13 @@ colorama = {version = "*", markers = "platform_system == \"Windows\""} [[package]] name = "cloudpathlib" -version = "0.18.1" +version = "0.16.0" description = "pathlib-style classes for cloud storage services." optional = false python-versions = ">=3.7" files = [ - {file = "cloudpathlib-0.18.1-py3-none-any.whl", hash = "sha256:20efd5d772c75df91bb2ac52e053be53fd9000f5e9755fd92375a2a9fe6005e0"}, - {file = "cloudpathlib-0.18.1.tar.gz", hash = "sha256:ffd22f324bfbf9c3f2bc1bec6e8372cb372a0feef17c7f2b48030cd6810ea859"}, + {file = "cloudpathlib-0.16.0-py3-none-any.whl", hash = "sha256:f46267556bf91f03db52b5df7a152548596a15aabca1c8731ef32b0b25a1a6a3"}, + {file = "cloudpathlib-0.16.0.tar.gz", hash = "sha256:cdfcd35d46d529587d744154a0bdf962aca953b725c8784cd2ec478354ea63a3"}, ] [package.dependencies] @@ -2373,240 +2373,6 @@ dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-mater server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp310-cp310-linux_x86_64.whl", hash = "sha256:8a5a107c5c6a76827da223da105a523295f4577cc1f7b57c9abaa69b78eb49b1"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp310-cp310-linux_x86_64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp310-cp310-win_amd64.whl", hash = "sha256:8990dd768ec1ac5e6339ba03fe84ec24f108a526ab5b087cdd8509838fa215d3"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp310-cp310-win_amd64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp311-cp311-linux_x86_64.whl", hash = "sha256:96213cd64a5632db850812dcaecdc890c3c6605bdb9e50a933486a1ed53d78b1"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp311-cp311-linux_x86_64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp311-cp311-win_amd64.whl", hash = "sha256:b086e427f7a1262a51f778b4aa4a3758cfef0b4961504203affab374282b708b"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp311-cp311-win_amd64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp312-cp312-linux_x86_64.whl", hash = "sha256:7ce449ab43fe0f540aa922a20638f756d2d8248f700ebf2a65a57f33a37b2935"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp312-cp312-linux_x86_64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp312-cp312-win_amd64.whl", hash = "sha256:4c39946777304c9d661251943bea414991527e22493e3c87d8e997a83f6df418"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp312-cp312-win_amd64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:95f5f1b5e519ed0de5a81edefbd11ef62436c6523020b8d5e41be22920be6472"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-metal/llama_cpp_python-0.2.88-cp310-cp310-macosx_11_0_arm64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:12c52ecae541ad897b0b5a88007228e07b4b430bfa31339dde65f04d6d8fa193"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-metal/llama_cpp_python-0.2.88-cp311-cp311-macosx_11_0_arm64.whl" - -[[package]] -name = "llama_cpp_python" -version = "0.2.88" -description = "Python bindings for the llama.cpp library" -optional = false -python-versions = ">=3.8" -files = [ - {file = "llama_cpp_python-0.2.88-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a67020dc230d37cf6ba90b59173113ef8060400d25094fcc89b37b297c08617e"}, -] - -[package.dependencies] -diskcache = ">=5.6.1" -jinja2 = ">=2.11.3" -numpy = ">=1.20.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["llama_cpp_python[dev,server,test]"] -dev = ["black (>=23.3.0)", "httpx (>=0.24.1)", "mkdocs (>=1.4.3)", "mkdocs-material (>=9.1.18)", "mkdocstrings[python] (>=0.22.0)", "pytest (>=7.4.0)", "twine (>=4.0.2)"] -server = ["PyYAML (>=5.1)", "fastapi (>=0.100.0)", "pydantic-settings (>=2.0.1)", "sse-starlette (>=1.6.1)", "starlette-context (>=0.3.6,<0.4)", "uvicorn (>=0.22.0)"] -test = ["httpx (>=0.24.1)", "pytest (>=7.4.0)", "scipy (>=1.10)"] - -[package.source] -type = "url" -url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-metal/llama_cpp_python-0.2.88-cp312-cp312-macosx_11_0_arm64.whl" - [[package]] name = "llvmlite" version = "0.43.0" @@ -5190,17 +4956,6 @@ files = [ {file = "shellcheck_py-0.10.0.1.tar.gz", hash = "sha256:390826b340b8c19173922b0da5ef7b66ef34d4d087dc48aad3e01f7e77e164d9"}, ] -[[package]] -name = "shellingham" -version = "1.5.4" -description = "Tool to Detect Surrounding Shell" -optional = false -python-versions = ">=3.7" -files = [ - {file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"}, - {file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"}, -] - [[package]] name = "six" version = "1.16.0" @@ -5214,28 +4969,24 @@ files = [ [[package]] name = "smart-open" -version = "7.0.4" +version = "6.4.0" description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)" optional = false -python-versions = "<4.0,>=3.7" +python-versions = ">=3.6,<4.0" files = [ - {file = "smart_open-7.0.4-py3-none-any.whl", hash = "sha256:4e98489932b3372595cddc075e6033194775165702887216b65eba760dfd8d47"}, - {file = "smart_open-7.0.4.tar.gz", hash = "sha256:62b65852bdd1d1d516839fcb1f6bc50cd0f16e05b4ec44b52f43d38bcb838524"}, + {file = "smart_open-6.4.0-py3-none-any.whl", hash = "sha256:8d3ef7e6997e8e42dd55c74166ed21e6ac70664caa32dd940b26d54a8f6b4142"}, + {file = "smart_open-6.4.0.tar.gz", hash = "sha256:be3c92c246fbe80ebce8fbacb180494a481a77fcdcb7c1aadb2ea5b9c2bee8b9"}, ] -[package.dependencies] -wrapt = "*" - [package.extras] -all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "paramiko", "requests", "zstandard"] +all = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "paramiko", "requests"] azure = ["azure-common", "azure-core", "azure-storage-blob"] gcs = ["google-cloud-storage (>=2.6.0)"] http = ["requests"] s3 = ["boto3"] ssh = ["paramiko"] -test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "moto[server]", "paramiko", "pytest", "pytest-rerunfailures", "requests", "responses", "zstandard"] +test = ["azure-common", "azure-core", "azure-storage-blob", "boto3", "google-cloud-storage (>=2.6.0)", "moto[server]", "paramiko", "pytest", "pytest-rerunfailures", "requests", "responses"] webhdfs = ["requests"] -zst = ["zstandard"] [[package]] name = "smmap" @@ -5938,21 +5689,25 @@ test = ["coverage[toml] (>=7)", "mypy (>=1.2.0)", "pytest (>=7)"] [[package]] name = "typer" -version = "0.12.4" +version = "0.9.4" description = "Typer, build great CLIs. Easy to code. Based on Python type hints." optional = false -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "typer-0.12.4-py3-none-any.whl", hash = "sha256:819aa03699f438397e876aa12b0d63766864ecba1b579092cc9fe35d886e34b6"}, - {file = "typer-0.12.4.tar.gz", hash = "sha256:c9c1613ed6a166162705b3347b8d10b661ccc5d95692654d0fb628118f2c34e6"}, + {file = "typer-0.9.4-py3-none-any.whl", hash = "sha256:aa6c4a4e2329d868b80ecbaf16f807f2b54e192209d7ac9dd42691d63f7a54eb"}, + {file = "typer-0.9.4.tar.gz", hash = "sha256:f714c2d90afae3a7929fcd72a3abb08df305e1ff61719381384211c4070af57f"}, ] [package.dependencies] -click = ">=8.0.0" -rich = ">=10.11.0" -shellingham = ">=1.3.0" +click = ">=7.1.1,<9.0.0" typing-extensions = ">=3.7.4.3" +[package.extras] +all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] +dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] +doc = ["cairosvg (>=2.5.2,<3.0.0)", "mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)", "pillow (>=9.3.0,<10.0.0)"] +test = ["black (>=22.3.0,<23.0.0)", "coverage (>=6.2,<7.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.971)", "pytest (>=4.4.0,<8.0.0)", "pytest-cov (>=2.10.0,<5.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<4.0.0)", "rich (>=10.11.0,<14.0.0)", "shellingham (>=1.3.0,<2.0.0)"] + [[package]] name = "types-python-dateutil" version = "2.9.0.20240316" @@ -6079,24 +5834,24 @@ files = [ [[package]] name = "weasel" -version = "0.4.1" +version = "0.3.4" description = "Weasel: A small and easy workflow system" optional = false -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "weasel-0.4.1-py3-none-any.whl", hash = "sha256:24140a090ea1ac512a2b2f479cc64192fd1d527a7f3627671268d08ed5ac418c"}, - {file = "weasel-0.4.1.tar.gz", hash = "sha256:aabc210f072e13f6744e5c3a28037f93702433405cd35673f7c6279147085aa9"}, + {file = "weasel-0.3.4-py3-none-any.whl", hash = "sha256:ee48a944f051d007201c2ea1661d0c41035028c5d5a8bcb29a0b10f1100206ae"}, + {file = "weasel-0.3.4.tar.gz", hash = "sha256:eb16f92dc9f1a3ffa89c165e3a9acd28018ebb656e0da4da02c0d7d8ae3f6178"}, ] [package.dependencies] -cloudpathlib = ">=0.7.0,<1.0.0" +cloudpathlib = ">=0.7.0,<0.17.0" confection = ">=0.0.4,<0.2.0" packaging = ">=20.0" pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<3.0.0" requests = ">=2.13.0,<3.0.0" -smart-open = ">=5.2.1,<8.0.0" +smart-open = ">=5.2.1,<7.0.0" srsly = ">=2.4.3,<3.0.0" -typer = ">=0.3.0,<1.0.0" +typer = ">=0.3.0,<0.10.0" wasabi = ">=0.9.1,<1.2.0" [[package]] @@ -6110,85 +5865,6 @@ files = [ {file = "widgetsnbextension-4.0.11.tar.gz", hash = "sha256:8b22a8f1910bfd188e596fe7fc05dcbd87e810c8a4ba010bdb3da86637398474"}, ] -[[package]] -name = "wrapt" -version = "1.16.0" -description = "Module for decorators, wrappers and monkey patching." -optional = false -python-versions = ">=3.6" -files = [ - {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"}, - {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"}, - {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"}, - {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"}, - {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"}, - {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"}, - {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"}, - {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"}, - {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"}, - {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"}, - {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"}, - {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"}, - {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"}, - {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"}, - {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"}, - {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"}, - {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"}, - {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"}, - {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"}, - {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"}, - {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"}, - {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"}, - {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"}, - {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"}, - {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"}, - {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"}, - {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"}, - {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"}, - {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"}, - {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"}, - {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"}, - {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"}, - {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"}, - {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"}, - {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"}, - {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"}, - {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"}, - {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"}, - {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"}, - {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"}, - {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"}, - {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"}, - {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"}, - {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"}, - {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"}, - {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"}, - {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"}, - {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"}, - {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"}, - {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"}, - {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"}, - {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"}, - {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"}, - {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"}, - {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"}, - {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"}, - {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"}, - {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"}, - {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"}, - {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"}, - {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"}, - {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"}, - {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"}, - {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"}, - {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"}, - {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"}, - {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"}, - {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"}, - {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"}, - {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, -] - [[package]] name = "xx-sent-ud-sm" version = "3.7.0" @@ -6461,11 +6137,10 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", type = ["pytest-mypy"] [extras] -cuda124 = [] pandoc = ["pypandoc-binary"] ragas = ["ragas"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<4.0" -content-hash = "b67e5de29567433c962694b0a54ae8c7843e36988e70ced50d71f7a70eacacf1" +content-hash = "64733cd486454d9ab888604e47ceb8ff2fa4f4f38edb5745daec45165e506efb" diff --git a/pyproject.toml b/pyproject.toml index 6ae1b67..541f38d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,25 +29,11 @@ mdformat-gfm = ">=0.3.6" # Sentence and chunk splitting: numpy = ">=1.26.4" scipy = ">=1.5.0" -spacy = ">=3.7.4" -xx_sent_ud_sm = { url = "https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl" } +spacy = ">=3.7.0,<3.8.0" # Large Language Models: huggingface-hub = ">=0.22.0" litellm = ">=1.47.1" -llama-cpp-python = [ - { version = ">=0.2.88", markers = "(sys_platform != 'darwin' or platform_machine != 'arm64') and (extra != 'cuda121' and extra != 'cuda122' and extra != 'cuda123' and extra != 'cuda124')" }, - # Metal - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-metal/llama_cpp_python-0.2.88-cp310-cp310-macosx_11_0_arm64.whl", markers = "sys_platform == 'darwin' and platform_machine == 'arm64' and python_version == '3.10'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-metal/llama_cpp_python-0.2.88-cp311-cp311-macosx_11_0_arm64.whl", markers = "sys_platform == 'darwin' and platform_machine == 'arm64' and python_version == '3.11'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-metal/llama_cpp_python-0.2.88-cp312-cp312-macosx_11_0_arm64.whl", markers = "sys_platform == 'darwin' and platform_machine == 'arm64' and python_version == '3.12'" }, - # CUDA 12.4 - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp310-cp310-linux_x86_64.whl", markers = "sys_platform == 'linux' and python_version == '3.10' and extra == 'cuda124'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp310-cp310-win_amd64.whl", markers = "sys_platform == 'win32' and python_version == '3.10' and extra == 'cuda124'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp311-cp311-linux_x86_64.whl", markers = "sys_platform == 'linux' and python_version == '3.11' and extra == 'cuda124'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp311-cp311-win_amd64.whl", markers = "sys_platform == 'win32' and python_version == '3.11' and extra == 'cuda124'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp312-cp312-linux_x86_64.whl", markers = "sys_platform == 'linux' and python_version == '3.12' and extra == 'cuda124'" }, - { url = "https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.88-cu124/llama_cpp_python-0.2.88-cp312-cp312-win_amd64.whl", markers = "sys_platform == 'win32' and python_version == '3.12' and extra == 'cuda124'" }, -] +llama-cpp-python = ">=0.2.88" pydantic = ">=2.7.0" # Approximate Nearest Neighbors: pynndescent = ">=0.5.12" @@ -64,7 +50,6 @@ pandas = ">=2.1.0" ragas = { version = ">=0.1.12", optional = true } [tool.poetry.extras] # https://python-poetry.org/docs/pyproject/#extras -cuda124 = [] pandoc = ["pypandoc-binary"] ragas = ["ragas"] @@ -80,6 +65,7 @@ ruff = ">=0.5.7" safety = ">=3.1.0" shellcheck-py = ">=0.10.0.1" typeguard = ">=4.2.1" +xx_sent_ud_sm = { url = "https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.7.0/xx_sent_ud_sm-3.7.0-py3-none-any.whl" } [tool.poetry.group.dev.dependencies] # https://python-poetry.org/docs/master/managing-dependencies/ cruft = ">=2.15.0" diff --git a/src/raglite/_split_sentences.py b/src/raglite/_split_sentences.py index a98bd6a..f2b8911 100644 --- a/src/raglite/_split_sentences.py +++ b/src/raglite/_split_sentences.py @@ -46,7 +46,11 @@ def get_markdown_heading_indexes(doc: str) -> list[tuple[int, int]]: def split_sentences(doc: str, max_len: int | None = None) -> list[str]: """Split a document into sentences.""" # Split sentences with spaCy. - nlp = spacy.load("xx_sent_ud_sm") + try: + nlp = spacy.load("xx_sent_ud_sm") + except OSError as error: + error_message = "Please install `xx_sent_ud_sm` with `pip install https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.8.0/xx_sent_ud_sm-3.8.0-py3-none-any.whl`." + raise ImportError(error_message) from error nlp.add_pipe("_mark_additional_sentence_boundaries", before="senter") sentences = [sent.text_with_ws for sent in nlp(doc).sents if sent.text.strip()] # Apply additional splits on paragraphs and sentences because spaCy's splitting is not perfect.