From 10432a033f99c9d50c170dd5f71d524481dc8e7a Mon Sep 17 00:00:00 2001 From: Alexis VIALARET Date: Tue, 12 Dec 2023 14:10:40 +0100 Subject: [PATCH] init --- .pre-commit-config.yaml | 43 --------------- Makefile | 34 ------------ bin/install_with_conda.sh | 18 ------- bin/install_with_venv.sh | 20 ------- config/config.py | 4 -- config/config.toml | 2 - docs/code.md | 1 - docs/index.md | 3 -- mkdocs.yaml | 42 --------------- notebooks/private/.gitignore | 2 - notebooks/template.ipynb | 79 ---------------------------- pyproject.toml | 65 ----------------------- requirements-developer.txt | 10 ---- requirements.txt | 11 ++-- sandbox_alexis/main.py | 18 +++++++ sandbox_alexis/storage_backend.py | 15 ++++++ sandbox_alexis/vector_store.py | 7 +++ secrets/.gitkeep | 0 tests/data/.gitkeep | 0 tests/integration_tests/.gitkeep | 0 tests/unit_tests/.gitkeep | 0 tests/unit_tests/test_placeholder.py | 6 --- 22 files changed, 48 insertions(+), 332 deletions(-) delete mode 100644 .pre-commit-config.yaml delete mode 100644 Makefile delete mode 100644 bin/install_with_conda.sh delete mode 100644 bin/install_with_venv.sh delete mode 100644 config/config.py delete mode 100644 config/config.toml delete mode 100644 docs/code.md delete mode 100644 docs/index.md delete mode 100644 mkdocs.yaml delete mode 100644 notebooks/private/.gitignore delete mode 100644 notebooks/template.ipynb delete mode 100644 pyproject.toml delete mode 100644 requirements-developer.txt create mode 100644 sandbox_alexis/main.py create mode 100644 sandbox_alexis/storage_backend.py create mode 100644 sandbox_alexis/vector_store.py delete mode 100644 secrets/.gitkeep delete mode 100644 tests/data/.gitkeep delete mode 100644 tests/integration_tests/.gitkeep delete mode 100644 tests/unit_tests/.gitkeep delete mode 100644 tests/unit_tests/test_placeholder.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml deleted file mode 100644 index c285e93..0000000 --- a/.pre-commit-config.yaml +++ /dev/null @@ -1,43 +0,0 @@ -repos: - - repo: "https://github.com/pre-commit/pre-commit-hooks" - rev: v4.4.0 - hooks: - - id: trailing-whitespace - - id: end-of-file-fixer - - id: check-toml - - id: check-yaml - - id: check-json - - id: check-added-large-files - - repo: local - hooks: - - id: ruff-format - name: Formatting (ruff) - entry: ruff format - types: [python] - language: system - - id: ruff-fix - name: Linting & sorting (ruff) - entry: ruff --fix --fixable I001 # allow only to fix unsorted imports - types: [python] - language: system - - id: nbstripout - name: Strip Jupyter notebook output (nbstripout) - entry: nbstripout - types: [file] - files: (.ipynb)$ - language: system - - id: python-bandit-vulnerability-check - name: Security check (bandit) - entry: bandit - types: [python] - args: ["--recursive", "lib/"] - language: system - - id: pytest-check - name: Tests (pytest) - stages: [push] - entry: pytest tests/ - types: [python] - language: system - pass_filenames: false - always_run: true -exclude: ^(.svn|CVS|.bzr|.hg|.git|__pycache__|.tox|.ipynb_checkpoints|assets|tests/assets/|venv/|.venv/) diff --git a/Makefile b/Makefile deleted file mode 100644 index 0d8f173..0000000 --- a/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -USE_CONDA ?= 1 -INSTALL_SCRIPT = install_with_conda.sh -ifeq (false,$(USE_CONDA)) - INSTALL_SCRIPT = install_with_venv.sh -endif - -.DEFAULT_GOAL = help - -# help: help - Display this makefile's help information -.PHONY: help -help: - @grep "^# help\:" Makefile | grep -v grep | sed 's/\# help\: //' | sed 's/\# help\://' - -# help: install - Create a virtual environment and install dependencies -.PHONY: install -install: - @bash bin/$(INSTALL_SCRIPT) - -# help: install_precommit - Install pre-commit hooks -.PHONY: install_precommit -install_precommit: - @pre-commit install -t pre-commit - @pre-commit install -t pre-push - -# help: serve_docs_locally - Serve docs locally on port 8001 -.PHONY: serve_docs_locally -serve_docs_locally: - @mkdocs serve --livereload -a localhost:8001 - -# help: deploy_docs - Deploy documentation to GitHub Pages -.PHONY: deploy_docs -deploy_docs: - @mkdocs build - @mkdocs gh-deploy diff --git a/bin/install_with_conda.sh b/bin/install_with_conda.sh deleted file mode 100644 index a8e1f74..0000000 --- a/bin/install_with_conda.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -e - -read -p "Want to install conda env named 'skaff-rag-accelerator'? (y/n)" answer -if [ "$answer" = "y" ]; then - echo "Installing conda env..." - conda create -n skaff-rag-accelerator python=3.10 -y - source $(conda info --base)/etc/profile.d/conda.sh - conda activate skaff-rag-accelerator - echo "Installing requirements..." - pip install -r requirements-developer.txt - python3 -m ipykernel install --user --name=skaff-rag-accelerator - conda install -c conda-forge --name skaff-rag-accelerator notebook -y - echo "Installing pre-commit..." - make install_precommit - echo "Installation complete!"; -else - echo "Installation of conda env aborted!"; -fi diff --git a/bin/install_with_venv.sh b/bin/install_with_venv.sh deleted file mode 100644 index 1086890..0000000 --- a/bin/install_with_venv.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -e - -read -p "Want to install virtual env named 'venv' in this project ? (y/n)" answer -if [ "$answer" = "y" ]; then - echo "Installing virtual env..." - declare VENV_DIR=$(pwd)/venv - if ! [ -d "$VENV_DIR" ]; then - python3 -m venv $VENV_DIR - fi - - source $VENV_DIR/bin/activate - echo "Installing requirements..." - pip install -r requirements-developer.txt - python3 -m ipykernel install --user --name=venv - echo "Installing pre-commit..." - make install_precommit - echo "Installation complete!"; -else - echo "Installation of virtual env aborted!"; -fi diff --git a/config/config.py b/config/config.py deleted file mode 100644 index 111cc12..0000000 --- a/config/config.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Module for dynamic configuration variables of the project.""" -from pathlib import Path - -ROOT_PATH = Path(__file__).parent.parent diff --git a/config/config.toml b/config/config.toml deleted file mode 100644 index 6b45397..0000000 --- a/config/config.toml +++ /dev/null @@ -1,2 +0,0 @@ -# Write config variables here -# See https://toml.io/en/ for more information diff --git a/docs/code.md b/docs/code.md deleted file mode 100644 index aa45473..0000000 --- a/docs/code.md +++ /dev/null @@ -1 +0,0 @@ -# Code diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 8013429..0000000 --- a/docs/index.md +++ /dev/null @@ -1,3 +0,0 @@ -# Welcome to the documentation! - -For more information, make sure to check the [Material for MkDocs documentation](https://squidfunk.github.io/mkdocs-material/getting-started/) diff --git a/mkdocs.yaml b/mkdocs.yaml deleted file mode 100644 index 218e74c..0000000 --- a/mkdocs.yaml +++ /dev/null @@ -1,42 +0,0 @@ -site_name: skaff-rag-accelerator - -theme: - name: "material" - palette: - - media: "(prefers-color-scheme: dark)" - scheme: default - primary: teal - accent: amber - toggle: - icon: material/moon-waning-crescent - name: Switch to dark mode - - media: "(prefers-color-scheme: light)" - scheme: slate - primary: teal - accent: amber - toggle: - icon: material/white-balance-sunny - name: Switch to light mode - features: - - search.suggest - - search.highlight - - content.tabs.link - - content.code.annotation - - content.code.copy - -markdown_extensions: - - pymdownx.highlight: - anchor_linenums: true - line_spans: __span - pygments_lang_class: true - - pymdownx.inlinehilite - - pymdownx.snippets - - pymdownx.superfences - -plugins: - - mkdocstrings - - search - -nav: - - Home: index.md - - Source code: code.md diff --git a/notebooks/private/.gitignore b/notebooks/private/.gitignore deleted file mode 100644 index d6b7ef3..0000000 --- a/notebooks/private/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/notebooks/template.ipynb b/notebooks/template.ipynb deleted file mode 100644 index 298da4c..0000000 --- a/notebooks/template.ipynb +++ /dev/null @@ -1,79 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2\n", - "\n", - "import pandas as pd\n", - "pd.set_option('display.max_columns', 500)\n", - "pd.options.plotting.backend = \"plotly\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv(\"\")" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# EDA" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index a5fed58..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,65 +0,0 @@ -[build-system] -requires = ["setuptools"] -build-backend = "setuptools.build_meta" - -[project] -name = "skaff-rag-accelerator" -authors = [ - { name = "alexisVLRT", email = "alexis.vialaret@artefact.com" } -] -description = "Deploy RAGs quickly" -version = "0.0.1" -readme = "README.md" -requires-python = ">=3.8" - -[project.urls] -"Homepage" = "https://github.com/artefactory/skaff-rag-accelerator" -"Documentation" = "https://artefactory.github.io/skaff-rag-accelerator" - -[tool.setuptools] -packages = ["lib", "config", "tests"] - -[tool.ruff] -select = [ - "E", - "W", - "F", - "I", - "N", - "D", - "ANN", - "Q", - "RET", - "ARG", - "PTH", - "PD", -] # See: https://beta.ruff.rs/docs/rules/ -ignore = ["D203", "D213", "ANN101", "ANN102"] -line-length = 100 -target-version = "py310" -exclude = [ - ".bzr", - ".direnv", - ".eggs", - ".git", - ".ruff_cache", - ".svn", - ".tox", - ".venv", - "__pypackages__", - "_build", - "buck-out", - "build", - "dist", - "node_modules", - "venv", -] - -[tool.ruff.pydocstyle] -convention = "google" - -[tool.ruff.format] -quote-style = "double" - -[tool.ruff.isort] -known-first-party = ["lib", "config", "tests"] diff --git a/requirements-developer.txt b/requirements-developer.txt deleted file mode 100644 index 20cf8a4..0000000 --- a/requirements-developer.txt +++ /dev/null @@ -1,10 +0,0 @@ --r requirements.txt -ruff==0.1.2 -pre-commit==3.3.3 -pytest==7.3.2 -mkdocs==1.4.3 -mkdocs-material==9.1.15 -mkdocstrings-python==1.1.2 -bandit==1.7.5 -nbstripout==0.6.1 -ipykernel==6.24.0 diff --git a/requirements.txt b/requirements.txt index d23458e..746b00d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,8 @@ --e . -pandas==1.5.3 -numpy==1.24.2 +gcsfs +s3fs +adlfs +universal_pathlib +chromadb +langchain +langchainhub +gpt4all \ No newline at end of file diff --git a/sandbox_alexis/main.py b/sandbox_alexis/main.py new file mode 100644 index 0000000..57a6d23 --- /dev/null +++ b/sandbox_alexis/main.py @@ -0,0 +1,18 @@ +from langchain.vectorstores.chroma import Chroma +from storage_backend import get_storage_root_path, StorageBackend +from langchain.embeddings import GPT4AllEmbeddings + +from text_splitter import load_and_split_document + +data = """One of the most important things I didn't understand about the world when I was a child is the degree to which the returns for performance are superlinear. + +Teachers and coaches implicitly told us the returns were linear. "You get out," I heard a thousand times, "what you put in." They meant well, but this is rarely true. If your product is only half as good as your competitor's, you don't get half as many customers. You get no customers, and you go out of business.It's obviously true that the returns for performance are superlinear in business. Some think this is a flaw of capitalism, and that if we changed the rules it would stop being true. But superlinear returns for performance are a feature of the world, not an artifact of rules we've invented. We see the same pattern in fame, power, military victories, knowledge, and even benefit to humanity. In all of these, the rich get richer. [1]You can't understand the world without understanding the concept of superlinear returns. And if you're ambitious you definitely should, because this will be the wave you surf on. + +It may seem as if there are a lot of different situations with superlinear returns, but as far as I can tell they reduce to two fundamental causes: exponential growth and thresholds.The most obvious case of superlinear returns is when you're working on something that grows exponentially. For example, growing bacterial cultures. When they grow at all, they grow exponentially. But they're tricky to grow. Which means the difference in outcome between someone who's adept at it and someone who's not is very great.Startups can also grow exponentially, and we see the same pattern there. Some manage to achieve high growth rates. Most don't. And as a result you get qualitatively different outcomes: the companies with high growth rates tend to become immensely valuable, while the ones with lower growth rates may not even survive.Y Combinator encourages founders to focus on growth rate rather than absolute numbers. It prevents them from being discouraged early on, when the absolute numbers are still low. It also helps them decide what to focus on: you can use growth rate as a compass to tell you how to evolve the company. But the main advantage is that by focusing on growth rate you tend to get something that grows exponentially.YC doesn't explicitly tell founders that with growth rate "you get out what you put in," but it's not far from the truth. And if growth rate were proportional to performance, then the reward for performance p over time t would be proportional to pt. + +Even after decades of thinking about this, I find that sentence startling.""" + +split_documents = load_and_split_document(text=data) +root_path = get_storage_root_path("", StorageBackend.GCS) +vector_store = Chroma(persist_directory=str(root_path / "chromadb"), embedding_function=GPT4AllEmbeddings()) +db = vector_store.add_documents(split_documents) \ No newline at end of file diff --git a/sandbox_alexis/storage_backend.py b/sandbox_alexis/storage_backend.py new file mode 100644 index 0000000..693f63e --- /dev/null +++ b/sandbox_alexis/storage_backend.py @@ -0,0 +1,15 @@ +from upath import UPath as Path +from enum import Enum + + +class StorageBackend(Enum): + LOCAL = "local" + MEMORY = "memory" + GCS = "gcs" + S3 = "s3" + AZURE = "az" + + +def get_storage_root_path(bucket_name, storage_backend: StorageBackend): + root_path = Path(f"{storage_backend.value}://{bucket_name}") + return root_path diff --git a/sandbox_alexis/vector_store.py b/sandbox_alexis/vector_store.py new file mode 100644 index 0000000..d6e0858 --- /dev/null +++ b/sandbox_alexis/vector_store.py @@ -0,0 +1,7 @@ +import chromadb +from storage_backend import StorageBackend, get_storage_root_path + + +root_path = get_storage_root_path("sample_bucket", StorageBackend.GCS) +client = chromadb.PersistentClient(root_path / "chromadb") +collection = client.get_or_create_collection("embeddings") \ No newline at end of file diff --git a/secrets/.gitkeep b/secrets/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/data/.gitkeep b/tests/data/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/integration_tests/.gitkeep b/tests/integration_tests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unit_tests/.gitkeep b/tests/unit_tests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/unit_tests/test_placeholder.py b/tests/unit_tests/test_placeholder.py deleted file mode 100644 index 338a8e0..0000000 --- a/tests/unit_tests/test_placeholder.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Placeholder test file for unit tests. To be replaced with actual tests.""" - - -def test_placeholder() -> None: - """To be replaced with actual tests.""" - pass