From 7c163f215646b9273b7a5a94f7b81b6114ab6fea Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 6 Nov 2024 11:23:36 +0100 Subject: [PATCH] ML/LlamaIndex: Add software tests and CI configuration (#707) * ML/LlamaIndex: Adjustments to make it work with non-Azure OpenAI * ML/LlamaIndex: Add software tests and CI configuration --- .github/dependabot.yml | 5 + .github/workflows/ml-llamaindex.yml | 82 ++++++++++++++ topic/machine-learning/llama-index/README.md | 19 +++- .../llama-index/{env.example => env.azure} | 4 +- .../llama-index/env.standalone | 4 + topic/machine-learning/llama-index/init.sql | 23 ++++ topic/machine-learning/llama-index/main.py | 79 +++++++++---- .../llama-index/pyproject.toml | 27 +++++ .../llama-index/requirements-dev.txt | 3 + .../llama-index/requirements.txt | 107 ++---------------- topic/machine-learning/llama-index/test.py | 40 +++++++ 11 files changed, 263 insertions(+), 130 deletions(-) create mode 100644 .github/workflows/ml-llamaindex.yml rename topic/machine-learning/llama-index/{env.example => env.azure} (64%) create mode 100644 topic/machine-learning/llama-index/env.standalone create mode 100644 topic/machine-learning/llama-index/init.sql create mode 100644 topic/machine-learning/llama-index/pyproject.toml create mode 100644 topic/machine-learning/llama-index/requirements-dev.txt create mode 100644 topic/machine-learning/llama-index/test.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 4dd545a4..7c263ae7 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -114,6 +114,11 @@ updates: schedule: interval: "daily" + - directory: "/topic/machine-learning/llama-index" + package-ecosystem: "pip" + schedule: + interval: "daily" + - directory: "/topic/machine-learning/mlops-mlflow" package-ecosystem: "pip" schedule: diff --git a/.github/workflows/ml-llamaindex.yml b/.github/workflows/ml-llamaindex.yml new file mode 100644 index 00000000..67fa34e9 --- /dev/null +++ b/.github/workflows/ml-llamaindex.yml @@ -0,0 +1,82 @@ +name: LlamaIndex + +on: + pull_request: + branches: ~ + paths: + - '.github/workflows/ml-llamaindex.yml' + - 'topic/machine-learning/llama-index/**' + - '/requirements.txt' + push: + branches: [ main ] + paths: + - '.github/workflows/ml-llamaindex.yml' + - 'topic/machine-learning/llama-index/**' + - '/requirements.txt' + + # Allow job to be triggered manually. + workflow_dispatch: + + # Run job each night after CrateDB nightly has been published. + schedule: + - cron: '0 3 * * *' + +# Cancel in-progress jobs when pushing to the same branch. +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +jobs: + test: + name: " + Python: ${{ matrix.python-version }} + CrateDB: ${{ matrix.cratedb-version }} + on ${{ matrix.os }}" + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ + 'ubuntu-latest', + ] + python-version: [ + '3.8', + '3.13', + ] + cratedb-version: [ 'nightly' ] + + services: + cratedb: + image: crate/crate:${{ matrix.cratedb-version }} + ports: + - 4200:4200 + - 5432:5432 + env: + CRATE_HEAP_SIZE: 4g + + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + + steps: + + - name: Acquire sources + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + cache: 'pip' + cache-dependency-path: | + requirements.txt + topic/machine-learning/llama-index/requirements.txt + topic/machine-learning/llama-index/requirements-dev.txt + + - name: Install utilities + run: | + pip install -r requirements.txt + + - name: Validate topic/machine-learning/llama-index + run: | + ngr test --accept-no-venv topic/machine-learning/llama-index diff --git a/topic/machine-learning/llama-index/README.md b/topic/machine-learning/llama-index/README.md index 7e339089..2c287e8c 100644 --- a/topic/machine-learning/llama-index/README.md +++ b/topic/machine-learning/llama-index/README.md @@ -4,9 +4,9 @@ This folder contains the codebase for [this tutorial](https://community.cratedb. This has been tested using: -* Python 3.12.2 -* macOS Sequoia 15.0.1 -* CrateDB 5.8.3 running in CrateDB Cloud on AWS Europe (Ireland) +* Python 3.12 +* macOS +* CrateDB 5.8 and higher ## Database Setup @@ -14,6 +14,15 @@ You will need a CrateDB Cloud database: sign up [here](https://console.cratedb.c Make a note of the hostname, username and password for your database. You'll need those when configuring the environment file later. +If you don't use CrateDB Cloud, you can also provide an instance for testing +purposes like this: + +```shell +docker run --rm -it --name=cratedb \ + --publish=4200:4200 --publish=5432:5432 \ + --env=CRATE_HEAP_SIZE=2g crate:latest -Cdiscovery.type=single-node +``` + Create a table in CrateDB: ```sql @@ -61,7 +70,7 @@ pip install -r requirements.txt ## Configure your Environment -To configure your environment, copy the provided [`env.example`](./env.example) file to a new file named `.env`, then open it with a text editor. +To configure your environment, copy the provided [`env.azure`](./env.azure) or [`env.standalone`](./env.standalone) file to a new file named `.env`, then open it with a text editor. Set the values in the file as follows: @@ -72,7 +81,7 @@ OPENAI_AZURE_ENDPOINT=https:// EMBEDDING_MODEL_INSTANCE= -CRATEDB_URL="crate://:@:4200/?ssl=true" +CRATEDB_SQLALCHEMY_URL="crate://:@:4200/?ssl=true" CRATEDB_TABLE_NAME=time_series_data ``` diff --git a/topic/machine-learning/llama-index/env.example b/topic/machine-learning/llama-index/env.azure similarity index 64% rename from topic/machine-learning/llama-index/env.example rename to topic/machine-learning/llama-index/env.azure index df23fa73..f97346ef 100644 --- a/topic/machine-learning/llama-index/env.example +++ b/topic/machine-learning/llama-index/env.azure @@ -4,5 +4,5 @@ OPENAI_AZURE_ENDPOINT=https://TODO.openai.azure.com OPENAI_AZURE_API_VERSION=2024-08-01-preview LLM_INSTANCE=TODO EMBEDDING_MODEL_INSTANCE=TODO -CRATEDB_URL="crate://USER:PASSWORD@HOST:4200/?ssl=true" -CRATEDB_TABLE_NAME=time_series_data \ No newline at end of file +CRATEDB_SQLALCHEMY_URL="crate://USER:PASSWORD@HOST:4200/?ssl=true" +CRATEDB_TABLE_NAME=time_series_data diff --git a/topic/machine-learning/llama-index/env.standalone b/topic/machine-learning/llama-index/env.standalone new file mode 100644 index 00000000..9ad450ef --- /dev/null +++ b/topic/machine-learning/llama-index/env.standalone @@ -0,0 +1,4 @@ +# OPENAI_API_KEY=sk-XJZ7pfog5Gp8Kus8D--invalid--0CJ5lyAKSefZLaV1Y9S1 +OPENAI_API_TYPE=openai +CRATEDB_SQLALCHEMY_URL="crate://crate@localhost:4200/" +CRATEDB_TABLE_NAME=time_series_data diff --git a/topic/machine-learning/llama-index/init.sql b/topic/machine-learning/llama-index/init.sql new file mode 100644 index 00000000..e59ad493 --- /dev/null +++ b/topic/machine-learning/llama-index/init.sql @@ -0,0 +1,23 @@ +CREATE TABLE IF NOT EXISTS time_series_data ( + timestamp TIMESTAMP, + value DOUBLE, + location STRING, + sensor_id INT +); + +INSERT INTO time_series_data (timestamp, value, location, sensor_id) +VALUES + ('2023-09-14T00:00:00', 10.5, 'Sensor A', 1), + ('2023-09-14T01:00:00', 15.2, 'Sensor A', 1), + ('2023-09-14T02:00:00', 18.9, 'Sensor A', 1), + ('2023-09-14T03:00:00', 12.7, 'Sensor B', 2), + ('2023-09-14T04:00:00', 17.3, 'Sensor B', 2), + ('2023-09-14T05:00:00', 20.1, 'Sensor B', 2), + ('2023-09-14T06:00:00', 22.5, 'Sensor A', 1), + ('2023-09-14T07:00:00', 18.3, 'Sensor A', 1), + ('2023-09-14T08:00:00', 16.8, 'Sensor A', 1), + ('2023-09-14T09:00:00', 14.6, 'Sensor B', 2), + ('2023-09-14T10:00:00', 13.2, 'Sensor B', 2), + ('2023-09-14T11:00:00', 11.7, 'Sensor B', 2); + +REFRESH TABLE time_series_data; diff --git a/topic/machine-learning/llama-index/main.py b/topic/machine-learning/llama-index/main.py index 0e795545..077f67d5 100644 --- a/topic/machine-learning/llama-index/main.py +++ b/topic/machine-learning/llama-index/main.py @@ -1,4 +1,7 @@ -""" Example code using Azure Open AI and llama-index. """ +""" +Use an LLM to query a database in human language. +Example code using LlamaIndex with vanilla Open AI and Azure Open AI. +""" import os import openai @@ -6,54 +9,84 @@ from dotenv import load_dotenv from langchain_openai import AzureOpenAIEmbeddings +from langchain_openai import OpenAIEmbeddings from llama_index.llms.azure_openai import AzureOpenAI +from llama_index.llms.openai import OpenAI from llama_index.embeddings.langchain import LangchainEmbedding from llama_index.core.utilities.sql_wrapper import SQLDatabase from llama_index.core.query_engine import NLSQLTableQueryEngine from llama_index.core import Settings -if __name__ == "__main__": - load_dotenv() + +def configure_llm(): + """ + Configure LLM. Use either vanilla Open AI, or Azure Open AI. + """ openai.api_type = os.getenv("OPENAI_API_TYPE") openai.azure_endpoint = os.getenv("OPENAI_AZURE_ENDPOINT") openai.api_version = os.getenv("OPENAI_AZURE_API_VERSION") openai.api_key = os.getenv("OPENAI_API_KEY") - llm = AzureOpenAI( - engine=os.getenv("LLM_INSTANCE"), - azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"), - api_key = os.getenv("OPENAI_API_KEY"), - api_version = os.getenv("OPENAI_AZURE_API_VERSION"), - temperature=0.0 - ) + if openai.api_type == "openai": + llm = OpenAI( + api_key=os.getenv("OPENAI_API_KEY"), + temperature=0.0 + ) + elif openai.api_type == "azure": + llm = AzureOpenAI( + engine=os.getenv("LLM_INSTANCE"), + azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"), + api_key = os.getenv("OPENAI_API_KEY"), + api_version = os.getenv("OPENAI_AZURE_API_VERSION"), + temperature=0.0 + ) + else: + raise ValueError(f"Open AI API type not defined or invalid: {openai.api_type}") Settings.llm = llm - Settings.embed_model = LangchainEmbedding( - AzureOpenAIEmbeddings( - azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"), - model=os.getenv("EMBEDDING_MODEL_INSTANCE") + if openai.api_type == "openai": + Settings.embed_model = LangchainEmbedding(OpenAIEmbeddings()) + elif openai.api_type == "azure": + Settings.embed_model = LangchainEmbedding( + AzureOpenAIEmbeddings( + azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"), + model=os.getenv("EMBEDDING_MODEL_INSTANCE") + ) ) - ) - print("Creating SQLAlchemy engine...") - engine_crate = sa.create_engine(os.getenv("CRATEDB_URL")) - print("Connecting to CrateDB...") + +def main(): + """ + Use an LLM to query a database in human language. + """ + + # Configure application. + load_dotenv() + configure_llm() + + # Configure database connection and query engine. + print("Connecting to CrateDB") + engine_crate = sa.create_engine(os.getenv("CRATEDB_SQLALCHEMY_URL")) engine_crate.connect() - print("Creating SQLDatabase instance...") + + print("Creating LlamaIndex QueryEngine") sql_database = SQLDatabase(engine_crate, include_tables=[os.getenv("CRATEDB_TABLE_NAME")]) - print("Creating QueryEngine...") query_engine = NLSQLTableQueryEngine( sql_database=sql_database, tables=[os.getenv("CRATEDB_TABLE_NAME")], - llm = llm + llm=Settings.llm ) - print("Running query...") - + # Invoke an inquiry. + print("Running query") QUERY_STR = "What is the average value for sensor 1?" answer = query_engine.query(QUERY_STR) print(answer.get_formatted_sources()) print("Query was:", QUERY_STR) print("Answer was:", answer) print(answer.metadata) + + +if __name__ == "__main__": + main() diff --git a/topic/machine-learning/llama-index/pyproject.toml b/topic/machine-learning/llama-index/pyproject.toml new file mode 100644 index 00000000..d06dcec5 --- /dev/null +++ b/topic/machine-learning/llama-index/pyproject.toml @@ -0,0 +1,27 @@ +[tool.pytest.ini_options] +minversion = "2.0" +addopts = """ + -rfEX -p pytester --strict-markers --verbosity=3 --capture=no + --cov=. --cov-report=term-missing --cov-report=xml + """ + +#log_level = "DEBUG" +#log_cli_level = "DEBUG" + +testpaths = [ + "*.py", +] +xfail_strict = true +markers = [ +] + +[tool.coverage.run] +branch = false + +[tool.coverage.report] +fail_under = 0 +show_missing = true +omit = [ + "conftest.py", + "test*.py", +] diff --git a/topic/machine-learning/llama-index/requirements-dev.txt b/topic/machine-learning/llama-index/requirements-dev.txt new file mode 100644 index 00000000..930b8ce0 --- /dev/null +++ b/topic/machine-learning/llama-index/requirements-dev.txt @@ -0,0 +1,3 @@ +cratedb-toolkit +pueblo[testing] +sqlparse diff --git a/topic/machine-learning/llama-index/requirements.txt b/topic/machine-learning/llama-index/requirements.txt index ad580877..ba44e31a 100644 --- a/topic/machine-learning/llama-index/requirements.txt +++ b/topic/machine-learning/llama-index/requirements.txt @@ -1,100 +1,7 @@ -aiohappyeyeballs==2.4.3 -aiohttp==3.10.10 -aiosignal==1.3.1 -annotated-types==0.7.0 -anyio==4.6.2.post1 -attrs==24.2.0 -azure-core==1.31.0 -azure-identity==1.19.0 -beautifulsoup4==4.12.3 -certifi==2024.8.30 -cffi==1.17.1 -charset-normalizer==3.4.0 -click==8.1.7 -crate>=1.0.0.dev2 -cryptography==43.0.3 -dataclasses-json==0.6.7 -Deprecated==1.2.14 -dirtyjson==1.0.8 -distro==1.9.0 -frozenlist==1.4.1 -fsspec==2024.10.0 -geojson==3.1.0 -greenlet==3.1.1 -h11==0.14.0 -httpcore==1.0.6 -httpx==0.27.2 -idna==3.10 -jiter==0.6.1 -joblib==1.4.2 -jsonpatch==1.33 -jsonpointer==3.0.0 -langchain==0.3.4 -langchain-community==0.3.3 -langchain-core==0.3.12 -langchain-openai==0.2.3 -langchain-text-splitters==0.3.0 -langsmith==0.1.136 -llama-cloud==0.1.4 -llama-index==0.11.19 -llama-index-agent-openai==0.3.4 -llama-index-cli==0.3.1 -llama-index-core==0.11.19 -llama-index-embeddings-langchain==0.2.1 -llama-index-embeddings-openai==0.2.5 -llama-index-indices-managed-llama-cloud==0.4.0 -llama-index-legacy==0.9.48.post3 -llama-index-llms-azure-openai==0.2.2 -llama-index-llms-langchain==0.4.2 -llama-index-llms-openai==0.2.15 -llama-index-multi-modal-llms-openai==0.2.2 -llama-index-program-openai==0.2.0 -llama-index-question-gen-openai==0.2.0 -llama-index-readers-file==0.2.2 -llama-index-readers-llama-parse==0.3.0 -llama-parse==0.5.10 -marshmallow==3.23.0 -msal==1.31.0 -msal-extensions==1.2.0 -multidict==6.1.0 -mypy-extensions==1.0.0 -nest-asyncio==1.6.0 -networkx==3.4.2 -nltk==3.9.1 -numpy==1.26.4 -openai==1.52.0 -orjson==3.10.9 -packaging==24.1 -pandas==2.2.3 -pillow==11.0.0 -portalocker==2.10.1 -propcache==0.2.0 -pycparser==2.22 -pydantic==2.9.2 -pydantic-settings==2.6.0 -pydantic_core==2.23.4 -PyJWT==2.9.0 -pypdf==4.3.1 -python-dateutil==2.9.0.post0 -python-dotenv==1.0.1 -pytz==2024.2 -PyYAML==6.0.2 -regex==2024.9.11 -requests==2.32.3 -requests-toolbelt==1.0.0 -six==1.16.0 -sniffio==1.3.1 -soupsieve==2.6 -SQLAlchemy==2.0.36 -sqlalchemy-cratedb>=0.40.0 -striprtf==0.0.26 -tenacity==8.5.0 -tiktoken==0.8.0 -tqdm==4.66.5 -typing-inspect==0.9.0 -typing_extensions==4.12.2 -tzdata==2024.2 -urllib3==2.2.3 -verlib2==0.2.0 -wrapt==1.16.0 -yarl==1.16.0 +langchain-openai<0.3 +llama-index-embeddings-langchain<0.3 +llama-index-embeddings-openai<0.3 +llama-index-llms-azure-openai<0.3 +llama-index-llms-openai<0.3 +python-dotenv +sqlalchemy-cratedb diff --git a/topic/machine-learning/llama-index/test.py b/topic/machine-learning/llama-index/test.py new file mode 100644 index 00000000..3c81566a --- /dev/null +++ b/topic/machine-learning/llama-index/test.py @@ -0,0 +1,40 @@ +from pathlib import Path + +import pytest + +from cratedb_toolkit.io.sql import DatabaseAdapter +from dotenv import load_dotenv + +HERE = Path(__file__).parent + + +@pytest.fixture() +def cratedb() -> DatabaseAdapter: + return DatabaseAdapter(dburi="crate://crate@localhost:4200") + + +@pytest.fixture(scope="function", autouse=True) +def init_database(cratedb): + """ + Initialize database. + """ + cratedb.run_sql("DROP TABLE IF EXISTS time_series_data;") + cratedb.run_sql((HERE / "init.sql").read_text()) + + +def test_main(cratedb, capsys): + """ + Execute `main.py` and verify outcome. + """ + + # Load the standalone configuration also for software testing. + # On CI, `OPENAI_API_KEY` will need to be supplied externally. + load_dotenv("env.standalone") + + # Invoke the workload, in-process. + from main import main + main() + + # Verify the outcome. + out = capsys.readouterr().out + assert "Answer was: The average value for sensor 1 is approximately 17.03." in out