From 7c163f215646b9273b7a5a94f7b81b6114ab6fea Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@crate.io>
Date: Wed, 6 Nov 2024 11:23:36 +0100
Subject: [PATCH] ML/LlamaIndex: Add software tests and CI configuration (#707)

* ML/LlamaIndex: Adjustments to make it work with non-Azure OpenAI

* ML/LlamaIndex: Add software tests and CI configuration
---
 .github/dependabot.yml                        |   5 +
 .github/workflows/ml-llamaindex.yml           |  82 ++++++++++++++
 topic/machine-learning/llama-index/README.md  |  19 +++-
 .../llama-index/{env.example => env.azure}    |   4 +-
 .../llama-index/env.standalone                |   4 +
 topic/machine-learning/llama-index/init.sql   |  23 ++++
 topic/machine-learning/llama-index/main.py    |  79 +++++++++----
 .../llama-index/pyproject.toml                |  27 +++++
 .../llama-index/requirements-dev.txt          |   3 +
 .../llama-index/requirements.txt              | 107 ++----------------
 topic/machine-learning/llama-index/test.py    |  40 +++++++
 11 files changed, 263 insertions(+), 130 deletions(-)
 create mode 100644 .github/workflows/ml-llamaindex.yml
 rename topic/machine-learning/llama-index/{env.example => env.azure} (64%)
 create mode 100644 topic/machine-learning/llama-index/env.standalone
 create mode 100644 topic/machine-learning/llama-index/init.sql
 create mode 100644 topic/machine-learning/llama-index/pyproject.toml
 create mode 100644 topic/machine-learning/llama-index/requirements-dev.txt
 create mode 100644 topic/machine-learning/llama-index/test.py

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 4dd545a4..7c263ae7 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -114,6 +114,11 @@ updates:
     schedule:
       interval: "daily"
 
+  - directory: "/topic/machine-learning/llama-index"
+    package-ecosystem: "pip"
+    schedule:
+      interval: "daily"
+
   - directory: "/topic/machine-learning/mlops-mlflow"
     package-ecosystem: "pip"
     schedule:
diff --git a/.github/workflows/ml-llamaindex.yml b/.github/workflows/ml-llamaindex.yml
new file mode 100644
index 00000000..67fa34e9
--- /dev/null
+++ b/.github/workflows/ml-llamaindex.yml
@@ -0,0 +1,82 @@
+name: LlamaIndex
+
+on:
+  pull_request:
+    branches: ~
+    paths:
+    - '.github/workflows/ml-llamaindex.yml'
+    - 'topic/machine-learning/llama-index/**'
+    - '/requirements.txt'
+  push:
+    branches: [ main ]
+    paths:
+    - '.github/workflows/ml-llamaindex.yml'
+    - 'topic/machine-learning/llama-index/**'
+    - '/requirements.txt'
+
+  # Allow job to be triggered manually.
+  workflow_dispatch:
+
+  # Run job each night after CrateDB nightly has been published.
+  schedule:
+    - cron: '0 3 * * *'
+
+# Cancel in-progress jobs when pushing to the same branch.
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+jobs:
+  test:
+    name: "
+     Python: ${{ matrix.python-version }}
+     CrateDB: ${{ matrix.cratedb-version }}
+     on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [
+          'ubuntu-latest',
+        ]
+        python-version: [
+          '3.8',
+          '3.13',
+        ]
+        cratedb-version: [ 'nightly' ]
+
+    services:
+      cratedb:
+        image: crate/crate:${{ matrix.cratedb-version }}
+        ports:
+          - 4200:4200
+          - 5432:5432
+        env:
+          CRATE_HEAP_SIZE: 4g
+
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+    steps:
+
+      - name: Acquire sources
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+            topic/machine-learning/llama-index/requirements.txt
+            topic/machine-learning/llama-index/requirements-dev.txt
+
+      - name: Install utilities
+        run: |
+          pip install -r requirements.txt
+
+      - name: Validate topic/machine-learning/llama-index
+        run: |
+          ngr test --accept-no-venv topic/machine-learning/llama-index
diff --git a/topic/machine-learning/llama-index/README.md b/topic/machine-learning/llama-index/README.md
index 7e339089..2c287e8c 100644
--- a/topic/machine-learning/llama-index/README.md
+++ b/topic/machine-learning/llama-index/README.md
@@ -4,9 +4,9 @@ This folder contains the codebase for [this tutorial](https://community.cratedb.
 
 This has been tested using:
 
-* Python 3.12.2
-* macOS Sequoia 15.0.1
-* CrateDB 5.8.3 running in CrateDB Cloud on AWS Europe (Ireland)
+* Python 3.12
+* macOS
+* CrateDB 5.8 and higher
 
 ## Database Setup
 
@@ -14,6 +14,15 @@ You will need a CrateDB Cloud database: sign up [here](https://console.cratedb.c
 
 Make a note of the hostname, username and password for your database.  You'll need those when configuring the environment file later.
 
+If you don't use CrateDB Cloud, you can also provide an instance for testing
+purposes like this:
+
+```shell
+docker run --rm -it --name=cratedb \
+  --publish=4200:4200 --publish=5432:5432 \
+  --env=CRATE_HEAP_SIZE=2g crate:latest -Cdiscovery.type=single-node
+```
+
 Create a table in CrateDB:
 
 ```sql
@@ -61,7 +70,7 @@ pip install -r requirements.txt
 
 ## Configure your Environment
 
-To configure your environment, copy the provided [`env.example`](./env.example) file to a new file named `.env`, then open it with a text editor.
+To configure your environment, copy the provided [`env.azure`](./env.azure) or [`env.standalone`](./env.standalone) file to a new file named `.env`, then open it with a text editor.
 
 Set the values in the file as follows:
 
@@ -72,7 +81,7 @@ OPENAI_AZURE_ENDPOINT=https://<Your endpoint from Azure e.g. myendpoint.openai.a
 OPENAI_AZURE_API_VERSION=2024-08-01-preview
 LLM_INSTANCE=<The name of your Chat GPT 3.5 turbo instance from Azure>
 EMBEDDING_MODEL_INSTANCE=<The name of your Text Embedding Ada 2.0 instance from Azure>
-CRATEDB_URL="crate://<Database user name>:<Database password>@<Database host>:4200/?ssl=true"
+CRATEDB_SQLALCHEMY_URL="crate://<Database user name>:<Database password>@<Database host>:4200/?ssl=true"
 CRATEDB_TABLE_NAME=time_series_data
 ```
 
diff --git a/topic/machine-learning/llama-index/env.example b/topic/machine-learning/llama-index/env.azure
similarity index 64%
rename from topic/machine-learning/llama-index/env.example
rename to topic/machine-learning/llama-index/env.azure
index df23fa73..f97346ef 100644
--- a/topic/machine-learning/llama-index/env.example
+++ b/topic/machine-learning/llama-index/env.azure
@@ -4,5 +4,5 @@ OPENAI_AZURE_ENDPOINT=https://TODO.openai.azure.com
 OPENAI_AZURE_API_VERSION=2024-08-01-preview
 LLM_INSTANCE=TODO
 EMBEDDING_MODEL_INSTANCE=TODO
-CRATEDB_URL="crate://USER:PASSWORD@HOST:4200/?ssl=true"
-CRATEDB_TABLE_NAME=time_series_data
\ No newline at end of file
+CRATEDB_SQLALCHEMY_URL="crate://USER:PASSWORD@HOST:4200/?ssl=true"
+CRATEDB_TABLE_NAME=time_series_data
diff --git a/topic/machine-learning/llama-index/env.standalone b/topic/machine-learning/llama-index/env.standalone
new file mode 100644
index 00000000..9ad450ef
--- /dev/null
+++ b/topic/machine-learning/llama-index/env.standalone
@@ -0,0 +1,4 @@
+# OPENAI_API_KEY=sk-XJZ7pfog5Gp8Kus8D--invalid--0CJ5lyAKSefZLaV1Y9S1
+OPENAI_API_TYPE=openai
+CRATEDB_SQLALCHEMY_URL="crate://crate@localhost:4200/"
+CRATEDB_TABLE_NAME=time_series_data
diff --git a/topic/machine-learning/llama-index/init.sql b/topic/machine-learning/llama-index/init.sql
new file mode 100644
index 00000000..e59ad493
--- /dev/null
+++ b/topic/machine-learning/llama-index/init.sql
@@ -0,0 +1,23 @@
+CREATE TABLE IF NOT EXISTS time_series_data (
+    timestamp TIMESTAMP,
+    value DOUBLE,
+    location STRING,
+    sensor_id INT
+);
+
+INSERT INTO time_series_data (timestamp, value, location, sensor_id)
+VALUES
+    ('2023-09-14T00:00:00', 10.5, 'Sensor A', 1),
+    ('2023-09-14T01:00:00', 15.2, 'Sensor A', 1),
+    ('2023-09-14T02:00:00', 18.9, 'Sensor A', 1),
+    ('2023-09-14T03:00:00', 12.7, 'Sensor B', 2),
+    ('2023-09-14T04:00:00', 17.3, 'Sensor B', 2),
+    ('2023-09-14T05:00:00', 20.1, 'Sensor B', 2),
+    ('2023-09-14T06:00:00', 22.5, 'Sensor A', 1),
+    ('2023-09-14T07:00:00', 18.3, 'Sensor A', 1),
+    ('2023-09-14T08:00:00', 16.8, 'Sensor A', 1),
+    ('2023-09-14T09:00:00', 14.6, 'Sensor B', 2),
+    ('2023-09-14T10:00:00', 13.2, 'Sensor B', 2),
+    ('2023-09-14T11:00:00', 11.7, 'Sensor B', 2);
+
+REFRESH TABLE time_series_data;
diff --git a/topic/machine-learning/llama-index/main.py b/topic/machine-learning/llama-index/main.py
index 0e795545..077f67d5 100644
--- a/topic/machine-learning/llama-index/main.py
+++ b/topic/machine-learning/llama-index/main.py
@@ -1,4 +1,7 @@
-""" Example code using Azure Open AI and llama-index. """
+"""
+Use an LLM to query a database in human language.
+Example code using LlamaIndex with vanilla Open AI and Azure Open AI.
+"""
 
 import os
 import openai
@@ -6,54 +9,84 @@
 
 from dotenv import load_dotenv
 from langchain_openai import AzureOpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
 from llama_index.llms.azure_openai import AzureOpenAI
+from llama_index.llms.openai import OpenAI
 from llama_index.embeddings.langchain import LangchainEmbedding
 from llama_index.core.utilities.sql_wrapper import SQLDatabase
 from llama_index.core.query_engine import NLSQLTableQueryEngine
 from llama_index.core import Settings
 
-if __name__ == "__main__":
-    load_dotenv()
+
+def configure_llm():
+    """
+    Configure LLM. Use either vanilla Open AI, or Azure Open AI.
+    """
 
     openai.api_type = os.getenv("OPENAI_API_TYPE")
     openai.azure_endpoint = os.getenv("OPENAI_AZURE_ENDPOINT")
     openai.api_version = os.getenv("OPENAI_AZURE_API_VERSION")
     openai.api_key = os.getenv("OPENAI_API_KEY")
 
-    llm = AzureOpenAI(
-        engine=os.getenv("LLM_INSTANCE"),
-        azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
-        api_key = os.getenv("OPENAI_API_KEY"),
-        api_version = os.getenv("OPENAI_AZURE_API_VERSION"),
-        temperature=0.0
-    )
+    if openai.api_type == "openai":
+        llm = OpenAI(
+            api_key=os.getenv("OPENAI_API_KEY"),
+            temperature=0.0
+        )
+    elif openai.api_type == "azure":
+        llm = AzureOpenAI(
+            engine=os.getenv("LLM_INSTANCE"),
+            azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
+            api_key = os.getenv("OPENAI_API_KEY"),
+            api_version = os.getenv("OPENAI_AZURE_API_VERSION"),
+            temperature=0.0
+        )
+    else:
+        raise ValueError(f"Open AI API type not defined or invalid: {openai.api_type}")
 
     Settings.llm = llm
-    Settings.embed_model = LangchainEmbedding(
-        AzureOpenAIEmbeddings(
-            azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
-            model=os.getenv("EMBEDDING_MODEL_INSTANCE")
+    if openai.api_type == "openai":
+        Settings.embed_model = LangchainEmbedding(OpenAIEmbeddings())
+    elif openai.api_type == "azure":
+        Settings.embed_model = LangchainEmbedding(
+            AzureOpenAIEmbeddings(
+                azure_endpoint=os.getenv("OPENAI_AZURE_ENDPOINT"),
+                model=os.getenv("EMBEDDING_MODEL_INSTANCE")
+            )
         )
-    )
 
-    print("Creating SQLAlchemy engine...")
-    engine_crate = sa.create_engine(os.getenv("CRATEDB_URL"))
-    print("Connecting to CrateDB...")
+
+def main():
+    """
+    Use an LLM to query a database in human language.
+    """
+
+    # Configure application.
+    load_dotenv()
+    configure_llm()
+
+    # Configure database connection and query engine.
+    print("Connecting to CrateDB")
+    engine_crate = sa.create_engine(os.getenv("CRATEDB_SQLALCHEMY_URL"))
     engine_crate.connect()
-    print("Creating SQLDatabase instance...")
+
+    print("Creating LlamaIndex QueryEngine")
     sql_database = SQLDatabase(engine_crate, include_tables=[os.getenv("CRATEDB_TABLE_NAME")])
-    print("Creating QueryEngine...")
     query_engine = NLSQLTableQueryEngine(
         sql_database=sql_database,
         tables=[os.getenv("CRATEDB_TABLE_NAME")],
-        llm = llm
+        llm=Settings.llm
     )
 
-    print("Running query...")
-
+    # Invoke an inquiry.
+    print("Running query")
     QUERY_STR = "What is the average value for sensor 1?"
     answer = query_engine.query(QUERY_STR)
     print(answer.get_formatted_sources())
     print("Query was:", QUERY_STR)
     print("Answer was:", answer)
     print(answer.metadata)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/topic/machine-learning/llama-index/pyproject.toml b/topic/machine-learning/llama-index/pyproject.toml
new file mode 100644
index 00000000..d06dcec5
--- /dev/null
+++ b/topic/machine-learning/llama-index/pyproject.toml
@@ -0,0 +1,27 @@
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = """
+  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  --cov=. --cov-report=term-missing --cov-report=xml
+  """
+
+#log_level = "DEBUG"
+#log_cli_level = "DEBUG"
+
+testpaths = [
+    "*.py",
+]
+xfail_strict = true
+markers = [
+]
+
+[tool.coverage.run]
+branch = false
+
+[tool.coverage.report]
+fail_under = 0
+show_missing = true
+omit = [
+    "conftest.py",
+    "test*.py",
+]
diff --git a/topic/machine-learning/llama-index/requirements-dev.txt b/topic/machine-learning/llama-index/requirements-dev.txt
new file mode 100644
index 00000000..930b8ce0
--- /dev/null
+++ b/topic/machine-learning/llama-index/requirements-dev.txt
@@ -0,0 +1,3 @@
+cratedb-toolkit
+pueblo[testing]
+sqlparse
diff --git a/topic/machine-learning/llama-index/requirements.txt b/topic/machine-learning/llama-index/requirements.txt
index ad580877..ba44e31a 100644
--- a/topic/machine-learning/llama-index/requirements.txt
+++ b/topic/machine-learning/llama-index/requirements.txt
@@ -1,100 +1,7 @@
-aiohappyeyeballs==2.4.3
-aiohttp==3.10.10
-aiosignal==1.3.1
-annotated-types==0.7.0
-anyio==4.6.2.post1
-attrs==24.2.0
-azure-core==1.31.0
-azure-identity==1.19.0
-beautifulsoup4==4.12.3
-certifi==2024.8.30
-cffi==1.17.1
-charset-normalizer==3.4.0
-click==8.1.7
-crate>=1.0.0.dev2
-cryptography==43.0.3
-dataclasses-json==0.6.7
-Deprecated==1.2.14
-dirtyjson==1.0.8
-distro==1.9.0
-frozenlist==1.4.1
-fsspec==2024.10.0
-geojson==3.1.0
-greenlet==3.1.1
-h11==0.14.0
-httpcore==1.0.6
-httpx==0.27.2
-idna==3.10
-jiter==0.6.1
-joblib==1.4.2
-jsonpatch==1.33
-jsonpointer==3.0.0
-langchain==0.3.4
-langchain-community==0.3.3
-langchain-core==0.3.12
-langchain-openai==0.2.3
-langchain-text-splitters==0.3.0
-langsmith==0.1.136
-llama-cloud==0.1.4
-llama-index==0.11.19
-llama-index-agent-openai==0.3.4
-llama-index-cli==0.3.1
-llama-index-core==0.11.19
-llama-index-embeddings-langchain==0.2.1
-llama-index-embeddings-openai==0.2.5
-llama-index-indices-managed-llama-cloud==0.4.0
-llama-index-legacy==0.9.48.post3
-llama-index-llms-azure-openai==0.2.2
-llama-index-llms-langchain==0.4.2
-llama-index-llms-openai==0.2.15
-llama-index-multi-modal-llms-openai==0.2.2
-llama-index-program-openai==0.2.0
-llama-index-question-gen-openai==0.2.0
-llama-index-readers-file==0.2.2
-llama-index-readers-llama-parse==0.3.0
-llama-parse==0.5.10
-marshmallow==3.23.0
-msal==1.31.0
-msal-extensions==1.2.0
-multidict==6.1.0
-mypy-extensions==1.0.0
-nest-asyncio==1.6.0
-networkx==3.4.2
-nltk==3.9.1
-numpy==1.26.4
-openai==1.52.0
-orjson==3.10.9
-packaging==24.1
-pandas==2.2.3
-pillow==11.0.0
-portalocker==2.10.1
-propcache==0.2.0
-pycparser==2.22
-pydantic==2.9.2
-pydantic-settings==2.6.0
-pydantic_core==2.23.4
-PyJWT==2.9.0
-pypdf==4.3.1
-python-dateutil==2.9.0.post0
-python-dotenv==1.0.1
-pytz==2024.2
-PyYAML==6.0.2
-regex==2024.9.11
-requests==2.32.3
-requests-toolbelt==1.0.0
-six==1.16.0
-sniffio==1.3.1
-soupsieve==2.6
-SQLAlchemy==2.0.36
-sqlalchemy-cratedb>=0.40.0
-striprtf==0.0.26
-tenacity==8.5.0
-tiktoken==0.8.0
-tqdm==4.66.5
-typing-inspect==0.9.0
-typing_extensions==4.12.2
-tzdata==2024.2
-urllib3==2.2.3
-verlib2==0.2.0
-wrapt==1.16.0
-yarl==1.16.0
+langchain-openai<0.3
+llama-index-embeddings-langchain<0.3
+llama-index-embeddings-openai<0.3
+llama-index-llms-azure-openai<0.3
+llama-index-llms-openai<0.3
+python-dotenv
+sqlalchemy-cratedb
diff --git a/topic/machine-learning/llama-index/test.py b/topic/machine-learning/llama-index/test.py
new file mode 100644
index 00000000..3c81566a
--- /dev/null
+++ b/topic/machine-learning/llama-index/test.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+import pytest
+
+from cratedb_toolkit.io.sql import DatabaseAdapter
+from dotenv import load_dotenv
+
+HERE = Path(__file__).parent
+
+
+@pytest.fixture()
+def cratedb() -> DatabaseAdapter:
+    return DatabaseAdapter(dburi="crate://crate@localhost:4200")
+
+
+@pytest.fixture(scope="function", autouse=True)
+def init_database(cratedb):
+    """
+    Initialize database.
+    """
+    cratedb.run_sql("DROP TABLE IF EXISTS time_series_data;")
+    cratedb.run_sql((HERE / "init.sql").read_text())
+
+
+def test_main(cratedb, capsys):
+    """
+    Execute `main.py` and verify outcome.
+    """
+
+    # Load the standalone configuration also for software testing.
+    # On CI, `OPENAI_API_KEY` will need to be supplied externally.
+    load_dotenv("env.standalone")
+
+    # Invoke the workload, in-process.
+    from main import main
+    main()
+
+    # Verify the outcome.
+    out = capsys.readouterr().out
+    assert "Answer was: The average value for sensor 1 is approximately 17.03." in out