From 9768ae380fee939d13abf5c3c324ef623b0eb2ee Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Tue, 7 Nov 2023 16:57:02 -0500 Subject: [PATCH] Update documentation to note SQL bind parameters, closes #596 --- docs/embeddings/query.md | 19 +- .../13_Similarity_search_with_images.ipynb | 884 ++++++++++++------ 2 files changed, 598 insertions(+), 305 deletions(-) diff --git a/docs/embeddings/query.md b/docs/embeddings/query.md index 53246503c..2e215cc9d 100644 --- a/docs/embeddings/query.md +++ b/docs/embeddings/query.md @@ -32,7 +32,6 @@ The similar clause is a txtai function that enables similarity searches with SQL ```sql SELECT id, text, score FROM txtai WHERE similar('feel good story') -SELECT id, text, score FROM txtai WHERE similar('feel good story') ``` The similar clause takes the following arguments: @@ -92,6 +91,20 @@ SELECT text FROM txtai WHERE [parent.child element] = 'abc' Note the bracket statement escaping the nested column with spaces in the name. +### Bind parameters + +txtai has support for SQL bind parameters. + +```python +# Query with a bind parameter for similar clause +query = "SELECT id, text, score FROM txtai WHERE similar(:x)" +results = embeddings.search(query, parameters={"x": "feel good story"}) + +# Query with a bind parameter for column filter +query = "select text, flag, actiondate from txtai where flag = :x" +results = embeddings.search(query, parameters={"x": 1}) +``` + ### Aggregation queries The goal of txtai's query language is to closely support all functions in the underlying database engine. The main challenge is ensuring dynamic columns are properly escaped into the engines native query function. @@ -123,6 +136,10 @@ embeddings.index([("txtai", {"text": "txtai executes machine-learning workflows. # Query txtai and get associated object query = "select object from txtai where similar('machine learning') limit 1" result = embeddings.search(query)[0]["object"] + +# Query binary content with a bind parameter +query = "select object from txtai where similar(:x) limit 1" +results = embeddings.search(query, parameters={"x": request.read()}) ``` ## Custom SQL functions diff --git a/examples/13_Similarity_search_with_images.ipynb b/examples/13_Similarity_search_with_images.ipynb index 362dce415..3bf1ce6cf 100644 --- a/examples/13_Similarity_search_with_images.ipynb +++ b/examples/13_Similarity_search_with_images.ipynb @@ -1,16 +1,4 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU" - }, "cells": [ { "cell_type": "markdown", @@ -36,9 +24,11 @@ }, { "cell_type": "code", + "execution_count": 7, "metadata": { "id": "XMQuuun2R06J" }, + "outputs": [], "source": [ "%%capture\n", "!pip install torchvision ipyplot git+https://github.com/neuml/txtai#egg=txtai[similarity]\n", @@ -46,9 +36,7 @@ "# Get test data\n", "!wget -N https://github.com/neuml/txtai/releases/download/v3.5.0/tests.tar.gz\n", "!tar -xvzf tests.tar.gz" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", @@ -66,9 +54,11 @@ }, { "cell_type": "code", + "execution_count": 8, "metadata": { "id": "nTDwXOUeTH2-" }, + "outputs": [], "source": [ "%%capture\n", "\n", @@ -86,75 +76,74 @@ " for path in glob.glob('txtai/*jpg'):\n", " # Add image object along with image metadata\n", " image = Image.open(path)\n", + "\n", " yield (path, {\"object\": image, \"format\": image.format, \"width\": image.width, \"height\": image.height, \"caption\": caption(image)}, None)\n", "\n", "# Index with content and objects\n", "embeddings = Embeddings({\"method\": \"sentence-transformers\", \"path\": \"sentence-transformers/clip-ViT-B-32\", \"content\": True, \"objects\": \"image\"})\n", "embeddings.index(images())" - ], - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "Next let's query and see what's available in the index." - ], "metadata": { "id": "PTZbRHiE5_l3" - } + }, + "source": [ + "Next let's query and see what's available in the index." + ] }, { "cell_type": "code", - "source": [ - "embeddings.search(\"select id, object, format, width, height, caption from txtai\")" - ], + "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "040r95YG1w3J", - "outputId": "cffa34d0-983a-4086-9bb1-5cf9aa791b6b" + "outputId": "65a2b2b2-6153-4f3d-e32c-5e4e661956ab" }, - "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ - "[{'caption': 'a book shelf filled with many books',\n", + "[{'id': 'txtai/books.jpg',\n", + " 'object': ,\n", " 'format': 'JPEG',\n", + " 'width': 1024,\n", " 'height': 682,\n", - " 'id': 'txtai/books.jpg',\n", - " 'object': ,\n", - " 'width': 1024},\n", - " {'caption': 'a large building with many windows in a city',\n", + " 'caption': 'a book shelf filled with books and a stack of books'},\n", + " {'id': 'txtai/buildings.jpg',\n", + " 'object': ,\n", " 'format': 'JPEG',\n", + " 'width': 700,\n", " 'height': 466,\n", - " 'id': 'txtai/buildings.jpg',\n", - " 'object': ,\n", - " 'width': 700},\n", - " {'caption': 'a bird is perched on top of a tree branch',\n", + " 'caption': 'a city skyline with buildings and a sky background'},\n", + " {'id': 'txtai/chop.jpg',\n", + " 'object': ,\n", " 'format': 'JPEG',\n", + " 'width': 700,\n", " 'height': 466,\n", - " 'id': 'txtai/chop.jpg',\n", - " 'object': ,\n", - " 'width': 700}]" + " 'caption': 'a tree branch with a person holding a stick'}]" ] }, "metadata": {}, - "execution_count": 23 + "execution_count": 9 } + ], + "source": [ + "embeddings.search(\"select id, object, format, width, height, caption from txtai\")" ] }, { "cell_type": "markdown", - "source": [ - "The query above shows the metadata that was added in addition to the image object. These fields can be retrieved on search and/or used to filter results." - ], "metadata": { "id": "r5GjmdCA6IPJ" - } + }, + "source": [ + "The query above shows the metadata that was added in addition to the image object. These fields can be retrieved on search and/or used to filter results." + ] }, { "cell_type": "markdown", @@ -169,43 +158,33 @@ }, { "cell_type": "code", + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 698 + "height": 600 }, "id": "WHTq86MG9UBF", - "outputId": "d3c01ec4-001a-4c62-b471-d13109766ffe" + "outputId": "b679e3d3-a82a-43bb-c59e-c7e9a3ce0577" }, - "source": [ - "import ipyplot\n", - "from PIL import Image\n", - "\n", - "images, labels = [], []\n", - "for query in [\"Walking into the office\", \"Saturday cleaning the yard\", \"Working on the latest analysis\", \"Working on my homework\", \"Watching an exciting race\",\n", - " \"The universe is massive\", \"Time lapse video of traffic\", \"Relaxing Thanksgiving day\"]:\n", - " result = embeddings.search(f\"select object from txtai where similar(\\\"{query}\\\")\", 1)[0]\n", - " images.append(result[\"object\"])\n", - " labels.append(query)\n", - "\n", - "ipyplot.plot_images(images, labels, img_width=425, force_b64=True)" - ], - "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { + "text/plain": [ + "" + ], "text/html": [ "\n", " \n", "
\n", - " \n", - " \n", - " \n", "
\n", " " - ], - "text/plain": [ - "" ] }, "metadata": {} @@ -430,10 +406,13 @@ { "output_type": "display_data", "data": { + "text/plain": [ + "" + ], "text/html": [ "\n", " \n", - "
\n", - "
\n", - "
\n", + "
\n", + "
\n", + "
\n", "

Walking into the office

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Saturday cleaning the yard

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Working on the latest analysis

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Working on my homework

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Watching an exciting race

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

The universe is massive

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Time lapse video of traffic

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Relaxing Thanksgiving day

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", "
" + ] + }, + "metadata": {} + } + ], + "source": [ + "import ipyplot\n", + "from PIL import Image\n", + "\n", + "def resize(images):\n", + " results = []\n", + " for image in images:\n", + " results.append(image.resize((350, int(image.height * (350 / image.width))), Image.Resampling.LANCZOS))\n", + "\n", + " return results\n", + "\n", + "images, labels = [], []\n", + "for query in [\"Walking into the office\", \"Saturday cleaning the yard\", \"Working on the latest analysis\", \"Working on my homework\", \"Watching an exciting race\",\n", + " \"The universe is massive\", \"Time lapse video of traffic\", \"Relaxing Thanksgiving day\"]:\n", + " result = embeddings.search(f\"select object from txtai where similar(\\\"{query}\\\")\", 1)[0]\n", + " images.append(result[\"object\"])\n", + " labels.append(query)\n", + "\n", + "ipyplot.plot_images(resize(images), labels, img_width=350, force_b64=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8BYDpAeoiOvt" + }, + "source": [ + "# Search with SQL\n", + "\n", + "txtai has support for SQL bind parameters, which enables similarity search with binary content." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 317 + }, + "id": "a2V0wE84iWkh", + "outputId": "f98abb71-1679-40fb-d3b4-98726b10dfe6" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "" ], + "text/html": [ + "\n", + " \n", + "
\n", + " \n", + " \n", + " \n", + "
\n", + " " + ] + }, + "metadata": {} + }, + { + "output_type": "display_data", + "data": { "text/plain": [ "" + ], + "text/html": [ + "\n", + " \n", + "
\n", + "
\n", + "
\n", + "

Result

\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
\n", + "
" ] }, "metadata": {} } + ], + "source": [ + "result = embeddings.search(f\"select object from txtai where similar(:x)\", 1, parameters={\"x\": Image.open(\"txtai/books.jpg\")})[0]\n", + "\n", + "ipyplot.plot_images(resize([result[\"object\"]]), [\"Result\"], img_width=350, force_b64=True)" ] }, { @@ -630,55 +895,33 @@ }, { "cell_type": "code", + "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/", - "height": 728 + "height": 630 }, "id": "e8BxURU6gZV3", - "outputId": "486f8450-5a52-4218-a2e4-f0c1e03bebde" + "outputId": "e58335f5-25ee-4c9a-bc90-b0027f21cdef" }, - "source": [ - "import ipyplot\n", - "from PIL import Image\n", - "\n", - "from txtai.pipeline import Translation\n", - "\n", - "# Update model at query time to support multilingual queries\n", - "embeddings.config[\"path\"] = \"sentence-transformers/clip-ViT-B-32-multilingual-v1\"\n", - "embeddings.model = embeddings.loadvectors()\n", - "\n", - "# Translate queries to German\n", - "queries = [\"Walking into the office\", \"Saturday cleaning the yard\", \"Working on the latest analysis\", \"Working on my homework\", \"Watching an exciting race\",\n", - " \"The universe is massive\", \"Time lapse video of traffic\", \"Relaxing Thanksgiving day\"]\n", - "translate = Translation()\n", - "translated = translate(queries, \"de\")\n", - "\n", - "images, labels = [], []\n", - "for x, query in enumerate(translated):\n", - " result = embeddings.search(f\"select object from txtai where similar(\\\"{query}\\\")\", 1)[0]\n", - "\n", - " images.append(result[\"object\"])\n", - " labels.append(\"%s
(%s)\" % (query, queries[x]))\n", - "\n", - "ipyplot.plot_images(images, labels, img_width=425, force_b64=True)" - ], - "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { + "text/plain": [ + "" + ], "text/html": [ "\n", " \n", "
\n", - " \n", - " \n", - " \n", "
\n", " " - ], - "text/plain": [ - "" ] }, "metadata": {} @@ -903,10 +1143,13 @@ { "output_type": "display_data", "data": { + "text/plain": [ + "" + ], "text/html": [ "\n", " \n", - "
\n", - "
\n", - "
\n", + "
\n", + "
\n", + "
\n", "

Zu Fuß ins Büro
(Walking into the office)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Samstag Reinigung des Hofes
(Saturday cleaning the yard)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Arbeiten an der neuesten Analyse
(Working on the latest analysis)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Arbeiten an meinen Hausaufgaben
(Working on my homework)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Ein spannendes Rennen beobachten
(Watching an exciting race)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Das Universum ist riesig
(The universe is massive)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", - "

Zeitraffer-Video des
(Time lapse video of traffic)

\n", - " \n", + "
\n", + "
\n", + "

Zeitraffer Video des Verkehrs
(Time lapse video of traffic)

\n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", " \n", - "
\n", - "
\n", + "
\n", + "
\n", "

Entspannender Thanksgiving-Tag
(Relaxing Thanksgiving day)

\n", - " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", " \n", " \n", "
\n", "
\n", "
" - ], - "text/plain": [ - "" ] }, "metadata": {} } + ], + "source": [ + "import ipyplot\n", + "\n", + "from txtai.pipeline import Translation\n", + "\n", + "# Update model at query time to support multilingual queries\n", + "embeddings.config[\"path\"] = \"sentence-transformers/clip-ViT-B-32-multilingual-v1\"\n", + "embeddings.model = embeddings.loadvectors()\n", + "\n", + "# Translate queries to German\n", + "queries = [\"Walking into the office\", \"Saturday cleaning the yard\", \"Working on the latest analysis\", \"Working on my homework\", \"Watching an exciting race\",\n", + " \"The universe is massive\", \"Time lapse video of traffic\", \"Relaxing Thanksgiving day\"]\n", + "translate = Translation()\n", + "translated = translate(queries, \"de\")\n", + "\n", + "images, labels = [], []\n", + "for x, query in enumerate(translated):\n", + " result = embeddings.search(f\"select object from txtai where similar(:x)\", 1, parameters={\"x\": query})[0]\n", + "\n", + " images.append(result[\"object\"])\n", + " labels.append(\"%s
(%s)\" % (query, queries[x]))\n", + "\n", + "ipyplot.plot_images(resize(images), labels, img_width=350, force_b64=True)" ] } - ] + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file