From adbdd15b102ebc5ea8a0ca5c3d80d25e3bd5ab5b Mon Sep 17 00:00:00 2001 From: bilgeyucel Date: Thu, 2 Jan 2025 14:40:58 +0300 Subject: [PATCH] Update the warnings * Add more explanation for 2.x tutorials * Put alternatives for 1.x tutorials --- tutorials/01_Basic_QA_Pipeline.ipynb | 2 +- .../02_Finetune_a_model_on_your_data.ipynb | 4 + tutorials/03_Scalable_QA_System.ipynb | 4 + tutorials/04_FAQ_style_QA.ipynb | 1439 ++++----- tutorials/05_Evaluation.ipynb | 4 + ...er_Retrieval_via_Embedding_Retrieval.ipynb | 2 +- tutorials/07_RAG_Generator.ipynb | 4 +- tutorials/08_Preprocessing.ipynb | 2 +- tutorials/09_DPR_training.ipynb | 4 + tutorials/10_Knowledge_Graph.ipynb | 4 +- tutorials/11_Pipelines.ipynb | 6 +- tutorials/12_LFQA.ipynb | 4 +- tutorials/13_Question_generation.ipynb | 4 + tutorials/14_Query_Classifier.ipynb | 4 + tutorials/15_TableQA.ipynb | 4 + ...16_Document_Classifier_at_Index_Time.ipynb | 4 + tutorials/17_Audio.ipynb | 11 +- tutorials/18_GPL.ipynb | 4 + ...h_pipeline_with_MultiModal_Retriever.ipynb | 9 +- .../20_Using_Haystack_with_REST_API.ipynb | 4 + tutorials/21_Customizing_PromptNode.ipynb | 10 +- tutorials/22_Pipeline_with_PromptNode.ipynb | 13 +- ...ering_Multihop_Questions_with_Agents.ipynb | 4 + tutorials/24_Building_Chat_App.ipynb | 4 + tutorials/25_Customizing_Agent.ipynb | 4 + tutorials/26_Hybrid_Retrieval.ipynb | 13 +- tutorials/27_First_RAG_Pipeline.ipynb | 2689 ++++++++--------- .../28_Structured_Output_With_Loop.ipynb | 972 +++--- tutorials/29_Serializing_Pipelines.ipynb | 47 +- ...le_Type_Preprocessing_Index_Pipeline.ipynb | 5 +- tutorials/31_Metadata_Filtering.ipynb | 4 +- ...ng_Documents_and_Queries_by_Language.ipynb | 10 +- tutorials/33_Hybrid_Retrieval.ipynb | 4 +- tutorials/34_Extractive_QA_Pipeline.ipynb | 2 +- tutorials/35_Evaluating_RAG_Pipelines.ipynb | 137 +- ...g_Fallbacks_with_Conditional_Routing.ipynb | 1071 +++---- ...ing_Pipeline_Inputs_with_Multiplexer.ipynb | 4 +- ...ding_Metadata_for_Improved_Retrieval.ipynb | 67 +- ...at_Application_with_Function_Calling.ipynb | 4 +- 39 files changed, 3159 insertions(+), 3428 deletions(-) diff --git a/tutorials/01_Basic_QA_Pipeline.ipynb b/tutorials/01_Basic_QA_Pipeline.ipynb index 7000b94c..7162d354 100644 --- a/tutorials/01_Basic_QA_Pipeline.ipynb +++ b/tutorials/01_Basic_QA_Pipeline.ipynb @@ -7,7 +7,7 @@ "source": [ "# Tutorial: Build Your First Question Answering System\n", "\n", - "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.0 and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) and [Build an Extractive QA Pipeline](https://haystack.deepset.ai/tutorials/34_extractive_qa_pipeline). \n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) and [Build an Extractive QA Pipeline](https://haystack.deepset.ai/tutorials/34_extractive_qa_pipeline). \n", ">\n", "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", diff --git a/tutorials/02_Finetune_a_model_on_your_data.ipynb b/tutorials/02_Finetune_a_model_on_your_data.ipynb index 3620e168..07d2bece 100644 --- a/tutorials/02_Finetune_a_model_on_your_data.ipynb +++ b/tutorials/02_Finetune_a_model_on_your_data.ipynb @@ -7,6 +7,10 @@ "source": [ "# Tutorial: Fine-Tuning a Model on Your Own Data\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook)\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 15 minutes\n", "- **Nodes Used**: `FARMReader`\n", diff --git a/tutorials/03_Scalable_QA_System.ipynb b/tutorials/03_Scalable_QA_System.ipynb index 810889f7..ecd098da 100644 --- a/tutorials/03_Scalable_QA_System.ipynb +++ b/tutorials/03_Scalable_QA_System.ipynb @@ -7,6 +7,10 @@ "source": [ "# Tutorial: Build a Scalable Question Answering System\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook)\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Beginner\n", "- **Time to complete**: 20 minutes\n", "- **Nodes Used**: `ElasticsearchDocumentStore`, `BM25Retriever`, `FARMReader`\n", diff --git a/tutorials/04_FAQ_style_QA.ipynb b/tutorials/04_FAQ_style_QA.ipynb index 4bb2cf38..382463c0 100644 --- a/tutorials/04_FAQ_style_QA.ipynb +++ b/tutorials/04_FAQ_style_QA.ipynb @@ -1,736 +1,741 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "jUbPUmtaozIP" - }, - "source": [ - "# Utilizing existing FAQs for Question Answering\n", - "- **Level**: Beginner\n", - "- **Time to complete**: 15 minutes\n", - "- **Nodes Used**: `InMemoryDocumentStore`, `EmbeddingRetriever`\n", - "- **Goal**: Learn how to use the `EmbeddingRetriever` in a `FAQPipeline` to answer incoming questions by matching them to the most similar questions in your existing FAQ.\n", - "\n", - "# Overview\n", - "While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.\n", - "\n", - "**Pros**:\n", - "\n", - "- Very fast at inference time\n", - "- Utilize existing FAQ data\n", - "- Quite good control over answers\n", - "\n", - "**Cons**:\n", - "\n", - "- Generalizability: We can only answer questions that are similar to existing ones in FAQ\n", - "\n", - "In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "collapsed": false, - "id": "zBOtphIMozIT" - }, - "source": [ - "\n", - "## Preparing the Colab Environment\n", - "\n", - "- [Enable GPU Runtime](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration#enabling-the-gpu-in-colab)\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "ENpLjBejozIW" - }, - "source": [ - "## Installing Haystack\n", - "\n", - "To start, let's install the latest release of Haystack with `pip`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "q_y78_4LozIW" - }, - "outputs": [], - "source": [ - "%%bash\n", - "\n", - "pip install --upgrade pip\n", - "pip install farm-haystack[colab,inference]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Enabling Telemetry \n", - "Knowing you're using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from haystack.telemetry import tutorial_running\n", - "\n", - "tutorial_running(4)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "collapsed": false, - "id": "Wl9Q6E3hozIW", - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Set the logging level to INFO:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "id": "Edvocv1ZozIX", - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "import logging\n", - "\n", - "logging.basicConfig(format=\"%(levelname)s - %(name)s - %(message)s\", level=logging.WARNING)\n", - "logging.getLogger(\"haystack\").setLevel(logging.INFO)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "noVtM20ZozIX" - }, - "source": [ - "### Create a simple DocumentStore\n", - "The InMemoryDocumentStore is good for quick development and prototyping. For more scalable options, check-out the [docs](https://docs.haystack.deepset.ai/docs/document_store)." - ] + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "jUbPUmtaozIP" + }, + "source": [ + "# Utilizing existing FAQs for Question Answering\n", + "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", + "- **Level**: Beginner\n", + "- **Time to complete**: 15 minutes\n", + "- **Nodes Used**: `InMemoryDocumentStore`, `EmbeddingRetriever`\n", + "- **Goal**: Learn how to use the `EmbeddingRetriever` in a `FAQPipeline` to answer incoming questions by matching them to the most similar questions in your existing FAQ.\n", + "\n", + "# Overview\n", + "While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data.\n", + "\n", + "**Pros**:\n", + "\n", + "- Very fast at inference time\n", + "- Utilize existing FAQ data\n", + "- Quite good control over answers\n", + "\n", + "**Cons**:\n", + "\n", + "- Generalizability: We can only answer questions that are similar to existing ones in FAQ\n", + "\n", + "In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "zBOtphIMozIT" + }, + "source": [ + "\n", + "## Preparing the Colab Environment\n", + "\n", + "- [Enable GPU Runtime](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration#enabling-the-gpu-in-colab)\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "ENpLjBejozIW" + }, + "source": [ + "## Installing Haystack\n", + "\n", + "To start, let's install the latest release of Haystack with `pip`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "q_y78_4LozIW" + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "pip install --upgrade pip\n", + "pip install farm-haystack[colab,inference]" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Enabling Telemetry \n", + "Knowing you're using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from haystack.telemetry import tutorial_running\n", + "\n", + "tutorial_running(4)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "Wl9Q6E3hozIW", + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Set the logging level to INFO:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "Edvocv1ZozIX", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "logging.basicConfig(format=\"%(levelname)s - %(name)s - %(message)s\", level=logging.WARNING)\n", + "logging.getLogger(\"haystack\").setLevel(logging.INFO)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "noVtM20ZozIX" + }, + "source": [ + "### Create a simple DocumentStore\n", + "The InMemoryDocumentStore is good for quick development and prototyping. For more scalable options, check-out the [docs](https://docs.haystack.deepset.ai/docs/document_store)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zeVfvRLZozIY" + }, + "outputs": [], + "source": [ + "from haystack.document_stores import InMemoryDocumentStore\n", + "\n", + "document_store = InMemoryDocumentStore()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "zHevRxxaozIa" + }, + "source": [ + "### Create a Retriever using embeddings\n", + "Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones).\n", + "We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oFNXb3kIozIb", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from haystack.nodes import EmbeddingRetriever\n", + "\n", + "retriever = EmbeddingRetriever(\n", + " document_store=document_store,\n", + " embedding_model=\"sentence-transformers/all-MiniLM-L6-v2\",\n", + " use_gpu=True,\n", + " scale_score=False,\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "uLv8ysluozIb" + }, + "source": [ + "### Prepare & Index FAQ data\n", + "We create a pandas dataframe containing some FAQ data (i.e curated pairs of question + answer) and index those in our documentstore.\n", + "Here: We download some question-answer pairs related to COVID-19" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AHiSltp4ozIb", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from haystack.utils import fetch_archive_from_http\n", + "\n", + "\n", + "# Download\n", + "doc_dir = \"data/tutorial4\"\n", + "s3_url = \"https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip\"\n", + "fetch_archive_from_http(url=s3_url, output_dir=doc_dir)\n", + "\n", + "# Get dataframe with columns \"question\", \"answer\" and some custom metadata\n", + "df = pd.read_csv(f\"{doc_dir}/small_faq_covid.csv\")\n", + "# Minimal cleaning\n", + "df.fillna(value=\"\", inplace=True)\n", + "df[\"question\"] = df[\"question\"].apply(lambda x: x.strip())\n", + "print(df.head())\n", + "\n", + "# Create embeddings for our questions from the FAQs\n", + "# In contrast to most other search use cases, we don't create the embeddings here from the content of our documents,\n", + "# but rather from the additional text field \"question\" as we want to match \"incoming question\" <-> \"stored question\".\n", + "questions = list(df[\"question\"].values)\n", + "df[\"embedding\"] = retriever.embed_queries(queries=questions).tolist()\n", + "df = df.rename(columns={\"question\": \"content\"})\n", + "\n", + "# Convert Dataframe to list of dicts and index them in our DocumentStore\n", + "docs_to_index = df.to_dict(orient=\"records\")\n", + "document_store.write_documents(docs_to_index)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "collapsed": false, + "id": "MXteNgYRozIb" + }, + "source": [ + "### Ask questions\n", + "Initialize a Pipeline (this time without a reader) and ask questions" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "F5O7r3poozIb", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from haystack.pipelines import FAQPipeline\n", + "\n", + "pipe = FAQPipeline(retriever=retriever)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 709, + "referenced_widgets": [ + "070f7d6a12804647b2c4f5ec98241ced", + "8678507de5e748219ba28bb7970c0e63", + "35855d91133f474092381950bdbfce58", + "0656e34a277141d184aef005e4d39f88", + "612af309a6a94477b56dcea22c7a0940", + "dc9c54def7bf47d39819a97b7ceed839", + "09f4ba018a514f1ca2929ece4d0335e2", + "58f3458cddc747d7b1a1c05f8f0664ed", + "cd2614a0933c48a391966cb572044710", + "52abbb2d8eb043a0924d705a99577303", + "04495cdbd0e04e02a91ae3b026ef4c46" + ] }, + "id": "QX6qbic2ozIc", + "outputId": "af0a8eda-f7f6-4c97-cda7-13566ff888b1", + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zeVfvRLZozIY" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "070f7d6a12804647b2c4f5ec98241ced", + "version_major": 2, + "version_minor": 0 }, - "outputs": [], - "source": [ - "from haystack.document_stores import InMemoryDocumentStore\n", - "\n", - "document_store = InMemoryDocumentStore()" + "text/plain": [ + "Batches: 0%| | 0/1 [00:00 \"stored question\".\n", - "questions = list(df[\"question\"].values)\n", - "df[\"embedding\"] = retriever.embed_queries(queries=questions).tolist()\n", - "df = df.rename(columns={\"question\": \"content\"})\n", - "\n", - "# Convert Dataframe to list of dicts and index them in our DocumentStore\n", - "docs_to_index = df.to_dict(orient=\"records\")\n", - "document_store.write_documents(docs_to_index)" - ] + "09f4ba018a514f1ca2929ece4d0335e2": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "collapsed": false, - "id": "MXteNgYRozIb" - }, - "source": [ - "### Ask questions\n", - "Initialize a Pipeline (this time without a reader) and ask questions" - ] + "35855d91133f474092381950bdbfce58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_58f3458cddc747d7b1a1c05f8f0664ed", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_cd2614a0933c48a391966cb572044710", + "value": 1 + } }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": { - "id": "F5O7r3poozIb", - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "from haystack.pipelines import FAQPipeline\n", - "\n", - "pipe = FAQPipeline(retriever=retriever)" - ] + "52abbb2d8eb043a0924d705a99577303": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 709, - "referenced_widgets": [ - "070f7d6a12804647b2c4f5ec98241ced", - "8678507de5e748219ba28bb7970c0e63", - "35855d91133f474092381950bdbfce58", - "0656e34a277141d184aef005e4d39f88", - "612af309a6a94477b56dcea22c7a0940", - "dc9c54def7bf47d39819a97b7ceed839", - "09f4ba018a514f1ca2929ece4d0335e2", - "58f3458cddc747d7b1a1c05f8f0664ed", - "cd2614a0933c48a391966cb572044710", - "52abbb2d8eb043a0924d705a99577303", - "04495cdbd0e04e02a91ae3b026ef4c46" - ] - }, - "id": "QX6qbic2ozIc", - "outputId": "af0a8eda-f7f6-4c97-cda7-13566ff888b1", - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "070f7d6a12804647b2c4f5ec98241ced", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/1 [00:00 This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Evaluating RAG Pipelines](https://haystack.deepset.ai/tutorials/35_evaluating_rag_pipelines). \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "To be able to make a statement about the quality of results a question-answering pipeline or any other pipeline in haystack produces, it is important to evaluate it. Furthermore, evaluation allows determining which components of the pipeline can be improved.\n", "The results of the evaluation can be saved as CSV files, which contain all the information to calculate additional metrics later on or inspect individual predictions." ] diff --git a/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb b/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb index 8263ba60..db82999d 100644 --- a/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb +++ b/tutorials/06_Better_Retrieval_via_Embedding_Retrieval.ipynb @@ -9,7 +9,7 @@ "source": [ "# Better Retrieval via \"Embedding Retrieval\"\n", "\n", - "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.0 and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) and [Build an Extractive QA Pipeline](https://haystack.deepset.ai/tutorials/34_extractive_qa_pipeline). \n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) and [Build an Extractive QA Pipeline](https://haystack.deepset.ai/tutorials/34_extractive_qa_pipeline). \n", ">\n", "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", diff --git a/tutorials/07_RAG_Generator.ipynb b/tutorials/07_RAG_Generator.ipynb index 2b239d31..b7dffe25 100644 --- a/tutorials/07_RAG_Generator.ipynb +++ b/tutorials/07_RAG_Generator.ipynb @@ -15,7 +15,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> As of version 1.16, `RAGenerator` has been deprecated in Haystack and completely removed from Haystack as of v1.18. We recommend following the tutorial on [Creating a Generative QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode) instead. For more details about this deprecation, check out [our announcement](https://github.com/deepset-ai/haystack/discussions/4816) on Github." + "> As of version 1.16 (`farm-haystack`), `RAGenerator` has been deprecated in Haystack and completely removed from Haystack as of v1.18. We recommend using Haystack 2.x (`haystack-ai`) and following the tutorial on [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) instead. \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release)." ] }, { diff --git a/tutorials/08_Preprocessing.ipynb b/tutorials/08_Preprocessing.ipynb index a0be3bca..a24168b3 100644 --- a/tutorials/08_Preprocessing.ipynb +++ b/tutorials/08_Preprocessing.ipynb @@ -8,7 +8,7 @@ "source": [ "# Preprocessing\n", "\n", - "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.0 and would like to follow the updated version of this tutorial, check out [Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline). \n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline). \n", ">\n", "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", diff --git a/tutorials/09_DPR_training.ipynb b/tutorials/09_DPR_training.ipynb index 611450ae..90ac0445 100644 --- a/tutorials/09_DPR_training.ipynb +++ b/tutorials/09_DPR_training.ipynb @@ -12,6 +12,10 @@ "source": [ "# Training Your Own \"Dense Passage Retrieval\" Model\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "Haystack contains all the tools needed to train your own Dense Passage Retrieval model.\n", "This tutorial will guide you through the steps required to create a retriever that is specifically tailored to your domain." ] diff --git a/tutorials/10_Knowledge_Graph.ipynb b/tutorials/10_Knowledge_Graph.ipynb index 5f728a64..198aba3e 100644 --- a/tutorials/10_Knowledge_Graph.ipynb +++ b/tutorials/10_Knowledge_Graph.ipynb @@ -11,7 +11,9 @@ "source": [ "# Question Answering on a Knowledge Graph\n", "\n", - "> Starting from version 1.15, `BaseKnowledgeGraph`, `GraphDBKnowledgeGraph`, `InMemoryKnowledgeGraph`, and `Text2SparqlRetriever` are being deprecated and will be removed from Haystack as of version 1.17. For more details about this deprecation, check out [our announcement](https://github.com/deepset-ai/haystack/discussions/4882) on Github. \n", + "> Starting from version 1.15 (`farm-haystack`), `BaseKnowledgeGraph`, `GraphDBKnowledgeGraph`, `InMemoryKnowledgeGraph`, and `Text2SparqlRetriever` are being deprecated and will be removed from Haystack as of version 1.17. For more details about this deprecation, check out [our announcement](https://github.com/deepset-ai/haystack/discussions/4882) on Github. \n", + ">\n", + "> If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook). For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", "Haystack allows storing and querying knowledge graphs with the help of pre-trained models that translate text queries to SPARQL queries.\n", "This tutorial demonstrates how to load an existing knowledge graph into haystack, load a pre-trained retriever, and execute text queries on the knowledge graph.\n", diff --git a/tutorials/11_Pipelines.ipynb b/tutorials/11_Pipelines.ipynb index 86a9f7cb..381b9064 100644 --- a/tutorials/11_Pipelines.ipynb +++ b/tutorials/11_Pipelines.ipynb @@ -11,7 +11,11 @@ } }, "source": [ - "# Tutorial: How to Use Pipelines\n", + "# How to Use Pipelines\n", + "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", "In this tutorial, you will learn how the `Pipeline` connects the different components in Haystack. Whether you are using a Reader, Summarizer\n", "or Retriever (or 2), the `Pipeline` class will help you build a Directed Acyclic Graph (DAG) that\n", diff --git a/tutorials/12_LFQA.ipynb b/tutorials/12_LFQA.ipynb index c70f96c9..787bc9b5 100644 --- a/tutorials/12_LFQA.ipynb +++ b/tutorials/12_LFQA.ipynb @@ -15,7 +15,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> As of version 1.16, `Seq2SeqGenerator` has been deprecated in Haystack and completely removed from Haystack as of v1.18. We recommend following the tutorial on [Creating a Generative QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode) instead. For more details about this deprecation, check out [our announcement](https://github.com/deepset-ai/haystack/discussions/4816) on Github." + "> As of version 1.16 (`farm-haystack`), `Seq2SeqGenerator` has been deprecated in Haystack and completely removed from Haystack as of v1.18. We recommend using Haystack 2.x (`haystack-ai`) and following the tutorial on [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) instead. \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release)." ] }, { diff --git a/tutorials/13_Question_generation.ipynb b/tutorials/13_Question_generation.ipynb index 1f50b5d6..0a78d8d7 100644 --- a/tutorials/13_Question_generation.ipynb +++ b/tutorials/13_Question_generation.ipynb @@ -9,6 +9,10 @@ "source": [ "# Question Generation\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "This is a bare bones tutorial showing what is possible with the QuestionGenerator Nodes and Pipelines which automatically\n", "generate questions which the question generation model thinks can be answered by a given document." ] diff --git a/tutorials/14_Query_Classifier.ipynb b/tutorials/14_Query_Classifier.ipynb index a2a4da9d..09d8798b 100644 --- a/tutorials/14_Query_Classifier.ipynb +++ b/tutorials/14_Query_Classifier.ipynb @@ -9,6 +9,10 @@ "source": [ "# Tutorial: Query Classifier\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 15 minutes\n", "- **Nodes Used**: `TransformersQueryClassifier`, `InMemoryDocumentStore`, `BM25Retriever`, `EmbeddingRetriever`, `FARMReader`\n", diff --git a/tutorials/15_TableQA.ipynb b/tutorials/15_TableQA.ipynb index ed8fea8a..77bc51f7 100644 --- a/tutorials/15_TableQA.ipynb +++ b/tutorials/15_TableQA.ipynb @@ -9,6 +9,10 @@ "source": [ "# Open-Domain QA on Tables\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "This tutorial shows you how to perform question-answering on tables using the `EmbeddingRetriever` or `BM25Retriever` as retriever node and the `TableReader` as reader node." ] }, diff --git a/tutorials/16_Document_Classifier_at_Index_Time.ipynb b/tutorials/16_Document_Classifier_at_Index_Time.ipynb index 57b23caf..6f132c4e 100644 --- a/tutorials/16_Document_Classifier_at_Index_Time.ipynb +++ b/tutorials/16_Document_Classifier_at_Index_Time.ipynb @@ -7,6 +7,10 @@ "source": [ "# Extending your Metadata using DocumentClassifiers at Index Time\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "With DocumentClassifier it's possible to automatically enrich your documents with categories, sentiments, topics or whatever metadata you like. This metadata could be used for efficient filtering or further processing. Say you have some categories your users typically filter on. If the documents are tagged manually with these categories, you could automate this process by training a model. Or you can leverage the full power and flexibility of zero shot classification. All you need to do is pass your categories to the classifier, no labels required. This tutorial shows how to integrate it in your indexing pipeline." ] }, diff --git a/tutorials/17_Audio.ipynb b/tutorials/17_Audio.ipynb index cd15a2f1..f460f808 100644 --- a/tutorials/17_Audio.ipynb +++ b/tutorials/17_Audio.ipynb @@ -9,7 +9,9 @@ "source": [ "# Tutorial: Make Your QA Pipelines Talk!\n", "\n", - ">⚠️**Update:** This tutorial is now outdated and we recommend moving to Haystack >= 2.0 and checking out the new tutorials [here](https://haystack.deepset.ai/tutorials). AnswerToSpeech lives in the [text2speech](https://github.com/deepset-ai/haystack-extras/tree/main/nodes/text2speech) package. Main [Haystack](https://github.com/deepset-ai/haystack) repository doesn't include it anymore.\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 15 minutes\n", @@ -319,7 +321,7 @@ }, "gpuClass": "standard", "kernelspec": { - "display_name": "Python 3.9.6 64-bit", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -334,11 +336,6 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" - }, - "vscode": { - "interpreter": { - "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" - } } }, "nbformat": 4, diff --git a/tutorials/18_GPL.ipynb b/tutorials/18_GPL.ipynb index 9ca92467..187a757a 100644 --- a/tutorials/18_GPL.ipynb +++ b/tutorials/18_GPL.ipynb @@ -12,6 +12,10 @@ "source": [ "# Generative Pseudo Labeling for Domain Adaptation of Dense Retrievals\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "*Note: Adapted to Haystack from Nils Reimers' original [notebook](https://colab.research.google.com/gist/jamescalam/d2c888775c87f9882bb7c379a96adbc8/gpl-domain-adaptation.ipynb#scrollTo=183ff7ab)\n", "\n", "The NLP models we use every day were trained on a corpus of data that reflects the world from the past. In the meantime, we've experienced world-changing events, like the COVID pandemics, and we'd like our models to know about them. Training a model from scratch is tedious work but what if we could just update the models with new data? Generative Pseudo Labeling comes to the rescue.\n", diff --git a/tutorials/19_Text_to_Image_search_pipeline_with_MultiModal_Retriever.ipynb b/tutorials/19_Text_to_Image_search_pipeline_with_MultiModal_Retriever.ipynb index 24386806..52ef2702 100644 --- a/tutorials/19_Text_to_Image_search_pipeline_with_MultiModal_Retriever.ipynb +++ b/tutorials/19_Text_to_Image_search_pipeline_with_MultiModal_Retriever.ipynb @@ -9,18 +9,17 @@ "source": [ "# Text-To-Image Search Pipeline with Multimodal Retriever\n", "\n", - "**Level**: Intermediate\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", "\n", + "**Level**: Intermediate\n", "**Time to complete**: 20 minutes\n", - "\n", "**Prerequisites**: This tutorial assumes basic knowledge of Haystack Retrievers and Pipelines. If you want to learn about them, have a look at our tutorials on [Build Your First QA System](https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/01_Basic_QA_Pipeline.ipynb) and [Fine-Tuning a Model on Your Own Data](https://github.com/deepset-ai/haystack-tutorials/blob/main/tutorials/02_Finetune_a_model_on_your_data.ipynb).\n", "\n", "Prepare the Colab environment (see links below).\n", - "\n", "**Nodes Used**: InMemoryDocumentStore, MultiModalRetriever\n", - "\n", "**Goal**: After completing this tutorial, you will have built a search system that retrieves images as answers to a text query.\n", - "\n", "**Description**: In this tutorial, you'll download a set of images that you'll then turn into embeddings using a transformers model, OpenAI CLIP. You'll then use the same model to embed the text query. Finally, you'll perform a nearest neighbor search to retrieve the images relevant to the text query.\n", "\n", "Let's build a text-to-image search pipeline using a small animal dataset!" diff --git a/tutorials/20_Using_Haystack_with_REST_API.ipynb b/tutorials/20_Using_Haystack_with_REST_API.ipynb index 55d66296..57a79eda 100644 --- a/tutorials/20_Using_Haystack_with_REST_API.ipynb +++ b/tutorials/20_Using_Haystack_with_REST_API.ipynb @@ -7,6 +7,10 @@ "source": [ "# Tutorial: Using Haystack with REST API\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Advanced\n", "- **Time to complete**: 30 minutes\n", "- **Prerequisites**: Basic understanding of Docker and basic knowledge of Haystack pipelines. \n", diff --git a/tutorials/21_Customizing_PromptNode.ipynb b/tutorials/21_Customizing_PromptNode.ipynb index e031a8f3..078bacbb 100644 --- a/tutorials/21_Customizing_PromptNode.ipynb +++ b/tutorials/21_Customizing_PromptNode.ipynb @@ -9,14 +9,14 @@ "source": [ "# Tutorial: Customizing PromptNode for NLP Tasks\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 20 minutes\n", "- **Nodes Used**: `PromptNode`, `PromptTemplate`\n", - "- **Goal**: After completing this tutorial, you will have learned the basics of using PromptNode and PromptTemplates and you'll have added titles to articles from The Guardian and categorized them. \n", - "\n", - "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.0 and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline). \n", - ">\n", - "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release)." + "- **Goal**: After completing this tutorial, you will have learned the basics of using PromptNode and PromptTemplates and you'll have added titles to articles from The Guardian and categorized them. " ] }, { diff --git a/tutorials/22_Pipeline_with_PromptNode.ipynb b/tutorials/22_Pipeline_with_PromptNode.ipynb index fedbf40c..d7ea329d 100644 --- a/tutorials/22_Pipeline_with_PromptNode.ipynb +++ b/tutorials/22_Pipeline_with_PromptNode.ipynb @@ -9,21 +9,16 @@ "source": [ "# Tutorial: Creating a Generative QA Pipeline with Retrieval-Augmentation\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline) \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 15 minutes\n", "- **Nodes Used**: `InMemoryDocumentStore`, `BM25Retriever`, `PromptNode`, `PromptTemplate`\n", "- **Goal**: After completing this tutorial, you'll have created a generative question answering search system that uses a large language model through PromptNode with PromptTemplate." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.0 and would like to follow the updated version of this tutorial, check out [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline). \n", - ">\n", - "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release)." - ] - }, { "attachments": {}, "cell_type": "markdown", diff --git a/tutorials/23_Answering_Multihop_Questions_with_Agents.ipynb b/tutorials/23_Answering_Multihop_Questions_with_Agents.ipynb index 12f7b5f8..86275a91 100644 --- a/tutorials/23_Answering_Multihop_Questions_with_Agents.ipynb +++ b/tutorials/23_Answering_Multihop_Questions_with_Agents.ipynb @@ -9,6 +9,10 @@ "source": [ "# Tutorial: Answering Multihop Questions with Agents\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), and would like to follow the updated version of this tutorial, check out [Cookbook: Newsletter Sending Agent](https://haystack.deepset.ai/cookbook/newsletter-agent) or refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 10 minutes\n", "- **Nodes Used**: `Agent`, `PromptNode`, `InMemoryDocumentStore`, `FARMReader` and `ExtractiveQAPipeline`\n", diff --git a/tutorials/24_Building_Chat_App.ipynb b/tutorials/24_Building_Chat_App.ipynb index ee15a6a9..b6399b30 100644 --- a/tutorials/24_Building_Chat_App.ipynb +++ b/tutorials/24_Building_Chat_App.ipynb @@ -8,6 +8,10 @@ "source": [ "# Tutorial: Building a Conversational Chat App\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`) and would like to follow the updated version of this tutorial, check out [Building a Chat Application with Function Calling](https://haystack.deepset.ai/tutorials/40_building_chat_application_with_function_calling).\n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 10 minutes\n", "- **Nodes Used**: `PromptNode`, `ConversationalAgent` and `ConversationSummaryMemory`\n", diff --git a/tutorials/25_Customizing_Agent.ipynb b/tutorials/25_Customizing_Agent.ipynb index cd076d69..08d592ea 100644 --- a/tutorials/25_Customizing_Agent.ipynb +++ b/tutorials/25_Customizing_Agent.ipynb @@ -9,6 +9,10 @@ "source": [ "# Tutorial: Customizing Agent to Chat with Your Documents\n", "\n", + "> This tutorial is based on Haystack 1.x (`farm-haystack`). If you're using Haystack 2.x (`haystack-ai`), refer to the [Haystack 2.x tutorials](https://haystack.deepset.ai/tutorials) or [Haystack Cookbook](https://haystack.deepset.ai/cookbook).\n", + ">\n", + "> For more information on Haystack 2.x, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Advanced\n", "- **Time to complete**: 20 minutes\n", "- **Nodes Used**: `BM25Retriever`, `PromptNode`, `Agent`, and `Memory`\n", diff --git a/tutorials/26_Hybrid_Retrieval.ipynb b/tutorials/26_Hybrid_Retrieval.ipynb index 0ef23f7c..d56fbfbb 100644 --- a/tutorials/26_Hybrid_Retrieval.ipynb +++ b/tutorials/26_Hybrid_Retrieval.ipynb @@ -8,21 +8,16 @@ "source": [ "# Tutorial: Creating a Hybrid Retrieval Pipeline\n", "\n", + "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.x and would like to follow the updated version of this tutorial, check out [Creating a Hybrid Pipeline](https://haystack.deepset.ai/tutorials/33_hybrid_retrieval). \n", + ">\n", + "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release).\n", + "\n", "- **Level**: Intermediate\n", "- **Time to complete**: 15 minutes\n", "- **Nodes Used**: `EmbeddingRetriever`, `BM25Retriever`, `JoinDocuments`, `SentenceTransformersRanker` and `InMemoryDocumentStore`\n", "- **Goal**: After completing this tutorial, you will have learned about creating your first hybrid retrieval and when it's useful." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "> This tutorial is based on Haystack 1.x. If you're using Haystack 2.0 and would like to follow the updated version of this tutorial, check out [Creating a Hybrid Pipeline](https://haystack.deepset.ai/tutorials/33_hybrid_retrieval). \n", - ">\n", - "> For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release)." - ] - }, { "cell_type": "markdown", "metadata": { diff --git a/tutorials/27_First_RAG_Pipeline.ipynb b/tutorials/27_First_RAG_Pipeline.ipynb index d9467ff3..943d1c70 100644 --- a/tutorials/27_First_RAG_Pipeline.ipynb +++ b/tutorials/27_First_RAG_Pipeline.ipynb @@ -1,1446 +1,1343 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "2OvkPji9O-qX" - }, - "source": [ - "# Tutorial: Creating Your First QA Pipeline with Retrieval-Augmentation\n", - "\n", - "- **Level**: Beginner\n", - "- **Time to complete**: 10 minutes\n", - "- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder), [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator)\n", - "- **Prerequisites**: You must have an [OpenAI API Key](https://platform.openai.com/api-keys).\n", - "- **Goal**: After completing this tutorial, you'll have learned the new prompt syntax and how to use PromptBuilder and OpenAIGenerator to build a generative question-answering pipeline with retrieval-augmentation.\n", - "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LFqHcXYPO-qZ" - }, - "source": [ - "## Overview\n", - "\n", - "This tutorial shows you how to create a generative question-answering pipeline using the retrieval-augmentation ([RAG](https://www.deepset.ai/blog/llms-retrieval-augmentation)) approach with Haystack 2.0. The process involves four main components: [SentenceTransformersTextEmbedder](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder) for creating an embedding for the user query, [InMemoryBM25Retriever](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever) for fetching relevant documents, [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder) for creating a template prompt, and [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) for generating responses.\n", - "\n", - "For this tutorial, you'll use the Wikipedia pages of [Seven Wonders of the Ancient World](https://en.wikipedia.org/wiki/Wonders_of_the_World) as Documents, but you can replace them with any text you want.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QXjVlbPiO-qZ" - }, - "source": [ - "## Preparing the Colab Environment\n", - "\n", - "- [Enable GPU Runtime in Colab](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration)\n", - "- [Set logging level to INFO](https://docs.haystack.deepset.ai/docs/logging)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Kww5B_vXO-qZ" - }, - "source": [ - "## Installing Haystack\n", - "\n", - "Install Haystack 2.0 and other required packages with `pip`:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "UQbU8GUfO-qZ", - "outputId": "c33579e9-5557-43bd-a3c5-63b8373770c7" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: haystack-ai in /usr/local/lib/python3.10/dist-packages (2.0.0b8)\n", - "Requirement already satisfied: boilerpy3 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.0.7)\n", - "Requirement already satisfied: haystack-bm25 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.0.2)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.3)\n", - "Requirement already satisfied: lazy-imports in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.3.1)\n", - "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (10.1.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.2.1)\n", - "Requirement already satisfied: openai>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.13.3)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.5.3)\n", - "Requirement already satisfied: posthog in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.5.0)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (6.0.1)\n", - "Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (8.2.3)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.66.2)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.10.0)\n", - "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (3.7.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->haystack-ai) (1.7.0)\n", - "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (0.27.0)\n", - "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (2.6.3)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.3.1)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from haystack-bm25->haystack-ai) (1.25.2)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai) (2.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2023.4)\n", - "Requirement already satisfied: requests<3.0,>=2.7 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.31.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.16.0)\n", - "Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.6)\n", - "Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.2.1)\n", - "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (3.6)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (1.2.0)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (2024.2.2)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (1.0.4)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (0.14.0)\n", - "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai) (0.6.0)\n", - "Requirement already satisfied: pydantic-core==2.16.3 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai) (2.16.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog->haystack-ai) (3.3.2)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog->haystack-ai) (2.0.7)\n", - "Requirement already satisfied: datasets>=2.6.1 in /usr/local/lib/python3.10/dist-packages (2.18.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (3.13.1)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (1.25.2)\n", - "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (14.0.2)\n", - "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (0.6)\n", - "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (0.3.8)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (1.5.3)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (2.31.0)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (4.66.2)\n", - "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (3.4.1)\n", - "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (0.70.16)\n", - "Requirement already satisfied: fsspec[http]<=2024.2.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (2023.6.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (3.9.3)\n", - "Requirement already satisfied: huggingface-hub>=0.19.4 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (0.20.3)\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (23.2)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (6.0.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (1.9.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (4.0.3)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.19.4->datasets>=2.6.1) (4.10.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (2024.2.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.6.1) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.6.1) (2023.4)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets>=2.6.1) (1.16.0)\n", - "Requirement already satisfied: sentence-transformers>=2.2.0 in /usr/local/lib/python3.10/dist-packages (2.5.1)\n", - "Requirement already satisfied: transformers<5.0.0,>=4.32.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (4.38.2)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (4.66.2)\n", - "Requirement already satisfied: torch>=1.11.0 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (2.1.0+cu121)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (1.25.2)\n", - "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (1.2.2)\n", - "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (1.11.4)\n", - "Requirement already satisfied: huggingface-hub>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (0.20.3)\n", - "Requirement already satisfied: Pillow in /usr/local/lib/python3.10/dist-packages (from sentence-transformers>=2.2.0) (9.4.0)\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (3.13.1)\n", - "Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (2023.6.0)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (2.31.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (6.0.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (4.10.0)\n", - "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (23.2)\n", - "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers>=2.2.0) (1.12)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers>=2.2.0) (3.2.1)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers>=2.2.0) (3.1.3)\n", - "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.11.0->sentence-transformers>=2.2.0) (2.1.0)\n", - "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers>=2.2.0) (2023.12.25)\n", - "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers>=2.2.0) (0.15.2)\n", - "Requirement already satisfied: safetensors>=0.4.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.32.0->sentence-transformers>=2.2.0) (0.4.2)\n", - "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers>=2.2.0) (1.3.2)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn->sentence-transformers>=2.2.0) (3.3.0)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.11.0->sentence-transformers>=2.2.0) (2.1.5)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->huggingface-hub>=0.15.1->sentence-transformers>=2.2.0) (2024.2.2)\n", - "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.11.0->sentence-transformers>=2.2.0) (1.3.0)\n" - ] - } - ], - "source": [ - "%%bash\n", - "\n", - "pip install haystack-ai\n", - "pip install \"datasets>=2.6.1\"\n", - "pip install \"sentence-transformers>=3.0.0\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Wl_jYERtO-qa" - }, - "source": [ - "### Enabling Telemetry\n", - "\n", - "Knowing you're using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/enabling-telemetry) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "id": "A76B4S49O-qa" - }, - "outputs": [], - "source": [ - "from haystack.telemetry import tutorial_running\n", - "\n", - "tutorial_running(27)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_lvfew16O-qa" - }, - "source": [ - "## Fetching and Indexing Documents\n", - "\n", - "You'll start creating your question answering system by downloading the data and indexing the data with its embeddings to a DocumentStore. \n", - "\n", - "In this tutorial, you will take a simple approach to writing documents and their embeddings into the DocumentStore. For a full indexing pipeline with preprocessing, cleaning and splitting, check out our tutorial on [Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline).\n", - "\n", - "\n", - "### Initializing the DocumentStore\n", - "\n", - "Initialize a DocumentStore to index your documents. A DocumentStore stores the Documents that the question answering system uses to find answers to your questions. In this tutorial, you'll be using the `InMemoryDocumentStore`." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "CbVN-s5LO-qa" - }, - "outputs": [], - "source": [ - "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", - "\n", - "document_store = InMemoryDocumentStore()" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "2OvkPji9O-qX" + }, + "source": [ + "# Tutorial: Creating Your First QA Pipeline with Retrieval-Augmentation\n", + "\n", + "- **Level**: Beginner\n", + "- **Time to complete**: 10 minutes\n", + "- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder), [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator)\n", + "- **Prerequisites**: You must have an [OpenAI API Key](https://platform.openai.com/api-keys).\n", + "- **Goal**: After completing this tutorial, you'll have learned the new prompt syntax and how to use PromptBuilder and OpenAIGenerator to build a generative question-answering pipeline with retrieval-augmentation.\n", + "\n", + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LFqHcXYPO-qZ" + }, + "source": [ + "## Overview\n", + "\n", + "This tutorial shows you how to create a generative question-answering pipeline using the retrieval-augmentation ([RAG](https://www.deepset.ai/blog/llms-retrieval-augmentation)) approach with Haystack 2.0. The process involves four main components: [SentenceTransformersTextEmbedder](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder) for creating an embedding for the user query, [InMemoryBM25Retriever](https://docs.haystack.deepset.ai/docs/inmemorybm25retriever) for fetching relevant documents, [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder) for creating a template prompt, and [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) for generating responses.\n", + "\n", + "For this tutorial, you'll use the Wikipedia pages of [Seven Wonders of the Ancient World](https://en.wikipedia.org/wiki/Wonders_of_the_World) as Documents, but you can replace them with any text you want.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QXjVlbPiO-qZ" + }, + "source": [ + "## Preparing the Colab Environment\n", + "\n", + "- [Enable GPU Runtime in Colab](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration)\n", + "- [Set logging level to INFO](https://docs.haystack.deepset.ai/docs/logging)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Kww5B_vXO-qZ" + }, + "source": [ + "## Installing Haystack\n", + "\n", + "Install Haystack and other required packages with `pip`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "yL8nuJdWO-qa" - }, - "source": [ - "> `InMemoryDocumentStore` is the simplest DocumentStore to get started with. It requires no external dependencies and it's a good option for smaller projects and debugging. But it doesn't scale up so well to larger Document collections, so it's not a good choice for production systems. To learn more about the different types of external databases that Haystack supports, see [DocumentStore Integrations](https://haystack.deepset.ai/integrations?type=Document+Store)." - ] + "id": "UQbU8GUfO-qZ", + "outputId": "c33579e9-5557-43bd-a3c5-63b8373770c7" + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "pip install haystack-ai\n", + "pip install \"datasets>=2.6.1\"\n", + "pip install \"sentence-transformers>=3.0.0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wl_jYERtO-qa" + }, + "source": [ + "### Enabling Telemetry\n", + "\n", + "Knowing you're using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/enabling-telemetry) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "A76B4S49O-qa" + }, + "outputs": [], + "source": [ + "from haystack.telemetry import tutorial_running\n", + "\n", + "tutorial_running(27)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_lvfew16O-qa" + }, + "source": [ + "## Fetching and Indexing Documents\n", + "\n", + "You'll start creating your question answering system by downloading the data and indexing the data with its embeddings to a DocumentStore. \n", + "\n", + "In this tutorial, you will take a simple approach to writing documents and their embeddings into the DocumentStore. For a full indexing pipeline with preprocessing, cleaning and splitting, check out our tutorial on [Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline).\n", + "\n", + "\n", + "### Initializing the DocumentStore\n", + "\n", + "Initialize a DocumentStore to index your documents. A DocumentStore stores the Documents that the question answering system uses to find answers to your questions. In this tutorial, you'll be using the `InMemoryDocumentStore`." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "CbVN-s5LO-qa" + }, + "outputs": [], + "source": [ + "from haystack.document_stores.in_memory import InMemoryDocumentStore\n", + "\n", + "document_store = InMemoryDocumentStore()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yL8nuJdWO-qa" + }, + "source": [ + "> `InMemoryDocumentStore` is the simplest DocumentStore to get started with. It requires no external dependencies and it's a good option for smaller projects and debugging. But it doesn't scale up so well to larger Document collections, so it's not a good choice for production systems. To learn more about the different types of external databases that Haystack supports, see [DocumentStore Integrations](https://haystack.deepset.ai/integrations?type=Document+Store)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XvLVaFHTO-qb" + }, + "source": [ + "The DocumentStore is now ready. Now it's time to fill it with some Documents." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HryYZP9ZO-qb" + }, + "source": [ + "### Fetch the Data\n", + "\n", + "You'll use the Wikipedia pages of [Seven Wonders of the Ancient World](https://en.wikipedia.org/wiki/Wonders_of_the_World) as Documents. We preprocessed the data and uploaded to a Hugging Face Space: [Seven Wonders](https://huggingface.co/datasets/bilgeyucel/seven-wonders). Thus, you don't need to perform any additional cleaning or splitting.\n", + "\n", + "Fetch the data and convert it into Haystack Documents:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "INdC3WvLO-qb", + "outputId": "1af43d0f-2999-4de4-d152-b3cca9fb49e6" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "XvLVaFHTO-qb" - }, - "source": [ - "The DocumentStore is now ready. Now it's time to fill it with some Documents." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", + "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", + "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", + "You will be able to reuse this secret in all of your notebooks.\n", + "Please note that authentication is recommended but still optional to access public models or datasets.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from datasets import load_dataset\n", + "from haystack import Document\n", + "\n", + "dataset = load_dataset(\"bilgeyucel/seven-wonders\", split=\"train\")\n", + "docs = [Document(content=doc[\"content\"], meta=doc[\"meta\"]) for doc in dataset]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "czMjWwnxPA-3" + }, + "source": [ + "### Initalize a Document Embedder\n", + "\n", + "To store your data in the DocumentStore with embeddings, initialize a [SentenceTransformersDocumentEmbedder](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder) with the model name and call `warm_up()` to download the embedding model.\n", + "\n", + "> If you'd like, you can use a different [Embedder](https://docs.haystack.deepset.ai/docs/embedders) for your documents." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "EUmAH9sEn3R7", + "outputId": "ee54b59b-4d4a-45eb-c1a9-0b7b248f1dd4" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "HryYZP9ZO-qb" - }, - "source": [ - "### Fetch the Data\n", - "\n", - "You'll use the Wikipedia pages of [Seven Wonders of the Ancient World](https://en.wikipedia.org/wiki/Wonders_of_the_World) as Documents. We preprocessed the data and uploaded to a Hugging Face Space: [Seven Wonders](https://huggingface.co/datasets/bilgeyucel/seven-wonders). Thus, you don't need to perform any additional cleaning or splitting.\n", - "\n", - "Fetch the data and convert it into Haystack Documents:" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n", + " return self.fget.__get__(instance, owner)()\n" + ] + } + ], + "source": [ + "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", + "\n", + "doc_embedder = SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", + "doc_embedder.warm_up()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9y4iJE_SrS4K" + }, + "source": [ + "### Write Documents to the DocumentStore\n", + "\n", + "Run the `doc_embedder` with the Documents. The embedder will create embeddings for each document and save these embeddings in Document object's `embedding` field. Then, you can write the Documents to the DocumentStore with `write_documents()` method." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 66, + "referenced_widgets": [ + "7d482188c12d4a7886f20a65d3402c59", + "2a3ec74419ae4a02ac0210db66133415", + "ddeff9a822404adbbc3cad97a939bc0c", + "36d341ab3a044709b5af2e8ab97559bc", + "88fc33e1ab78405e911b5eafa512c935", + "91e5d4b0ede848319ef0d3b558d57d19", + "d2428c21707d43f2b6f07bfafbace8bb", + "7fdb2c859e454e72888709a835f7591e", + "6b8334e071a3438397ba6435aac69f58", + "5f5cfa425cac4d37b2ea29e53b4ed900", + "3c59a82dac5c476b9a3e3132094e1702" + ] }, + "id": "ETpQKftLplqh", + "outputId": "b9c8658c-90c8-497c-e765-97487c0daf8e" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "INdC3WvLO-qb", - "outputId": "1af43d0f-2999-4de4-d152-b3cca9fb49e6" + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7d482188c12d4a7886f20a65d3402c59", + "version_major": 2, + "version_minor": 0 }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", - "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", - "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", - "You will be able to reuse this secret in all of your notebooks.\n", - "Please note that authentication is recommended but still optional to access public models or datasets.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "from datasets import load_dataset\n", - "from haystack import Document\n", - "\n", - "dataset = load_dataset(\"bilgeyucel/seven-wonders\", split=\"train\")\n", - "docs = [Document(content=doc[\"content\"], meta=doc[\"meta\"]) for doc in dataset]" + "text/plain": [ + "Batches: 0%| | 0/5 [00:00 If you'd like, you can use a different [Embedder](https://docs.haystack.deepset.ai/docs/embedders) for your documents." + "data": { + "text/plain": [ + "151" ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs_with_embeddings = doc_embedder.run(docs)\n", + "document_store.write_documents(docs_with_embeddings[\"documents\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IdojTxg6uubn" + }, + "source": [ + "## Building the RAG Pipeline\n", + "\n", + "The next step is to build a [Pipeline](https://docs.haystack.deepset.ai/docs/pipelines) to generate answers for the user query following the RAG approach. To create the pipeline, you first need to initialize each component, add them to your pipeline, and connect them." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0uyV6-u-u56P" + }, + "source": [ + "### Initialize a Text Embedder\n", + "\n", + "Initialize a text embedder to create an embedding for the user query. The created embedding will later be used by the Retriever to retrieve relevant documents from the DocumentStore.\n", + "\n", + "> ⚠️ Notice that you used `sentence-transformers/all-MiniLM-L6-v2` model to create embeddings for your documents before. This is why you need to use the same model to embed the user queries." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "LyJY2yW628dl" + }, + "outputs": [], + "source": [ + "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", + "\n", + "text_embedder = SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0_cj-5m-O-qb" + }, + "source": [ + "### Initialize the Retriever\n", + "\n", + "Initialize a [InMemoryEmbeddingRetriever](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) and make it use the InMemoryDocumentStore you initialized earlier in this tutorial. This Retriever will get the relevant documents to the query." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "-uo-6fjiO-qb" + }, + "outputs": [], + "source": [ + "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n", + "\n", + "retriever = InMemoryEmbeddingRetriever(document_store)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6CEuQpB7O-qb" + }, + "source": [ + "### Define a Template Prompt\n", + "\n", + "Create a custom prompt for a generative question answering task using the RAG approach. The prompt should take in two parameters: `documents`, which are retrieved from a document store, and a `question` from the user. Use the Jinja2 looping syntax to combine the content of the retrieved documents in the prompt.\n", + "\n", + "Next, initialize a [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder) instance with your prompt template. The PromptBuilder, when given the necessary values, will automatically fill in the variable values and generate a complete prompt. This approach allows for a more tailored and effective question-answering experience." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "ObahTh45FqOT" + }, + "outputs": [], + "source": [ + "from haystack.components.builders import PromptBuilder\n", + "\n", + "template = \"\"\"\n", + "Given the following information, answer the question.\n", + "\n", + "Context:\n", + "{% for document in documents %}\n", + " {{ document.content }}\n", + "{% endfor %}\n", + "\n", + "Question: {{question}}\n", + "Answer:\n", + "\"\"\"\n", + "\n", + "prompt_builder = PromptBuilder(template=template)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HR14lbfcFtXj" + }, + "source": [ + "### Initialize a Generator\n", + "\n", + "\n", + "Generators are the components that interact with large language models (LLMs). Now, set `OPENAI_API_KEY` environment variable and initialize a [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/OpenAIGenerator) that can communicate with OpenAI GPT models. As you initialize, provide a model name:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "SavE_FAqfApo", + "outputId": "1afbf2e8-ae63-41ff-c37f-5123b2103356" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "EUmAH9sEn3R7", - "outputId": "ee54b59b-4d4a-45eb-c1a9-0b7b248f1dd4" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n", - " return self.fget.__get__(instance, owner)()\n" - ] - } - ], - "source": [ - "from haystack.components.embedders import SentenceTransformersDocumentEmbedder\n", - "\n", - "doc_embedder = SentenceTransformersDocumentEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")\n", - "doc_embedder.warm_up()" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter OpenAI API key: ··········\n" + ] + } + ], + "source": [ + "import os\n", + "from getpass import getpass\n", + "from haystack.components.generators import OpenAIGenerator\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", + "generator = OpenAIGenerator(model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nenbo2SvycHd" + }, + "source": [ + "> You can replace `OpenAIGenerator` in your pipeline with another `Generator`. Check out the full list of generators [here](https://docs.haystack.deepset.ai/docs/generators)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1bfHwOQwycHe" + }, + "source": [ + "### Build the Pipeline\n", + "\n", + "To build a pipeline, add all components to your pipeline and connect them. Create connections from `text_embedder`'s \"embedding\" output to \"query_embedding\" input of `retriever`, from `retriever` to `prompt_builder` and from `prompt_builder` to `llm`. Explicitly connect the output of `retriever` with \"documents\" input of the `prompt_builder` to make the connection obvious as `prompt_builder` has two inputs (\"documents\" and \"question\").\n", + "\n", + "For more information on pipelines and creating connections, refer to [Creating Pipelines](https://docs.haystack.deepset.ai/docs/creating-pipelines) documentation." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, + "id": "f6NFmpjEO-qb", + "outputId": "89fd1b48-5189-4401-9cf8-15f55c503676" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "9y4iJE_SrS4K" - }, - "source": [ - "### Write Documents to the DocumentStore\n", - "\n", - "Run the `doc_embedder` with the Documents. The embedder will create embeddings for each document and save these embeddings in Document object's `embedding` field. Then, you can write the Documents to the DocumentStore with `write_documents()` method." + "data": { + "image/jpeg": "", + "text/plain": [ + "" ] + }, + "metadata": {}, + "output_type": "display_data" }, { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 66, - "referenced_widgets": [ - "7d482188c12d4a7886f20a65d3402c59", - "2a3ec74419ae4a02ac0210db66133415", - "ddeff9a822404adbbc3cad97a939bc0c", - "36d341ab3a044709b5af2e8ab97559bc", - "88fc33e1ab78405e911b5eafa512c935", - "91e5d4b0ede848319ef0d3b558d57d19", - "d2428c21707d43f2b6f07bfafbace8bb", - "7fdb2c859e454e72888709a835f7591e", - "6b8334e071a3438397ba6435aac69f58", - "5f5cfa425cac4d37b2ea29e53b4ed900", - "3c59a82dac5c476b9a3e3132094e1702" - ] - }, - "id": "ETpQKftLplqh", - "outputId": "b9c8658c-90c8-497c-e765-97487c0daf8e" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "7d482188c12d4a7886f20a65d3402c59", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/5 [00:00 ⚠️ Notice that you used `sentence-transformers/all-MiniLM-L6-v2` model to create embeddings for your documents before. This is why you need to use the same model to embed the user queries." - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "The Rhodes Statue was a 33-meter tall statue of the Greek sun-god Helios, featuring a structure built with iron tie bars covered in brass plates to form the skin. The head of the statue was described as having curly hair with spikes of bronze or silver flame radiating, similar to contemporary Rhodian coins.\n" + ] + } + ], + "source": [ + "question = \"What does Rhodes Statue look like?\"\n", + "\n", + "response = basic_rag_pipeline.run({\"text_embedder\": {\"text\": question}, \"prompt_builder\": {\"question\": question}})\n", + "\n", + "print(response[\"llm\"][\"replies\"][0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IWQN-aoGO-qc" + }, + "source": [ + "Here are some other example questions to test:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "_OHUQ5xxO-qc" + }, + "outputs": [], + "source": [ + "examples = [\n", + " \"Where is Gardens of Babylon?\",\n", + " \"Why did people build Great Pyramid of Giza?\",\n", + " \"What does Rhodes Statue look like?\",\n", + " \"Why did people visit the Temple of Artemis?\",\n", + " \"What is the importance of Colossus of Rhodes?\",\n", + " \"What happened to the Tomb of Mausolus?\",\n", + " \"How did Colossus of Rhodes collapse?\",\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XueCK3y4O-qc" + }, + "source": [ + "## What's next\n", + "\n", + "🎉 Congratulations! You've learned how to create a generative QA system for your documents with the RAG approach.\n", + "\n", + "If you liked this tutorial, you may also enjoy:\n", + "- [Filtering Documents with Metadata](https://haystack.deepset.ai/tutorials/31_metadata_filtering)\n", + "- [Preprocessing Different File Types](https://haystack.deepset.ai/tutorials/30_file_type_preprocessing_index_pipeline)\n", + "- [Creating a Hybrid Retrieval Pipeline](https://haystack.deepset.ai/tutorials/33_hybrid_retrieval)\n", + "\n", + "To stay up to date on the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) and [join Haystack discord community](https://discord.gg/haystack).\n", + "\n", + "Thanks for reading!" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.6" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" + } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "1a820c06a7a049d8b6c9ff300284d06e": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_d0cfe5dacdfc431a91b4c4741123e2d0", + "placeholder": "​", + "style": "IPY_MODEL_e7f1e1a14bb740d18827dd78bbe7b2e3", + "value": "Batches: 100%" + } }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "id": "LyJY2yW628dl" - }, - "outputs": [], - "source": [ - "from haystack.components.embedders import SentenceTransformersTextEmbedder\n", - "\n", - "text_embedder = SentenceTransformersTextEmbedder(model=\"sentence-transformers/all-MiniLM-L6-v2\")" - ] + "2a3ec74419ae4a02ac0210db66133415": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_91e5d4b0ede848319ef0d3b558d57d19", + "placeholder": "​", + "style": "IPY_MODEL_d2428c21707d43f2b6f07bfafbace8bb", + "value": "Batches: 100%" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "0_cj-5m-O-qb" - }, - "source": [ - "### Initialize the Retriever\n", - "\n", - "Initialize a [InMemoryEmbeddingRetriever](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever) and make it use the InMemoryDocumentStore you initialized earlier in this tutorial. This Retriever will get the relevant documents to the query." - ] + "2bc341a780f7498ba9cd475468841bb5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": { - "id": "-uo-6fjiO-qb" - }, - "outputs": [], - "source": [ - "from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\n", - "\n", - "retriever = InMemoryEmbeddingRetriever(document_store)" - ] + "36d341ab3a044709b5af2e8ab97559bc": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HTMLModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HTMLModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HTMLView", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_5f5cfa425cac4d37b2ea29e53b4ed900", + "placeholder": "​", + "style": "IPY_MODEL_3c59a82dac5c476b9a3e3132094e1702", + "value": " 5/5 [00:01<00:00,  3.35it/s]" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "6CEuQpB7O-qb" - }, - "source": [ - "### Define a Template Prompt\n", - "\n", - "Create a custom prompt for a generative question answering task using the RAG approach. The prompt should take in two parameters: `documents`, which are retrieved from a document store, and a `question` from the user. Use the Jinja2 looping syntax to combine the content of the retrieved documents in the prompt.\n", - "\n", - "Next, initialize a [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder) instance with your prompt template. The PromptBuilder, when given the necessary values, will automatically fill in the variable values and generate a complete prompt. This approach allows for a more tailored and effective question-answering experience." - ] + "39a68d9a5c274e2dafaa2d1f86eea768": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "id": "ObahTh45FqOT" - }, - "outputs": [], - "source": [ - "from haystack.components.builders import PromptBuilder\n", - "\n", - "template = \"\"\"\n", - "Given the following information, answer the question.\n", - "\n", - "Context:\n", - "{% for document in documents %}\n", - " {{ document.content }}\n", - "{% endfor %}\n", - "\n", - "Question: {{question}}\n", - "Answer:\n", - "\"\"\"\n", - "\n", - "prompt_builder = PromptBuilder(template=template)" - ] + "3c59a82dac5c476b9a3e3132094e1702": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "DescriptionStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "DescriptionStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "description_width": "" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "HR14lbfcFtXj" - }, - "source": [ - "### Initialize a Generator\n", - "\n", - "\n", - "Generators are the components that interact with large language models (LLMs). Now, set `OPENAI_API_KEY` environment variable and initialize a [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/OpenAIGenerator) that can communicate with OpenAI GPT models. As you initialize, provide a model name:" - ] + "3fda06f905b445a488efdd2dd08c0939": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SavE_FAqfApo", - "outputId": "1afbf2e8-ae63-41ff-c37f-5123b2103356" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Enter OpenAI API key: ··········\n" - ] - } + "4e6e97b6d54f4f80bb7e8b25aba8e616": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_1a820c06a7a049d8b6c9ff300284d06e", + "IPY_MODEL_58ff4e0603a74978a134f63533859be5", + "IPY_MODEL_8bdb8bfae31d4f4cb6c3b0bf43120eed" ], - "source": [ - "import os\n", - "from getpass import getpass\n", - "from haystack.components.generators import OpenAIGenerator\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", - "generator = OpenAIGenerator(model=\"gpt-4o-mini\")" - ] + "layout": "IPY_MODEL_39a68d9a5c274e2dafaa2d1f86eea768" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "nenbo2SvycHd" - }, - "source": [ - "> You can replace `OpenAIGenerator` in your pipeline with another `Generator`. Check out the full list of generators [here](https://docs.haystack.deepset.ai/docs/generators)." - ] + "58ff4e0603a74978a134f63533859be5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "FloatProgressModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "FloatProgressModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "ProgressView", + "bar_style": "success", + "description": "", + "description_tooltip": null, + "layout": "IPY_MODEL_3fda06f905b445a488efdd2dd08c0939", + "max": 1, + "min": 0, + "orientation": "horizontal", + "style": "IPY_MODEL_2bc341a780f7498ba9cd475468841bb5", + "value": 1 + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "1bfHwOQwycHe" - }, - "source": [ - "### Build the Pipeline\n", - "\n", - "To build a pipeline, add all components to your pipeline and connect them. Create connections from `text_embedder`'s \"embedding\" output to \"query_embedding\" input of `retriever`, from `retriever` to `prompt_builder` and from `prompt_builder` to `llm`. Explicitly connect the output of `retriever` with \"documents\" input of the `prompt_builder` to make the connection obvious as `prompt_builder` has two inputs (\"documents\" and \"question\").\n", - "\n", - "For more information on pipelines and creating connections, refer to [Creating Pipelines](https://docs.haystack.deepset.ai/docs/creating-pipelines) documentation." - ] + "5f5cfa425cac4d37b2ea29e53b4ed900": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "f6NFmpjEO-qb", - "outputId": "89fd1b48-5189-4401-9cf8-15f55c503676" - }, - "outputs": [ - { - "data": { - "image/jpeg": "", - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } + "6b8334e071a3438397ba6435aac69f58": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "ProgressStyleModel", + "state": { + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "ProgressStyleModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "StyleView", + "bar_color": null, + "description_width": "" + } + }, + "7d482188c12d4a7886f20a65d3402c59": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "1.5.0", + "model_name": "HBoxModel", + "state": { + "_dom_classes": [], + "_model_module": "@jupyter-widgets/controls", + "_model_module_version": "1.5.0", + "_model_name": "HBoxModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/controls", + "_view_module_version": "1.5.0", + "_view_name": "HBoxView", + "box_style": "", + "children": [ + "IPY_MODEL_2a3ec74419ae4a02ac0210db66133415", + "IPY_MODEL_ddeff9a822404adbbc3cad97a939bc0c", + "IPY_MODEL_36d341ab3a044709b5af2e8ab97559bc" ], - "source": [ - "from haystack import Pipeline\n", - "\n", - "basic_rag_pipeline = Pipeline()\n", - "# Add components to your pipeline\n", - "basic_rag_pipeline.add_component(\"text_embedder\", text_embedder)\n", - "basic_rag_pipeline.add_component(\"retriever\", retriever)\n", - "basic_rag_pipeline.add_component(\"prompt_builder\", prompt_builder)\n", - "basic_rag_pipeline.add_component(\"llm\", generator)\n", - "\n", - "# Now, connect the components to each other\n", - "basic_rag_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n", - "basic_rag_pipeline.connect(\"retriever\", \"prompt_builder.documents\")\n", - "basic_rag_pipeline.connect(\"prompt_builder\", \"llm\")" - ] + "layout": "IPY_MODEL_88fc33e1ab78405e911b5eafa512c935" + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "6NqyLhx7O-qc" - }, - "source": [ - "That's it! Your RAG pipeline is ready to generate answers to questions!" - ] + "7fdb2c859e454e72888709a835f7591e": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "markdown", - "metadata": { - "id": "DBAyF5tVO-qc" - }, - "source": [ - "## Asking a Question\n", - "\n", - "When asking a question, use the `run()` method of the pipeline. Make sure to provide the question to both the `text_embedder` and the `prompt_builder`. This ensures that the `{{question}}` variable in the template prompt gets replaced with your specific question." - ] + "88fc33e1ab78405e911b5eafa512c935": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "1.2.0", + "model_name": "LayoutModel", + "state": { + "_model_module": "@jupyter-widgets/base", + "_model_module_version": "1.2.0", + "_model_name": "LayoutModel", + "_view_count": null, + "_view_module": "@jupyter-widgets/base", + "_view_module_version": "1.2.0", + "_view_name": "LayoutView", + "align_content": null, + "align_items": null, + "align_self": null, + "border": null, + "bottom": null, + "display": null, + "flex": null, + "flex_flow": null, + "grid_area": null, + "grid_auto_columns": null, + "grid_auto_flow": null, + "grid_auto_rows": null, + "grid_column": null, + "grid_gap": null, + "grid_row": null, + "grid_template_areas": null, + "grid_template_columns": null, + "grid_template_rows": null, + "height": null, + "justify_content": null, + "justify_items": null, + "left": null, + "margin": null, + "max_height": null, + "max_width": null, + "min_height": null, + "min_width": null, + "object_fit": null, + "object_position": null, + "order": null, + "overflow": null, + "overflow_x": null, + "overflow_y": null, + "padding": null, + "right": null, + "top": null, + "visibility": null, + "width": null + } }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 86, - "referenced_widgets": [ - "4e6e97b6d54f4f80bb7e8b25aba8e616", - "1a820c06a7a049d8b6c9ff300284d06e", - "58ff4e0603a74978a134f63533859be5", - "8bdb8bfae31d4f4cb6c3b0bf43120eed", - "39a68d9a5c274e2dafaa2d1f86eea768", - "d0cfe5dacdfc431a91b4c4741123e2d0", - "e7f1e1a14bb740d18827dd78bbe7b2e3", - "3fda06f905b445a488efdd2dd08c0939", - "2bc341a780f7498ba9cd475468841bb5", - "d7218475e23b420a8c03d00ca4ab8718", - "a694abaf765f4d1b82fa0138e59c6793" - ] - }, - "id": "Vnt283M5O-qc", - "outputId": "d2843a73-3ad5-4daa-8d1e-a58de7aa2bb0" - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4e6e97b6d54f4f80bb7e8b25aba8e616", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Batches: 0%| | 0/1 [00:00 This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)..\n", - "\n", - "## Overview\n", - "This tutorial demonstrates how to use Haystack 2.0's advanced [looping pipelines](https://docs.haystack.deepset.ai/docs/pipelines#loops) with LLMs for more dynamic and flexible data processing. You'll learn how to extract structured data from unstructured data using an LLM, and to validate the generated output against a predefined schema.\n", - "\n", - "This tutorial uses `gpt-4o-mini` to change unstructured passages into JSON outputs that follow the [Pydantic](https://github.com/pydantic/pydantic) schema. It uses a custom OutputValidator component to validate the JSON and loop back to make corrections, if necessary." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jmiAHh1oGsKI" - }, - "source": [ - "## Preparing the Colab Environment\n", - "\n", - "Enable the debug mode of logging:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Vor9IHuNRvEh" - }, - "outputs": [], - "source": [ - "import logging\n", - "\n", - "logging.basicConfig()\n", - "logging.getLogger(\"canals.pipeline.pipeline\").setLevel(logging.DEBUG)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ljbWiyJkKiPw" - }, - "source": [ - "## Installing Dependencies\n", - "Install Haystack and [colorama](https://pypi.org/project/colorama/) with pip:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kcc1AlLQd_jI", - "outputId": "efc4bbab-a9fe-46ee-d8af-9d86edacaf04" - }, - "outputs": [], - "source": [ - "%%bash\n", - "\n", - "pip install haystack-ai\n", - "pip install colorama" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nTA5fdvCLMKD" - }, - "source": [ - "### Enabling Telemetry\n", - "\n", - "Enable telemetry to let us know you're using this tutorial. (You can always opt out by commenting out this line). For details, see [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Apay3QSQLKdM" - }, - "outputs": [], - "source": [ - "from haystack.telemetry import tutorial_running\n", - "\n", - "tutorial_running(28)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Cmjfa8CiCeFl" - }, - "source": [ - "## Defining a Schema to Parse the JSON Object\n", - "\n", - "Define a simple JSON schema for the data you want to extract from a text passsage using the LLM. As the first step, define two [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/), `City` and `CitiesData`, with suitable fields and types." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xwKrDOOGdaAz" - }, - "outputs": [], - "source": [ - "from typing import List\n", - "from pydantic import BaseModel\n", - "\n", - "\n", - "class City(BaseModel):\n", - " name: str\n", - " country: str\n", - " population: int\n", - "\n", - "\n", - "class CitiesData(BaseModel):\n", - " cities: List[City]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zv-6-l_PCeFl" - }, - "source": [ - "> You can change these models according to the format you wish to extract from the text." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ouk1mAOUCeFl" - }, - "source": [ - "Then, generate a JSON schema from Pydantic models using `schema_json()`. You will later on use this schema in the prompt to instruct the LLM.\n", - "\n", - "To learn more about the JSON schemas, visit [Pydantic Schema](https://docs.pydantic.dev/1.10/usage/schema/). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8Lg9_72jCeFl" - }, - "outputs": [], - "source": [ - "json_schema = CitiesData.schema_json(indent=2)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KvNhg0bP7kfg" - }, - "source": [ - "## Creating a Custom Component: OutputValidator\n", - "\n", - "`OutputValidator` is a custom component that validates if the JSON object the LLM generates complies with the provided [Pydantic model](https://docs.pydantic.dev/1.10/usage/models/). If it doesn't, OutputValidator returns an error message along with the incorrect JSON object to get it fixed in the next loop.\n", - "\n", - "For more details about custom components, see [Creating Custom Components](https://docs.haystack.deepset.ai/docs/custom-components)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yr6D8RN2d7Vy" - }, - "outputs": [], - "source": [ - "import json\n", - "import random\n", - "import pydantic\n", - "from pydantic import ValidationError\n", - "from typing import Optional, List\n", - "from colorama import Fore\n", - "from haystack import component\n", - "\n", - "# Define the component input parameters\n", - "@component\n", - "class OutputValidator:\n", - " def __init__(self, pydantic_model: pydantic.BaseModel):\n", - " self.pydantic_model = pydantic_model\n", - " self.iteration_counter = 0\n", - "\n", - " # Define the component output\n", - " @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])\n", - " def run(self, replies: List[str]):\n", - "\n", - " self.iteration_counter += 1\n", - "\n", - " ## Try to parse the LLM's reply ##\n", - " # If the LLM's reply is a valid object, return `\"valid_replies\"`\n", - " try:\n", - " output_dict = json.loads(replies[0])\n", - " self.pydantic_model.parse_obj(output_dict)\n", - " print(\n", - " Fore.GREEN\n", - " + f\"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}\"\n", - " )\n", - " return {\"valid_replies\": replies}\n", - "\n", - " # If the LLM's reply is corrupted or not valid, return \"invalid_replies\" and the \"error_message\" for LLM to try again\n", - " except (ValueError, ValidationError) as e:\n", - " print(\n", - " Fore.RED\n", - " + f\"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\\n\"\n", - " f\"Output from LLM:\\n {replies[0]} \\n\"\n", - " f\"Error from OutputValidator: {e}\"\n", - " )\n", - " return {\"invalid_replies\": replies, \"error_message\": str(e)}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vQ_TfSBkCeFm" - }, - "source": [ - "Then, create an OutputValidator instance with `CitiesData` that you have created before." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "bhPCLCBCCeFm" - }, - "outputs": [], - "source": [ - "output_validator = OutputValidator(pydantic_model=CitiesData)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xcIWKjW4k42r" - }, - "source": [ - "## Creating the Prompt\n", - "\n", - "Write instructions for the LLM for converting a passage into a JSON format. Ensure the instructions explain how to identify and correct errors if the JSON doesn't match the required schema. Once you create the prompt, initialize PromptBuilder to use it. \n", - "\n", - "For information about Jinja2 template and PromptBuilder, see [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ohPpNALjdVKt" - }, - "outputs": [], - "source": [ - "from haystack.components.builders import PromptBuilder\n", - "\n", - "prompt_template = \"\"\"\n", - "Create a JSON object from the information present in this passage: {{passage}}.\n", - "Only use information that is present in the passage. Follow this JSON schema, but only return the actual instances without any additional schema definition:\n", - "{{schema}}\n", - "Make sure your response is a dict and not a list.\n", - "{% if invalid_replies and error_message %}\n", - " You already created the following output in a previous attempt: {{invalid_replies}}\n", - " However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}\n", - " Correct the output and try again. Just return the corrected output without any extra explanations.\n", - "{% endif %}\n", - "\"\"\"\n", - "prompt_builder = PromptBuilder(template=prompt_template)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KM9-Zq2FL7Nn" - }, - "source": [ - "## Initalizing the Generator\n", - "\n", - "[OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) generates\n", - "text using OpenAI's `gpt-4o-mini` model by default. Set the `OPENAI_API_KEY` variable and provide a model name to the Generator." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Z4cQteIgunUR" - }, - "outputs": [], - "source": [ - "import os\n", - "from getpass import getpass\n", - "\n", - "from haystack.components.generators import OpenAIGenerator\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", - "generator = OpenAIGenerator()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zbotIOgXHkC5" - }, - "source": [ - "## Building the Pipeline\n", - "\n", - "Add all components to your pipeline and connect them. Add connections from `output_validator` back to the `prompt_builder` for cases where the produced JSON doesn't comply with the JSON schema. Set `max_runs_per_component` to avoid infinite looping." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eFglN9YEv-1W" - }, - "outputs": [], - "source": [ - "from haystack import Pipeline\n", - "\n", - "pipeline = Pipeline(max_runs_per_component=5)\n", - "\n", - "# Add components to your pipeline\n", - "pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n", - "pipeline.add_component(instance=generator, name=\"llm\")\n", - "pipeline.add_component(instance=output_validator, name=\"output_validator\")\n", - "\n", - "# Now, connect the components to each other\n", - "pipeline.connect(\"prompt_builder\", \"llm\")\n", - "pipeline.connect(\"llm\", \"output_validator\")\n", - "# If a component has more than one output or input, explicitly specify the connections:\n", - "pipeline.connect(\"output_validator.invalid_replies\", \"prompt_builder.invalid_replies\")\n", - "pipeline.connect(\"output_validator.error_message\", \"prompt_builder.error_message\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-UKW5wtIIT7w" - }, - "source": [ - "### Visualize the Pipeline\n", - "\n", - "Draw the pipeline with the [`draw()`](https://docs.haystack.deepset.ai/docs/drawing-pipeline-graphs) method to confirm the connections are correct. You can find the diagram in the Files section of this Colab." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RZJg6YHId300" - }, - "outputs": [], - "source": [ - "pipeline.draw(\"auto-correct-pipeline.png\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kV_kexTjImpo" - }, - "source": [ - "## Testing the Pipeline\n", - "\n", - "Run the pipeline with an example passage that you want to convert into a JSON format and the `json_schema` you have created for `CitiesData`. For the given example passage, the generated JSON object should be like:\n", - "```json\n", - "{\n", - " \"cities\": [\n", - " {\n", - " \"name\": \"Berlin\",\n", - " \"country\": \"Germany\",\n", - " \"population\": 3850809\n", - " },\n", - " {\n", - " \"name\": \"Paris\",\n", - " \"country\": \"France\",\n", - " \"population\": 2161000\n", - " },\n", - " {\n", - " \"name\": \"Lisbon\",\n", - " \"country\": \"Portugal\",\n", - " \"population\": 504718\n", - " }\n", - " ]\n", - "}\n", - "```\n", - "The output of the LLM should be compliant with the `json_schema`. If the LLM doesn't generate the correct JSON object, it will loop back and try again." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "yIoMedb6eKia", - "outputId": "4a9ef924-cf26-4908-d83f-b0bc0dc03b54" - }, - "outputs": [], - "source": [ - "passage = \"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\"\n", - "result = pipeline.run({\"prompt_builder\": {\"passage\": passage, \"schema\": json_schema}})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WWxmPgADS_Fa" - }, - "source": [ - "> If you encounter `PipelineMaxLoops: Maximum loops count (5) exceeded for component 'prompt_builder'.` error, consider increasing the maximum loop count or simply rerun the pipeline." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eWPawSjgSJAM" - }, - "source": [ - "### Print the Correct JSON\n", - "If you didn't get any error, you can now print the corrected JSON." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "BVO47gXQQnDC", - "outputId": "460a10d4-a69a-49cd-bbb2-fc4980907299" - }, - "outputs": [], - "source": [ - "valid_reply = result[\"output_validator\"][\"valid_replies\"][0]\n", - "valid_json = json.loads(valid_reply)\n", - "print(valid_json)" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "AVBtOVlNJ51C" + }, + "source": [ + "# Tutorial: Generating Structured Output with Loop-Based Auto-Correction\n", + "\n", + "- **Level**: Intermediate\n", + "- **Time to complete**: 15 minutes\n", + "- **Prerequisites**: You must have an API key from an active OpenAI account as this tutorial is using the gpt-4o-mini model by OpenAI.\n", + "- **Components Used**: `PromptBuilder`, `OpenAIGenerator`, `OutputValidator` (Custom component)\n", + "- **Goal**: After completing this tutorial, you will have built a system that extracts unstructured data, puts it in a JSON schema, and automatically corrects errors in the JSON output from a large language model (LLM) to make sure it follows the specified structure.\n", + "\n", + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", + "\n", + "## Overview\n", + "This tutorial demonstrates how to use Haystack 2.0's advanced [looping pipelines](https://docs.haystack.deepset.ai/docs/pipelines#loops) with LLMs for more dynamic and flexible data processing. You'll learn how to extract structured data from unstructured data using an LLM, and to validate the generated output against a predefined schema.\n", + "\n", + "This tutorial uses `gpt-4o-mini` to change unstructured passages into JSON outputs that follow the [Pydantic](https://github.com/pydantic/pydantic) schema. It uses a custom OutputValidator component to validate the JSON and loop back to make corrections, if necessary." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jmiAHh1oGsKI" + }, + "source": [ + "## Preparing the Colab Environment\n", + "\n", + "Enable the debug mode of logging:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "Vor9IHuNRvEh" + }, + "outputs": [], + "source": [ + "import logging\n", + "\n", + "logging.basicConfig()\n", + "logging.getLogger(\"canals.pipeline.pipeline\").setLevel(logging.DEBUG)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ljbWiyJkKiPw" + }, + "source": [ + "## Installing Dependencies\n", + "Install Haystack and [colorama](https://pypi.org/project/colorama/) with pip:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "kcc1AlLQd_jI", + "outputId": "efc4bbab-a9fe-46ee-d8af-9d86edacaf04" + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "pip install -U haystack-ai\n", + "pip install colorama" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nTA5fdvCLMKD" + }, + "source": [ + "### Enabling Telemetry\n", + "\n", + "Enable telemetry to let us know you're using this tutorial. (You can always opt out by commenting out this line). For details, see [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Cmjfa8CiCeFl" + }, + "source": [ + "## Defining a Schema to Parse the JSON Object\n", + "\n", + "Define a simple JSON schema for the data you want to extract from a text passsage using the LLM. As the first step, define two [Pydantic models](https://docs.pydantic.dev/1.10/usage/models/), `City` and `CitiesData`, with suitable fields and types." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "xwKrDOOGdaAz" + }, + "outputs": [], + "source": [ + "from typing import List\n", + "from pydantic import BaseModel\n", + "\n", + "\n", + "class City(BaseModel):\n", + " person: List[str]\n", + " country: List[str]\n", + " city: List[str]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zv-6-l_PCeFl" + }, + "source": [ + "> You can change these models according to the format you wish to extract from the text." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ouk1mAOUCeFl" + }, + "source": [ + "Then, generate a JSON schema from Pydantic models using `schema_json()`. You will later on use this schema in the prompt to instruct the LLM.\n", + "\n", + "To learn more about the JSON schemas, visit [Pydantic Schema](https://docs.pydantic.dev/1.10/usage/schema/). " + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "8Lg9_72jCeFl" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "Egz_4h2vI_QL" - }, - "source": [ - "## What's next\n", - "\n", - "🎉 Congratulations! You've built a system that generates structured JSON out of unstructured text passages, and auto-corrects it by using the looping functionality of Haystack pipelines.\n", - "\n", - "To stay up to date on the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) and [join Haystack discord community](https://discord.gg/haystack).\n", - "\n", - "Thanks for reading!" - ] + "ename": "ValidationError", + "evalue": "3 validation errors for City\nperson\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.6/v/missing\ncountry\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.6/v/missing\ncity\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.6/v/missing", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m json_schema \u001b[38;5;241m=\u001b[39m \u001b[43mCity\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mmodel_dump_json(indent\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n", + "File \u001b[0;32m~/Desktop/deepset/haystack-tutorials/.venv/lib/python3.9/site-packages/pydantic/main.py:171\u001b[0m, in \u001b[0;36mBaseModel.__init__\u001b[0;34m(self, **data)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;66;03m# `__tracebackhide__` tells pytest and some other tools to omit this function from tracebacks\u001b[39;00m\n\u001b[1;32m 170\u001b[0m __tracebackhide__ \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m--> 171\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m__pydantic_validator__\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mvalidate_python\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mself_instance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\n", + "\u001b[0;31mValidationError\u001b[0m: 3 validation errors for City\nperson\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.6/v/missing\ncountry\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.6/v/missing\ncity\n Field required [type=missing, input_value={}, input_type=dict]\n For further information visit https://errors.pydantic.dev/2.6/v/missing" + ] } - ], - "metadata": { - "accelerator": "GPU", + ], + "source": [ + "json_schema = City().model_dump_json(indent=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KvNhg0bP7kfg" + }, + "source": [ + "## Creating a Custom Component: OutputValidator\n", + "\n", + "`OutputValidator` is a custom component that validates if the JSON object the LLM generates complies with the provided [Pydantic model](https://docs.pydantic.dev/1.10/usage/models/). If it doesn't, OutputValidator returns an error message along with the incorrect JSON object to get it fixed in the next loop.\n", + "\n", + "For more details about custom components, see [Creating Custom Components](https://docs.haystack.deepset.ai/docs/custom-components)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yr6D8RN2d7Vy" + }, + "outputs": [], + "source": [ + "import json\n", + "import random\n", + "import pydantic\n", + "from pydantic import ValidationError\n", + "from typing import Optional, List\n", + "from colorama import Fore\n", + "from haystack import component\n", + "\n", + "# Define the component input parameters\n", + "@component\n", + "class OutputValidator:\n", + " def __init__(self, pydantic_model: pydantic.BaseModel):\n", + " self.pydantic_model = pydantic_model\n", + " self.iteration_counter = 0\n", + "\n", + " # Define the component output\n", + " @component.output_types(valid_replies=List[str], invalid_replies=Optional[List[str]], error_message=Optional[str])\n", + " def run(self, replies: List[str]):\n", + "\n", + " self.iteration_counter += 1\n", + "\n", + " ## Try to parse the LLM's reply ##\n", + " # If the LLM's reply is a valid object, return `\"valid_replies\"`\n", + " try:\n", + " output_dict = json.loads(replies[0])\n", + " self.pydantic_model.parse_obj(output_dict)\n", + " print(\n", + " Fore.GREEN\n", + " + f\"OutputValidator at Iteration {self.iteration_counter}: Valid JSON from LLM - No need for looping: {replies[0]}\"\n", + " )\n", + " return {\"valid_replies\": replies}\n", + "\n", + " # If the LLM's reply is corrupted or not valid, return \"invalid_replies\" and the \"error_message\" for LLM to try again\n", + " except (ValueError, ValidationError) as e:\n", + " print(\n", + " Fore.RED\n", + " + f\"OutputValidator at Iteration {self.iteration_counter}: Invalid JSON from LLM - Let's try again.\\n\"\n", + " f\"Output from LLM:\\n {replies[0]} \\n\"\n", + " f\"Error from OutputValidator: {e}\"\n", + " )\n", + " return {\"invalid_replies\": replies, \"error_message\": str(e)}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vQ_TfSBkCeFm" + }, + "source": [ + "Then, create an OutputValidator instance with `CitiesData` that you have created before." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bhPCLCBCCeFm" + }, + "outputs": [], + "source": [ + "output_validator = OutputValidator(pydantic_model=City)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xcIWKjW4k42r" + }, + "source": [ + "## Creating the Prompt\n", + "\n", + "Write instructions for the LLM for converting a passage into a JSON format. Ensure the instructions explain how to identify and correct errors if the JSON doesn't match the required schema. Once you create the prompt, initialize PromptBuilder to use it. \n", + "\n", + "For information about Jinja2 template and PromptBuilder, see [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ohPpNALjdVKt" + }, + "outputs": [], + "source": [ + "from haystack.components.builders import PromptBuilder\n", + "\n", + "prompt_template = \"\"\"\n", + "Create a JSON object from the information present in this passage: {{passage}}.\n", + "Only use information that is present in the passage. Follow this JSON schema, but only return the actual instances without any additional schema definition:\n", + "{{schema}}\n", + "Make sure your response is a dict and not a list.\n", + "{% if invalid_replies and error_message %}\n", + " You already created the following output in a previous attempt: {{invalid_replies}}\n", + " However, this doesn't comply with the format requirements from above and triggered this Python exception: {{error_message}}\n", + " Correct the output and try again. Just return the corrected output without any extra explanations.\n", + "{% endif %}\n", + "\"\"\"\n", + "prompt_builder = PromptBuilder(template=prompt_template)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KM9-Zq2FL7Nn" + }, + "source": [ + "## Initalizing the Generator\n", + "\n", + "[OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) generates\n", + "text using OpenAI's `gpt-4o-mini` model by default. Set the `OPENAI_API_KEY` variable and provide a model name to the Generator." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Z4cQteIgunUR" + }, + "outputs": [], + "source": [ + "import os\n", + "from getpass import getpass\n", + "\n", + "from haystack.components.generators import OpenAIGenerator\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", + "generator = OpenAIGenerator()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zbotIOgXHkC5" + }, + "source": [ + "## Building the Pipeline\n", + "\n", + "Add all components to your pipeline and connect them. Add connections from `output_validator` back to the `prompt_builder` for cases where the produced JSON doesn't comply with the JSON schema. Set `max_runs_per_component` to avoid infinite looping." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eFglN9YEv-1W" + }, + "outputs": [], + "source": [ + "from haystack import Pipeline\n", + "\n", + "pipeline = Pipeline(max_runs_per_component=5)\n", + "\n", + "# Add components to your pipeline\n", + "pipeline.add_component(instance=prompt_builder, name=\"prompt_builder\")\n", + "pipeline.add_component(instance=generator, name=\"llm\")\n", + "pipeline.add_component(instance=output_validator, name=\"output_validator\")\n", + "\n", + "# Now, connect the components to each other\n", + "pipeline.connect(\"prompt_builder\", \"llm\")\n", + "pipeline.connect(\"llm\", \"output_validator\")\n", + "# If a component has more than one output or input, explicitly specify the connections:\n", + "pipeline.connect(\"output_validator.invalid_replies\", \"prompt_builder.invalid_replies\")\n", + "pipeline.connect(\"output_validator.error_message\", \"prompt_builder.error_message\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-UKW5wtIIT7w" + }, + "source": [ + "### Visualize the Pipeline\n", + "\n", + "Draw the pipeline with the [`draw()`](https://docs.haystack.deepset.ai/docs/drawing-pipeline-graphs) method to confirm the connections are correct. You can find the diagram in the Files section of this Colab." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "RZJg6YHId300" + }, + "outputs": [], + "source": [ + "pipeline.draw(\"auto-correct-pipeline.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kV_kexTjImpo" + }, + "source": [ + "## Testing the Pipeline\n", + "\n", + "Run the pipeline with an example passage that you want to convert into a JSON format and the `json_schema` you have created for `CitiesData`. For the given example passage, the generated JSON object should be like:\n", + "```json\n", + "{\n", + " \"cities\": [\n", + " {\n", + " \"name\": \"Berlin\",\n", + " \"country\": \"Germany\",\n", + " \"population\": 3850809\n", + " },\n", + " {\n", + " \"name\": \"Paris\",\n", + " \"country\": \"France\",\n", + " \"population\": 2161000\n", + " },\n", + " {\n", + " \"name\": \"Lisbon\",\n", + " \"country\": \"Portugal\",\n", + " \"population\": 504718\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "The output of the LLM should be compliant with the `json_schema`. If the LLM doesn't generate the correct JSON object, it will loop back and try again." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "gpuType": "T4", - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" + "id": "yIoMedb6eKia", + "outputId": "4a9ef924-cf26-4908-d83f-b0bc0dc03b54" + }, + "outputs": [], + "source": [ + "passage = \"Berlin is the capital of Germany. It has a population of 3,850,809. Paris, France's capital, has 2.161 million residents. Lisbon is the capital and the largest city of Portugal with the population of 504,718.\"\n", + "result = pipeline.run({\"prompt_builder\": {\"passage\": passage, \"schema\": json_schema}})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WWxmPgADS_Fa" + }, + "source": [ + "> If you encounter `PipelineMaxLoops: Maximum loops count (5) exceeded for component 'prompt_builder'.` error, consider increasing the maximum loop count or simply rerun the pipeline." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eWPawSjgSJAM" + }, + "source": [ + "### Print the Correct JSON\n", + "If you didn't get any error, you can now print the corrected JSON." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - "language_info": { - "name": "python" - } + "id": "BVO47gXQQnDC", + "outputId": "460a10d4-a69a-49cd-bbb2-fc4980907299" + }, + "outputs": [], + "source": [ + "valid_reply = result[\"output_validator\"][\"valid_replies\"][0]\n", + "valid_json = json.loads(valid_reply)\n", + "print(valid_json)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Egz_4h2vI_QL" + }, + "source": [ + "## What's next\n", + "\n", + "🎉 Congratulations! You've built a system that generates structured JSON out of unstructured text passages, and auto-corrects it by using the looping functionality of Haystack pipelines.\n", + "\n", + "To stay up to date on the latest Haystack developments, you can [subscribe to our newsletter](https://landing.deepset.ai/haystack-community-updates) and [join Haystack discord community](https://discord.gg/haystack).\n", + "\n", + "Thanks for reading!" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tutorials/29_Serializing_Pipelines.ipynb b/tutorials/29_Serializing_Pipelines.ipynb index 9e6d3493..a95ebe62 100644 --- a/tutorials/29_Serializing_Pipelines.ipynb +++ b/tutorials/29_Serializing_Pipelines.ipynb @@ -14,7 +14,7 @@ "- **Prerequisites**: None\n", "- **Goal**: After completing this tutorial, you'll understand how to serialize and deserialize between YAML and Python code.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)." + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro)." ] }, { @@ -52,7 +52,7 @@ "source": [ "## Installing Haystack\n", "\n", - "Install Haystack 2.0 with `pip`:" + "Install Haystack with `pip`:" ] }, { @@ -65,48 +65,7 @@ "id": "CagzMFdkeBBp", "outputId": "e304450a-24e3-4ef8-e642-1fbb573e7d55" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: haystack-ai in /usr/local/lib/python3.10/dist-packages (2.0.0b5)\n", - "Requirement already satisfied: boilerpy3 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.0.7)\n", - "Requirement already satisfied: haystack-bm25 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.0.2)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.1.3)\n", - "Requirement already satisfied: lazy-imports in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (0.3.1)\n", - "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (10.1.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.2.1)\n", - "Requirement already satisfied: openai>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.10.0)\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (1.5.3)\n", - "Requirement already satisfied: posthog in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (3.3.3)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (6.0.1)\n", - "Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (8.2.3)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.66.1)\n", - "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from haystack-ai) (4.9.0)\n", - "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (3.7.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->haystack-ai) (1.7.0)\n", - "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (0.26.0)\n", - "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.10.14)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai) (1.3.0)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from haystack-bm25->haystack-ai) (1.23.5)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai) (2.1.4)\n", - "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai) (2023.3.post1)\n", - "Requirement already satisfied: requests<3.0,>=2.7 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.31.0)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.16.0)\n", - "Requirement already satisfied: monotonic>=1.5 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (1.6)\n", - "Requirement already satisfied: backoff>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from posthog->haystack-ai) (2.2.1)\n", - "Requirement already satisfied: idna>=2.8 in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (3.6)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai) (1.2.0)\n", - "Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (2023.11.17)\n", - "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (1.0.2)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /usr/local/lib/python3.10/dist-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (0.14.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog->haystack-ai) (3.3.2)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0,>=2.7->posthog->haystack-ai) (2.0.7)\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "\n", diff --git a/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb b/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb index 5349636c..3857962d 100644 --- a/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb +++ b/tutorials/30_File_Type_Preprocessing_Index_Pipeline.ipynb @@ -12,7 +12,7 @@ "- **Time to complete**: 15 minutes\n", "- **Goal**: After completing this tutorial, you'll have learned how to build an indexing pipeline that will preprocess files based on their file type, using the `FileTypeRouter`.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", "\n", "> 💡 (Optional): After creating the indexing pipeline in this tutorial, there is an optional section that shows you how to create a RAG pipeline on top of the document store you just created. You must have a [Hugging Face API Key](https://huggingface.co/settings/tokens) for this section\n", "\n", @@ -467,7 +467,8 @@ "provenance": [] }, "kernelspec": { - "display_name": "Python 3", + "display_name": ".venv", + "language": "python", "name": "python3" }, "language_info": { diff --git a/tutorials/31_Metadata_Filtering.ipynb b/tutorials/31_Metadata_Filtering.ipynb index 4c5d05f5..d46f67f6 100644 --- a/tutorials/31_Metadata_Filtering.ipynb +++ b/tutorials/31_Metadata_Filtering.ipynb @@ -14,7 +14,7 @@ "- **Prerequisites**: None\n", "- **Goal**: Filter documents in a document store based on given metadata\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)." + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro)." ] }, { @@ -50,7 +50,7 @@ "source": [ "## Installing Haystack\n", "\n", - "Install Haystack 2.0 with `pip`:" + "Install Haystack with `pip`:" ] }, { diff --git a/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb b/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb index c4bdb5b3..6de8b6f0 100644 --- a/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb +++ b/tutorials/32_Classifying_Documents_and_Queries_by_Language.ipynb @@ -14,7 +14,7 @@ "- **Goal**: After completing this tutorial, you'll have learned how to build a Haystack pipeline to classify documents based on the (human) language they were written in.\n", "- Optionally, at the end you'll also incorporate language clasification and query routing into a RAG pipeline, so you can query documents based on the language a question was written in.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro).\n" + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n" ] }, { @@ -298,9 +298,11 @@ "outputs": [], "source": [ "language_classifier = DocumentLanguageClassifier(languages=[\"en\", \"fr\", \"es\"])\n", - "router_rules = {\"en\": {\"field\": \"meta.language\", \"operator\": \"==\", \"value\": \"en\"}, \n", - " \"fr\": {\"field\": \"meta.language\", \"operator\": \"==\", \"value\": \"fe\"}, \n", - " \"es\": {\"field\": \"meta.language\", \"operator\": \"==\", \"value\": \"es\"}}\n", + "router_rules = {\n", + " \"en\": {\"field\": \"meta.language\", \"operator\": \"==\", \"value\": \"en\"},\n", + " \"fr\": {\"field\": \"meta.language\", \"operator\": \"==\", \"value\": \"fe\"},\n", + " \"es\": {\"field\": \"meta.language\", \"operator\": \"==\", \"value\": \"es\"},\n", + "}\n", "router = MetadataRouter(rules=router_rules)" ] }, diff --git a/tutorials/33_Hybrid_Retrieval.ipynb b/tutorials/33_Hybrid_Retrieval.ipynb index 4813d3df..6c136ab5 100644 --- a/tutorials/33_Hybrid_Retrieval.ipynb +++ b/tutorials/33_Hybrid_Retrieval.ipynb @@ -14,7 +14,7 @@ "- **Prerequisites**: None\n", "- **Goal**: After completing this tutorial, you will have learned about creating a hybrid retrieval and when it's useful.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)." + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro)." ] }, { @@ -50,7 +50,7 @@ "source": [ "## Installing Haystack\n", "\n", - "Install Haystack 2.0 and other required packages with `pip`:" + "Install Haystack and other required packages with `pip`:" ] }, { diff --git a/tutorials/34_Extractive_QA_Pipeline.ipynb b/tutorials/34_Extractive_QA_Pipeline.ipynb index 7f21ce7e..7c1222aa 100644 --- a/tutorials/34_Extractive_QA_Pipeline.ipynb +++ b/tutorials/34_Extractive_QA_Pipeline.ipynb @@ -13,7 +13,7 @@ "- **Components Used**: [`ExtractiveReader`](https://docs.haystack.deepset.ai/docs/extractivereader), [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`DocumentWriter`](https://docs.haystack.deepset.ai/docs/documentwriter), [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)\n", "- **Goal**: After completing this tutorial, you'll have learned how to build a Haystack pipeline that uses an extractive model to display where the answer to your query is.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", "\n" ] }, diff --git a/tutorials/35_Evaluating_RAG_Pipelines.ipynb b/tutorials/35_Evaluating_RAG_Pipelines.ipynb index 0f1d860e..bcea2f3d 100644 --- a/tutorials/35_Evaluating_RAG_Pipelines.ipynb +++ b/tutorials/35_Evaluating_RAG_Pipelines.ipynb @@ -14,7 +14,7 @@ "- **Prerequisites**: You must have an API key from an active OpenAI account as this tutorial is using the gpt-4o-mini model by OpenAI: https://platform.openai.com/api-keys\n", "- **Goal**: After completing this tutorial, you'll have learned how to evaluate your RAG pipelines both with model-based, and statistical metrics available in the Haystack evaluation offering. You'll also see which other evaluation frameworks are integrated with Haystack.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)." + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro)." ] }, { @@ -75,12 +75,12 @@ "source": [ "## Installing Haystack\n", "\n", - "Install Haystack 2.0 and [datasets](https://pypi.org/project/datasets/) with `pip`:" + "Install Haystack and [datasets](https://pypi.org/project/datasets/) with `pip`:" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -88,136 +88,7 @@ "id": "UQbU8GUfO-qZ", "outputId": "80fe52ea-108b-4bb4-cb1d-fe79373c86f3" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting git+https://github.com/deepset-ai/haystack.git@main\n", - " Cloning https://github.com/deepset-ai/haystack.git (to revision main) to /tmp/pip-req-build-83hiigdl\n", - " Resolved https://github.com/deepset-ai/haystack.git to commit 2509eeea7e82ef52ef65ccce00bfdcc6c1e8c1c2\n", - " Installing build dependencies: started\n", - " Installing build dependencies: finished with status 'done'\n", - " Getting requirements to build wheel: started\n", - " Getting requirements to build wheel: finished with status 'done'\n", - " Preparing metadata (pyproject.toml): started\n", - " Preparing metadata (pyproject.toml): finished with status 'done'\n", - "Collecting boilerpy3 (from haystack-ai==2.1.0rc0)\n", - " Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)\n", - "Collecting haystack-bm25 (from haystack-ai==2.1.0rc0)\n", - " Downloading haystack_bm25-1.0.2-py2.py3-none-any.whl (8.8 kB)\n", - "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (3.1.3)\n", - "Collecting lazy-imports (from haystack-ai==2.1.0rc0)\n", - " Downloading lazy_imports-0.3.1-py3-none-any.whl (12 kB)\n", - "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (10.1.0)\n", - "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (3.3)\n", - "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (1.25.2)\n", - "Collecting openai>=1.1.0 (from haystack-ai==2.1.0rc0)\n", - " Downloading openai-1.25.0-py3-none-any.whl (312 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 312.9/312.9 kB 9.8 MB/s eta 0:00:00\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (2.0.3)\n", - "Collecting posthog (from haystack-ai==2.1.0rc0)\n", - " Downloading posthog-3.5.0-py2.py3-none-any.whl (41 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.3/41.3 kB 4.4 MB/s eta 0:00:00\n", - "Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (2.8.2)\n", - "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (6.0.1)\n", - "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (2.31.0)\n", - "Requirement already satisfied: tenacity in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (8.2.3)\n", - "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (4.66.2)\n", - "Requirement already satisfied: typing-extensions>=4.7 in /usr/local/lib/python3.10/dist-packages (from haystack-ai==2.1.0rc0) (4.11.0)\n", - "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai==2.1.0rc0) (3.7.1)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /usr/lib/python3/dist-packages (from openai>=1.1.0->haystack-ai==2.1.0rc0) (1.7.0)\n", - "Collecting httpx<1,>=0.23.0 (from openai>=1.1.0->haystack-ai==2.1.0rc0)\n", - " Downloading httpx-0.27.0-py3-none-any.whl (75 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.6/75.6 kB 7.4 MB/s eta 0:00:00\n", - "Requirement already satisfied: pydantic<3,>=1.9.0 in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai==2.1.0rc0) (2.7.1)\n", - "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai>=1.1.0->haystack-ai==2.1.0rc0) (1.3.1)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->haystack-ai==2.1.0rc0) (2.1.5)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai==2.1.0rc0) (2023.4)\n", - "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->haystack-ai==2.1.0rc0) (2024.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil->haystack-ai==2.1.0rc0) (1.16.0)\n", - "Collecting monotonic>=1.5 (from posthog->haystack-ai==2.1.0rc0)\n", - " Downloading monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", - "Collecting backoff>=1.10.0 (from posthog->haystack-ai==2.1.0rc0)\n", - " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==2.1.0rc0) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==2.1.0rc0) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==2.1.0rc0) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->haystack-ai==2.1.0rc0) (2024.2.2)\n", - "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai>=1.1.0->haystack-ai==2.1.0rc0) (1.2.1)\n", - "Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai==2.1.0rc0)\n", - " Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.9/77.9 kB 12.3 MB/s eta 0:00:00\n", - "Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai==2.1.0rc0)\n", - " Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 58.3/58.3 kB 10.2 MB/s eta 0:00:00\n", - "Requirement already satisfied: annotated-types>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai==2.1.0rc0) (0.6.0)\n", - "Requirement already satisfied: pydantic-core==2.18.2 in /usr/local/lib/python3.10/dist-packages (from pydantic<3,>=1.9.0->openai>=1.1.0->haystack-ai==2.1.0rc0) (2.18.2)\n", - "Building wheels for collected packages: haystack-ai\n", - " Building wheel for haystack-ai (pyproject.toml): started\n", - " Building wheel for haystack-ai (pyproject.toml): finished with status 'done'\n", - " Created wheel for haystack-ai: filename=haystack_ai-2.1.0rc0-py3-none-any.whl size=316211 sha256=aee4b70fda05260e7466d477508440735cfe4d5c3b9a15a7003773a7fa01bd0c\n", - " Stored in directory: /tmp/pip-ephem-wheel-cache-faxhntm2/wheels/23/e0/55/004621325804423c8026b4b5008ddb11f337bf73284d1b9caf\n", - "Successfully built haystack-ai\n", - "Installing collected packages: monotonic, lazy-imports, haystack-bm25, h11, boilerpy3, backoff, posthog, httpcore, httpx, openai, haystack-ai\n", - "Successfully installed backoff-2.2.1 boilerpy3-1.0.7 h11-0.14.0 haystack-ai-2.1.0rc0 haystack-bm25-1.0.2 httpcore-1.0.5 httpx-0.27.0 lazy-imports-0.3.1 monotonic-1.6 openai-1.25.0 posthog-3.5.0\n", - "Collecting datasets>=2.6.1\n", - " Downloading datasets-2.19.0-py3-none-any.whl (542 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 542.0/542.0 kB 9.3 MB/s eta 0:00:00\n", - "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (3.13.4)\n", - "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (1.25.2)\n", - "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (14.0.2)\n", - "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (0.6)\n", - "Collecting dill<0.3.9,>=0.3.0 (from datasets>=2.6.1)\n", - " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 116.3/116.3 kB 11.3 MB/s eta 0:00:00\n", - "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (2.0.3)\n", - "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (2.31.0)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (4.66.2)\n", - "Collecting xxhash (from datasets>=2.6.1)\n", - " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 12.5 MB/s eta 0:00:00\n", - "Collecting multiprocess (from datasets>=2.6.1)\n", - " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 11.9 MB/s eta 0:00:00\n", - "Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (2023.6.0)\n", - "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (3.9.5)\n", - "Collecting huggingface-hub>=0.21.2 (from datasets>=2.6.1)\n", - " Downloading huggingface_hub-0.22.2-py3-none-any.whl (388 kB)\n", - " ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 388.9/388.9 kB 17.1 MB/s eta 0:00:00\n", - "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (24.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.6.1) (6.0.1)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (23.2.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (6.0.5)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (1.9.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.6.1) (4.0.3)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.21.2->datasets>=2.6.1) (4.11.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (3.7)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (2.0.7)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.6.1) (2024.2.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.6.1) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.6.1) (2023.4)\n", - "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.6.1) (2024.1)\n", - "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->datasets>=2.6.1) (1.16.0)\n", - "Installing collected packages: xxhash, dill, multiprocess, huggingface-hub, datasets\n", - " Attempting uninstall: huggingface-hub\n", - " Found existing installation: huggingface-hub 0.20.3\n", - " Uninstalling huggingface-hub-0.20.3:\n", - " Successfully uninstalled huggingface-hub-0.20.3\n", - "Successfully installed datasets-2.19.0 dill-0.3.8 huggingface-hub-0.22.2 multiprocess-0.70.16 xxhash-3.4.1\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-83hiigdl\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "\n", diff --git a/tutorials/36_Building_Fallbacks_with_Conditional_Routing.ipynb b/tutorials/36_Building_Fallbacks_with_Conditional_Routing.ipynb index 9e91f81c..2bfd8b2e 100644 --- a/tutorials/36_Building_Fallbacks_with_Conditional_Routing.ipynb +++ b/tutorials/36_Building_Fallbacks_with_Conditional_Routing.ipynb @@ -1,551 +1,552 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "IR5wivW8THt7" - }, - "source": [ - "# Tutorial: Building Fallbacks to Websearch with Conditional Routing\n", - "\n", - "- **Level**: Intermediate\n", - "- **Time to complete**: 10 minutes\n", - "- **Components Used**: [`ConditionalRouter`](https://docs.haystack.deepset.ai/docs/conditionalrouter), [`SerperDevWebSearch`](https://docs.haystack.deepset.ai/docs/serperdevwebsearch), [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder), [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator)\n", - "- **Prerequisites**: You must have an [OpenAI API Key](https://platform.openai.com/api-keys) and a [Serper API Key](https://serper.dev/api-key) for this tutorial\n", - "- **Goal**: After completing this tutorial, you'll have learned how to create a pipeline with conditional routing that can fallback to websearch if the answer is not present in your dataset.\n", - "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro).\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F-a-MAMVat-o" - }, - "source": [ - "## Overview\n", - "\n", - "When developing applications using **retrieval augmented generation ([RAG](https://www.deepset.ai/blog/llms-retrieval-augmentation))**, the retrieval step plays a critical role. It serves as the primary information source for **large language models (LLMs)** to generate responses. However, if your database lacks the necessary information, the retrieval step's effectiveness is limited. In such scenarios, it may be practical to use the web as a fallback data source for your RAG application. By implementing a conditional routing mechanism in your system, you gain complete control over the data flow, enabling you to design a system that can leverage the web as its data source under some conditions.\n", - "\n", - "In this tutorial, you will learn how to create a pipeline with conditional routing that directs the query to a **web-based RAG** route if the answer is not found in the initially given documents." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LSwNKkeKeq0f" - }, - "source": [ - "## Development Environment" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eGJ7GmCBas4R" - }, - "source": [ - "### Prepare the Colab Environment\n", - "\n", - "- [Enable GPU Runtime in Colab](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration)\n", - "- [Set logging level to INFO](https://docs.haystack.deepset.ai/docs/setting-the-log-level)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FwIgIpE2XqpO" - }, - "source": [ - "### Install Haystack\n", - "\n", - "Install Haystack 2.0 with `pip`:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uba0mntlqs_O" - }, - "outputs": [], - "source": [ - "%%bash\n", - "\n", - "pip install haystack-ai" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WBkJ7d3hZkOJ" - }, - "source": [ - "### Enable Telemetry\n", - "\n", - "Knowing you're using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry) for more details." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HvrOixzzZmMi" - }, - "outputs": [], - "source": [ - "from haystack.telemetry import tutorial_running\n", - "\n", - "tutorial_running(36)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QfECEAy2Jdqs" - }, - "source": [ - "### Enter API Keys\n", - "\n", - "Enter API keys required for this tutorial." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "13U7Z_k3yE-F", - "outputId": "6ec48553-12d2-4c89-ca13-fc5d34fbc625" - }, - "outputs": [], - "source": [ - "from getpass import getpass\n", - "import os\n", - "\n", - "if \"OPENAI_API_KEY\" not in os.environ:\n", - " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", - "if \"SERPERDEV_API_KEY\" not in os.environ:\n", - " os.environ[\"SERPERDEV_API_KEY\"] = getpass(\"Enter Serper Api key: \")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i_AlhPv1T-4t" - }, - "source": [ - "## Creating a Document\n", - "\n", - "Create a Document about Munich, where the answer to your question will be initially searched:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5CHbQlLMyVbg" - }, - "outputs": [], - "source": [ - "from haystack.dataclasses import Document\n", - "\n", - "documents = [\n", - " Document(\n", - " content=\"\"\"Munich, the vibrant capital of Bavaria in southern Germany, exudes a perfect blend of rich cultural\n", - " heritage and modern urban sophistication. Nestled along the banks of the Isar River, Munich is renowned\n", - " for its splendid architecture, including the iconic Neues Rathaus (New Town Hall) at Marienplatz and\n", - " the grandeur of Nymphenburg Palace. The city is a haven for art enthusiasts, with world-class museums like the\n", - " Alte Pinakothek housing masterpieces by renowned artists. Munich is also famous for its lively beer gardens, where\n", - " locals and tourists gather to enjoy the city's famed beers and traditional Bavarian cuisine. The city's annual\n", - " Oktoberfest celebration, the world's largest beer festival, attracts millions of visitors from around the globe.\n", - " Beyond its cultural and culinary delights, Munich offers picturesque parks like the English Garden, providing a\n", - " serene escape within the heart of the bustling metropolis. Visitors are charmed by Munich's warm hospitality,\n", - " making it a must-visit destination for travelers seeking a taste of both old-world charm and contemporary allure.\"\"\"\n", - " )\n", - "]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zMNy0tjtUh_L" - }, - "source": [ - "## Creating the Initial Pipeline Components\n", - "\n", - "First, define a prompt instructing the LLM to respond with the text `\"no_answer\"` if the provided documents do not offer enough context to answer the query. Next, initialize a [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder) with that prompt. It's crucial that the LLM replies with `\"no_answer\"` as you will use this keyword to indicate that the query should be directed to the fallback web search route.\n", - "\n", - "As the LLM, you will use an [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) with the `gpt-4o-mini` model.\n", - "\n", - "> The provided prompt works effectively with the `gpt-4o-mini` model. If you prefer to use a different [Generator](https://docs.haystack.deepset.ai/docs/generators), you may need to update the prompt to provide clear instructions to your model." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "id": "nzhn2kDfqvbs" - }, - "outputs": [], - "source": [ - "from haystack.components.builders.prompt_builder import PromptBuilder\n", - "from haystack.components.generators import OpenAIGenerator\n", - "\n", - "prompt_template = \"\"\"\n", - "Answer the following query given the documents.\n", - "If the answer is not contained within the documents reply with 'no_answer'\n", - "Query: {{query}}\n", - "Documents:\n", - "{% for document in documents %}\n", - " {{document.content}}\n", - "{% endfor %}\n", - "\"\"\"\n", - "\n", - "prompt_builder = PromptBuilder(template=prompt_template)\n", - "llm = OpenAIGenerator(model=\"gpt-4o-mini\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LepACkkWPsBx" - }, - "source": [ - "## Initializing the Web Search Components\n", - "\n", - "Initialize the necessary components for a web-based RAG application. Along with a `PromptBuilder` and an `OpenAIGenerator`, you will need a [SerperDevWebSearch](https://docs.haystack.deepset.ai/docs/serperdevwebsearch) to retrieve relevant documents for the query from the web.\n", - "\n", - "> If desired, you can use a different [Generator](https://docs.haystack.deepset.ai/docs/generators) for the web-based RAG branch of the pipeline." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "id": "VEYchFgQPxZ_" - }, - "outputs": [], - "source": [ - "from haystack.components.builders.prompt_builder import PromptBuilder\n", - "from haystack.components.generators import OpenAIGenerator\n", - "from haystack.components.websearch.serper_dev import SerperDevWebSearch\n", - "\n", - "prompt_for_websearch = \"\"\"\n", - "Answer the following query given the documents retrieved from the web.\n", - "Your answer shoud indicate that your answer was generated from websearch.\n", - "\n", - "Query: {{query}}\n", - "Documents:\n", - "{% for document in documents %}\n", - " {{document.content}}\n", - "{% endfor %}\n", - "\"\"\"\n", - "\n", - "websearch = SerperDevWebSearch()\n", - "prompt_builder_for_websearch = PromptBuilder(template=prompt_for_websearch)\n", - "llm_for_websearch = OpenAIGenerator(model=\"gpt-4o-mini\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "vnacak_tVWqv" - }, - "source": [ - "## Creating the ConditionalRouter\n", - "\n", - "[ConditionalRouter](https://docs.haystack.deepset.ai/docs/conditionalrouter) is the component that handles data routing on specific conditions. You need to define a `condition`, an `output`, an `output_name` and an `output_type` for each route. Each route that the `ConditionalRouter` creates acts as the output of this component and can be connected to other components in the same pipeline. \n", - "\n", - "In this case, you need to define two routes:\n", - "- If the LLM replies with the `\"no_answer\"` keyword, the pipeline should perform web search. It means that you will put the original `query` in the output value to pass to the next component (in this case the next component will be the `SerperDevWebSearch`) and the output name will be `go_to_websearch`.\n", - "- Otherwise, the given documents are enough for an answer and pipeline execution ends here. Return the LLM reply in the output named `answer`." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": { - "id": "qyE9rGcawX3F" - }, - "outputs": [], - "source": [ - "from haystack.components.routers import ConditionalRouter\n", - "\n", - "routes = [\n", - " {\n", - " \"condition\": \"{{'no_answer' in replies[0]}}\",\n", - " \"output\": \"{{query}}\",\n", - " \"output_name\": \"go_to_websearch\",\n", - " \"output_type\": str,\n", - " },\n", - " {\n", - " \"condition\": \"{{'no_answer' not in replies[0]}}\",\n", - " \"output\": \"{{replies[0]}}\",\n", - " \"output_name\": \"answer\",\n", - " \"output_type\": str,\n", - " },\n", - "]\n", - "\n", - "router = ConditionalRouter(routes)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Wdyko78oXb5a" - }, - "source": [ - "## Building the Pipeline\n", - "\n", - "Add all components to your pipeline and connect them. `go_to_websearch` output of the `router` should be connected to the `websearch` to retrieve documents from the web and also to `prompt_builder_for_websearch` to use in the prompt." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4sCyBwc0oTVs", - "outputId": "fd2347d4-9363-45e0-e734-87e4a160f741" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from haystack import Pipeline\n", - "\n", - "pipe = Pipeline()\n", - "pipe.add_component(\"prompt_builder\", prompt_builder)\n", - "pipe.add_component(\"llm\", llm)\n", - "pipe.add_component(\"router\", router)\n", - "pipe.add_component(\"websearch\", websearch)\n", - "pipe.add_component(\"prompt_builder_for_websearch\", prompt_builder_for_websearch)\n", - "pipe.add_component(\"llm_for_websearch\", llm_for_websearch)\n", - "\n", - "pipe.connect(\"prompt_builder\", \"llm\")\n", - "pipe.connect(\"llm.replies\", \"router.replies\")\n", - "pipe.connect(\"router.go_to_websearch\", \"websearch.query\")\n", - "pipe.connect(\"router.go_to_websearch\", \"prompt_builder_for_websearch.query\")\n", - "pipe.connect(\"websearch.documents\", \"prompt_builder_for_websearch.documents\")\n", - "pipe.connect(\"prompt_builder_for_websearch\", \"llm_for_websearch\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "d0HmdbUJKJ_9" - }, - "source": [ - "### Visualize the Pipeline\n", - "\n", - "To understand how you formed this pipeline with conditional routing, use [draw()](https://docs.haystack.deepset.ai/docs/drawing-pipeline-graphs) method of the pipeline. If you're running this notebook on Google Colab, the generated file will be saved in \\\"Files\\\" section on the sidebar." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "svF_SUK4rFwv", - "outputId": "60894eea-2cec-4be8-d13c-83d2c81656f4" - }, - "outputs": [], - "source": [ - "pipe.draw(\"pipe.png\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jgk1z6GGYH6J" - }, - "source": [ - "## Running the Pipeline!\n", - "\n", - "In the `run()`, pass the query to the `prompt_builder` and the `router`. In real life applications, `documents` will be provided by a [Retriever](https://docs.haystack.deepset.ai/docs/retrievers) but to keep this example simple, you will provide the defined `documents` to the `prompt_builder`." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "d_l4rYmCoVki", - "outputId": "3bd7956a-7612-4bc1-c3e5-a7a51be8981f" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Munich is in southern Germany.\n" - ] - } - ], - "source": [ - "query = \"Where is Munich?\"\n", - "\n", - "result = pipe.run({\"prompt_builder\": {\"query\": query, \"documents\": documents}, \"router\": {\"query\": query}})\n", - "\n", - "# Print the `answer` coming from the ConditionalRouter\n", - "print(result[\"router\"][\"answer\"])" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "IR5wivW8THt7" + }, + "source": [ + "# Tutorial: Building Fallbacks to Websearch with Conditional Routing\n", + "\n", + "- **Level**: Intermediate\n", + "- **Time to complete**: 10 minutes\n", + "- **Components Used**: [`ConditionalRouter`](https://docs.haystack.deepset.ai/docs/conditionalrouter), [`SerperDevWebSearch`](https://docs.haystack.deepset.ai/docs/serperdevwebsearch), [`PromptBuilder`](https://docs.haystack.deepset.ai/docs/promptbuilder), [`OpenAIGenerator`](https://docs.haystack.deepset.ai/docs/openaigenerator)\n", + "- **Prerequisites**: You must have an [OpenAI API Key](https://platform.openai.com/api-keys) and a [Serper API Key](https://serper.dev/api-key) for this tutorial\n", + "- **Goal**: After completing this tutorial, you'll have learned how to create a pipeline with conditional routing that can fallback to websearch if the answer is not present in your dataset.\n", + "\n", + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F-a-MAMVat-o" + }, + "source": [ + "## Overview\n", + "\n", + "When developing applications using **retrieval augmented generation ([RAG](https://www.deepset.ai/blog/llms-retrieval-augmentation))**, the retrieval step plays a critical role. It serves as the primary information source for **large language models (LLMs)** to generate responses. However, if your database lacks the necessary information, the retrieval step's effectiveness is limited. In such scenarios, it may be practical to use the web as a fallback data source for your RAG application. By implementing a conditional routing mechanism in your system, you gain complete control over the data flow, enabling you to design a system that can leverage the web as its data source under some conditions.\n", + "\n", + "In this tutorial, you will learn how to create a pipeline with conditional routing that directs the query to a **web-based RAG** route if the answer is not found in the initially given documents." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LSwNKkeKeq0f" + }, + "source": [ + "## Development Environment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eGJ7GmCBas4R" + }, + "source": [ + "### Prepare the Colab Environment\n", + "\n", + "- [Enable GPU Runtime in Colab](https://docs.haystack.deepset.ai/docs/enabling-gpu-acceleration)\n", + "- [Set logging level to INFO](https://docs.haystack.deepset.ai/docs/setting-the-log-level)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "FwIgIpE2XqpO" + }, + "source": [ + "### Install Haystack\n", + "\n", + "Install Haystack with `pip`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uba0mntlqs_O" + }, + "outputs": [], + "source": [ + "%%bash\n", + "\n", + "pip install haystack-ai" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WBkJ7d3hZkOJ" + }, + "source": [ + "### Enable Telemetry\n", + "\n", + "Knowing you're using this tutorial helps us decide where to invest our efforts to build a better product but you can always opt out by commenting the following line. See [Telemetry](https://docs.haystack.deepset.ai/docs/telemetry) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HvrOixzzZmMi" + }, + "outputs": [], + "source": [ + "from haystack.telemetry import tutorial_running\n", + "\n", + "tutorial_running(36)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QfECEAy2Jdqs" + }, + "source": [ + "### Enter API Keys\n", + "\n", + "Enter API keys required for this tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, - { - "cell_type": "markdown", - "metadata": { - "id": "dBN8eLSKgb16" - }, - "source": [ - "✅ The answer to this query can be found in the defined document.\n", - "\n", - "Now, try a different query that doesn't have an answer in the given document and test if the web search works as expected:" - ] + "id": "13U7Z_k3yE-F", + "outputId": "6ec48553-12d2-4c89-ca13-fc5d34fbc625" + }, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "import os\n", + "\n", + "if \"OPENAI_API_KEY\" not in os.environ:\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter OpenAI API key:\")\n", + "if \"SERPERDEV_API_KEY\" not in os.environ:\n", + " os.environ[\"SERPERDEV_API_KEY\"] = getpass(\"Enter Serper Api key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i_AlhPv1T-4t" + }, + "source": [ + "## Creating a Document\n", + "\n", + "Create a Document about Munich, where the answer to your question will be initially searched:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5CHbQlLMyVbg" + }, + "outputs": [], + "source": [ + "from haystack.dataclasses import Document\n", + "\n", + "documents = [\n", + " Document(\n", + " content=\"\"\"Munich, the vibrant capital of Bavaria in southern Germany, exudes a perfect blend of rich cultural\n", + " heritage and modern urban sophistication. Nestled along the banks of the Isar River, Munich is renowned\n", + " for its splendid architecture, including the iconic Neues Rathaus (New Town Hall) at Marienplatz and\n", + " the grandeur of Nymphenburg Palace. The city is a haven for art enthusiasts, with world-class museums like the\n", + " Alte Pinakothek housing masterpieces by renowned artists. Munich is also famous for its lively beer gardens, where\n", + " locals and tourists gather to enjoy the city's famed beers and traditional Bavarian cuisine. The city's annual\n", + " Oktoberfest celebration, the world's largest beer festival, attracts millions of visitors from around the globe.\n", + " Beyond its cultural and culinary delights, Munich offers picturesque parks like the English Garden, providing a\n", + " serene escape within the heart of the bustling metropolis. Visitors are charmed by Munich's warm hospitality,\n", + " making it a must-visit destination for travelers seeking a taste of both old-world charm and contemporary allure.\"\"\"\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zMNy0tjtUh_L" + }, + "source": [ + "## Creating the Initial Pipeline Components\n", + "\n", + "First, define a prompt instructing the LLM to respond with the text `\"no_answer\"` if the provided documents do not offer enough context to answer the query. Next, initialize a [PromptBuilder](https://docs.haystack.deepset.ai/docs/promptbuilder) with that prompt. It's crucial that the LLM replies with `\"no_answer\"` as you will use this keyword to indicate that the query should be directed to the fallback web search route.\n", + "\n", + "As the LLM, you will use an [OpenAIGenerator](https://docs.haystack.deepset.ai/docs/openaigenerator) with the `gpt-4o-mini` model.\n", + "\n", + "> The provided prompt works effectively with the `gpt-4o-mini` model. If you prefer to use a different [Generator](https://docs.haystack.deepset.ai/docs/generators), you may need to update the prompt to provide clear instructions to your model." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "nzhn2kDfqvbs" + }, + "outputs": [], + "source": [ + "from haystack.components.builders.prompt_builder import PromptBuilder\n", + "from haystack.components.generators import OpenAIGenerator\n", + "\n", + "prompt_template = \"\"\"\n", + "Answer the following query given the documents.\n", + "If the answer is not contained within the documents reply with 'no_answer'\n", + "Query: {{query}}\n", + "Documents:\n", + "{% for document in documents %}\n", + " {{document.content}}\n", + "{% endfor %}\n", + "\"\"\"\n", + "\n", + "prompt_builder = PromptBuilder(template=prompt_template)\n", + "llm = OpenAIGenerator(model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LepACkkWPsBx" + }, + "source": [ + "## Initializing the Web Search Components\n", + "\n", + "Initialize the necessary components for a web-based RAG application. Along with a `PromptBuilder` and an `OpenAIGenerator`, you will need a [SerperDevWebSearch](https://docs.haystack.deepset.ai/docs/serperdevwebsearch) to retrieve relevant documents for the query from the web.\n", + "\n", + "> If desired, you can use a different [Generator](https://docs.haystack.deepset.ai/docs/generators) for the web-based RAG branch of the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "VEYchFgQPxZ_" + }, + "outputs": [], + "source": [ + "from haystack.components.builders.prompt_builder import PromptBuilder\n", + "from haystack.components.generators import OpenAIGenerator\n", + "from haystack.components.websearch.serper_dev import SerperDevWebSearch\n", + "\n", + "prompt_for_websearch = \"\"\"\n", + "Answer the following query given the documents retrieved from the web.\n", + "Your answer shoud indicate that your answer was generated from websearch.\n", + "\n", + "Query: {{query}}\n", + "Documents:\n", + "{% for document in documents %}\n", + " {{document.content}}\n", + "{% endfor %}\n", + "\"\"\"\n", + "\n", + "websearch = SerperDevWebSearch()\n", + "prompt_builder_for_websearch = PromptBuilder(template=prompt_for_websearch)\n", + "llm_for_websearch = OpenAIGenerator(model=\"gpt-4o-mini\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vnacak_tVWqv" + }, + "source": [ + "## Creating the ConditionalRouter\n", + "\n", + "[ConditionalRouter](https://docs.haystack.deepset.ai/docs/conditionalrouter) is the component that handles data routing on specific conditions. You need to define a `condition`, an `output`, an `output_name` and an `output_type` for each route. Each route that the `ConditionalRouter` creates acts as the output of this component and can be connected to other components in the same pipeline. \n", + "\n", + "In this case, you need to define two routes:\n", + "- If the LLM replies with the `\"no_answer\"` keyword, the pipeline should perform web search. It means that you will put the original `query` in the output value to pass to the next component (in this case the next component will be the `SerperDevWebSearch`) and the output name will be `go_to_websearch`.\n", + "- Otherwise, the given documents are enough for an answer and pipeline execution ends here. Return the LLM reply in the output named `answer`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "qyE9rGcawX3F" + }, + "outputs": [], + "source": [ + "from haystack.components.routers import ConditionalRouter\n", + "\n", + "routes = [\n", + " {\n", + " \"condition\": \"{{'no_answer' in replies[0]}}\",\n", + " \"output\": \"{{query}}\",\n", + " \"output_name\": \"go_to_websearch\",\n", + " \"output_type\": str,\n", + " },\n", + " {\n", + " \"condition\": \"{{'no_answer' not in replies[0]}}\",\n", + " \"output\": \"{{replies[0]}}\",\n", + " \"output_name\": \"answer\",\n", + " \"output_type\": str,\n", + " },\n", + "]\n", + "\n", + "router = ConditionalRouter(routes)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Wdyko78oXb5a" + }, + "source": [ + "## Building the Pipeline\n", + "\n", + "Add all components to your pipeline and connect them. `go_to_websearch` output of the `router` should be connected to the `websearch` to retrieve documents from the web and also to `prompt_builder_for_websearch` to use in the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "4sCyBwc0oTVs", + "outputId": "fd2347d4-9363-45e0-e734-87e4a160f741" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 10, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_v-WdlSy365M", - "outputId": "603c9346-8718-427e-d232-4cc71799a2bb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['According to the documents retrieved from the web, the population of Munich is approximately 1.47 million as of 2019. However, the most recent estimates suggest that the population has grown to about 1.58 million as of May 31, 2022. Additionally, the current estimated population of Munich is around 1.46 million, with the urban area being much larger at 2.65 million.']\n" - ] - } - ], - "source": [ - "query = \"How many people live in Munich?\"\n", - "\n", - "result = pipe.run({\"prompt_builder\": {\"query\": query, \"documents\": documents}, \"router\": {\"query\": query}})\n", - "\n", - "# Print the `replies` generated using the web searched Documents\n", - "print(result[\"llm_for_websearch\"][\"replies\"])" + "data": { + "text/plain": [ + "" ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from haystack import Pipeline\n", + "\n", + "pipe = Pipeline()\n", + "pipe.add_component(\"prompt_builder\", prompt_builder)\n", + "pipe.add_component(\"llm\", llm)\n", + "pipe.add_component(\"router\", router)\n", + "pipe.add_component(\"websearch\", websearch)\n", + "pipe.add_component(\"prompt_builder_for_websearch\", prompt_builder_for_websearch)\n", + "pipe.add_component(\"llm_for_websearch\", llm_for_websearch)\n", + "\n", + "pipe.connect(\"prompt_builder\", \"llm\")\n", + "pipe.connect(\"llm.replies\", \"router.replies\")\n", + "pipe.connect(\"router.go_to_websearch\", \"websearch.query\")\n", + "pipe.connect(\"router.go_to_websearch\", \"prompt_builder_for_websearch.query\")\n", + "pipe.connect(\"websearch.documents\", \"prompt_builder_for_websearch.documents\")\n", + "pipe.connect(\"prompt_builder_for_websearch\", \"llm_for_websearch\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "d0HmdbUJKJ_9" + }, + "source": [ + "### Visualize the Pipeline\n", + "\n", + "To understand how you formed this pipeline with conditional routing, use [draw()](https://docs.haystack.deepset.ai/docs/drawing-pipeline-graphs) method of the pipeline. If you're running this notebook on Google Colab, the generated file will be saved in \\\"Files\\\" section on the sidebar." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1000 }, - { - "cell_type": "markdown", - "metadata": { - "id": "wUkuXoWnHa5c" - }, - "source": [ - "If you check the whole result, you will see that `websearch` component also provides links to Documents retrieved from the web:" - ] + "id": "svF_SUK4rFwv", + "outputId": "60894eea-2cec-4be8-d13c-83d2c81656f4" + }, + "outputs": [], + "source": [ + "pipe.draw(\"pipe.png\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jgk1z6GGYH6J" + }, + "source": [ + "## Running the Pipeline!\n", + "\n", + "In the `run()`, pass the query to the `prompt_builder` and the `router`. In real life applications, `documents` will be provided by a [Retriever](https://docs.haystack.deepset.ai/docs/retrievers) but to keep this example simple, you will provide the defined `documents` to the `prompt_builder`." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "d_l4rYmCoVki", + "outputId": "3bd7956a-7612-4bc1-c3e5-a7a51be8981f" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": 11, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "_EYLZguZGznY", - "outputId": "df49a576-9961-44b4-e89d-2c5195869360" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'llm': {'meta': [{'model': 'gpt-4o-mini-2024-07-18',\n", - " 'index': 0,\n", - " 'finish_reason': 'stop',\n", - " 'usage': {'completion_tokens': 2,\n", - " 'prompt_tokens': 271,\n", - " 'total_tokens': 273}}]},\n", - " 'websearch': {'links': ['https://en.wikipedia.org/wiki/Munich',\n", - " 'https://worldpopulationreview.com/world-cities/munich-population',\n", - " 'https://en.wikipedia.org/wiki/Demographics_of_Munich',\n", - " 'https://www.macrotrends.net/cities/204371/munich/population',\n", - " 'https://www.britannica.com/place/Munich-Bavaria-Germany',\n", - " 'https://www.statista.com/statistics/519723/munich-population-by-age-group/',\n", - " 'https://www.citypopulation.de/en/germany/bayern/m%C3%BCnchen_stadt/09162000__m%C3%BCnchen/',\n", - " 'https://www.quora.com/How-many-people-live-in-Munich',\n", - " 'https://earth.esa.int/web/earth-watching/image-of-the-week/content/-/article/munich-germany/']},\n", - " 'llm_for_websearch': {'replies': ['According to the documents retrieved from the web, the population of Munich is approximately 1.47 million as of 2019. However, the most recent estimates suggest that the population has grown to about 1.58 million as of May 31, 2022. Additionally, the current estimated population of Munich is around 1.46 million, with the urban area being much larger at 2.65 million.'],\n", - " 'meta': [{'model': 'gpt-4o-mini-2024-07-18',\n", - " 'index': 0,\n", - " 'finish_reason': 'stop',\n", - " 'usage': {'completion_tokens': 85,\n", - " 'prompt_tokens': 436,\n", - " 'total_tokens': 521}}]}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "result" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Munich is in southern Germany.\n" + ] + } + ], + "source": [ + "query = \"Where is Munich?\"\n", + "\n", + "result = pipe.run({\"prompt_builder\": {\"query\": query, \"documents\": documents}, \"router\": {\"query\": query}})\n", + "\n", + "# Print the `answer` coming from the ConditionalRouter\n", + "print(result[\"router\"][\"answer\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dBN8eLSKgb16" + }, + "source": [ + "✅ The answer to this query can be found in the defined document.\n", + "\n", + "Now, try a different query that doesn't have an answer in the given document and test if the web search works as expected:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" }, + "id": "_v-WdlSy365M", + "outputId": "603c9346-8718-427e-d232-4cc71799a2bb" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "6nhdYK-vHpNM" - }, - "source": [ - "## What's next\n", - "\n", - "🎉 Congratulations! You've built a pipeline with conditional routing! You can now customize the condition for your specific use case and create a custom Haystack 2.0 pipeline to meet your needs.\n", - "\n", - "If you liked this tutorial, there's more to learn about Haystack 2.0:\n", - "- [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline)\n", - "- [Model-Based Evaluation of RAG Pipelines](https://haystack.deepset.ai/tutorials/35_model_based_evaluation_of_rag_pipelines)\n", - "\n", - "To stay up to date on the latest Haystack developments, you can [sign up for our newsletter](https://landing.deepset.ai/haystack-community-updates) or [join Haystack discord community](https://discord.gg/haystack).\n", - "\n", - "Thanks for reading!" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "['According to the documents retrieved from the web, the population of Munich is approximately 1.47 million as of 2019. However, the most recent estimates suggest that the population has grown to about 1.58 million as of May 31, 2022. Additionally, the current estimated population of Munich is around 1.46 million, with the urban area being much larger at 2.65 million.']\n" + ] } - ], - "metadata": { + ], + "source": [ + "query = \"How many people live in Munich?\"\n", + "\n", + "result = pipe.run({\"prompt_builder\": {\"query\": query, \"documents\": documents}, \"router\": {\"query\": query}})\n", + "\n", + "# Print the `replies` generated using the web searched Documents\n", + "print(result[\"llm_for_websearch\"][\"replies\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wUkuXoWnHa5c" + }, + "source": [ + "If you check the whole result, you will see that `websearch` component also provides links to Documents retrieved from the web:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { "colab": { - "provenance": [] + "base_uri": "https://localhost:8080/" }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.6" + "id": "_EYLZguZGznY", + "outputId": "df49a576-9961-44b4-e89d-2c5195869360" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'llm': {'meta': [{'model': 'gpt-4o-mini-2024-07-18',\n", + " 'index': 0,\n", + " 'finish_reason': 'stop',\n", + " 'usage': {'completion_tokens': 2,\n", + " 'prompt_tokens': 271,\n", + " 'total_tokens': 273}}]},\n", + " 'websearch': {'links': ['https://en.wikipedia.org/wiki/Munich',\n", + " 'https://worldpopulationreview.com/world-cities/munich-population',\n", + " 'https://en.wikipedia.org/wiki/Demographics_of_Munich',\n", + " 'https://www.macrotrends.net/cities/204371/munich/population',\n", + " 'https://www.britannica.com/place/Munich-Bavaria-Germany',\n", + " 'https://www.statista.com/statistics/519723/munich-population-by-age-group/',\n", + " 'https://www.citypopulation.de/en/germany/bayern/m%C3%BCnchen_stadt/09162000__m%C3%BCnchen/',\n", + " 'https://www.quora.com/How-many-people-live-in-Munich',\n", + " 'https://earth.esa.int/web/earth-watching/image-of-the-week/content/-/article/munich-germany/']},\n", + " 'llm_for_websearch': {'replies': ['According to the documents retrieved from the web, the population of Munich is approximately 1.47 million as of 2019. However, the most recent estimates suggest that the population has grown to about 1.58 million as of May 31, 2022. Additionally, the current estimated population of Munich is around 1.46 million, with the urban area being much larger at 2.65 million.'],\n", + " 'meta': [{'model': 'gpt-4o-mini-2024-07-18',\n", + " 'index': 0,\n", + " 'finish_reason': 'stop',\n", + " 'usage': {'completion_tokens': 85,\n", + " 'prompt_tokens': 436,\n", + " 'total_tokens': 521}}]}}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" } + ], + "source": [ + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6nhdYK-vHpNM" + }, + "source": [ + "## What's next\n", + "\n", + "🎉 Congratulations! You've built a pipeline with conditional routing! You can now customize the condition for your specific use case and create a custom Haystack 2.0 pipeline to meet your needs.\n", + "\n", + "If you liked this tutorial, there's more to learn about Haystack 2.0:\n", + "- [Creating Your First QA Pipeline with Retrieval-Augmentation](https://haystack.deepset.ai/tutorials/27_first_rag_pipeline)\n", + "- [Model-Based Evaluation of RAG Pipelines](https://haystack.deepset.ai/tutorials/35_model_based_evaluation_of_rag_pipelines)\n", + "\n", + "To stay up to date on the latest Haystack developments, you can [sign up for our newsletter](https://landing.deepset.ai/haystack-community-updates) or [join Haystack discord community](https://discord.gg/haystack).\n", + "\n", + "Thanks for reading!" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/tutorials/37_Simplifying_Pipeline_Inputs_with_Multiplexer.ipynb b/tutorials/37_Simplifying_Pipeline_Inputs_with_Multiplexer.ipynb index 1f7f04e4..91552ee8 100644 --- a/tutorials/37_Simplifying_Pipeline_Inputs_with_Multiplexer.ipynb +++ b/tutorials/37_Simplifying_Pipeline_Inputs_with_Multiplexer.ipynb @@ -17,7 +17,7 @@ "\n", "> As of version 2.2.0, `Multiplexer` has been deprecated in Haystack and will be completely removed from Haystack as of v2.4.0. We recommend using [BranchJoiner](https://docs.haystack.deepset.ai/docs/branchjoiner) instead. For more details about this deprecation, check out [Haystack 2.2.0 release notes](https://github.com/deepset-ai/haystack/releases/tag/v2.2.0) on Github.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro)." + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro)." ] }, { @@ -56,7 +56,7 @@ "source": [ "### Install Haystack\n", "\n", - "Install Haystack 2.0 with `pip`:" + "Install Haystack with `pip`:" ] }, { diff --git a/tutorials/39_Embedding_Metadata_for_Improved_Retrieval.ipynb b/tutorials/39_Embedding_Metadata_for_Improved_Retrieval.ipynb index c591c1e4..7b48edc0 100644 --- a/tutorials/39_Embedding_Metadata_for_Improved_Retrieval.ipynb +++ b/tutorials/39_Embedding_Metadata_for_Improved_Retrieval.ipynb @@ -12,7 +12,7 @@ "- **Components Used**: [`InMemoryDocumentStore`](https://docs.haystack.deepset.ai/docs/inmemorydocumentstore), [`InMemoryEmbeddingRetriever`](https://docs.haystack.deepset.ai/docs/inmemoryembeddingretriever), [`SentenceTransformersDocumentEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformersdocumentembedder), [`SentenceTransformersTextEmbedder`](https://docs.haystack.deepset.ai/docs/sentencetransformerstextembedder)\n", "- **Goal**: After completing this tutorial, you'll have learned how to embed metadata information while indexing documents, to improve retrieval.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n", "\n", "> ⚠️ Note of caution: The method showcased in this tutorial is not always the right approach for all types of metadata. This method works best when the embedded metadata is meaningful. For example, here we're showcasing embedding the \"title\" meta field, which can also provide good context for the embedding model." ] @@ -45,73 +45,14 @@ "source": [ "### Install Haystack\n", "\n", - "Install Haystack 2.0 and other required packages with `pip`:" + "Install Haystack and other required packages with `pip`:" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: haystack-ai in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (2.0.0b7)\n", - "Collecting wikipedia\n", - " Downloading wikipedia-1.4.0.tar.gz (27 kB)\n", - " Preparing metadata (setup.py): started\n", - " Preparing metadata (setup.py): finished with status 'done'\n", - "Requirement already satisfied: boilerpy3 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (1.0.7)\n", - "Requirement already satisfied: haystack-bm25 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (1.0.2)\n", - "Requirement already satisfied: jinja2 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (3.1.3)\n", - "Requirement already satisfied: lazy-imports in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (0.3.1)\n", - "Requirement already satisfied: more-itertools in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (10.2.0)\n", - "Requirement already satisfied: networkx in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (3.2.1)\n", - "Requirement already satisfied: openai>=1.1.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (1.12.0)\n", - "Requirement already satisfied: pandas in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (2.2.0)\n", - "Requirement already satisfied: posthog in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (3.4.1)\n", - "Requirement already satisfied: pyyaml in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (6.0.1)\n", - "Requirement already satisfied: tenacity in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (8.2.3)\n", - "Requirement already satisfied: tqdm in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (4.66.2)\n", - "Requirement already satisfied: typing-extensions in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-ai) (4.9.0)\n", - "Collecting beautifulsoup4 (from wikipedia)\n", - " Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)\n", - "Requirement already satisfied: requests<3.0.0,>=2.0.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from wikipedia) (2.31.0)\n", - "Requirement already satisfied: anyio<5,>=3.5.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from openai>=1.1.0->haystack-ai) (4.2.0)\n", - "Requirement already satisfied: distro<2,>=1.7.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from openai>=1.1.0->haystack-ai) (1.9.0)\n", - "Requirement already satisfied: httpx<1,>=0.23.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from openai>=1.1.0->haystack-ai) (0.26.0)\n", - "Requirement already satisfied: pydantic<3,>=1.9.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from openai>=1.1.0->haystack-ai) (1.10.9)\n", - "Requirement already satisfied: sniffio in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from openai>=1.1.0->haystack-ai) (1.3.0)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from requests<3.0.0,>=2.0.0->wikipedia) (3.3.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from requests<3.0.0,>=2.0.0->wikipedia) (3.6)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from requests<3.0.0,>=2.0.0->wikipedia) (2.2.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from requests<3.0.0,>=2.0.0->wikipedia) (2024.2.2)\n", - "Collecting soupsieve>1.2 (from beautifulsoup4->wikipedia)\n", - " Using cached soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)\n", - "Requirement already satisfied: numpy in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from haystack-bm25->haystack-ai) (1.26.4)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from jinja2->haystack-ai) (2.1.5)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from pandas->haystack-ai) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from pandas->haystack-ai) (2024.1)\n", - "Requirement already satisfied: tzdata>=2022.7 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from pandas->haystack-ai) (2024.1)\n", - "Requirement already satisfied: six>=1.5 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from posthog->haystack-ai) (1.16.0)\n", - "Requirement already satisfied: monotonic>=1.5 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from posthog->haystack-ai) (1.6)\n", - "Requirement already satisfied: backoff>=1.10.0 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from posthog->haystack-ai) (2.2.1)\n", - "Requirement already satisfied: httpcore==1.* in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (1.0.2)\n", - "Requirement already satisfied: h11<0.15,>=0.13 in /Users/tuanacelik/opt/anaconda3/envs/mistral/lib/python3.12/site-packages (from httpcore==1.*->httpx<1,>=0.23.0->openai>=1.1.0->haystack-ai) (0.14.0)\n", - "Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)\n", - "Using cached soupsieve-2.5-py3-none-any.whl (36 kB)\n", - "Building wheels for collected packages: wikipedia\n", - " Building wheel for wikipedia (setup.py): started\n", - " Building wheel for wikipedia (setup.py): finished with status 'done'\n", - " Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11678 sha256=17926b00d77f1d294e927f20b0e52e7a137fcd6b219ca85e63570f3b5c7d58f3\n", - " Stored in directory: /Users/tuanacelik/Library/Caches/pip/wheels/63/47/7c/a9688349aa74d228ce0a9023229c6c0ac52ca2a40fe87679b8\n", - "Successfully built wikipedia\n", - "Installing collected packages: soupsieve, beautifulsoup4, wikipedia\n", - "Successfully installed beautifulsoup4-4.12.3 soupsieve-2.5 wikipedia-1.4.0\n" - ] - } - ], + "outputs": [], "source": [ "%%bash\n", "\n", diff --git a/tutorials/40_Building_Chat_Application_with_Function_Calling.ipynb b/tutorials/40_Building_Chat_Application_with_Function_Calling.ipynb index 08d87ba7..1febda65 100644 --- a/tutorials/40_Building_Chat_Application_with_Function_Calling.ipynb +++ b/tutorials/40_Building_Chat_Application_with_Function_Calling.ipynb @@ -14,7 +14,7 @@ "- **Prerequisites**: You must have an [OpenAI API Key](https://platform.openai.com/api-keys) and be familiar with [creating pipelines](https://docs.haystack.deepset.ai/docs/creating-pipelines)\n", "- **Goal**: After completing this tutorial, you will have learned how to build chat applications that demonstrate agent-like behavior using OpenAI's function calling feature.\n", "\n", - "> This tutorial uses Haystack 2.0. To learn more, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack 2.0 Documentation](https://docs.haystack.deepset.ai/docs/intro).\n" + "> This tutorial uses the latest version of Haystack 2.x (`haystack-ai`). For more information on Haystack 2.0, read the [Haystack 2.0 announcement](https://haystack.deepset.ai/blog/haystack-2-release) or visit the [Haystack Documentation](https://docs.haystack.deepset.ai/docs/intro).\n" ] }, { @@ -45,7 +45,7 @@ "source": [ "## Setting up the Development Environment\n", "\n", - "Install Haystack 2.0 and [sentence-transformers](https://pypi.org/project/sentence-transformers/) using pip:" + "Install Haystack and [sentence-transformers](https://pypi.org/project/sentence-transformers/) using pip:" ] }, {