Improve notebook (#44)

ml6team · Jan 8, 2024 · 02259e5 · 02259e5
1 parent 2a282c6
commit 02259e5
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 34 deletions.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ informed choices on which parameters to try.
 > ⚠️ **Prerequisites:**
 >
 > - A Python version between 3.8 and 3.10 installed on your system.
-> - Docker and docker compose installed and configured on your system.
+> - Docker and docker compose installed and configured on your system. More info [here](https://fondant.ai/en/latest/guides/installation/#docker-installation).
 > - A GPU is recommended to run the model-based components of the pipeline.
 
 ### Cloning the repository
@@ -70,5 +70,5 @@ fondant --help
 
 There are two options to run the pipeline:
 
-- [Via python files and the Fondant CLI](./src/README.md): how you should run Fondant in production
-- [Via a Jupyter notebook](./src/pipeline.ipynb): ideal to learn about Fondant
+- [**Via python files and the Fondant CLI:**](https://fondant.ai/en/latest/pipeline/#running-a-pipeline) how you should run Fondant in production
+- [**Via a Jupyter notebook**](./src/pipeline.ipynb): ideal to learn about Fondant
diff --git a/src/evaluation.ipynb b/src/evaluation.ipynb
diff --git a/src/parameter_search.ipynb b/src/parameter_search.ipynb
@@ -405,9 +405,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "# all results\n",
@@ -583,7 +581,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/src/pipeline.ipynb b/src/pipeline.ipynb
@@ -36,7 +36,7 @@
     "  a Weaviate database.\n",
     "\n",
     "## Environment\n",
-    "### This section checks the prerequisites of your environment. Read any errors or warnings carefully.\n",
+    "### This section checks the prerequisites of your environment. Read any errors or warnings carefully. \n",
     "\n",
     "**Ensure a Python between version 3.8 and 3.10 is available**"
    ]
@@ -69,6 +69,33 @@
     "!docker info >/dev/null"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Check if GPU is available**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import subprocess\n",
+    "\n",
+    "try:\n",
+    "    subprocess.check_output('nvidia-smi')\n",
+    "    logging.info(\"Found GPU, using it!\")\n",
+    "    number_of_accelerators = 1\n",
+    "    accelerator_name = \"GPU\"\n",
+    "except Exception:\n",
+    "    logging.warning(\"We recommend to run this pipeline on a GPU, but none could be found, using CPU instead\")\n",
+    "    number_of_accelerators = None\n",
+    "    accelerator_name = None"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -101,7 +128,7 @@
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
-    "from fondant.pipeline import Pipeline\n",
+    "from fondant.pipeline import Pipeline, Resources\n",
     "\n",
     "BASE_PATH = \"./data-dir\"\n",
     "Path(BASE_PATH).mkdir(parents=True, exist_ok=True)\n",
@@ -133,7 +160,6 @@
     "    arguments={\n",
     "        # Add arguments\n",
     "        \"dataset_name\": \"wikitext@~parquet\",\n",
-    "        \"column_name_mapping\": {\"text\": \"text\"},\n",
     "        \"n_rows_to_load\": 1000,\n",
     "    },\n",
     "    produces={\n",
@@ -155,6 +181,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import utils\n",
+    "\n",
+    "\n",
     "chunks = text.apply(\n",
     "    \"chunk_text\",\n",
     "    arguments={\n",
@@ -167,16 +196,22 @@
     "    \"embed_text\",\n",
     "    arguments={\n",
     "        \"model_provider\": \"huggingface\",\n",
-    "        \"model\": \"all-MiniLM-L6-v2\",\n",
-    "    }\n",
+    "        \"model\": \"all-MiniLM-L6-v2\"\n",
+    "    },\n",
+    "    resources=Resources(\n",
+    "        accelerator_number=number_of_accelerators,\n",
+    "        accelerator_name=accelerator_name,\n",
+    "    ),\n",
+    "    cluster_type=\"local\"\n",
     ")\n",
     "\n",
     "embeddings.write(\n",
     "    \"index_weaviate\",\n",
     "    arguments={\n",
-    "        \"weaviate_url\": \"http://host.docker.internal:8080\",\n",
+    "        \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n",
     "        \"class_name\": \"index\",\n",
-    "    }\n",
+    "    },\n",
+    "    cache=False\n",
     ")"
    ]
   },
@@ -217,7 +252,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!docker compose -f weaviate/docker-compose.yaml up --detach"
+    "!docker compose -f weaviate/docker-compose.yaml up --detach --quiet-pull"
    ]
   },
   {
@@ -431,7 +466,6 @@
     "    \"load_from_hf_hub\",\n",
     "    arguments={\n",
     "        \"dataset_name\": \"wikitext@~parquet\",\n",
-    "        \"column_name_mapping\": {\"text\": \"text\"},\n",
     "        \"n_rows_to_load\": 1000,\n",
     "    },\n",
     "    produces={\n",
@@ -462,7 +496,7 @@
     "embeddings.write(\n",
     "    \"index_weaviate\",\n",
     "    arguments={\n",
-    "        \"weaviate_url\": \"http://host.docker.internal:8080\",\n",
+    "        \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n",
     "        \"class_name\": \"index\",\n",
     "    },\n",
     ")"
@@ -488,7 +522,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If you check the logs, you will see th\n",
+    "If you check the logs, you will see that components executed with the same parameters which enables faster pipeline iteration.\n",
     "\n",
     "If you restart the Explorer, you'll see that you can now select a second pipeline and inspect your new dataset."
    ]

diff --git a/src/pipeline_eval.py b/src/pipeline_eval.py
@@ -1,7 +1,7 @@
 """Fondant pipeline to evaluate a RAG pipeline."""
 
 import pyarrow as pa
-from fondant.pipeline import Pipeline
+from fondant.pipeline import Pipeline, Resources
 
 
 def create_pipeline(
@@ -19,6 +19,8 @@ def create_pipeline(
     evaluation_llm: str = "OpenAI",
     evaluation_llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"},
     evaluation_metrics: list = ["context_precision", "context_relevancy"],
+    number_of_accelerators=None,
+    accelerator_name=None,
 ):
     """Create a Fondant pipeline based on the provided arguments."""
     evaluation_pipeline = Pipeline(
@@ -48,6 +50,11 @@ def create_pipeline(
         consumes={
             "text": "question",
         },
+        resources=Resources(
+            accelerator_number=number_of_accelerators,
+            accelerator_name=accelerator_name,
+        ),
+        cluster_type="local",
     )
 
     retrieve_chunks = embed_text_op.apply(

diff --git a/src/pipeline_index.py b/src/pipeline_index.py
@@ -1,6 +1,6 @@
 """Fondant pipeline to index a RAG system."""
 import pyarrow as pa
-from fondant.pipeline import Pipeline
+from fondant.pipeline import Pipeline, Resources
 
 
 def create_pipeline(
@@ -15,6 +15,8 @@ def create_pipeline(
     embed_api_key: dict = {},
     chunk_size: int = 512,
     chunk_overlap: int = 32,
+    number_of_accelerators=None,
+    accelerator_name=None,
 ):
     """Create a Fondant pipeline based on the provided arguments."""
     indexing_pipeline = Pipeline(
@@ -50,6 +52,11 @@ def create_pipeline(
             "model": embed_model,
             "api_keys": embed_api_key,
         },
+        resources=Resources(
+            accelerator_number=number_of_accelerators,
+            accelerator_name=accelerator_name,
+        ),
+        cluster_type="local",
     )
 
     embeddings.write(

diff --git a/src/weaviate/docker-compose.yaml b/src/weaviate/docker-compose.yaml
@@ -3,7 +3,7 @@ services:
   weaviate:
     image: semitechnologies/weaviate:1.20.5
     ports:
-      - 8080:8080
+      - 8081:8080
     environment:
       CONTEXTIONARY_URL: contextionary:9999
       QUERY_DEFAULTS_LIMIT: 25