From 06dedd6cb2ebeef8515b54acccd556f0e25c5efd Mon Sep 17 00:00:00 2001
From: Matthias Richter <matthias.r1092@gmail.com>
Date: Wed, 6 Mar 2024 22:32:44 +0100
Subject: [PATCH] Update to use new retrieve from faiss component (#12)

Updates notebook and pipeline to use the new faiss component.

---------

Co-authored-by: Robbe Sneyders <robbe.sneyders@ml6.eu>
---
 requirements.txt   |  2 +-
 src/pipeline.ipynb | 22 +++++++---------------
 src/pipeline.py    |  6 +++---
 3 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index b3e4c0c..4bce09f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-fondant==0.10.1
+fondant==0.11.0
 notebook==7.0.6
diff --git a/src/pipeline.ipynb b/src/pipeline.ipynb
index 682ad01..3e02340 100644
--- a/src/pipeline.ipynb
+++ b/src/pipeline.ipynb
@@ -19,7 +19,8 @@
     "\n",
     "1. [**Prompt Generation**](components/generate_prompts.py): This component generates a set of seed prompts using a rule-based approach that combines various rooms and styles together, like “a photo of a {room_type} in the style of {style_type}”. As input, it takes in a list of room types (bedroom, kitchen, laundry room, ..), a list of room styles (contemporary, minimalist, art deco, ...) and a list of prefixes (comfortable, luxurious, simple). These lists can be easily adapted to other domains. The output of this component is a list of seed prompts.\n",
     "\n",
-    "2. [**Image URL Retrieval**](https://github.com/ml6team/fondant/tree/main/components/prompt_based_laion_retrieval): This component retrieves images from the [LAION-5B](https://laion.ai/blog/laion-5b/) dataset based on the seed prompts. The retrieval itself is done based on CLIP embeddings similarity between the prompt sentences and the captions in the LAION dataset. This component doesn’t return the actual images yet, only the URLs. The next component in the pipeline will then download these images.\n",
+    "2. [**Image URL Retrieval**](https://fondant.ai/en/latest/components/hub/#retrieve_from_faiss_by_prompt#description): This component retrieves images from a image dataset based on the seed prompts. The retrieval itself is done based on CLIP embeddings similarity between the prompt sentences and the captions in the image dataset. This component doesn’t return the actual images yet, only the URLs. The next component in the pipeline will then download these images.\n",
+    "The LIAON dataset is currently unavailable. Therefore, we have published a similar dataset based on the [DataComp 12M dataset](https://www.datacomp.ai/).\n",
     "\n",
     "3. [**Download Images**](https://github.com/ml6team/fondant/tree/main/components/download_images): This component downloads the actual images based on the URLs retrieved by the previous component. It takes in the URLs as input and returns the actual images, along with some metadata (like their height and width).\n",
     "\n",
@@ -139,9 +140,7 @@
     "\n",
     "from fondant.pipeline import Pipeline, Resources\n",
     "\n",
-    "BASE_PATH = \"./data_dir\"\n",
-    "Path(BASE_PATH).mkdir(parents=True, exist_ok=True)\n",
-    "\n",
+    "BASE_PATH = \"./artifacts\"\n",
     "pipeline = Pipeline(\n",
     "    name=\"controlnet-pipeline\",\n",
     "    description=\"Pipeline that collects data to train ControlNet\",\n",
@@ -330,13 +329,6 @@
     "We will use components available on the [Fondant Hub](https://fondant.ai/en/latest/components/hub/), we chain the components together by calling `.apply()` on the previous component."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "*NOTE: The `retrieve_laion_by_prompt` component uses a public CLIP service which can only handle a few requets at a time, if you run into [timeout issues](https://github.com/rom1504/clip-retrieval/issues/267), you might want to host your own clip service following this [guide](https://github.com/rom1504/clip-retrieval/blob/main/docs/laion5B_h14_back.md)*"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -344,11 +336,11 @@
    "outputs": [],
    "source": [
     "image_urls = prompts.apply(\n",
-    "    \"retrieve_laion_by_prompt\",\n",
+    "    \"retrieve_from_faiss_by_prompt\",\n",
     "    arguments={\n",
-    "        \"num_images\": 2,\n",
-    "        \"aesthetic_score\": 9,\n",
-    "        \"aesthetic_weight\": 0.5,\n",
+    "        \"url_mapping_path\":\"hf://datasets/fondant-ai/datacomp-small-clip/id_mapping\",\n",
+    "        \"faiss_index_path\":\"hf://datasets/fondant-ai/datacomp-small-clip/faiss\",\n",
+    "        \"num_images\": 2\n",
     "    },\n",
     ")\n",
     "\n",
diff --git a/src/pipeline.py b/src/pipeline.py
index 2161dbb..3f7a919 100644
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -26,11 +26,11 @@
 )
 
 image_urls = prompts.apply(
-    "retrieve_laion_by_prompt",
+    "retrieve_from_faiss_by_prompt",
     arguments={
+        "url_mapping_path": "hf://datasets/fondant-ai/datacomp-small-clip/id_mapping",
+        "faiss_index_path": "hf://datasets/fondant-ai/datacomp-small-clip/faiss",
         "num_images": 2,
-        "aesthetic_score": 9,
-        "aesthetic_weight": 0.5,
     },
 )