Skip to content

Commit

Permalink
Improve notebook (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilippeMoussalli authored Jan 8, 2024
1 parent 2a282c6 commit 02259e5
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 34 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ informed choices on which parameters to try.
> ⚠️ **Prerequisites:**
>
> - A Python version between 3.8 and 3.10 installed on your system.
> - Docker and docker compose installed and configured on your system.
> - Docker and docker compose installed and configured on your system. More info [here](https://fondant.ai/en/latest/guides/installation/#docker-installation).
> - A GPU is recommended to run the model-based components of the pipeline.
### Cloning the repository
Expand Down Expand Up @@ -70,5 +70,5 @@ fondant --help

There are two options to run the pipeline:

- [Via python files and the Fondant CLI](./src/README.md): how you should run Fondant in production
- [Via a Jupyter notebook](./src/pipeline.ipynb): ideal to learn about Fondant
- [**Via python files and the Fondant CLI:**](https://fondant.ai/en/latest/pipeline/#running-a-pipeline) how you should run Fondant in production
- [**Via a Jupyter notebook**](./src/pipeline.ipynb): ideal to learn about Fondant
69 changes: 56 additions & 13 deletions src/evaluation.ipynb

Large diffs are not rendered by default.

6 changes: 2 additions & 4 deletions src/parameter_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,7 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"metadata": {},
"outputs": [],
"source": [
"# all results\n",
Expand Down Expand Up @@ -583,7 +581,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
"version": "3.10.12"
}
},
"nbformat": 4,
Expand Down
56 changes: 45 additions & 11 deletions src/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
" a Weaviate database.\n",
"\n",
"## Environment\n",
"### This section checks the prerequisites of your environment. Read any errors or warnings carefully.\n",
"### This section checks the prerequisites of your environment. Read any errors or warnings carefully. \n",
"\n",
"**Ensure a Python between version 3.8 and 3.10 is available**"
]
Expand Down Expand Up @@ -69,6 +69,33 @@
"!docker info >/dev/null"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Check if GPU is available**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import subprocess\n",
"\n",
"try:\n",
" subprocess.check_output('nvidia-smi')\n",
" logging.info(\"Found GPU, using it!\")\n",
" number_of_accelerators = 1\n",
" accelerator_name = \"GPU\"\n",
"except Exception:\n",
" logging.warning(\"We recommend to run this pipeline on a GPU, but none could be found, using CPU instead\")\n",
" number_of_accelerators = None\n",
" accelerator_name = None"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -101,7 +128,7 @@
"outputs": [],
"source": [
"from pathlib import Path\n",
"from fondant.pipeline import Pipeline\n",
"from fondant.pipeline import Pipeline, Resources\n",
"\n",
"BASE_PATH = \"./data-dir\"\n",
"Path(BASE_PATH).mkdir(parents=True, exist_ok=True)\n",
Expand Down Expand Up @@ -133,7 +160,6 @@
" arguments={\n",
" # Add arguments\n",
" \"dataset_name\": \"wikitext@~parquet\",\n",
" \"column_name_mapping\": {\"text\": \"text\"},\n",
" \"n_rows_to_load\": 1000,\n",
" },\n",
" produces={\n",
Expand All @@ -155,6 +181,9 @@
"metadata": {},
"outputs": [],
"source": [
"import utils\n",
"\n",
"\n",
"chunks = text.apply(\n",
" \"chunk_text\",\n",
" arguments={\n",
Expand All @@ -167,16 +196,22 @@
" \"embed_text\",\n",
" arguments={\n",
" \"model_provider\": \"huggingface\",\n",
" \"model\": \"all-MiniLM-L6-v2\",\n",
" }\n",
" \"model\": \"all-MiniLM-L6-v2\"\n",
" },\n",
" resources=Resources(\n",
" accelerator_number=number_of_accelerators,\n",
" accelerator_name=accelerator_name,\n",
" ),\n",
" cluster_type=\"local\"\n",
")\n",
"\n",
"embeddings.write(\n",
" \"index_weaviate\",\n",
" arguments={\n",
" \"weaviate_url\": \"http://host.docker.internal:8080\",\n",
" \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n",
" \"class_name\": \"index\",\n",
" }\n",
" },\n",
" cache=False\n",
")"
]
},
Expand Down Expand Up @@ -217,7 +252,7 @@
"metadata": {},
"outputs": [],
"source": [
"!docker compose -f weaviate/docker-compose.yaml up --detach"
"!docker compose -f weaviate/docker-compose.yaml up --detach --quiet-pull"
]
},
{
Expand Down Expand Up @@ -431,7 +466,6 @@
" \"load_from_hf_hub\",\n",
" arguments={\n",
" \"dataset_name\": \"wikitext@~parquet\",\n",
" \"column_name_mapping\": {\"text\": \"text\"},\n",
" \"n_rows_to_load\": 1000,\n",
" },\n",
" produces={\n",
Expand Down Expand Up @@ -462,7 +496,7 @@
"embeddings.write(\n",
" \"index_weaviate\",\n",
" arguments={\n",
" \"weaviate_url\": \"http://host.docker.internal:8080\",\n",
" \"weaviate_url\": f\"http://{utils.get_host_ip()}:8081\",\n",
" \"class_name\": \"index\",\n",
" },\n",
")"
Expand All @@ -488,7 +522,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"If you check the logs, you will see th\n",
"If you check the logs, you will see that components executed with the same parameters which enables faster pipeline iteration.\n",
"\n",
"If you restart the Explorer, you'll see that you can now select a second pipeline and inspect your new dataset."
]
Expand Down
9 changes: 8 additions & 1 deletion src/pipeline_eval.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Fondant pipeline to evaluate a RAG pipeline."""

import pyarrow as pa
from fondant.pipeline import Pipeline
from fondant.pipeline import Pipeline, Resources


def create_pipeline(
Expand All @@ -19,6 +19,8 @@ def create_pipeline(
evaluation_llm: str = "OpenAI",
evaluation_llm_kwargs: dict = {"model_name": "gpt-3.5-turbo"},
evaluation_metrics: list = ["context_precision", "context_relevancy"],
number_of_accelerators=None,
accelerator_name=None,
):
"""Create a Fondant pipeline based on the provided arguments."""
evaluation_pipeline = Pipeline(
Expand Down Expand Up @@ -48,6 +50,11 @@ def create_pipeline(
consumes={
"text": "question",
},
resources=Resources(
accelerator_number=number_of_accelerators,
accelerator_name=accelerator_name,
),
cluster_type="local",
)

retrieve_chunks = embed_text_op.apply(
Expand Down
9 changes: 8 additions & 1 deletion src/pipeline_index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Fondant pipeline to index a RAG system."""
import pyarrow as pa
from fondant.pipeline import Pipeline
from fondant.pipeline import Pipeline, Resources


def create_pipeline(
Expand All @@ -15,6 +15,8 @@ def create_pipeline(
embed_api_key: dict = {},
chunk_size: int = 512,
chunk_overlap: int = 32,
number_of_accelerators=None,
accelerator_name=None,
):
"""Create a Fondant pipeline based on the provided arguments."""
indexing_pipeline = Pipeline(
Expand Down Expand Up @@ -50,6 +52,11 @@ def create_pipeline(
"model": embed_model,
"api_keys": embed_api_key,
},
resources=Resources(
accelerator_number=number_of_accelerators,
accelerator_name=accelerator_name,
),
cluster_type="local",
)

embeddings.write(
Expand Down
2 changes: 1 addition & 1 deletion src/weaviate/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ services:
weaviate:
image: semitechnologies/weaviate:1.20.5
ports:
- 8080:8080
- 8081:8080
environment:
CONTEXTIONARY_URL: contextionary:9999
QUERY_DEFAULTS_LIMIT: 25
Expand Down

0 comments on commit 02259e5

Please sign in to comment.