From 6001589c7205f1d56b5952e91b69bb108a4a5c6e Mon Sep 17 00:00:00 2001 From: Tibor Mach <56956998+tibor-mach@users.noreply.github.com> Date: Wed, 18 Sep 2024 09:09:21 +0200 Subject: [PATCH] add embeddings/gen example (#362) --- .../unstructured-embeddings-gen.py | 76 +++++++++++++++++++ ...ed-text.py => unstructured-summary-map.py} | 10 ++- examples/multimodal/hf_pipeline.py | 8 +- examples/multimodal/openai_image_desc_lib.py | 2 - pyproject.toml | 5 +- 5 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 examples/llm_and_nlp/unstructured-embeddings-gen.py rename examples/llm_and_nlp/{unstructured-text.py => unstructured-summary-map.py} (95%) diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py new file mode 100644 index 000000000..0c0dc5f0e --- /dev/null +++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py @@ -0,0 +1,76 @@ +""" +To install the required dependencies: + + pip install datachain[examples] + +""" + +from collections.abc import Iterator + +from unstructured.cleaners.core import ( + clean, + group_broken_paragraphs, + replace_unicode_quotes, +) +from unstructured.embed.huggingface import ( + HuggingFaceEmbeddingConfig, + HuggingFaceEmbeddingEncoder, +) +from unstructured.partition.pdf import partition_pdf + +from datachain import C, DataChain, DataModel, File + +source = "gs://datachain-demo/neurips/1987/" + + +# Define the output as a DataModel class +class Chunk(DataModel): + key: str + text: str + embeddings: list[float] + + +# Define embedding encoder + +embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig()) + + +# Use signatures to define UDF input/output +# these can be pydantic model or regular Python types +def process_pdf(file: File) -> Iterator[Chunk]: + # Ingest the file + with file.open() as f: + chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast") + + # Clean the chunks and add new columns + for chunk in chunks: + chunk.apply( + lambda text: clean( + text, bullets=True, extra_whitespace=True, trailing_punctuation=True + ) + ) + chunk.apply(replace_unicode_quotes) + chunk.apply(group_broken_paragraphs) + + # create embeddings + chunks_embedded = embedding_encoder.embed_documents(chunks) + + # Add new rows to DataChain + for chunk in chunks_embedded: + yield Chunk( + key=file.path, + text=chunk.text, + embeddings=chunk.embeddings, + ) + + +dc = ( + DataChain.from_storage(source) + .settings(parallel=-1) + .filter(C.file.path.glob("*.pdf")) + .gen(document=process_pdf) +) + +dc.save("embedded-documents") + +DataChain.from_dataset("embedded-documents").show() diff --git a/examples/llm_and_nlp/unstructured-text.py b/examples/llm_and_nlp/unstructured-summary-map.py similarity index 95% rename from examples/llm_and_nlp/unstructured-text.py rename to examples/llm_and_nlp/unstructured-summary-map.py index 1e3f0ac2b..c765b6255 100644 --- a/examples/llm_and_nlp/unstructured-text.py +++ b/examples/llm_and_nlp/unstructured-summary-map.py @@ -1,6 +1,10 @@ -# -# pip install unstructured[pdf] huggingface_hub[hf_transfer] -# +""" +To install the required dependencies: + + pip install datachain[examples] + +""" + import os os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" diff --git a/examples/multimodal/hf_pipeline.py b/examples/multimodal/hf_pipeline.py index dc5dc0a77..2df499bcb 100644 --- a/examples/multimodal/hf_pipeline.py +++ b/examples/multimodal/hf_pipeline.py @@ -1,4 +1,10 @@ -# pip install scipy torch transformers huggingface_hub[hf_transfer] +""" +To install the required dependencies: + + pip install datachain[examples] + +""" + # NOTE: also need to install ffmpeg binary import json import os diff --git a/examples/multimodal/openai_image_desc_lib.py b/examples/multimodal/openai_image_desc_lib.py index cdd4f2400..611ec98a5 100644 --- a/examples/multimodal/openai_image_desc_lib.py +++ b/examples/multimodal/openai_image_desc_lib.py @@ -1,5 +1,3 @@ -# pip install Pillow - import base64 import os diff --git a/pyproject.toml b/pyproject.toml index 222ddb0b7..46a0fb783 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,9 +104,10 @@ examples = [ "numpy>=1,<2", "defusedxml", "accelerate", - "unstructured[pdf]", + "unstructured[pdf, embed-huggingface]", "pdfplumber==0.11.4", - "huggingface_hub[hf_transfer]" + "huggingface_hub[hf_transfer]", + "onnx==1.16.1" ] [project.urls]