From 6001589c7205f1d56b5952e91b69bb108a4a5c6e Mon Sep 17 00:00:00 2001
From: Tibor Mach <56956998+tibor-mach@users.noreply.github.com>
Date: Wed, 18 Sep 2024 09:09:21 +0200
Subject: [PATCH] add embeddings/gen example (#362)

---
 .../unstructured-embeddings-gen.py            | 76 +++++++++++++++++++
 ...ed-text.py => unstructured-summary-map.py} | 10 ++-
 examples/multimodal/hf_pipeline.py            |  8 +-
 examples/multimodal/openai_image_desc_lib.py  |  2 -
 pyproject.toml                                |  5 +-
 5 files changed, 93 insertions(+), 8 deletions(-)
 create mode 100644 examples/llm_and_nlp/unstructured-embeddings-gen.py
 rename examples/llm_and_nlp/{unstructured-text.py => unstructured-summary-map.py} (95%)

diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py
new file mode 100644
index 000000000..0c0dc5f0e
--- /dev/null
+++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py
@@ -0,0 +1,76 @@
+"""
+To install the required dependencies:
+
+  pip install datachain[examples]
+
+"""
+
+from collections.abc import Iterator
+
+from unstructured.cleaners.core import (
+    clean,
+    group_broken_paragraphs,
+    replace_unicode_quotes,
+)
+from unstructured.embed.huggingface import (
+    HuggingFaceEmbeddingConfig,
+    HuggingFaceEmbeddingEncoder,
+)
+from unstructured.partition.pdf import partition_pdf
+
+from datachain import C, DataChain, DataModel, File
+
+source = "gs://datachain-demo/neurips/1987/"
+
+
+# Define the output as a DataModel class
+class Chunk(DataModel):
+    key: str
+    text: str
+    embeddings: list[float]
+
+
+# Define embedding encoder
+
+embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
+
+
+# Use signatures to define UDF input/output
+# these can be pydantic model or regular Python types
+def process_pdf(file: File) -> Iterator[Chunk]:
+    # Ingest the file
+    with file.open() as f:
+        chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
+
+    # Clean the chunks and add new columns
+    for chunk in chunks:
+        chunk.apply(
+            lambda text: clean(
+                text, bullets=True, extra_whitespace=True, trailing_punctuation=True
+            )
+        )
+        chunk.apply(replace_unicode_quotes)
+        chunk.apply(group_broken_paragraphs)
+
+    # create embeddings
+    chunks_embedded = embedding_encoder.embed_documents(chunks)
+
+    # Add new rows to DataChain
+    for chunk in chunks_embedded:
+        yield Chunk(
+            key=file.path,
+            text=chunk.text,
+            embeddings=chunk.embeddings,
+        )
+
+
+dc = (
+    DataChain.from_storage(source)
+    .settings(parallel=-1)
+    .filter(C.file.path.glob("*.pdf"))
+    .gen(document=process_pdf)
+)
+
+dc.save("embedded-documents")
+
+DataChain.from_dataset("embedded-documents").show()
diff --git a/examples/llm_and_nlp/unstructured-text.py b/examples/llm_and_nlp/unstructured-summary-map.py
similarity index 95%
rename from examples/llm_and_nlp/unstructured-text.py
rename to examples/llm_and_nlp/unstructured-summary-map.py
index 1e3f0ac2b..c765b6255 100644
--- a/examples/llm_and_nlp/unstructured-text.py
+++ b/examples/llm_and_nlp/unstructured-summary-map.py
@@ -1,6 +1,10 @@
-#
-# pip install unstructured[pdf] huggingface_hub[hf_transfer]
-#
+"""
+To install the required dependencies:
+
+  pip install datachain[examples]
+
+"""
+
 import os
 
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
diff --git a/examples/multimodal/hf_pipeline.py b/examples/multimodal/hf_pipeline.py
index dc5dc0a77..2df499bcb 100644
--- a/examples/multimodal/hf_pipeline.py
+++ b/examples/multimodal/hf_pipeline.py
@@ -1,4 +1,10 @@
-# pip install scipy torch transformers huggingface_hub[hf_transfer]
+"""
+To install the required dependencies:
+
+  pip install datachain[examples]
+
+"""
+
 # NOTE: also need to install ffmpeg binary
 import json
 import os
diff --git a/examples/multimodal/openai_image_desc_lib.py b/examples/multimodal/openai_image_desc_lib.py
index cdd4f2400..611ec98a5 100644
--- a/examples/multimodal/openai_image_desc_lib.py
+++ b/examples/multimodal/openai_image_desc_lib.py
@@ -1,5 +1,3 @@
-# pip install Pillow
-
 import base64
 import os
 
diff --git a/pyproject.toml b/pyproject.toml
index 222ddb0b7..46a0fb783 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,9 +104,10 @@ examples = [
   "numpy>=1,<2",
   "defusedxml",
   "accelerate",
-  "unstructured[pdf]",
+  "unstructured[pdf, embed-huggingface]",
   "pdfplumber==0.11.4",
-  "huggingface_hub[hf_transfer]"
+  "huggingface_hub[hf_transfer]",
+  "onnx==1.16.1"
 ]
 
 [project.urls]