add embeddings/gen example (#362)

iterative · Sep 18, 2024 · 6001589 · 6001589
1 parent 80f4fbe
commit 6001589
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 8 deletions.
diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py
@@ -0,0 +1,76 @@
+"""
+To install the required dependencies:
+
+  pip install datachain[examples]
+
+"""
+
+from collections.abc import Iterator
+
+from unstructured.cleaners.core import (
+    clean,
+    group_broken_paragraphs,
+    replace_unicode_quotes,
+)
+from unstructured.embed.huggingface import (
+    HuggingFaceEmbeddingConfig,
+    HuggingFaceEmbeddingEncoder,
+)
+from unstructured.partition.pdf import partition_pdf
+
+from datachain import C, DataChain, DataModel, File
+
+source = "gs://datachain-demo/neurips/1987/"
+
+
+# Define the output as a DataModel class
+class Chunk(DataModel):
+    key: str
+    text: str
+    embeddings: list[float]
+
+
+# Define embedding encoder
+
+embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())
+
+
+# Use signatures to define UDF input/output
+# these can be pydantic model or regular Python types
+def process_pdf(file: File) -> Iterator[Chunk]:
+    # Ingest the file
+    with file.open() as f:
+        chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
+
+    # Clean the chunks and add new columns
+    for chunk in chunks:
+        chunk.apply(
+            lambda text: clean(
+                text, bullets=True, extra_whitespace=True, trailing_punctuation=True
+            )
+        )
+        chunk.apply(replace_unicode_quotes)
+        chunk.apply(group_broken_paragraphs)
+
+    # create embeddings
+    chunks_embedded = embedding_encoder.embed_documents(chunks)
+
+    # Add new rows to DataChain
+    for chunk in chunks_embedded:
+        yield Chunk(
+            key=file.path,
+            text=chunk.text,
+            embeddings=chunk.embeddings,
+        )
+
+
+dc = (
+    DataChain.from_storage(source)
+    .settings(parallel=-1)
+    .filter(C.file.path.glob("*.pdf"))
+    .gen(document=process_pdf)
+)
+
+dc.save("embedded-documents")
+
+DataChain.from_dataset("embedded-documents").show()
diff --git a/examples/llm_and_nlp/unstructured-text.py → ...s/llm_and_nlp/unstructured-summary-map.py b/examples/llm_and_nlp/unstructured-text.py → ...s/llm_and_nlp/unstructured-summary-map.py
@@ -1,6 +1,10 @@
-#
-# pip install unstructured[pdf] huggingface_hub[hf_transfer]
-#
+"""
+To install the required dependencies:
+
+  pip install datachain[examples]
+
+"""
+
 import os
 
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

diff --git a/examples/multimodal/hf_pipeline.py b/examples/multimodal/hf_pipeline.py
@@ -1,4 +1,10 @@
-# pip install scipy torch transformers huggingface_hub[hf_transfer]
+"""
+To install the required dependencies:
+
+  pip install datachain[examples]
+
+"""
+
 # NOTE: also need to install ffmpeg binary
 import json
 import os

diff --git a/examples/multimodal/openai_image_desc_lib.py b/examples/multimodal/openai_image_desc_lib.py
@@ -1,5 +1,3 @@
-# pip install Pillow
-
 import base64
 import os
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -104,9 +104,10 @@ examples = [
   "numpy>=1,<2",
   "defusedxml",
   "accelerate",
-  "unstructured[pdf]",
+  "unstructured[pdf, embed-huggingface]",
   "pdfplumber==0.11.4",
-  "huggingface_hub[hf_transfer]"
+  "huggingface_hub[hf_transfer]",
+  "onnx==1.16.1"
 ]
 
 [project.urls]