Skip to content

Commit

Permalink
add embeddings/gen example (#362)
Browse files Browse the repository at this point in the history
  • Loading branch information
tibor-mach authored Sep 18, 2024
1 parent 80f4fbe commit 6001589
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 8 deletions.
76 changes: 76 additions & 0 deletions examples/llm_and_nlp/unstructured-embeddings-gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
"""
To install the required dependencies:
pip install datachain[examples]
"""

from collections.abc import Iterator

from unstructured.cleaners.core import (
clean,
group_broken_paragraphs,
replace_unicode_quotes,
)
from unstructured.embed.huggingface import (
HuggingFaceEmbeddingConfig,
HuggingFaceEmbeddingEncoder,
)
from unstructured.partition.pdf import partition_pdf

from datachain import C, DataChain, DataModel, File

source = "gs://datachain-demo/neurips/1987/"


# Define the output as a DataModel class
class Chunk(DataModel):
key: str
text: str
embeddings: list[float]


# Define embedding encoder

embedding_encoder = HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig())


# Use signatures to define UDF input/output
# these can be pydantic model or regular Python types
def process_pdf(file: File) -> Iterator[Chunk]:
# Ingest the file
with file.open() as f:
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")

# Clean the chunks and add new columns
for chunk in chunks:
chunk.apply(
lambda text: clean(
text, bullets=True, extra_whitespace=True, trailing_punctuation=True
)
)
chunk.apply(replace_unicode_quotes)
chunk.apply(group_broken_paragraphs)

# create embeddings
chunks_embedded = embedding_encoder.embed_documents(chunks)

# Add new rows to DataChain
for chunk in chunks_embedded:
yield Chunk(
key=file.path,
text=chunk.text,
embeddings=chunk.embeddings,
)


dc = (
DataChain.from_storage(source)
.settings(parallel=-1)
.filter(C.file.path.glob("*.pdf"))
.gen(document=process_pdf)
)

dc.save("embedded-documents")

DataChain.from_dataset("embedded-documents").show()
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#
# pip install unstructured[pdf] huggingface_hub[hf_transfer]
#
"""
To install the required dependencies:
pip install datachain[examples]
"""

import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
Expand Down
8 changes: 7 additions & 1 deletion examples/multimodal/hf_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
# pip install scipy torch transformers huggingface_hub[hf_transfer]
"""
To install the required dependencies:
pip install datachain[examples]
"""

# NOTE: also need to install ffmpeg binary
import json
import os
Expand Down
2 changes: 0 additions & 2 deletions examples/multimodal/openai_image_desc_lib.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
# pip install Pillow

import base64
import os

Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,10 @@ examples = [
"numpy>=1,<2",
"defusedxml",
"accelerate",
"unstructured[pdf]",
"unstructured[pdf, embed-huggingface]",
"pdfplumber==0.11.4",
"huggingface_hub[hf_transfer]"
"huggingface_hub[hf_transfer]",
"onnx==1.16.1"
]

[project.urls]
Expand Down

0 comments on commit 6001589

Please sign in to comment.