Merge pull request #498 from chaojun-zhang/op_readme

[V1.2] [ISSUES-497] Update readme for document loader, document split and document ingestion operators
intel · Dec 20, 2023 · c275a74 · c275a74
2 parents 6ea53f5 + 77609b2
commit c275a74
Show file tree

Hide file tree

Showing 7 changed files with 2,983 additions and 104 deletions.
diff --git a/RecDP/examples/notebooks/llmutils/document_extract.ipynb b/RecDP/examples/notebooks/llmutils/document_extract.ipynb
diff --git a/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb b/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb
diff --git a/RecDP/examples/notebooks/llmutils/document_split.ipynb b/RecDP/examples/notebooks/llmutils/document_split.ipynb
diff --git a/RecDP/pyrecdp/LLM/README.md b/RecDP/pyrecdp/LLM/README.md
@@ -9,8 +9,11 @@ RecDP LLM is a set of python components that enables quick and easy establish of
 
 | Type                                                                                                                       | notebook                                                                                                                                                                                                   | Description                                               | supports                                             | Verified dataset & size               |
 | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------- | ---------------------------------------------------- | ------------------------------------- |
-| [ DocumentExtract ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/llmutils/document_extractor.py)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_extract.ipynb)            | extract text from unstructured format                          | jpg, png, pdf, docx,                                 | RefinedWeb - 1.7 TB                   |
-| [ Reader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_reader.py#L16)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/reader.ipynb)            | Read data from directory                            | jsonl, parquet,                                 | RefinedWeb - 1.7 TB                   |
+| [ Directory Loader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/doc_loader.py#L77)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_extract.ipynb)            | extract text from a directory                          | jpg, png, pdf, docx,                                 | RefinedWeb - 1.7 TB                   |
+| [ Document Loader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/doc_loader.py#L15)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_extract.ipynb)            | extract text from unstructured format                       | all [document loaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/) provided in [langchain](https://python.langchain.com/)                                 | RefinedWeb - 1.7 TB                   |
+| [ Text Reader ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_reader.py#L16)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/reader.ipynb)            | Read data from directory                            | jsonl, parquet,                                 | RefinedWeb - 1.7 TB                   |
+| [ Document Split ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_split.py)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_split.ipynb)            | split documents                           |  [text splitter](https://python.langchain.com/docs/modules/data_connection/document_transformers/#text-splitters) provided in [langchain](https://python.langchain.com/) and [customer document split](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_split.py#L278)                          | RefinedWeb - 1.7 TB                   |
+| [ Document Ingestion ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_ingestion.py)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/document_ingestion.ipynb)            | embedding documents and store into  vector database                        |  chroma,faiss,elasticsearch                  | RefinedWeb - 1.7 TB                   |
 | [ Converter ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_converter.py)                      | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/convert.ipynb)            | Read and convert unstructed data to unified format                           | html, document, image, pdf, ...                                 | RefinedWeb - 1.7 TB                   |
 | [ Filter ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/filter.py)                         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/filter.ipynb)              | Filter out document based on condition                    | profanity-based, black-list, url_based, length_based | RedPajama - 2 TB                      |
 | [ Text Bytesize ](https://github.com/intel/e2eAIOK/blob/main/RecDP/pyrecdp/primitives/operations/text_bytesize.py)                         | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/intel/e2eAIOK/blob/main/RecDP/examples/notebooks/llmutils/bytesize.ipynb)              | Get text bytes size                    |  | RedPajama - 2 TB                      |

diff --git a/RecDP/pyrecdp/primitives/llmutils/document/reader.py b/RecDP/pyrecdp/primitives/llmutils/document/reader.py
@@ -312,17 +312,7 @@ def _load_file(self, input_file: Path, pbar):
                 loader = self.file_extractor[file_suffix]
                 return loader.load()
             else:
-                from pyrecdp.core.import_utils import import_langchain
-                import_langchain()
-                from langchain.document_loaders import UnstructuredFileLoader
-                loader = UnstructuredFileLoader(str(input_file))
-                docs = [Document(text=doc.text, metadata=doc.metadata) for doc in loader.load()]
-                docs = list(filter(lambda d: (d.pa.strip() != ""), docs))
-                if self.single_text_per_document:
-                    text = self.page_separator.join([doc.text for doc in docs])
-                    return [Document(text=text, metadata={"source": str(input_file)})]
-                else:
-                    return docs
+                return []
         finally:
             if pbar:
                 pbar.update(1)
@@ -336,7 +326,8 @@ def load(self) -> List[Document]:
                 from concurrent.futures import ThreadPoolExecutor
                 with ThreadPoolExecutor(self.max_concurrency) as executor:
                     for docs in executor.map(lambda i: self._load_file(i, pbar), self.input_files):
-                        docs_result.extend(docs)
+                        if len(docs)>0:
+                            docs_result.extend(docs)
             else:
                 for file in self.input_files:
                     docs = self._load_file(file, pbar)

diff --git a/RecDP/pyrecdp/primitives/operations/text_ingestion.py b/RecDP/pyrecdp/primitives/operations/text_ingestion.py
@@ -157,6 +157,7 @@ def do_persist(self, ds, **kwargs):
         check_availability_and_install(["chromadb==0.4.15", "langchain"])
         chroma = self.vector_store_args["db_handler"]
 
+
         collection_name = self.vector_store_args.get("collection_name", 'langchain')
         rows = ds.iter_rows() if isinstance(ds, Dataset) else ds.collect()
         texts = [row[self.text_column] for row in rows]
@@ -165,9 +166,15 @@ def do_persist(self, ds, **kwargs):
         if chroma is not None:
             chroma.add_texts(texts)
             return chroma
-        if "output_dir" not in self.vector_store_args:
-            raise ValueError(f"You must have `output_dir` option specify for Chroma vector store")
-        persist_directory = self.vector_store_args["output_dir"]
+        if "output_dir" not in self.vector_store_args and 'persist_directory' not in self.vector_store_args:
+            raise ValueError(
+                f"You must have `output_dir` or `persist_directory` option specify for Chroma vector store")
+
+        if 'output_dir' in self.vector_store_args:
+            persist_directory = self.vector_store_args["output_dir"]
+        else:
+            persist_directory = self.vector_store_args["persist_directory"]
+
         if not self.override and os.path.exists(persist_directory):
             chroma = Chroma(collection_name=collection_name,
                             persist_directory=persist_directory,

diff --git a/RecDP/tests/test_llmutils_operations.py b/RecDP/tests/test_llmutils_operations.py
@@ -231,11 +231,16 @@ def test_gopherqualityfilter_ray(self):
         with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
             ctx.show(op.process_rayds(ctx.ds))
 
-    def test_document_load_ray(self):
+    def test_document_load_pdf_ray(self):
         op = DirectoryLoader("tests/data/llm_data/document", glob="**/*.pdf")
         with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx:
             ctx.show(op.process_rayds())
 
+    def test_document_load_ray(self):
+        op = DirectoryLoader("data/llm_data/document")
+        with RayContext("data/llm_data/tiny_c4_sample.jsonl") as ctx:
+            ctx.show(op.process_rayds())
+
     def test_url_load_ray(self):
         op = UrlLoader(["https://www.intc.com/news-events/press-releases?year=2023&category=all"], max_depth=1)
         with RayContext("tests/data/llm_data/tiny_c4_sample.jsonl") as ctx: