call markdown chunker after context-aware chunking

Signed-off-by: Khaled Sulayman <[email protected]>
instructlab · Nov 5, 2024 · 96ca459 · 96ca459
1 parent 4941d64
commit 96ca459
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 36 deletions.
diff --git a/src/instructlab/sdg/utils/chunkers.py b/src/instructlab/sdg/utils/chunkers.py
@@ -62,7 +62,17 @@ def __new__(
                 chunker class for the provided filetype
         """
         documents = leaf_node[0]["documents"]
-        assert type(documents) == list
+
+        if isinstance(documents, str):
+            documents = [documents]
+            logger.info(
+                "Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
+            )
+        elif not isinstance(documents, list):
+            raise TypeError(
+                "Expected: documents to be a list, but got {}".format(type(documents))
+            )
+
         filepaths = leaf_node[0]["filepaths"]
         leaf_node_path = Path(leaf_node[0]["taxonomy_path"].replace("->", "/"))
 
@@ -84,9 +94,18 @@ def __new__(
                 filepaths,
                 DEFAULT_TAXONOMY_PATH / leaf_node_path / "qna.yaml",
                 output_dir, 
+                chunk_word_count,
                 tokenizer_model_name,
             )
 
+    @staticmethod
+    def _num_tokens_from_words(num_words) -> int:
+        return int(num_words * 1.3)  # 1 word ~ 1.3 token
+
+    @staticmethod
+    def _num_chars_from_tokens(num_tokens) -> int:
+        return int(num_tokens * 4)  # 1 token ~ 4 English character
+
     @staticmethod
     def _split_docs_by_filetype(documents: List[str], filepaths: List[Path]) -> defaultdict[any, List]:
         """Separate documents into lists based on their filetype.
@@ -128,7 +147,7 @@ def __init__(
         self.chunk_word_count = chunk_word_count
         self.output_dir = output_dir
 
-    def chunk_documents(self) -> Dataset:
+    def chunk_documents(self) -> List:
         """Naively chunk markdown documents based on the word count provided by the user.
         Returns:
             List[str]: List of chunked documents.
@@ -145,36 +164,8 @@ def chunk_documents(self) -> Dataset:
         if self.document_contents == []:
             return []
 
-        # Placeholder for params
-        content = []
         chunk_size = self._num_chars_from_tokens(num_tokens_per_doc)
-        chunk_overlap = _DEFAULT_CHUNK_OVERLAP
-
-        # Using Markdown as default, document-specific chunking will be implemented in separate pr.
-        md_text_splitter = RecursiveCharacterTextSplitter.from_language(
-            language=Language.MARKDOWN,
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap,
-        )
-
-        # Determine file type for heuristics, default with markdown
-        for doc in self.document_contents:
-            # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
-            doc = re.sub(r"-{2,}\|", "-|", doc)
-            # Remove unnecessary spaces in front of pipe characters in a markdown table.
-            doc = re.sub(r"\  +\|", " |", doc)
-            temp = md_text_splitter.create_documents([doc])
-            content.extend([item.page_content for item in temp])
-
-        return content
-
-    @staticmethod
-    def _num_tokens_from_words(num_words) -> int:
-        return int(num_words * 1.3)  # 1 word ~ 1.3 token
-
-    @staticmethod
-    def _num_chars_from_tokens(num_tokens) -> int:
-        return int(num_tokens * 4)  # 1 token ~ 4 English character
+        return chunk_markdowns(self.document_contents, chunk_size)
 
 
 class ContextAwareChunker(ChunkerBase):
@@ -184,12 +175,14 @@ def __init__(
         filepaths,
         leaf_node_path,
         output_dir: Path,
+        chunk_word_count: int,
         tokenizer_model_name=None,
     ):
         self.document_paths = document_paths
         self.filepaths = filepaths
         self.leaf_node_path = leaf_node_path
         self.output_dir = self._path_validator(output_dir)
+        self.chunk_word_count = chunk_word_count
         if tokenizer_model_name is None:
             self.tokenizer_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
         else:
@@ -202,7 +195,7 @@ def __init__(
             tokenizer_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
         self.tokenizer = self.create_tokenizer(tokenizer_model_name)
 
-    def chunk_documents(self) -> Dataset:
+    def chunk_documents(self) -> List:
         """Semantically chunk PDF documents.
 
         Returns:
@@ -270,9 +263,13 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
             max_token_per_chunk=500,
             tokenizer=self.tokenizer,
         )
-        return self.fuse_texts(chunks, 200)
+        fused_texts = self.fuse_texts(chunks, 200)
+
+        num_tokens_per_doc = self._num_tokens_from_words(self.chunk_word_count)
+        chunk_size = self._num_chars_from_tokens(num_tokens_per_doc)
+        return chunk_markdowns(fused_texts, chunk_size)
 
-    def fuse_texts(self, text_list, short_length_threshold=100):
+    def fuse_texts(self, text_list: List, short_length_threshold: int = 100):
         """
         Fuse short texts with preceding longer texts if their word count is below the threshold.
         Args:
@@ -561,3 +558,37 @@ def export_documents(self, converted_docs: Iterable[ConvertedDocument]):
         )
 
         return docling_artifacts_path
+
+
+def chunk_markdowns(documents: List | Dataset, chunk_size) -> Dataset:
+    """
+    Iterates over the documents and splits them into chunks based on the word count provided by the user.
+    Args:
+        documents (list): List of documents retrieved from git (can also consist of a single document).
+        server_ctx_size (int): Context window size of server.
+        chunk_word_count (int): Maximum number of words to chunk a document.
+    Returns:
+         List[str]: List of chunked documents.
+    """
+
+    # Checks for input type error
+    content = []
+    # chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
+    chunk_overlap = _DEFAULT_CHUNK_OVERLAP
+
+    # Using Markdown as default, document-specific chunking will be implemented in separate pr.
+    text_splitter = RecursiveCharacterTextSplitter.from_language(
+        language=Language.MARKDOWN,
+        chunk_size=chunk_size,
+        chunk_overlap=chunk_overlap,
+    )
+
+    # Determine file type for heuristics, default with markdown
+    for docs in documents:
+        # Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
+        docs = re.sub(r"-{2,}\|", "-|", docs)
+        # Remove unnecessary spaces in front of pipe characters in a markdown table.
+        docs = re.sub(r"\  +\|", " |", docs)
+        temp = text_splitter.create_documents([docs])
+        content.extend([item.page_content for item in temp])
+    return content
diff --git a/src/instructlab/sdg/utils/taxonomy.py b/src/instructlab/sdg/utils/taxonomy.py
@@ -308,7 +308,7 @@ def read_taxonomy(
     taxonomy: str | Path,
     taxonomy_base: str,
     yaml_rules: str | None = None,
-    document_output_dir: Path = Path(),
+    document_output_dir: Path | None = None,
 ):
     yamllint_config = None  # If no custom rules file, use default config
     if yaml_rules is not None:  # user attempted to pass custom rules file
@@ -364,7 +364,7 @@ def read_taxonomy(
     return seed_instruction_data
 
 
-def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules, document_output_dir):
+def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules, document_output_dir=None):
     seed_instruction_data = read_taxonomy(
         taxonomy, taxonomy_base, yaml_rules, document_output_dir
     )