Skip to content

Commit

Permalink
call markdown chunker after context-aware chunking
Browse files Browse the repository at this point in the history
Signed-off-by: Khaled Sulayman <[email protected]>
  • Loading branch information
khaledsulayman committed Nov 5, 2024
1 parent 4941d64 commit 96ca459
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 36 deletions.
99 changes: 65 additions & 34 deletions src/instructlab/sdg/utils/chunkers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,17 @@ def __new__(
chunker class for the provided filetype
"""
documents = leaf_node[0]["documents"]
assert type(documents) == list

if isinstance(documents, str):
documents = [documents]
logger.info(
"Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
)
elif not isinstance(documents, list):
raise TypeError(
"Expected: documents to be a list, but got {}".format(type(documents))
)

filepaths = leaf_node[0]["filepaths"]
leaf_node_path = Path(leaf_node[0]["taxonomy_path"].replace("->", "/"))

Expand All @@ -84,9 +94,18 @@ def __new__(
filepaths,
DEFAULT_TAXONOMY_PATH / leaf_node_path / "qna.yaml",
output_dir,
chunk_word_count,
tokenizer_model_name,
)

@staticmethod
def _num_tokens_from_words(num_words) -> int:
return int(num_words * 1.3) # 1 word ~ 1.3 token

@staticmethod
def _num_chars_from_tokens(num_tokens) -> int:
return int(num_tokens * 4) # 1 token ~ 4 English character

@staticmethod
def _split_docs_by_filetype(documents: List[str], filepaths: List[Path]) -> defaultdict[any, List]:
"""Separate documents into lists based on their filetype.
Expand Down Expand Up @@ -128,7 +147,7 @@ def __init__(
self.chunk_word_count = chunk_word_count
self.output_dir = output_dir

def chunk_documents(self) -> Dataset:
def chunk_documents(self) -> List:
"""Naively chunk markdown documents based on the word count provided by the user.
Returns:
List[str]: List of chunked documents.
Expand All @@ -145,36 +164,8 @@ def chunk_documents(self) -> Dataset:
if self.document_contents == []:
return []

# Placeholder for params
content = []
chunk_size = self._num_chars_from_tokens(num_tokens_per_doc)
chunk_overlap = _DEFAULT_CHUNK_OVERLAP

# Using Markdown as default, document-specific chunking will be implemented in separate pr.
md_text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

# Determine file type for heuristics, default with markdown
for doc in self.document_contents:
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
doc = re.sub(r"-{2,}\|", "-|", doc)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
doc = re.sub(r"\ +\|", " |", doc)
temp = md_text_splitter.create_documents([doc])
content.extend([item.page_content for item in temp])

return content

@staticmethod
def _num_tokens_from_words(num_words) -> int:
return int(num_words * 1.3) # 1 word ~ 1.3 token

@staticmethod
def _num_chars_from_tokens(num_tokens) -> int:
return int(num_tokens * 4) # 1 token ~ 4 English character
return chunk_markdowns(self.document_contents, chunk_size)


class ContextAwareChunker(ChunkerBase):
Expand All @@ -184,12 +175,14 @@ def __init__(
filepaths,
leaf_node_path,
output_dir: Path,
chunk_word_count: int,
tokenizer_model_name=None,
):
self.document_paths = document_paths
self.filepaths = filepaths
self.leaf_node_path = leaf_node_path
self.output_dir = self._path_validator(output_dir)
self.chunk_word_count = chunk_word_count
if tokenizer_model_name is None:
self.tokenizer_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
else:
Expand All @@ -202,7 +195,7 @@ def __init__(
tokenizer_model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
self.tokenizer = self.create_tokenizer(tokenizer_model_name)

def chunk_documents(self) -> Dataset:
def chunk_documents(self) -> List:
"""Semantically chunk PDF documents.
Returns:
Expand Down Expand Up @@ -270,9 +263,13 @@ def _process_parsed_docling_json(self, json_fp: Path) -> Dataset:
max_token_per_chunk=500,
tokenizer=self.tokenizer,
)
return self.fuse_texts(chunks, 200)
fused_texts = self.fuse_texts(chunks, 200)

num_tokens_per_doc = self._num_tokens_from_words(self.chunk_word_count)
chunk_size = self._num_chars_from_tokens(num_tokens_per_doc)
return chunk_markdowns(fused_texts, chunk_size)

def fuse_texts(self, text_list, short_length_threshold=100):
def fuse_texts(self, text_list: List, short_length_threshold: int = 100):
"""
Fuse short texts with preceding longer texts if their word count is below the threshold.
Args:
Expand Down Expand Up @@ -561,3 +558,37 @@ def export_documents(self, converted_docs: Iterable[ConvertedDocument]):
)

return docling_artifacts_path


def chunk_markdowns(documents: List | Dataset, chunk_size) -> Dataset:
"""
Iterates over the documents and splits them into chunks based on the word count provided by the user.
Args:
documents (list): List of documents retrieved from git (can also consist of a single document).
server_ctx_size (int): Context window size of server.
chunk_word_count (int): Maximum number of words to chunk a document.
Returns:
List[str]: List of chunked documents.
"""

# Checks for input type error
content = []
# chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
chunk_overlap = _DEFAULT_CHUNK_OVERLAP

# Using Markdown as default, document-specific chunking will be implemented in separate pr.
text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

# Determine file type for heuristics, default with markdown
for docs in documents:
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
docs = re.sub(r"-{2,}\|", "-|", docs)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
docs = re.sub(r"\ +\|", " |", docs)
temp = text_splitter.create_documents([docs])
content.extend([item.page_content for item in temp])
return content
4 changes: 2 additions & 2 deletions src/instructlab/sdg/utils/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def read_taxonomy(
taxonomy: str | Path,
taxonomy_base: str,
yaml_rules: str | None = None,
document_output_dir: Path = Path(),
document_output_dir: Path | None = None,
):
yamllint_config = None # If no custom rules file, use default config
if yaml_rules is not None: # user attempted to pass custom rules file
Expand Down Expand Up @@ -364,7 +364,7 @@ def read_taxonomy(
return seed_instruction_data


def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules, document_output_dir):
def read_taxonomy_leaf_nodes(taxonomy, taxonomy_base, yaml_rules, document_output_dir=None):
seed_instruction_data = read_taxonomy(
taxonomy, taxonomy_base, yaml_rules, document_output_dir
)
Expand Down

0 comments on commit 96ca459

Please sign in to comment.