Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated chunking_document. #65

Merged
merged 1 commit into from
Jul 6, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions src/instructlab/sdg/utils/chunking.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@

# Standard
from typing import List
import logging
import re

# Third Party
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter

_DEFAULT_CHUNK_OVERLAP = 100

logger = logging.getLogger(__name__)


def _num_tokens_from_words(num_words) -> int:
return int(num_words * 1.3) # 1 word ~ 1.3 token
Expand All @@ -21,12 +25,24 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
"""
Iterates over the documents and splits them into chunks based on the word count provided by the user.
Args:
documents (dict): List of documents retrieved from git (can also consist of a single document).
documents (list): List of documents retrieved from git (can also consist of a single document).
server_ctx_size (int): Context window size of server.
chunk_word_count (int): Maximum number of words to chunk a document.
Returns:
List[str]: List of chunked documents.
"""

# Checks for input type error
if isinstance(documents, str):
PalmPalm7 marked this conversation as resolved.
Show resolved Hide resolved
documents = [documents]
logger.info(
PalmPalm7 marked this conversation as resolved.
Show resolved Hide resolved
"Converted single string into a list of string. Assumed the string passed in is the document. Normally, chunk_document() should take a list as input."
)
elif not isinstance(documents, list):
raise TypeError(
"Expected: documents to be a list, but got {}".format(type(documents))
)

no_tokens_per_doc = _num_tokens_from_words(chunk_word_count)
if no_tokens_per_doc > int(server_ctx_size - 1024):
raise ValueError(
Expand All @@ -36,15 +52,24 @@ def chunk_document(documents: List, server_ctx_size, chunk_word_count) -> List[s
)
)
)
# Placeholder for params
content = []
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n\n", "\n", " "],
chunk_size=_num_chars_from_tokens(no_tokens_per_doc),
chunk_overlap=_DEFAULT_CHUNK_OVERLAP,
chunk_size = _num_chars_from_tokens(no_tokens_per_doc)
chunk_overlap = _DEFAULT_CHUNK_OVERLAP

# Using Markdown as default, document-specific chunking will be implemented in seperate pr.
text_splitter = RecursiveCharacterTextSplitter.from_language(
language=Language.MARKDOWN,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)

# Determine file type for heuristics, default with markdown
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like this comment may be out of date, but not a big deal

for docs in documents:
# Use regex to remove unnecessary dashes in front of pipe characters in a markdown table.
docs = re.sub(r"-{2,}\|", "-|", docs)
# Remove unnecessary spaces in front of pipe characters in a markdown table.
docs = re.sub(r"\ +\|", " |", docs)
temp = text_splitter.create_documents([docs])
content.extend([item.page_content for item in temp])

return content