From a2ef45e13f57976f84887f7825afbd49dc0b6440 Mon Sep 17 00:00:00 2001 From: jayantp2003 Date: Fri, 11 Oct 2024 17:08:04 +0530 Subject: [PATCH 1/2] Fix #859: Resolve issue with large zip breaking stream endpoint --- application/parser/file/rst_parser.py | 53 ++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index 633ec8445..eb9043b2e 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -91,6 +91,48 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: ] return rst_tups + def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]: + """Chunk text by token count.""" + # words = text.split() + # chunks = [] + # current_chunk = [] + # current_token_count = 0 + + # for word in words: + # word_token_len = len(word.split()) # Token count + # if current_token_count + word_token_len > max_tokens: + # chunks.append(" ".join(current_chunk)) + # current_chunk = [] + # current_token_count = 0 + # current_chunk.append(word) + # current_token_count += word_token_len + + # if current_chunk: + # chunks.append(" ".join(current_chunk)) + + # return chunks + + + avg_token_length = 5 + + # Calculate approximate chunk size in characters + chunk_size = max_tokens * avg_token_length + + # Split text into chunks + chunks = [] + for i in range(0, len(text), chunk_size): + chunk = text[i:i+chunk_size] + + # Adjust chunk to end at a word boundary + if i + chunk_size < len(text): + last_space = chunk.rfind(' ') + if last_space != -1: + chunk = chunk[:last_space] + + chunks.append(chunk.strip()) + + return chunks + def remove_images(self, content: str) -> str: pattern = r"\.\. image:: (.*)" content = re.sub(pattern, "", content) @@ -136,7 +178,7 @@ def _init_parser(self) -> Dict: return {} def parse_tups( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000 ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: @@ -156,6 +198,15 @@ def parse_tups( rst_tups = self.remove_whitespaces_excess(rst_tups) if self._remove_characters_excess: rst_tups = self.remove_characters_excess(rst_tups) + + # Apply chunking if max_tokens is provided + if max_tokens is not None: + chunked_tups = [] + for header, text in rst_tups: + chunks = self.chunk_by_token_count(text, max_tokens) + for idx, chunk in enumerate(chunks): + chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk)) + return chunked_tups return rst_tups def parse_file( From 3db07f3a26c56d29e6d5a7d99c097206ac64cdce Mon Sep 17 00:00:00 2001 From: jayantp2003 Date: Fri, 11 Oct 2024 17:10:12 +0530 Subject: [PATCH 2/2] Fix #859: Resolve issue with large zip breaking stream endpoint --- application/parser/file/rst_parser.py | 25 +------------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index eb9043b2e..d39a0837f 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -93,37 +93,14 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]: """Chunk text by token count.""" - # words = text.split() - # chunks = [] - # current_chunk = [] - # current_token_count = 0 - - # for word in words: - # word_token_len = len(word.split()) # Token count - # if current_token_count + word_token_len > max_tokens: - # chunks.append(" ".join(current_chunk)) - # current_chunk = [] - # current_token_count = 0 - # current_chunk.append(word) - # current_token_count += word_token_len - - # if current_chunk: - # chunks.append(" ".join(current_chunk)) - - # return chunks - avg_token_length = 5 - # Calculate approximate chunk size in characters chunk_size = max_tokens * avg_token_length - - # Split text into chunks + chunks = [] for i in range(0, len(text), chunk_size): chunk = text[i:i+chunk_size] - - # Adjust chunk to end at a word boundary if i + chunk_size < len(text): last_space = chunk.rfind(' ') if last_space != -1: