Skip to content

Commit

Permalink
Merge pull request #1303 from jayantp2003/bugfix/859-large-zip-breaki…
Browse files Browse the repository at this point in the history
…ng-stream-endpoint

Bugfix/859 large zip breaking stream endpoint
  • Loading branch information
dartpain authored Nov 21, 2024
2 parents a818975 + 3db07f3 commit a0a05b6
Showing 1 changed file with 29 additions and 1 deletion.
30 changes: 29 additions & 1 deletion application/parser/file/rst_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,25 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
]
return rst_tups

def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]:
"""Chunk text by token count."""

avg_token_length = 5

chunk_size = max_tokens * avg_token_length

chunks = []
for i in range(0, len(text), chunk_size):
chunk = text[i:i+chunk_size]
if i + chunk_size < len(text):
last_space = chunk.rfind(' ')
if last_space != -1:
chunk = chunk[:last_space]

chunks.append(chunk.strip())

return chunks

def remove_images(self, content: str) -> str:
pattern = r"\.\. image:: (.*)"
content = re.sub(pattern, "", content)
Expand Down Expand Up @@ -136,7 +155,7 @@ def _init_parser(self) -> Dict:
return {}

def parse_tups(
self, filepath: Path, errors: str = "ignore"
self, filepath: Path, errors: str = "ignore",max_tokens: Optional[int] = 1000
) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples."""
with open(filepath, "r") as f:
Expand All @@ -156,6 +175,15 @@ def parse_tups(
rst_tups = self.remove_whitespaces_excess(rst_tups)
if self._remove_characters_excess:
rst_tups = self.remove_characters_excess(rst_tups)

# Apply chunking if max_tokens is provided
if max_tokens is not None:
chunked_tups = []
for header, text in rst_tups:
chunks = self.chunk_by_token_count(text, max_tokens)
for idx, chunk in enumerate(chunks):
chunked_tups.append((f"{header} - Chunk {idx + 1}", chunk))
return chunked_tups
return rst_tups

def parse_file(
Expand Down

0 comments on commit a0a05b6

Please sign in to comment.