Skip to content

Commit

Permalink
Fix #859: Resolve issue with large zip breaking stream endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
jayantp2003 committed Oct 11, 2024
1 parent a2ef45e commit 3db07f3
Showing 1 changed file with 1 addition and 24 deletions.
25 changes: 1 addition & 24 deletions application/parser/file/rst_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,37 +93,14 @@ def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:

def chunk_by_token_count(self, text: str, max_tokens: int = 100) -> List[str]:
"""Chunk text by token count."""
# words = text.split()
# chunks = []
# current_chunk = []
# current_token_count = 0

# for word in words:
# word_token_len = len(word.split()) # Token count
# if current_token_count + word_token_len > max_tokens:
# chunks.append(" ".join(current_chunk))
# current_chunk = []
# current_token_count = 0
# current_chunk.append(word)
# current_token_count += word_token_len

# if current_chunk:
# chunks.append(" ".join(current_chunk))

# return chunks


avg_token_length = 5

# Calculate approximate chunk size in characters
chunk_size = max_tokens * avg_token_length

# Split text into chunks

chunks = []
for i in range(0, len(text), chunk_size):
chunk = text[i:i+chunk_size]

# Adjust chunk to end at a word boundary
if i + chunk_size < len(text):
last_space = chunk.rfind(' ')
if last_space != -1:
Expand Down

0 comments on commit 3db07f3

Please sign in to comment.