Skip to content

Commit

Permalink
truncation to max length
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelfeil committed Feb 27, 2024
1 parent d85a1e0 commit e7905c2
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,6 @@ def tokenize_lengths(self, sentences: List[str]) -> List[int]:
return_attention_mask=False,
return_length=False,
# max_length=self._infinity_tokenizer.model_max_length,
# truncation="longest_first",
truncation="longest_first",
).encodings
return [len(t.tokens) for t in tks]
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,6 @@ def encode_post(self, embedding: np.ndarray) -> EmbeddingReturnType:
def tokenize_lengths(self, sentences: List[str]) -> List[int]:
tks = self._infinity_tokenizer.encode_batch(
sentences,
truncation=True,
)
return [len(t.tokens) for t in tks]

0 comments on commit e7905c2

Please sign in to comment.