Skip to content

Commit

Permalink
just testing some sutff
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Jun 20, 2024
1 parent e5976a6 commit 1e027dc
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 3 deletions.
4 changes: 2 additions & 2 deletions tokenizers/benches/bpe_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
let mut tokenizer = Tokenizer::new(bpe);
tokenizer.with_pre_tokenizer(ByteLevel::default());
tokenizer.with_decoder(ByteLevel::default());
tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
// tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
// tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
tokenizer
}

Expand Down
4 changes: 3 additions & 1 deletion tokenizers/src/pre_tokenizers/byte_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,8 @@ impl PreTokenizer for ByteLevel {
.map(|(i, b)| (BYTES_CHAR[b], isize::from(i > 0))),
);
}
normalized.transform(transformations, 0);
// normalized.transform(transformations, 0); // TODO here what whould happen if we ignore
// aligments?
Ok(())
})
}
Expand Down Expand Up @@ -199,6 +200,7 @@ impl Decoder for ByteLevel {
}
}

// TODO this is also somewhere we want to just skip if we are fast
/// As a `PostProcessor`, `ByteLevel` is in charge of trimming the offsets if necessary.
impl PostProcessor for ByteLevel {
fn added_tokens(&self, _is_pair: bool) -> usize {
Expand Down

0 comments on commit 1e027dc

Please sign in to comment.