Skip to content

Commit

Permalink
add tokenizer prefixes for fine-tuned models
Browse files Browse the repository at this point in the history
  • Loading branch information
JB Griesner authored and zurawiki committed Dec 22, 2023
1 parent 4afb9d3 commit a9b1b9b
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions tiktoken-rs/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@ const MODEL_PREFIX_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
// chat
("gpt-4-", Tokenizer::Cl100kBase),
("gpt-3.5-turbo-", Tokenizer::Cl100kBase),
("gpt-35-turbo-", Tokenizer::Cl100kBase),
// fine-tuned
("ft:gpt-4", Tokenizer::Cl100kBase),
("ft:gpt-3.5-turbo", Tokenizer::Cl100kBase),
("ft:davinci-002", Tokenizer::Cl100kBase),
("ft:babbage-002", Tokenizer::Cl100kBase),
];

const MODEL_TO_TOKENIZER: &[(&str, Tokenizer)] = &[
Expand Down Expand Up @@ -136,6 +142,10 @@ mod tests {
Some(Tokenizer::Cl100kBase)
);
assert_eq!(get_tokenizer("gpt-3.5-turbo"), Some(Tokenizer::Cl100kBase));
assert_eq!(
get_tokenizer("ft:gpt-3.5-turbo:XXXXXX:2023-11-11"),
Some(Tokenizer::Cl100kBase)
);
assert_eq!(
get_tokenizer("gpt-3.5-turbo-0301"),
Some(Tokenizer::Cl100kBase)
Expand Down

0 comments on commit a9b1b9b

Please sign in to comment.