Skip to content

Commit

Permalink
Add web binding Tokenizer.tokenToId()
Browse files Browse the repository at this point in the history
  • Loading branch information
grf53 committed Aug 6, 2024
1 parent c0fab1e commit bdab02e
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 2 deletions.
8 changes: 7 additions & 1 deletion src/sentencepiece_tokenizer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ class SentencePieceTokenizer : public Tokenizer {

std::string IdToToken(int32_t id) final { return sentence_piece_.IdToPiece(id); }

int32_t TokenToId(const std::string& token) final { return sentence_piece_.PieceToId(token); }
int32_t TokenToId(const std::string& token) final {
int32_t id = sentence_piece_.PieceToId(token);
if (id == sentence_piece_.unk_id()) {
return -1;
}
return id;
}

private:
// the tokenizer
Expand Down
11 changes: 11 additions & 0 deletions web/src/tokenizers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ export class Tokenizer {
return res;
}

/**
* Convert the given token to its corresponding id if it exists. If not, return -1.
*
* @param token the input token string.
* @returns The encoded token id.
*/
tokenToId(token: string): number {
const id = this.handle.TokenToId(token.slice());
return id;
}

/**
* Create a tokenizer from jsonArrayBuffer
*
Expand Down
3 changes: 2 additions & 1 deletion web/src/tokenizers_binding.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,6 @@ EMSCRIPTEN_BINDINGS(tokenizers) {
.function("Encode", &tokenizers::Tokenizer::Encode)
.function("Decode", &tokenizers::Tokenizer::Decode)
.function("GetVocabSize", &tokenizers::Tokenizer::GetVocabSize)
.function("IdToToken", &tokenizers::Tokenizer::IdToToken);
.function("IdToToken", &tokenizers::Tokenizer::IdToToken)
.function("TokenToId", &tokenizers::Tokenizer::TokenToId);
}
12 changes: 12 additions & 0 deletions web/tests/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ async function testJSONTokenizer() {
if (tok49407 !== "<|endoftext|>") {
throw Error("Expect token 49407 to be <|endoftext|>");
}

const id0 = tok.tokenToId("!");
console.log("id0=" + id0);
if (id0 !== 0) {
throw Error("Expect token 0 to be !");
}

const id49407 = tok.tokenToId("<|endoftext|>");
console.log("id49407=" + id49407);
if (id49407 !== 49407) {
throw Error("Expect token 49407 to be <|endoftext|>");
}
}

async function testLlamaTokenizer() {
Expand Down

0 comments on commit bdab02e

Please sign in to comment.