From da02d76f10810cd6c380566a7045dbfb227f2528 Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sat, 20 Apr 2024 18:52:08 +0900 Subject: [PATCH 1/4] fix unknown token at gpt_tokenize --- examples/common.cpp | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/examples/common.cpp b/examples/common.cpp index 9ab162a5b..4b72d64e9 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -315,22 +315,44 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // find the longest token that forms each word in words: std::vector tokens; + // unknown token + std::vector unknown; + unknown.clear(); for (const auto & word : words) { for (int i = 0; i < (int) word.size(); ){ for (int j = word.size() - 1; j >= i; j--){ auto cand = word.substr(i, j-i+1); auto it = vocab.token_to_id.find(cand); if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } tokens.push_back(it->second); + //fprintf(stderr, "%s: known token '%s'(%5d)\n", __func__, cand.data(), it->second); i = j + 1; break; } else if (j == i){ // word.substr(i, 1) has no matching - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + //fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + auto unk = word.substr(i, 1).data(); + //fprintf(stderr, "%s: unknown token '%s'(%02x)\n", __func__, unk, *unk); + unknown.push_back(*unk); i++; } + else { + //fprintf(stderr, "%s: UNKNOWN token '%s'(%02x)\n", __func__, cand.data(), *(cand.data())); + } } } + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } } return tokens; From 93f1733961d366c8ca02e82565bcf6de0236521e Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sat, 4 May 2024 11:39:31 +0900 Subject: [PATCH 2/4] delete same debug code --- examples/common.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/examples/common.cpp b/examples/common.cpp index 2c0cdf082..b91309d80 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -320,22 +320,38 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // find the longest token that forms each word in words: std::vector tokens; + // unknown token + std::vector unknown; + unknown.clear(); for (const auto & word : words) { for (int i = 0; i < (int) word.size(); ){ for (int j = word.size() - 1; j >= i; j--){ auto cand = word.substr(i, j-i+1); auto it = vocab.token_to_id.find(cand); if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } tokens.push_back(it->second); i = j + 1; break; } else if (j == i){ // word.substr(i, 1) has no matching - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + auto unk = word.substr(i, 1).data(); + unknown.push_back(*unk); i++; } } } + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } } return tokens; From 3a8a694d65ba708d4e89570c300360cdee8fe0fe Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sat, 4 May 2024 12:01:03 +0900 Subject: [PATCH 3/4] delete same debug code --- examples/common.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/examples/common.cpp b/examples/common.cpp index 2c0cdf082..b91309d80 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -320,22 +320,38 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // find the longest token that forms each word in words: std::vector tokens; + // unknown token + std::vector unknown; + unknown.clear(); for (const auto & word : words) { for (int i = 0; i < (int) word.size(); ){ for (int j = word.size() - 1; j >= i; j--){ auto cand = word.substr(i, j-i+1); auto it = vocab.token_to_id.find(cand); if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } tokens.push_back(it->second); i = j + 1; break; } else if (j == i){ // word.substr(i, 1) has no matching - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + auto unk = word.substr(i, 1).data(); + unknown.push_back(*unk); i++; } } } + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } } return tokens; From 24f8912a8abf712b6eee093338215af6d30f3de7 Mon Sep 17 00:00:00 2001 From: katsu560 Date: Sat, 4 May 2024 12:01:03 +0900 Subject: [PATCH 4/4] delete some debug code --- examples/common.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/examples/common.cpp b/examples/common.cpp index 2c0cdf082..b91309d80 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -320,22 +320,38 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // find the longest token that forms each word in words: std::vector tokens; + // unknown token + std::vector unknown; + unknown.clear(); for (const auto & word : words) { for (int i = 0; i < (int) word.size(); ){ for (int j = word.size() - 1; j >= i; j--){ auto cand = word.substr(i, j-i+1); auto it = vocab.token_to_id.find(cand); if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } tokens.push_back(it->second); i = j + 1; break; } else if (j == i){ // word.substr(i, 1) has no matching - fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data()); + auto unk = word.substr(i, 1).data(); + unknown.push_back(*unk); i++; } } } + if (!unknown.empty()){ + unknown.push_back(0); // terminator + std::string unkstr(unknown.begin(), unknown.end()); + fprintf(stderr, "%s: unknown token '%s'\n", __func__, unkstr.data()); + unknown.clear(); + } } return tokens;