Skip to content

Commit

Permalink
Fix compiler complaints
Browse files Browse the repository at this point in the history
  • Loading branch information
jaime-m-p committed Aug 5, 2024
1 parent 674f0fa commit 2ca3138
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 9 deletions.
18 changes: 10 additions & 8 deletions src/unicode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
case codepoint_categ::P: return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
case codepoint_categ::S: return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
case codepoint_categ::Z: return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
default: GGML_ABORT("invalid category");
}
};

Expand All @@ -709,6 +709,8 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
};

GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t));

const auto cpts = unicode_cpts_from_utf8(text);

std::vector<size_t> bpe_offsets = { cpts.size() };
Expand Down Expand Up @@ -756,15 +758,15 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
wregex_whitespaces += L"\\s";
for (uint32_t cpt : unicode_vec_whitespace) {
if (cpt >= 0x80) { // non-ASCII whitespaces
if (wregex_whitespaces.back() + 1 == cpt) {
if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) {
if (*(wregex_whitespaces.end() - 2) == '-') {
wregex_whitespaces.back() = cpt;
} else {
wregex_whitespaces += '-';
wregex_whitespaces += cpt;
}
} else {
wregex_whitespaces += cpt;
wregex_whitespaces += (wchar_t) cpt;
}
}
}
Expand Down Expand Up @@ -847,15 +849,15 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
}
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
categ.set_flag(codepoint_categ::WHITESPACE, inside_square); //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
regex_expr_categs.emplace_back(i, categ);
regex_expr_categs.emplace_back((uint32_t)i, categ);
i += cpts_regex[i + 4] == '}' ? 4 : 5;
continue;
}

if (cpt == '\\') {
if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') { // \s \S
// (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
regex_expr_categs.emplace_back(i, categ_whitespace);
regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace);
//NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
i += 1;
Expand All @@ -875,9 +877,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
case 't': ++i; cpt = '\t'; break;
case 'r': ++i; cpt = '\r'; break;
case 'n': ++i; cpt = '\n'; break;
case 'x': GGML_ABORT("TODO"); break; //TODO: hex values
case 'u': GGML_ABORT("TODO"); break; //TODO: unicode values
case 'U': GGML_ABORT("TODO"); break; //TODO: unicode values
case 'x': GGML_ABORT("TODO"); //TODO: hex values
case 'u': GGML_ABORT("TODO"); //TODO: unicode values
case 'U': GGML_ABORT("TODO"); //TODO: unicode values
default: // escaped character
GGML_ASSERT(!is_cpt_range);
cpt = cpts_regex[++i];
Expand Down
2 changes: 1 addition & 1 deletion src/unicode.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ struct codepoint_categ {
return 0;
}
const char * p = strchr(subcategs, subcateg);
return p ? (p - subcategs + 1) : 0;
return (uint16_t) (p ? (p - subcategs + 1) : 0);
};
switch(categ) {
case 'C': if(subcateg == 'n') return 0; // undefined
Expand Down

0 comments on commit 2ca3138

Please sign in to comment.