From 2ca313830e56d293719040d09733bc574f59923e Mon Sep 17 00:00:00 2001
From: jaime-m-p <>
Date: Mon, 5 Aug 2024 23:55:17 +0200
Subject: [PATCH] Fix compiler complaints

---
 src/unicode.cpp | 18 ++++++++++--------
 src/unicode.h   |  2 +-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/unicode.cpp b/src/unicode.cpp
index ae36d2b43c828..725476600f2ff 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -694,7 +694,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             case codepoint_categ::P:     return COLLAPSE_CPT_RANGE_FIRST + ((5 << 3) | subindex);
             case codepoint_categ::S:     return COLLAPSE_CPT_RANGE_FIRST + ((6 << 3) | subindex);
             case codepoint_categ::Z:     return COLLAPSE_CPT_RANGE_FIRST + ((7 << 3) | subindex);
-            default: GGML_ASSERT(false); return COLLAPSE_CPT_RANGE_FIRST;
+            default:                     GGML_ABORT("invalid category");
         }
     };
 
@@ -709,6 +709,8 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         return std::pair<uint32_t, uint32_t>(collapsed, collapsed + range);
     };
 
+    GGML_ASSERT(sizeof(wchar_t) == sizeof(u_int32_t));
+
     const auto cpts = unicode_cpts_from_utf8(text);
 
     std::vector<size_t> bpe_offsets = { cpts.size() };
@@ -756,7 +758,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
         wregex_whitespaces += L"\\s";
         for (uint32_t cpt : unicode_vec_whitespace) {
             if (cpt >= 0x80) {  // non-ASCII whitespaces
-                if (wregex_whitespaces.back() + 1 == cpt) {
+                if (wregex_whitespaces.back() + 1 == (wchar_t) cpt) {
                     if (*(wregex_whitespaces.end() - 2) == '-') {
                         wregex_whitespaces.back() = cpt;
                     } else {
@@ -764,7 +766,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                         wregex_whitespaces += cpt;
                     }
                 } else {
-                    wregex_whitespaces += cpt;
+                    wregex_whitespaces += (wchar_t) cpt;
                 }
             }
         }
@@ -847,7 +849,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                 }
                 // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
                 categ.set_flag(codepoint_categ::WHITESPACE, inside_square);  //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
-                regex_expr_categs.emplace_back(i, categ);
+                regex_expr_categs.emplace_back((uint32_t)i, categ);
                 i += cpts_regex[i + 4] == '}' ? 4 : 5;
                 continue;
             }
@@ -855,7 +857,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
             if (cpt == '\\') {
                 if (cpts_regex[i + 1] == 's' || cpts_regex[i + 1] == 'S') {  // \s \S
                     // (2) Build a list of codepoint ranges. (2.2) [Optimization] Only build lists of ranges present in the regex.
-                    regex_expr_categs.emplace_back(i, categ_whitespace);
+                    regex_expr_categs.emplace_back((uint32_t)i, categ_whitespace);
                     //NOTE: reusing flag 'WHITESPACE' to store 'inside square brackets'
                     regex_expr_categs.back().second.set_flag(codepoint_categ::WHITESPACE, inside_square);
                     i += 1;
@@ -875,9 +877,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
                     case 't':  ++i;  cpt = '\t';  break;
                     case 'r':  ++i;  cpt = '\r';  break;
                     case 'n':  ++i;  cpt = '\n';  break;
-                    case 'x':  GGML_ABORT("TODO");  break;  //TODO: hex values
-                    case 'u':  GGML_ABORT("TODO");  break;  //TODO: unicode values
-                    case 'U':  GGML_ABORT("TODO");  break;  //TODO: unicode values
+                    case 'x':  GGML_ABORT("TODO");  //TODO: hex values
+                    case 'u':  GGML_ABORT("TODO");  //TODO: unicode values
+                    case 'U':  GGML_ABORT("TODO");  //TODO: unicode values
                     default:  // escaped character
                         GGML_ASSERT(!is_cpt_range);
                         cpt = cpts_regex[++i];
diff --git a/src/unicode.h b/src/unicode.h
index 75cdb3f4a596f..8a3f4078ca79b 100644
--- a/src/unicode.h
+++ b/src/unicode.h
@@ -149,7 +149,7 @@ struct codepoint_categ {
                 return 0;
             }
             const char * p = strchr(subcategs, subcateg);
-            return p ? (p - subcategs + 1) : 0;
+            return (uint16_t) (p ? (p - subcategs + 1) : 0);
         };
         switch(categ) {
             case 'C':  if(subcateg == 'n') return 0;  // undefined