From a199c529bcb6b3962af440a79968a74e7a910dd2 Mon Sep 17 00:00:00 2001
From: "Node.js GitHub Bot" <github-bot@iojs.org>
Date: Mon, 12 Aug 2024 20:54:02 -0400
Subject: [PATCH] deps: update simdutf to 5.3.4

PR-URL: https://github.com/nodejs/node/pull/54312
Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com>
Reviewed-By: James M Snell <jasnell@gmail.com>
---
 deps/simdutf/simdutf.cpp | 851 ++++++++++++++++++++++-----------------
 deps/simdutf/simdutf.h   | 126 ++++--
 2 files changed, 568 insertions(+), 409 deletions(-)
diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp
index 9597a3727cd11f..2e685aab872f93 100644
--- a/deps/simdutf/simdutf.cpp
+++ b/deps/simdutf/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-07-31 17:22:10 -0400. Do not edit! */
+/* auto-generated on 2024-08-09 09:52:29 -0400. Do not edit! */
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
 // We include base64_tables once.
@@ -5598,6 +5598,14 @@ bool is_ascii_white_space(char_type c) {
   return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
 }
 
+template <class char_type>
+bool is_eight_byte(char_type c) {
+  if(sizeof(char_type) == 1) {
+    return true;
+  }
+  return uint8_t(c) == c;
+}
+
 // Returns true upon success. The destination buffer must be large enough.
 // This functions assumes that the padding (=) has been removed.
 template <class char_type>
@@ -5617,7 +5625,7 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64
   size_t idx;
   uint8_t buffer[4];
   while (true) {
-    while (src + 4 <= srcend &&
+    while (src + 4 <= srcend && is_eight_byte(src[0]) && is_eight_byte(src[1]) && is_eight_byte(src[2]) && is_eight_byte(src[3]) &&
            (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
                 d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
@@ -5633,9 +5641,9 @@ result base64_tail_decode(char *dst, const char_type *src, size_t length, base64
       char_type c = *src;
       uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
-      if (code <= 63) {
+      if (is_eight_byte(c) && code <= 63) {
         idx++;
-      } else if (code > 64) {
+      } else if (code > 64 || !is_eight_byte(c)) {
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       } else {
         // We have a space or a newline. We ignore it.
@@ -5715,7 +5723,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
   size_t idx;
   uint8_t buffer[4];
   while (true) {
-    while (src + 4 <= srcend &&
+    while (src + 4 <= srcend && is_eight_byte(src[0]) && is_eight_byte(src[1]) && is_eight_byte(src[2]) && is_eight_byte(src[3]) &&
            (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
                 d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
       if(match_system(endianness::BIG)) {
@@ -5737,9 +5745,9 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       char_type c = *src;
       uint8_t code = to_base64[uint8_t(c)];
       buffer[idx] = uint8_t(code);
-      if (code <= 63) {
+      if (is_eight_byte(c) && code <= 63) {
         idx++;
-      } else if (code > 64) {
+      } else if (code > 64 || !is_eight_byte(c)) {
         outlen = size_t(dst - dstinit);
         return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
       } else {
@@ -5766,7 +5774,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
         dst += 1;
 
       } else if (idx == 3) {
-        if(dst + 2 >= dstend) {
+        if(dstend - dst < 2) {
           outlen = size_t(dst - dstinit);
           return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
         }
@@ -5789,7 +5797,7 @@ result base64_tail_decode_safe(char *dst, size_t& outlen, const char_type *src,
       outlen = size_t(dst - dstinit);
       return {SUCCESS, size_t(dst - dstinit)};
     }
-    if(dstend - dst <= 3) {
+    if(dstend - dst < 3) {
       outlen = size_t(dst - dstinit);
       return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
     }
@@ -13247,9 +13255,14 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
   bool found_leading_bytes{false};
   // important: it is i <= how_far_back and not 'i < how_far_back'.
   for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[0-i];
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
     if(found_leading_bytes) {
+      if(i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0-i+1);
+      }
       buf -= i;
       extra_len = i;
       break;
@@ -13523,9 +13536,14 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
   bool found_leading_bytes{false};
   // important: it is i <= how_far_back and not 'i < how_far_back'.
   for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[0-i];
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
     if(found_leading_bytes) {
+      if(i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0-i+1);
+      }
       buf -= i;
       extra_len = i;
       break;
@@ -13818,9 +13836,14 @@ inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf
   bool found_leading_bytes{false};
   // important: it is i <= how_far_back and not 'i < how_far_back'.
   for(size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[0-i];
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
     found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
     if(found_leading_bytes) {
+      if(i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0-i+1);
+      }
       buf -= i;
       extra_len = i;
       break;
@@ -13908,7 +13931,7 @@ inline result convert_with_errors(const char16_t* buf, size_t len, char* latin_o
       if (!match_system(big_endian)) { v1 = (v1 >> 8) | (v1 << (64 - 8)); }
       if (!match_system(big_endian)) { v2 = (v2 >> 8) | (v2 << (64 - 8)); }
       if (!match_system(big_endian)) { v3 = (v3 >> 8) | (v3 << (64 - 8)); }
-      if (!match_system(big_endian)) { v4 = (v1 >> 8) | (v4 << (64 - 8)); }
+      if (!match_system(big_endian)) { v4 = (v4 >> 8) | (v4 << (64 - 8)); }
 
       if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
         size_t final_pos = pos + 16;
@@ -14269,7 +14292,7 @@ int arm_detect_encodings(const char * buf, size_t len) {
 
     checker check{};
 
-    while(buf + 64 <= end) {
+    while(end - buf >= 64) {
         uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t*>(buf));
         uint16x8_t secondin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + simd16<uint16_t>::SIZE / sizeof(char16_t));
         uint16x8_t thirdin = vld1q_u16(reinterpret_cast<const uint16_t*>(buf) + 2*simd16<uint16_t>::SIZE / sizeof(char16_t));
@@ -14464,7 +14487,7 @@ const char16_t* arm_validate_utf16(const char16_t* input, size_t size) {
     const auto v_f8 = simd8<uint8_t>::splat(0xf8);
     const auto v_fc = simd8<uint8_t>::splat(0xfc);
     const auto v_dc = simd8<uint8_t>::splat(0xdc);
-    while (input + 16 < end) {
+    while (end - input >= 16) {
         // 0. Load data: since the validation takes into account only higher
         //    byte of each word, we compress the two vectors into one which
         //    consists only the higher bytes.
@@ -14606,7 +14629,7 @@ const char32_t* arm_validate_utf32le(const char32_t* input, size_t size) {
     uint32x4_t currentmax = vmovq_n_u32(0x0);
     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
 
-    while (input + 4 < end) {
+    while (end - input >= 4) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
         currentmax = vmaxq_u32(in,currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
@@ -14637,7 +14660,7 @@ const result arm_validate_utf32le_with_errors(const char32_t* input, size_t size
     uint32x4_t currentmax = vmovq_n_u32(0x0);
     uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
 
-    while (input + 4 < end) {
+    while (end - input >= 4) {
         const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t*>(input));
         currentmax = vmaxq_u32(in,currentmax);
         currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
@@ -14672,7 +14695,7 @@ arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
   // We always write 16 bytes, of which more than the first 8 bytes
   // are valid. A safety margin of 8 is more than sufficient.
-  while (latin1_input + 16 + 8 <= end) {
+  while (end - latin1_input >= 16 + 8) {
     uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
     if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
       vst1q_u8(utf8_output, in8);
@@ -14736,7 +14759,7 @@ template <endianness big_endian>
 std::pair<const char*, char16_t*> arm_convert_latin1_to_utf16(const char* buf, size_t len, char16_t* utf16_output) {
     const char* end = buf + len;
 
-    while (buf + 16 <= end) {
+    while (end - buf >= 16) {
         uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
         uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
         if (!match_system(big_endian)) { inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow))); }
@@ -14755,7 +14778,7 @@ std::pair<const char*, char16_t*> arm_convert_latin1_to_utf16(const char* buf, s
 std::pair<const char*, char32_t*> arm_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
     const char* end = buf + len;
 
-    while (buf + 16 <= end) {
+    while (end - buf >= 16) {
         uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
         uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
         uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
@@ -14948,8 +14971,10 @@ size_t convert_masked_utf8_to_utf16(const char *input,
       if (!match_system(big_endian)) {
         composed = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
       }
-      vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
-      utf16_output += 6; // We 3 32-bit surrogate pairs.
+      uint16_t buffer[8];
+      vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
+      for(int k = 0; k < 6; k++) { utf16_output[k] = buffer[k]; } // the loop might compiler to a couple of instructions.
+      utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
       return 12; // We consumed 12 bytes.
     }
     // 3 1-4 byte sequences
@@ -15287,7 +15312,7 @@ size_t convert_masked_utf8_to_latin1(const char *input,
 template <endianness big_endian>
 std::pair<const char16_t*, char*> arm_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) {
   const char16_t* end = buf + len;
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
     if (vmaxvq_u16(in) <= 0xff) {
@@ -15309,7 +15334,7 @@ template <endianness big_endian>
 std::pair<result, char*> arm_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) {
   const char16_t* start = buf;
   const char16_t* end = buf + len;
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
     if (vmaxvq_u16(in) <= 0xff) {
@@ -15397,7 +15422,7 @@ std::pair<const char16_t*, char*> arm_convert_utf16_to_utf8(const char16_t* buf,
   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
     if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
@@ -15651,7 +15676,7 @@ std::pair<result, char*> arm_convert_utf16_to_utf8_with_errors(const char16_t* b
   const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
     if(vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
@@ -15948,7 +15973,7 @@ std::pair<const char16_t*, char32_t*> arm_convert_utf16_to_utf32(const char16_t*
   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
 
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
 
@@ -16006,7 +16031,7 @@ std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16
   const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
   const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
 
-  while (buf + 8 <= end) {
+  while ((end - buf) >= 8) {
     uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
     if (!match_system(big_endian)) { in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in))); }
 
@@ -16052,7 +16077,7 @@ std::pair<result, char32_t*> arm_convert_utf16_to_utf32_with_errors(const char16
 /* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
 std::pair<const char32_t*, char*> arm_convert_utf32_to_latin1(const char32_t* buf, size_t len, char* latin1_output) {
   const char32_t* end = buf + len;
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
     uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
 
@@ -16077,7 +16102,7 @@ std::pair<result, char*> arm_convert_utf32_to_latin1_with_errors(const char32_t*
   const char32_t* start = buf;
   const char32_t* end = buf + len;
 
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
     uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf+4));
 
@@ -16585,7 +16610,7 @@ std::pair<const char32_t*, char16_t*> arm_convert_utf32_to_utf16(const char32_t*
 
   uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
 
-  while(buf + 4 <= end) {
+  while(end - buf >= 4) {
     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
 
     // Check if no bits set above 16th
@@ -16643,7 +16668,7 @@ std::pair<result, char16_t*> arm_convert_utf32_to_utf16_with_errors(const char32
   const char32_t* start = buf;
   const char32_t* end = buf + len;
 
-  while(buf + 4 <= end) {
+  while(end - buf >= 4) {
     uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
 
     // Check if no bits set above 16th
@@ -18879,6 +18904,9 @@ simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, s
 }
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
   result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(buf + res.count, len - res.count);
@@ -18889,6 +18917,9 @@ simdutf_warn_unused result implementation::validate_utf16le_with_errors(const ch
 }
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
   result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(buf + res.count, len - res.count);
@@ -18908,6 +18939,9 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz
 }
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
   result res = arm_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -19164,6 +19198,9 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(const c
 }
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
   std::pair<const char32_t*, char*> ret = arm_convert_utf32_to_utf8(buf, len, utf8_output);
   if (ret.first == nullptr) { return 0; }
   size_t saved_bytes = ret.second - utf8_output;
@@ -19177,6 +19214,9 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(const char32_t*
 }
 
 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
   // ret.first.count is always the position in the buffer, not the number of code units written even if finished
   std::pair<result, char*> ret = arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
   if (ret.first.count != len) {
@@ -20831,7 +20871,7 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
      * We check for ptr + 64 + 64 <= end because
      * we want to be do maskless writes without overruns.
      */
-    while (ptr + 64 + 64 <= end) {
+    while (end - ptr  >= 64 + 4) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
@@ -20853,12 +20893,12 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
         } else {
             vec0 = expand_utf8_to_utf32(vec0);
             vec1 = expand_utf8_to_utf32(vec1);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
         }
         const __m512i lane3 = broadcast_epi128<3>(utf8);
         int valid_count2;
@@ -20872,17 +20912,17 @@ std::pair<const char*, OUTPUT*> valid_utf8_to_fixed_length(const char* str, size
             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
         } else {
             vec2 = expand_utf8_to_utf32(vec2);
             vec3 = expand_utf8_to_utf32(vec3);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
         }
         ptr += 4*16;
     }
 
-    if (ptr + 64 <= end) {
+    if (end - ptr >= 64) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         const __m512i v_80 = _mm512_set1_epi8(char(0x80));
         const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
@@ -21048,7 +21088,6 @@ simdutf_really_inline __m512i check_special_cases(__m512i input, const __m512i p
     simdutf_really_inline bool errors() const {
         return _mm512_test_epi8_mask(this->error, this->error) != 0;
     }
-
   }; // struct avx512_utf8_checker
 /* end file src/icelake/icelake_utf8_validation.inl.cpp */
 /* begin file src/icelake/icelake_from_utf8.inl.cpp */
@@ -21068,7 +21107,7 @@ utf8_to_utf16_result fast_avx512_convert_utf8_to_utf16(const char *in, size_t le
   const char *const final_in = in + len;
   bool result = true;
   while (result) {
-    if (in + 64 <= final_in) {
+    if (final_in - in >= 64 ) {
         result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
     } else if(in < final_in) {
         result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
@@ -21085,13 +21124,23 @@ simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, si
   const char *const final_in = in + len;
   bool  result = true;
   while (result) {
-    if (in + 64 <= final_in) {
+    if (final_in - in >= 64 ) {
         result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(in, out, final_in - in);
     } else if(in < final_in) {
         result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(in, out, final_in - in);
     } else { break; }
   }
   if(!result) {
+    size_t pos = size_t(in - init_in);
+    if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
+      // We must check whether we are the fourth continuation byte
+      bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
+      bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
+      bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
+      if(c1 && c2 && c3) {
+        return {simdutf::TOO_LONG, pos};
+      }
+    }
     // rewind_and_convert_with_errors will seek a potential error from in onward,
     // with the ability to go back up to in - init_in bytes, and read final_in - in bytes forward.
     simdutf::result res = scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(in - init_in, in, final_in - in, out);
@@ -21104,6 +21153,7 @@ simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in, si
 
 
 template <endianness big_endian, typename OUTPUT>
+// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code is legacy.
 std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str, size_t len, OUTPUT* dwords) {
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
     constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
@@ -21127,10 +21177,9 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
-     * We check for ptr + 64 + 64 <= end because
-     * we want to be do maskless writes without overruns.
+     * We use masked writes to avoid overruns, see https://github.com/simdutf/simdutf/issues/471
      */
-    while (ptr + 64 + 64 <= end) {
+    while (end - ptr >= 64 + 4) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         if(checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
@@ -21149,12 +21198,12 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
         } else {
             vec0 = expand_utf8_to_utf32(vec0);
             vec1 = expand_utf8_to_utf32(vec1);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
         }
         const __m512i lane3 = broadcast_epi128<3>(utf8);
         int valid_count2;
@@ -21168,12 +21217,12 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
         } else {
             vec2 = expand_utf8_to_utf32(vec2);
             vec3 = expand_utf8_to_utf32(vec3);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
         }
         ptr += 4*16;
     }
@@ -21181,7 +21230,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
 
     // For the final pass, we validate 64 bytes, but we only transcode
     // 3*16 bytes, so we may end up double-validating 16 bytes.
-    if (ptr + 64 <= end) {
+    if (end - ptr >= 64) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         if(checker.check_next_input(utf8)) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
@@ -21214,8 +21263,8 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
         }
         validatedptr += 4*16;
     }
-    {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
+    if (end != validatedptr) {
+       const __m512i utf8 = _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)), (const __m512i*)validatedptr);
        checker.check_next_input(utf8);
     }
     checker.check_eof();
@@ -21226,6 +21275,7 @@ std::pair<const char*, OUTPUT*> validating_utf8_to_fixed_length(const char* str,
 }
 
 // Like validating_utf8_to_fixed_length but returns as soon as an error is identified
+// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code is legacy.
 template <endianness big_endian, typename OUTPUT>
 std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_constant_checks(const char* str, size_t len, OUTPUT* dwords) {
     constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
@@ -21250,20 +21300,19 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
     /**
      * In the main loop, we consume 64 bytes per iteration,
      * but we access 64 + 4 bytes.
-     * We check for ptr + 64 + 64 <= end because
-     * we want to be do maskless writes without overruns.
      */
-    while (ptr + 64 + 64 <= end) {
+    while (end - ptr >= 4 + 64) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        bool ascii = checker.check_next_input(utf8);
+        if(checker.errors()) {
+            return {ptr, output, false}; // We found an error.
+        }
+        if(ascii) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
             continue;
         }
-        if(checker.errors()) {
-            return {ptr, output, false}; // We found an error.
-        }
         const __m512i lane0 = broadcast_epi128<0>(utf8);
         const __m512i lane1 = broadcast_epi128<1>(utf8);
         int valid_count0;
@@ -21275,12 +21324,12 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             vec0 = _mm512_mask_expand_epi32(vec0, __mmask16(((1<<valid_count1)-1)<<valid_count0), vec1);
             valid_count0 += valid_count1;
             vec0 = expand_utf8_to_utf32(vec0);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
         } else {
             vec0 = expand_utf8_to_utf32(vec0);
             vec1 = expand_utf8_to_utf32(vec1);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, false)
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
         }
         const __m512i lane3 = broadcast_epi128<3>(utf8);
         int valid_count2;
@@ -21294,12 +21343,12 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
             vec2 = _mm512_mask_expand_epi32(vec2, __mmask16(((1<<valid_count3)-1)<<valid_count2), vec3);
             valid_count2 += valid_count3;
             vec2 = expand_utf8_to_utf32(vec2);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
         } else {
             vec2 = expand_utf8_to_utf32(vec2);
             vec3 = expand_utf8_to_utf32(vec3);
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, false)
-            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, false)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+            SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
         }
         ptr += 4*16;
     }
@@ -21307,14 +21356,16 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
 
     // For the final pass, we validate 64 bytes, but we only transcode
     // 3*16 bytes, so we may end up double-validating 16 bytes.
-    if (ptr + 64 <= end) {
+    if (end - ptr >= 64) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
-        if(checker.check_next_input(utf8)) {
+        bool ascii = checker.check_next_input(utf8);
+        if(checker.errors()) {        
+            return {ptr, output, false}; // We found an error.
+        }
+        if(ascii) {
             SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
             output += 64;
             ptr += 64;
-        } else if(checker.errors()) {
-            return {ptr, output, false}; // We found an error.
         } else {
             const __m512i lane0 = broadcast_epi128<0>(utf8);
             const __m512i lane1 = broadcast_epi128<1>(utf8);
@@ -21342,8 +21393,8 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
         }
         validatedptr += 4*16;
     }
-    {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - validatedptr))-1, (const __m512i*)validatedptr);
+    if (end != validatedptr) {
+       const __m512i utf8 = _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)), (const __m512i*)validatedptr);
        checker.check_next_input(utf8);
     }
     checker.check_eof();
@@ -21358,7 +21409,6 @@ std::tuple<const char*, OUTPUT*, bool> validating_utf8_to_fixed_length_with_cons
 
 // File contains conversion procedure from possibly invalid UTF-8 strings.
 
-// template <bool is_remaining, bool use_masked_store>
 template <bool is_remaining>
 simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf, size_t len,
                                            char *latin_output, __m512i minus64,
@@ -21369,15 +21419,17 @@ simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf,
       is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
   __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
   __mmask64 nonascii = _mm512_movepi8_mask(input);
-
   if (nonascii == 0) {
+    if(*next_leading_ptr) { // If we ended with a leading byte, it is an error.
+      return 0; // Indicates error
+    }
     is_remaining
         ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
         : _mm512_storeu_si512((__m512i *)latin_output, input);
     return len;
   }
 
-  __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+  const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
 
   __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
   __mmask64 invalid_leading_bytes =
@@ -21388,38 +21440,34 @@ simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf,
   }
 
   __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
-  *next_leading_ptr = leading >> 63;
 
   if ((nonascii ^ leading) != leading_shift) {
     return 0; // Indicates error
   }
 
-  __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+  const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
   input =
       _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
-  *next_bit6_ptr = bit6 >> 63;
 
   __mmask64 retain = ~leading & load_mask;
   __m512i output = _mm512_maskz_compress_epi8(retain, input);
   int64_t written_out = count_ones(retain);
-  __mmask64 store_mask = (1ULL << written_out) - 1;
-
-  // ***************************
-  //  Possible optimization? (Nick Nuon)
-  //  This commented out line is 5% faster but sadly it'll also write past
-  //  memory bounds for latin1_output: is_remaining ?
-  //  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output) :
-  //  _mm512_storeu_si512((__m512i *)latin_output, output); I tried using
-  //  _mm512_storeu_si512 and have the next process_block start from the
-  //  "written_out" point but the compiler shuffles memory in such a way that it
-  //  is significantly slower...
-  // ****************************
+  if(written_out == 0) {
+    return 0; // Indicates error
+  }
+  *next_bit6_ptr = bit6 >> 63;
+  *next_leading_ptr = leading >> 63;
+
+  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+
   _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
 
   return written_out;
 }
 
-size_t utf8_to_latin1_avx512(const char *buf, size_t len, char *latin_output) {
+size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len, char *&inlatin_output) {
+  const char *buf = inbuf;
+  char *latin_output = inlatin_output;
   char *start = latin_output;
   size_t pos = 0;
   __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
@@ -21431,7 +21479,9 @@ size_t utf8_to_latin1_avx512(const char *buf, size_t len, char *latin_output) {
     size_t written = process_block_from_utf8_to_latin1<false>(buf + pos, 64, latin_output, minus64,
                                           one, &next_leading, &next_bit6);
     if (written == 0) {
-      return 0; // Indicates error
+      inlatin_output = latin_output;
+      inbuf = buf + pos - next_leading;
+      return 0; // Indicates error at pos or after, or just before pos (too short error)
     }
     latin_output += written;
     pos += 64;
@@ -21443,12 +21493,20 @@ size_t utf8_to_latin1_avx512(const char *buf, size_t len, char *latin_output) {
         process_block_from_utf8_to_latin1<true>(buf + pos, remaining, latin_output, minus64, one,
                             &next_leading, &next_bit6);
     if (written == 0) {
-      return 0; // Indicates error
+      inbuf = buf + pos - next_leading;
+      inlatin_output = latin_output;
+      return 0; // Indicates error at pos or after, or just before pos (too short error)
     }
     latin_output += written;
   }
-
-  return (size_t)(latin_output - start);
+  if(next_leading) {
+    inbuf = buf + len - next_leading;
+    inlatin_output = latin_output;
+    return 0; // Indicates error at end of buffer
+  }
+  inlatin_output = latin_output;
+  inbuf += len;
+  return size_t(latin_output - start);
 }
 /* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
 /* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
@@ -21488,7 +21546,10 @@ simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(const char
   __mmask64 retain = ~leading & load_mask;
   __m512i output = _mm512_maskz_compress_epi8(retain, input);
   int64_t written_out = count_ones(retain);
-  __mmask64 store_mask = (1ULL << written_out) - 1;
+  if(written_out == 0) {
+    return 0; // Indicates error
+  }
+  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
   // Optimization opportunity: sometimes, masked writes are not needed.
   _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
   return written_out;
@@ -21536,7 +21597,7 @@ size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
       36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-  while (buf + 32 <= end) {
+  while (end - buf >= 32) {
     __m512i in = _mm512_loadu_si512((__m512i *)buf);
     if (big_endian) {
       in = _mm512_shuffle_epi8(in, byteflip);
@@ -21581,7 +21642,7 @@ icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
       36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-  while (buf + 32 <= end) {
+  while (end - buf >= 32) {
     __m512i in = _mm512_loadu_si512((__m512i *)buf);
     if (big_endian) {
       in = _mm512_shuffle_epi8(in, byteflip);
@@ -21652,7 +21713,7 @@ size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
         );
   const char16_t * const inbuf_orig = inbuf;
   const unsigned char * const outbuf_orig = outbuf;
-  size_t adjust = 0;
+  int adjust = 0;
   int carry = 0;
 
   while (inlen >= 32) {
@@ -21809,7 +21870,7 @@ size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
     _mm512_mask_storeu_epi8(outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)), outhi);
     outbuf += advlo + advhi;
   }
-  outbuf -= adjust;
+  outbuf += -adjust;
 
 tail:
   if (inlen != 0) {
@@ -21817,7 +21878,7 @@ size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
     inmask = _cvtu32_mask32((1U << inlen) - 1);
     in = _mm512_maskz_loadu_epi16(inmask, inbuf);
     if(big_endian) { in = _mm512_shuffle_epi8(in, byteflip); }
-    adjust = inlen - 31;
+    adjust = (int)inlen - 31;
     inlen = 0;
     goto lastiteration;
   }
@@ -21946,7 +22007,7 @@ size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
       56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
-  while (buf + 16 <= end) {
+  while (end - buf >= 16) {
     __m512i in = _mm512_loadu_si512((__m512i *)buf);
     if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
       return 0;
@@ -21979,7 +22040,7 @@ icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
       56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
-  while (buf + 16 <= end) {
+  while (end - buf >= 16) {
     __m512i in = _mm512_loadu_si512((__m512i *)buf);
     if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
       while (uint32_t(*buf) <= 0xff) {
@@ -22027,7 +22088,7 @@ std::pair<const char32_t*, char*> avx512_convert_utf32_to_utf8(const char32_t* b
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
     running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
@@ -22268,7 +22329,7 @@ std::pair<result, char*> avx512_convert_utf32_to_utf8_with_errors(const char32_t
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
     // Check for too large input
@@ -22507,7 +22568,7 @@ std::pair<const char32_t*, char16_t*> avx512_convert_utf32_to_utf16(const char32
   __m256i forbidden_bytemask = _mm256_setzero_si256();
 
 
-  while (buf + 8 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
     const __m256i v_00000000 = _mm256_setzero_si256();
@@ -22572,7 +22633,7 @@ std::pair<result, char16_t*> avx512_convert_utf32_to_utf16_with_errors(const cha
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 8 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
     const __m256i v_00000000 = _mm256_setzero_si256();
@@ -22636,7 +22697,7 @@ bool validate_ascii(const char* buf, size_t len) {
   const char* end = buf + len;
   const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
   __m512i running_or = _mm512_setzero_si512();
-  for (; buf + 64 <= end; buf += 64) {
+  for (; end - buf >= 64; buf += 64) {
     const __m512i utf8 = _mm512_loadu_si512((const __m512i*)buf);
     running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii, 0xf8); // running_or | (utf8 & ascii)
   }
@@ -22651,7 +22712,8 @@ bool validate_ascii(const char* buf, size_t len) {
 // file included directly
 
 const char32_t* validate_utf32(const char32_t* buf, size_t len) {
-    const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
+    if(len < 16) { return buf; }
+    const char32_t* end = buf + len - 16;
 
     const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
     __m512i currentmax = _mm512_setzero_si512();
@@ -23225,7 +23287,7 @@ implementation::detect_encodings(const char *input,
 
     avx512_utf8_checker checker{};
     __m512i currentmax = _mm512_setzero_si512();
-    while (buf + 64 <= end) {
+    while (end - buf >= 64) {
       __m512i in = _mm512_loadu_si512((__m512i *)buf);
       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
       __mmask32 surrogates =
@@ -23299,8 +23361,8 @@ implementation::detect_encodings(const char *input,
     if (is_utf8) {
       size_t current_length = static_cast<size_t>(buf - start);
       if (current_length != length) {
-        const __m512i utf8 = _mm512_maskz_loadu_epi8(
-            (1ULL << (length - current_length)) - 1, (const __m512i *)buf);
+                const __m512i utf8 = _mm512_maskz_loadu_epi8(
+            (UINT64_C(1) << (length - current_length)) - 1, (const __m512i *)buf);
         checker.check_next_input(utf8);
       }
       checker.check_eof();
@@ -23316,16 +23378,22 @@ implementation::detect_encodings(const char *input,
     }
 
     if (is_utf32 && (length % 4 == 0)) {
+      size_t leftover = length - static_cast<size_t>(buf - start);
       currentmax = _mm512_max_epu32(
           _mm512_maskz_loadu_epi8(
-              (1ULL << (length - static_cast<size_t>(buf - start))) - 1,
+              (UINT64_C(1) << leftover) - 1,
               (const __m512i *)buf),
           currentmax);
       __mmask16 outside_range = _mm512_cmp_epu32_mask(currentmax, _mm512_set1_epi32(0x10ffff),
                                 _MM_CMPINT_GT);
       if (outside_range == 0) {
+          out |= simdutf::encoding_type::UTF32_LE;
+        } else {
+        }
+        //}
+      /*} else {
         out |= simdutf::encoding_type::UTF32_LE;
-      }
+      }*/
     }
 
     return out;
@@ -23337,15 +23405,18 @@ implementation::detect_encodings(const char *input,
 }
 
 simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return true;
+  }
     avx512_utf8_checker checker{};
     const char* ptr = buf;
     const char* end = ptr + len;
-    for (; ptr + 64 <= end; ptr += 64) {
+    for (; end - ptr >= 64 ; ptr += 64) {
         const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
         checker.check_next_input(utf8);
     }
-    {
-       const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
+    if(end != ptr) {
+       const __m512i utf8 = _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i*)ptr);
        checker.check_next_input(utf8);
     }
     checker.check_eof();
@@ -23353,11 +23424,14 @@ simdutf_warn_unused bool implementation::validate_utf8(const char *buf, size_t l
 }
 
 simdutf_warn_unused result implementation::validate_utf8_with_errors(const char *buf, size_t len) const noexcept {
+    if (simdutf_unlikely(len == 0)) {
+       return result(error_code::SUCCESS, len);
+    }
     avx512_utf8_checker checker{};
     const char* ptr = buf;
     const char* end = ptr + len;
     size_t count{0};
-    for (; ptr + 64 <= end; ptr += 64) {
+    for (; end - ptr >= 64 ; ptr += 64) {
       const __m512i utf8 = _mm512_loadu_si512((const __m512i*)ptr);
       checker.check_next_input(utf8);
       if(checker.errors()) {
@@ -23368,18 +23442,18 @@ simdutf_warn_unused result implementation::validate_utf8_with_errors(const char
       }
       count += 64;
     }
-    {
-      const __m512i utf8 = _mm512_maskz_loadu_epi8((1ULL<<(end - ptr))-1, (const __m512i*)ptr);
+    if (end != ptr) {
+      const __m512i utf8 = _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i*)ptr);
       checker.check_next_input(utf8);
-      if(checker.errors()) {
-        if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
-        result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf), reinterpret_cast<const char*>(buf + count), len - count);
-        res.count += count;
-        return res;
-      } else {
-        return result(error_code::SUCCESS, len);
-      }
     }
+    checker.check_eof();
+    if(checker.errors()) {
+      if (count != 0) { count--; } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(reinterpret_cast<const char*>(buf), reinterpret_cast<const char*>(buf + count), len - count);
+      res.count += count;
+      return res;
+    }
+    return result(error_code::SUCCESS, len);
 }
 
 simdutf_warn_unused bool implementation::validate_ascii(const char *buf, size_t len) const noexcept {
@@ -23390,15 +23464,15 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char
   const char* buf_orig = buf;
   const char* end = buf + len;
   const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  for (; buf + 64 <= end; buf += 64) {
+  for (; end - buf >= 64 ; buf += 64) {
     const __m512i input = _mm512_loadu_si512((const __m512i*)buf);
     __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
     if(notascii) {
       return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
     }
   }
-  {
-    const __m512i input = _mm512_maskz_loadu_epi8((1ULL<<(end - buf))-1, (const __m512i*)buf);
+  if (end != buf) {
+    const __m512i input = _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - buf)), (const __m512i*)buf);
     __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
     if(notascii) {
       return result(error_code::TOO_LARGE, buf - buf_orig + _tzcnt_u64(notascii));
@@ -23410,7 +23484,7 @@ simdutf_warn_unused result implementation::validate_ascii_with_errors(const char
 simdutf_warn_unused bool implementation::validate_utf16le(const char16_t *buf, size_t len) const noexcept {
     const char16_t *end = buf + len;
 
-    for(;buf + 32 <= end; ) {
+    for(;end - buf >= 32; ) {
       __m512i in = _mm512_loadu_si512((__m512i*)buf);
       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
@@ -23459,7 +23533,7 @@ simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, s
             0x0607040502030001,
             0x0e0f0c0d0a0b0809
         );
-    for(;buf + 32 <= end; ) {
+    for(;end - buf >= 32; ) {
       __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
@@ -23499,7 +23573,7 @@ simdutf_warn_unused bool implementation::validate_utf16be(const char16_t *buf, s
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(const char16_t *buf, size_t len) const noexcept {
     const char16_t *start_buf = buf;
     const char16_t *end = buf + len;
-    for(;buf + 32 <= end; ) {
+    for(;end - buf >= 32; ) {
       __m512i in = _mm512_loadu_si512((__m512i*)buf);
       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
@@ -23553,7 +23627,7 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(const ch
             0x0607040502030001,
             0x0e0f0c0d0a0b0809
         );
-    for(;buf + 32 <= end; ) {
+    for(;end - buf >= 32; ) {
       __m512i in = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)buf), byteflip);
       __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
       __mmask32 surrogates = _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
@@ -23604,31 +23678,33 @@ simdutf_warn_unused bool implementation::validate_utf32(const char32_t *buf, siz
 }
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(const char32_t *buf, size_t len) const noexcept {
-  const char32_t* end = len >= 16 ? buf + len - 16 : nullptr;
   const char32_t* buf_orig = buf;
-  while (buf <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
-    __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
-                              _MM_CMPINT_GT);
+  if(len >= 16) {
+    const char32_t* end = buf + len - 16;
+    while (buf <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i*)buf);
+      __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
+                                _MM_CMPINT_GT);
 
-    __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
+      __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
 
-    __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
-                              _MM_CMPINT_GT);
-    if((outside_range | surrogate_range)) {
-      auto outside_idx = _tzcnt_u32(outside_range);
-      auto surrogate_idx = _tzcnt_u32(surrogate_range);
+      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(utf32_off, _mm512_set1_epi32(0xfffff7ff),
+                                _MM_CMPINT_GT);
+      if((outside_range | surrogate_range)) {
+        auto outside_idx = _tzcnt_u32(outside_range);
+        auto surrogate_idx = _tzcnt_u32(surrogate_range);
 
-      if (outside_idx < surrogate_idx) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+        if (outside_idx < surrogate_idx) {
+          return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+        }
+
+        return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
       }
 
-      return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+      buf += 16;
     }
-
-    buf += 16;
   }
-  if(buf < buf_orig + len) {
+  if(len > 0) {
     __m512i utf32 = _mm512_maskz_loadu_epi32(__mmask16((1U<<(buf_orig + len - buf))-1),(const __m512i*)buf);
     __mmask16 outside_range = _mm512_cmp_epu32_mask(utf32, _mm512_set1_epi32(0x10ffff),
                               _MM_CMPINT_GT);
@@ -23683,24 +23759,18 @@ simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char* bu
 
 
 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(const char* buf, size_t len, char* latin1_output) const noexcept {
-  // Initialize output length and input length counters
-  size_t inlen = 0;
-
   // First, try to convert as much as possible using the SIMD implementation.
-  inlen = icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
+  const char * obuf = buf;
+  char * olatin1_output = latin1_output;
+  size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
 
   // If we have completely converted the string
-  if(inlen == len) {
-    return {simdutf::SUCCESS, len};
+  if(obuf == buf + len) {
+    return {simdutf::SUCCESS, written};
   }
-
-  // Else if there are remaining bytes, use the scalar function to process them.
-  // Note: This is assuming scalar::utf8_to_latin1::convert_with_errors is a function that takes
-  // the input buffer, length, and output buffer, and returns a result object with an error code
-  // and the number of characters processed.
-  result res = scalar::utf8_to_latin1::convert_with_errors(buf + inlen, len - inlen, latin1_output + inlen);
-  res.count += inlen; // Add the number of characters processed by the SIMD implementation
-
+  size_t pos = obuf - buf;
+  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(pos, buf + pos, len - pos, olatin1_output);
+  res.count += pos;
   return res;
 }
 
@@ -23805,10 +23875,9 @@ simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf
   //       continuation bytes lie outside 16-byte window.
   //       It means, we have to skip continuation bytes from
   //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end and ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
       ret.first += 1;
   }
-
   if (ret.first != end) {
     const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
                                         ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
@@ -23820,15 +23889,33 @@ simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(const char* buf
 }
 
 simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(const char* buf, size_t len, char32_t* utf32) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return {error_code::SUCCESS, 0};
+  }
   uint32_t * utf32_output = reinterpret_cast<uint32_t *>(utf32);
   auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+
   if (!std::get<2>(ret)) {
-    auto new_buf = std::get<0>(ret);
-    // rewind_and_convert_with_errors will seek a potential error from new_buf onward,
-    // with the ability to go back up to new_buf - buf bytes, and read len - (new_buf - buf) bytes forward.
-    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(new_buf - buf, new_buf, len - (new_buf - buf), reinterpret_cast<char32_t *>(std::get<1>(ret)));
-    res.count += (std::get<0>(ret) - buf);
-    return res;
+    size_t pos = std::get<0>(ret) - buf;
+    // We might have an error that occurs right before  pos.
+    // This is only a concern if buf[pos] is not a continuation byte.
+    if((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
+      pos -= 1;
+    } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
+      // We must check whether we are the fourth continuation byte
+      bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
+      bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
+      bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
+      if(c1 && c2 && c3) {
+        return {simdutf::TOO_LONG, pos};
+      }
+    }
+    // todo: we reset the output to utf32 instead of using std::get<2.(ret) as you'd expect.
+    // that is because validating_utf8_to_fixed_length_with_constant_checks may have processed
+    // data beyond the error.
+    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(pos, buf + pos, len - pos, utf32);
+      res.count += pos;
+      return res;
   }
   size_t saved_bytes = std::get<1>(ret) - utf32_output;
   const char* end = buf + len;
@@ -23933,7 +24020,7 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(c
   size_t outlen;
   size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(buf, len, (unsigned char*)utf8_output, &outlen);
   if(inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - outlen, utf8_output + outlen);
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(buf + inlen, len - inlen, utf8_output + outlen);
     res.count += inlen;
     return res;
   }
@@ -23944,7 +24031,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(c
   size_t outlen;
   size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(buf, len, (unsigned char*)utf8_output, &outlen);
   if(inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - outlen, utf8_output + outlen);
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(buf + inlen, len - inlen, utf8_output + outlen);
     res.count += inlen;
     return res;
   }
@@ -24200,47 +24287,53 @@ void implementation::change_endianness_utf16(const char16_t * input, size_t leng
 
 
 simdutf_warn_unused size_t implementation::count_utf16le(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
   const char16_t* ptr = input;
+  size_t count{0};
 
-  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+  if(length >= 32) {
+    const char16_t* end = input + length - 32;
+
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-  size_t count{0};
 
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 32;
-    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-    count += count_ones(not_high_surrogate);
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+      ptr += 32;
+      uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+      count += count_ones(not_high_surrogate);
+    }
   }
 
   return count + scalar::utf16::count_code_points<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
 simdutf_warn_unused size_t implementation::count_utf16be(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
   const char16_t* ptr = input;
+  size_t count{0};
+  if(length >= 32) {
 
-  const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-  const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+    const char16_t* end = input + length - 32;
 
-  size_t count{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
-    ptr += 32;
-    uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
-    count += count_ones(not_high_surrogate);
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+
+    const __m512i byteflip = _mm512_setr_epi64(
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809,
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809,
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809,
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809
+          );
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i*)ptr), byteflip);
+      ptr += 32;
+      uint64_t not_high_surrogate = static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) | _mm512_cmplt_epu16_mask(utf16, low));
+      count += count_ones(not_high_surrogate);
+    }
   }
 
   return count + scalar::utf16::count_code_points<endianness::BIG>(ptr, length - (ptr - input));
@@ -24319,69 +24412,74 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf32(size_t lengt
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
   const char16_t* ptr = input;
+  size_t count{0};
+  if(length >= 32) {
+    const char16_t* end = input + length - 32;
 
-  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-  size_t count{0};
 
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 32;
-    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+      ptr += 32;
+      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+      __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+      __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+      size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
 
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
+      count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
+    }
   }
 
   return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(ptr, length - (ptr - input));
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(const char16_t * input, size_t length) const noexcept {
-  const char16_t* end = length >= 32 ? input + length - 32 : nullptr;
   const char16_t* ptr = input;
+  size_t count{0};
 
-  const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-  const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-  const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+  if(length >= 32) {
+    const char16_t* end = input + length - 32;
 
-  size_t count{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-  while (ptr <= end) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    ptr += 32;
-    __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-    __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-    __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-    __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-    size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
+    const __m512i byteflip = _mm512_setr_epi64(
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809,
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809,
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809,
+              0x0607040502030001,
+              0x0e0f0c0d0a0b0809
+          );
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i*)ptr);
+      utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+      ptr += 32;
+      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+      __mmask32 two_bytes_bitmask = _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+      __mmask32 surrogates_bitmask = _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) & _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+      size_t three_bytes_count = 32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+      count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 2*surrogate_bytes_count;
+    }
   }
 
   return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(ptr, length - (ptr - input));
@@ -24471,7 +24569,7 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * i
     // This algorithm could no doubt be improved!
     for(;pos + 64 <= length; pos += 64) {
       __m512i utf8 = _mm512_loadu_si512((const __m512i*)(input+pos));
-      uint64_t utf8_continuation_mask = _mm512_cmple_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
+      uint64_t utf8_continuation_mask = _mm512_cmplt_epi8_mask(utf8, _mm512_set1_epi8(-65+1));
       // We count one word for anything that is not a continuation (so
       // leading bytes).
       count += 64 - count_ones(utf8_continuation_mask);
@@ -24482,46 +24580,52 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8(const char * i
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
   const char32_t* ptr = input;
+  size_t count{0};
 
-  const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
-  const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
-  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+  if(length >= 16) {
+    const char32_t* end = input + length - 16;
+
+    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-  size_t count{0};
 
-  while (ptr <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 16;
-    __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
-    __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
-    __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
+    while (ptr <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+      ptr += 16;
+      __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+      __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+      __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(_knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32, v_0000_ffff);
 
-    size_t ascii_count = count_ones(ascii_bitmask);
-    size_t two_bytes_count = count_ones(two_bytes_bitmask);
-    size_t three_bytes_count = count_ones(three_bytes_bitmask);
-    size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
-    count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t three_bytes_count = count_ones(three_bytes_bitmask);
+      size_t four_bytes_count = 16 - ascii_count - two_bytes_count - three_bytes_count;
+      count += ascii_count + 2*two_bytes_count + 3*three_bytes_count + 4*four_bytes_count;
+    }
   }
 
   return count + scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
 }
 
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept {
-  const char32_t* end = length >= 16 ? input + length - 16 : nullptr;
   const char32_t* ptr = input;
+  size_t count{0};
 
-  const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+  if(length >= 16) {
+    const char32_t* end = input + length - 16;
+
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-  size_t count{0};
 
-  while (ptr <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
-    ptr += 16;
-    __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+    while (ptr <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i*)ptr);
+      ptr += 16;
+      __mmask16 surrogates_bitmask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
 
-    count += 16 + count_ones(surrogates_bitmask);
+      count += 16 + count_ones(surrogates_bitmask);
+    }
   }
 
   return count + scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
@@ -24641,7 +24745,7 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 
     checker check{};
 
-    while(buf + 64 <= end) {
+    while(end - buf >= 64) {
         __m256i in = _mm256_loadu_si256((__m256i*)buf);
         __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
 
@@ -24858,6 +24962,9 @@ int avx2_detect_encodings(const char * buf, size_t len) {
 */
 template <endianness big_endian>
 const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
+    if (simdutf_unlikely(size == 0)) {
+        return input;
+    }
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -24935,7 +25042,10 @@ const char16_t* avx2_validate_utf16(const char16_t* input, size_t size) {
 
 template <endianness big_endian>
 const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size) {
-    const char16_t* start = input;
+    if (simdutf_unlikely(size == 0)) {
+        return result(error_code::SUCCESS, 0);
+    }
+    const char16_t *start = input;
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -25016,6 +25126,9 @@ const result avx2_validate_utf16_with_errors(const char16_t* input, size_t size)
    - nullptr if an error was detected.
 */
 const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
+    if (simdutf_unlikely(size == 0)) {
+        return input;
+    }
     const char32_t* end = input + size;
 
     const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
@@ -25045,6 +25158,9 @@ const char32_t* avx2_validate_utf32le(const char32_t* input, size_t size) {
 
 
 const result avx2_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+    if (simdutf_unlikely(size == 0)) {
+        return result(error_code::SUCCESS, 0);
+    }
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -25084,7 +25200,7 @@ std::pair<const char *, char *> avx2_convert_latin1_to_utf8(const char *latin1_i
   const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
   const size_t safety_margin = 12;
 
-  while (latin1_input + 16 + safety_margin <= end) {
+  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
     __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
     const __m128i v_80 = _mm_set1_epi8((char)0x80);
@@ -25530,7 +25646,7 @@ std::pair<const char16_t *, char *>
 avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
                              char *latin1_output) {
   const char16_t *end = buf + len;
-  while (buf + 16 <= end) {
+  while (end - buf >= 16) {
     // Load 16 UTF-16 characters into 256-bit AVX2 register
     __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
 
@@ -25568,10 +25684,10 @@ avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
                                          char *latin1_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  while (buf + 16 <= end) {
+  while (end - buf >= 16) {
     __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
 
-    if (!big_endian) {
+    if (!match_system(big_endian)) {
       const __m256i swap = _mm256_setr_epi8(
           1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
           21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
@@ -25675,7 +25791,7 @@ std::pair<const char16_t*, char*> avx2_convert_utf16_to_utf8(const char16_t* buf
   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end -  buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     if (big_endian) {
       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
@@ -25918,7 +26034,7 @@ std::pair<result, char*> avx2_convert_utf16_to_utf8_with_errors(const char16_t*
   const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     if (big_endian) {
       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
@@ -26204,7 +26320,7 @@ std::pair<const char16_t*, char32_t*> avx2_convert_utf16_to_utf32(const char16_t
   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-  while (buf + 16 <= end) {
+  while (end - buf >= 16) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     if (big_endian) {
       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
@@ -26272,7 +26388,7 @@ std::pair<result, char32_t*> avx2_convert_utf16_to_utf32_with_errors(const char1
   const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
   const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-  while (buf + 16 <= end) {
+  while (end - buf >= 16) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     if (big_endian) {
       const __m256i swap = _mm256_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14,
@@ -26435,7 +26551,7 @@ std::pair<const char32_t*, char*> avx2_convert_utf32_to_utf8(const char32_t* buf
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
     running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
@@ -26676,7 +26792,7 @@ std::pair<result, char*> avx2_convert_utf32_to_utf8_with_errors(const char32_t*
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
     __m256i nextin = _mm256_loadu_si256((__m256i*)buf+1);
     // Check for too large input
@@ -26912,7 +27028,7 @@ std::pair<const char32_t*, char16_t*> avx2_convert_utf32_to_utf16(const char32_t
   __m256i forbidden_bytemask = _mm256_setzero_si256();
 
 
-  while (buf + 8 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
     const __m256i v_00000000 = _mm256_setzero_si256();
@@ -26977,7 +27093,7 @@ std::pair<result, char16_t*> avx2_convert_utf32_to_utf16_with_errors(const char3
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 8 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
     __m256i in = _mm256_loadu_si256((__m256i*)buf);
 
     const __m256i v_00000000 = _mm256_setzero_si256();
@@ -32227,6 +32343,25 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, size_t l
     vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
     vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
 
+     /* remove prefix from leading bytes
+      *
+      * We could also use vrgather here, but it increases register pressure,
+      * and its performance varies widely on current platforms. It might be
+      * worth reconsidering, though, once there is more hardware available.
+      * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
+      *
+      * We shift left and then right by the number of bytes in the prefix,
+      * which can be calculated as follows:
+      *         x                                max(x-10, 0)
+      * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
+      * 10xx -> 1000-1011 -> don't care
+      * 110x -> 1100,1101 -> sift by 3        -> 2,3
+      * 1110 -> 1110      -> sift by 4        -> 4
+      * 1111 -> 1111      -> sift by 5        -> 5
+      *
+      * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
+      * just need to manually detect and handle the one special case:
+      */
     #define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx) \
       vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx); \
       vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx); \
@@ -32235,26 +32370,7 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, size_t l
       /* remove prefix from trailing bytes */ \
       c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut); \
       c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut); \
-      c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut); \
-      /* remove prefix from leading bytes
-       *
-       * We could also use vrgather here, but it increases register pressure,
-       * and its performance varies widely on current platforms. It might be
-       * worth reconsidering, though, once there is more hardware available.
-       * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
-       *
-       * We shift left and then right by the number of bytes in the prefix,
-       * which can be calculated as follows:
-       *         x                                max(x-10, 0)
-       * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
-       * 10xx -> 1000-1011 -> don't care
-       * 110x -> 1100,1101 -> sift by 3        -> 2,3
-       * 1110 -> 1110      -> sift by 4        -> 4
-       * 1111 -> 1111      -> sift by 5        -> 5
-       *
-       * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
-       * just need to manually detect and handle the one special case:
-       */ \
+      c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut);  \
       vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut); \
       shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut), vlOut); \
       c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut); \
@@ -32314,23 +32430,41 @@ simdutf_really_inline static size_t rvv_utf8_to_common(char const *src, size_t l
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(const char *src, size_t len, char *dst) const noexcept {
   const char *beg = dst;
-  uint8_t last = 0b10000000;
+  uint8_t last = 0;
   for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = src[-1]) {
     vl = __riscv_vsetvl_e8m2(len);
     vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl);
-    vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-    vlOut = __riscv_vcpop_m_b4(m, vl);
-    if (vlOut != vl || last > 0b01111111) {
+    // check which bytes are ASCII
+    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+    // count ASCII bytes
+    vlOut = __riscv_vcpop_m_b4(ascii, vl);
+    // The original code would only enter the next block after this check:
+    //   vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+    //   vlOut = __riscv_vcpop_m_b4(m, vl);
+    //   if (vlOut != vl || last > 0b01111111) {...}q
+    // So that everything is ASCII or continuation bytes, we just proceeded
+    // without any processing, going straight to __riscv_vse8_v_u8m2.
+    // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII byte.
+    if (vlOut != vl) { // If not pure ASCII
+      // Non-ASCII characters
+      // We now want to mark the ascii and continuation bytes
+      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+      // We count them, that's our new vlOut (output vector length)
+      vlOut = __riscv_vcpop_m_b4(m, vl);
+
       vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
 
       vbool4_t leading0  = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
       vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
+      // -62 i 0b11000010, so we check whether any of v0 is too big
       vbool4_t tobig = __riscv_vmand_mm_b4(leading0, __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl), 1, vl), vl);
       if (__riscv_vfirst_m_b4(__riscv_vmor_mm_b4(tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl), vl) >= 0)
         return 0;
 
       v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), v1, v1, 0b01000000, vl);
       v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+    } else if (last >= 0b11000000) { // If last byte is a leading  byte and we got only ASCII, error!
+      return 0;
     }
     __riscv_vse8_v_u8m2((uint8_t*)dst, v1, vlOut);
   }
@@ -32347,13 +32481,15 @@ simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(co
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(const char *src, size_t len, char *dst) const noexcept {
   const char *beg = dst;
-  uint8_t last = 0b11000000;
+  uint8_t last = 0;
   for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut, last = src[-1]) {
     vl = __riscv_vsetvl_e8m2(len);
     vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t*)src, vl);
-    vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-    vlOut = __riscv_vcpop_m_b4(m, vl);
-    if (vlOut != vl || last > 0b01111111) {
+    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+    vlOut = __riscv_vcpop_m_b4(ascii, vl);
+    if (vlOut != vl) { // If not pure ASCII
+      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+      vlOut = __riscv_vcpop_m_b4(m, vl);
       vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
       v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl), v1, v1, 0b01000000, vl);
       v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
@@ -33240,7 +33376,7 @@ int sse_detect_encodings(const char * buf, size_t len) {
 
     checker check{};
 
-    while(buf + 64 <= end) {
+    while(end - buf >= 64) {
         __m128i in = _mm_loadu_si128((__m128i*)buf);
         __m128i secondin = _mm_loadu_si128((__m128i*)buf+1);
         __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
@@ -33477,6 +33613,9 @@ int sse_detect_encodings(const char * buf, size_t len) {
 */
 template <endianness big_endian>
 const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
+    if (simdutf_unlikely(size == 0)) {
+        return input;
+    }
     const char16_t* end = input + size;
 
     const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -33553,6 +33692,9 @@ const char16_t* sse_validate_utf16(const char16_t* input, size_t size) {
 
 template <endianness big_endian>
 const result sse_validate_utf16_with_errors(const char16_t* input, size_t size) {
+    if (simdutf_unlikely(size == 0)) {
+        return result(error_code::SUCCESS, 0);
+    }
     const char16_t* start = input;
     const char16_t* end = input + size;
 
@@ -33634,6 +33776,9 @@ const result sse_validate_utf16_with_errors(const char16_t* input, size_t size)
    - nullptr if an error was detected.
 */
 const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
+    if (size == 0) {
+        return input;
+    }
     const char32_t* end = input + size;
 
     const __m128i standardmax = _mm_set1_epi32(0x10ffff);
@@ -33663,6 +33808,9 @@ const char32_t* sse_validate_utf32le(const char32_t* input, size_t size) {
 
 
 const result sse_validate_utf32le_with_errors(const char32_t* input, size_t size) {
+    if (size == 0) {
+        return result(error_code::SUCCESS, 0);
+    }
     const char32_t* start = input;
     const char32_t* end = input + size;
 
@@ -33732,7 +33880,7 @@ std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
   // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then adjust the pointer)
   // so the last write can exceed the utf8_output size by 8-1 bytes
   // by reserving 8 extra input bytes, we expect the output to have 8-16 bytes free
-  while (latin_input + 16 + 8 <= end) {
+  while (end - latin_input >= 16 + 8) {
     // Load 16 Latin1 characters (16 bytes) into a 128-bit register
     __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
 
@@ -33757,7 +33905,7 @@ std::pair<const char* const, char* const> sse_convert_latin1_to_utf8(
     latin_input += 16;
   }
 
-  if (latin_input + 16 <= end) {
+  if (end - latin_input >= 16) {
     // Load 16 Latin1 characters (16 bytes) into a 128-bit register
     __m128i v_latin = _mm_loadu_si128((__m128i*)latin_input);
 
@@ -33801,7 +33949,7 @@ std::pair<const char*, char16_t*> sse_convert_latin1_to_utf16(const char *latin1
 std::pair<const char*, char32_t*> sse_convert_latin1_to_utf32(const char* buf, size_t len, char32_t* utf32_output) {
     const char* end = buf + len;
 
-    while (buf + 16 <= end) {
+    while (end - buf >= 16) {
         // Load 16 Latin1 characters (16 bytes) into a 128-bit register
         __m128i in = _mm_loadu_si128((__m128i*)buf);
 
@@ -34206,7 +34354,7 @@ size_t convert_masked_utf8_to_latin1(const char *input,
 template <endianness big_endian>
 std::pair<const char16_t*, char*> sse_convert_utf16_to_latin1(const char16_t* buf, size_t len, char* latin1_output) {
   const char16_t* end = buf + len;
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     // Load 8 UTF-16 characters into 128-bit SSE register
     __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buf));
 
@@ -34234,10 +34382,10 @@ template <endianness big_endian>
 std::pair<result, char*> sse_convert_utf16_to_latin1_with_errors(const char16_t* buf, size_t len, char* latin1_output) {
   const char16_t* start = buf;
   const char16_t* end = buf + len;
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i*>(buf));
 
-    if (!big_endian) {
+    if (!match_system(big_endian)) {
       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
       in = _mm_shuffle_epi8(in, swap);
     }
@@ -34328,7 +34476,7 @@ std::pair<const char16_t*, char*> sse_convert_utf16_to_utf8(const char16_t* buf,
   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m128i in = _mm_loadu_si128((__m128i*)buf);
     if (big_endian) {
       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
@@ -34532,7 +34680,7 @@ std::pair<result, char*> sse_convert_utf16_to_utf8_with_errors(const char16_t* b
   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
     __m128i in = _mm_loadu_si128((__m128i*)buf);
     if (big_endian) {
       const __m128i swap = _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
@@ -34780,7 +34928,7 @@ std::pair<const char16_t*, char32_t*> sse_convert_utf16_to_utf32(const char16_t*
   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     __m128i in = _mm_loadu_si128((__m128i*)buf);
 
     if (big_endian) {
@@ -34848,7 +34996,7 @@ std::pair<result, char32_t*> sse_convert_utf16_to_utf32_with_errors(const char16
   const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
   const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
 
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     __m128i in = _mm_loadu_si128((__m128i*)buf);
 
     if (big_endian) {
@@ -34996,7 +35144,7 @@ std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf,
   __m128i forbidden_bytemask = _mm_setzero_si128();
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) { //buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) { // buf is a char32_t pointer, each char32_t has 4 bytes or 32 bits, thus buf + 16 * char_32t = 512 bits = 64 bytes
     // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
     __m128i in = _mm_loadu_si128((__m128i*)buf);
     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);//These two values can hold only 8 UTF32 chars
@@ -35243,7 +35391,6 @@ std::pair<const char32_t*, char*> sse_convert_utf32_to_utf8(const char32_t* buf,
 
 
 std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* buf, size_t len, char* utf8_output) {
-
   const char32_t* end = buf + len;
   const char32_t* start = buf;
 
@@ -35257,11 +35404,10 @@ std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* b
 
   const size_t safety_margin = 12; // to avoid overruns, see issue https://github.com/simdutf/simdutf/issues/92
 
-  while (buf + 16 + safety_margin <= end) {
-    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
     __m128i in = _mm_loadu_si128((__m128i*)buf);
     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
-
     // Check for too large input
     __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
     if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
@@ -35275,14 +35421,6 @@ std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* b
 
     // Check for ASCII fast path
     if(_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      // We eagerly load another 32 bytes, hoping that they will be ASCII too.
-      // The intuition is that we try to collect 16 ASCII characters which requires
-      // a total of 64 bytes of input. If we fail, we just pass thirdin and fourthin
-      // as our new inputs.
-      __m128i thirdin = _mm_loadu_si128((__m128i*)buf+2);
-      __m128i fourthin = _mm_loadu_si128((__m128i*)buf+3);
-      __m128i nextin_16 = _mm_packus_epi32(_mm_and_si128(thirdin, v_7fffffff), _mm_and_si128(fourthin, v_7fffffff));
-      if(!_mm_testz_si128(nextin_16, v_ff80)) {
         // 1. pack the bytes
         // obviously suboptimal.
         const __m128i utf8_packed = _mm_packus_epi16(in_16,in_16);
@@ -35291,25 +35429,7 @@ std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* b
         // 3. adjust pointers
         buf += 8;
         utf8_output += 8;
-        // Proceed with next input
-        in_16 = nextin_16;
-        __m128i next_max_input = _mm_max_epu32(_mm_max_epu32(thirdin, fourthin), v_10ffff);
-        if(static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(next_max_input, v_10ffff))) != 0xffff) {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start), utf8_output);
-        }
-        // We need to update in and nextin because they are used later.
-        in = thirdin;
-        nextin = fourthin;
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i*)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
+        continue;
     }
 
     // no bits set above 7th bit
@@ -35491,7 +35611,6 @@ std::pair<result, char*> sse_convert_utf32_to_utf8_with_errors(const char32_t* b
       buf += k;
     }
   } // while
-
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
 /* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
@@ -35505,7 +35624,7 @@ std::pair<const char32_t*, char16_t*> sse_convert_utf32_to_utf16(const char32_t*
   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
   __m128i forbidden_bytemask = _mm_setzero_si128();
 
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     __m128i in = _mm_loadu_si128((__m128i*)buf);
     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
@@ -35571,7 +35690,7 @@ std::pair<result, char16_t*> sse_convert_utf32_to_utf16_with_errors(const char32
   const __m128i v_0000 = _mm_setzero_si128();
   const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
 
-  while (buf + 8 <= end) {
+  while (end - buf >= 8) {
     __m128i in = _mm_loadu_si128((__m128i*)buf);
     __m128i nextin = _mm_loadu_si128((__m128i*)buf+1);
     const __m128i saturation_bytemask = _mm_cmpeq_epi32(_mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h
index 3b206d69bcb034..b656861346e927 100644
--- a/deps/simdutf/simdutf.h
+++ b/deps/simdutf/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-07-31 17:22:10 -0400. Do not edit! */
+/* auto-generated on 2024-08-09 09:52:29 -0400. Do not edit! */
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
@@ -433,8 +433,11 @@
   #define SIMDUTF_POP_DISABLE_WARNINGS __pragma(warning( pop ))
 
 #else // SIMDUTF_REGULAR_VISUAL_STUDIO
-
+#if defined(__OPTIMIZE__) || defined(NDEBUG)
   #define simdutf_really_inline inline __attribute__((always_inline))
+#else
+  #define simdutf_really_inline inline
+#endif
   #define simdutf_never_inline inline __attribute__((noinline))
 
   #define simdutf_unused __attribute__((unused))
@@ -594,7 +597,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "5.3.1"
+#define SIMDUTF_VERSION "5.3.4"
 
 namespace simdutf {
 enum {
@@ -609,7 +612,7 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 1
+  SIMDUTF_VERSION_REVISION = 4
 };
 } // namespace simdutf
 
@@ -1314,7 +1317,7 @@ simdutf_warn_unused result convert_utf8_to_utf32_with_errors(const char * input,
     /**
    * Convert valid UTF-8 string into latin1 string.
    *
-   * This function assumes that the input string is valid UTF-8.
+   * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1.
    *
    * This function is not BOM-aware.
    *
@@ -1387,7 +1390,8 @@ simdutf_warn_unused size_t utf8_length_from_latin1(const char * input, size_t le
 /**
  * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-8 strings but in such cases
+   * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -1400,7 +1404,8 @@ simdutf_warn_unused size_t latin1_length_from_utf8(const char * input, size_t le
 /**
  * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-8 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -1415,7 +1420,8 @@ simdutf_warn_unused size_t utf16_length_from_utf8(const char * input, size_t len
  *
  * This function is equivalent to count_utf8
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-8 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -1628,7 +1634,7 @@ simdutf_warn_unused size_t convert_valid_utf16_to_utf8(const char16_t * input, s
 /**
  * Using native endianness, convert UTF-16 string into Latin1 string.
  *
- * This function assumes that the input string is valid UTF-8.
+ * This function assumes that the input string is valid UTF-16 and that it can be represented as Latin1.
  *
  * This function is not BOM-aware.
  *
@@ -1642,7 +1648,7 @@ simdutf_warn_unused size_t convert_valid_utf16_to_latin1(const char16_t * input,
 /**
  * Convert valid UTF-16LE string into Latin1 string.
  *
- * This function assumes that the input string is valid UTF-16LE.
+ * This function assumes that the input string is valid UTF-16LE and that it can be represented as Latin1.
  *
  * This function is not BOM-aware.
  *
@@ -1656,7 +1662,7 @@ simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(const char16_t * inpu
 /**
  * Convert valid UTF-16BE string into Latin1 string.
  *
- * This function assumes that the input string is valid UTF-16BE.
+ * This function assumes that the input string is valid UTF-16BE and that it can be represented as Latin1.
  *
  * This function is not BOM-aware.
  *
@@ -1671,7 +1677,7 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(const char16_t * inpu
 /**
  * Convert valid UTF-16LE string into UTF-8 string.
  *
- * This function assumes that the input string is valid UTF-16LE.
+ * This function assumes that the input string is valid UTF-16LE and that it can be represented as Latin1.
  *
  * This function is not BOM-aware.
  *
@@ -1833,7 +1839,8 @@ simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(const char16_t * input
 /*
  * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -1847,7 +1854,8 @@ simdutf_warn_unused size_t latin1_length_from_utf16(size_t length) noexcept;
  * Using native endianness; Compute the number of bytes that this UTF-16
  * string would require in UTF-8 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * @param input         the UTF-16 string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
@@ -1858,7 +1866,8 @@ simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t * input, size_t
 /**
  * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * @param input         the UTF-16LE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
@@ -1869,7 +1878,8 @@ simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t * input, size
 /**
  * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * @param input         the UTF-16BE string to convert
  * @param length        the length of the string in 2-byte code units (char16_t)
@@ -1986,7 +1996,7 @@ simdutf_warn_unused result convert_utf32_to_latin1_with_errors(const char32_t *
 /**
  * Convert valid UTF-32 string into Latin1 string.
  *
- * This function assumes that the input string is valid UTF-32.
+ * This function assumes that the input string is valid UTF-32 and that it can be represented as Latin1.
  *
  * This function is not BOM-aware.
  *
@@ -2117,7 +2127,8 @@ void change_endianness_utf16(const char16_t * input, size_t length, char16_t * o
 /**
  * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-32 strings but in such cases
+ * the result is implementation defined.
  *
  * @param input         the UTF-32 string to convert
  * @param length        the length of the string in 4-byte code units (char32_t)
@@ -2128,7 +2139,8 @@ simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t * input, size_t
 /**
  * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-32 strings but in such cases
+ * the result is implementation defined.
  *
  * @param input         the UTF-32 string to convert
  * @param length        the length of the string in 4-byte code units (char32_t)
@@ -2142,7 +2154,8 @@ simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t * input, size_
  *
  * This function is equivalent to count_utf16.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -2157,7 +2170,8 @@ simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t * input, size_
  *
  * This function is equivalent to count_utf16le.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -2172,7 +2186,8 @@ simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t * input, siz
  *
  * This function is equivalent to count_utf16be.
  *
- * This function does not validate the input.
+ * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -2187,6 +2202,8 @@ simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t * input, siz
  * it is valid.
  *
  * This function assumes that the input string is valid UTF-16 (native endianness).
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -2201,6 +2218,8 @@ simdutf_warn_unused size_t count_utf16(const char16_t * input, size_t length) no
  * it is valid.
  *
  * This function assumes that the input string is valid UTF-16LE.
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -2215,6 +2234,8 @@ simdutf_warn_unused size_t count_utf16le(const char16_t * input, size_t length)
  * it is valid.
  *
  * This function assumes that the input string is valid UTF-16BE.
+ * It is acceptable to pass invalid UTF-16 strings but in such cases
+ * the result is implementation defined.
  *
  * This function is not BOM-aware.
  *
@@ -2229,6 +2250,8 @@ simdutf_warn_unused size_t count_utf16be(const char16_t * input, size_t length)
  * it is valid.
  *
  * This function assumes that the input string is valid UTF-8.
+ * It is acceptable to pass invalid UTF-8 strings but in such cases
+ * the result is implementation defined.
  *
  * @param input         the UTF-8 string to process
  * @param length        the length of the string in bytes
@@ -2739,10 +2762,10 @@ class implementation {
    */
   simdutf_warn_unused virtual result convert_utf8_to_latin1_with_errors(const char * input, size_t length, char* latin1_output) const noexcept = 0;
 
-    /**
+  /**
    * Convert valid UTF-8 string into latin1 string.
    *
-   * This function assumes that the input string is valid UTF-8.
+   * This function assumes that the input string is valid UTF-8 and that it can be represented as Latin1.
    *
    * This function is not BOM-aware.
    *
@@ -2871,7 +2894,8 @@ class implementation {
   /**
    * Compute the number of 2-byte code units that this UTF-8 string would require in UTF-16LE format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-8 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the UTF-8 string to process
    * @param length        the length of the string in bytes
@@ -2882,7 +2906,8 @@ class implementation {
    /**
    * Compute the number of 4-byte code units that this UTF-8 string would require in UTF-32 format.
    *
-   * This function is equivalent to count_utf8.
+   * This function is equivalent to count_utf8. It is acceptable to pass invalid UTF-8 strings but in such cases
+   * the result is implementation defined.
    *
    * This function does not validate the input.
    *
@@ -2957,7 +2982,7 @@ class implementation {
   /**
    * Convert valid UTF-16LE string into Latin1 string.
    *
-   * This function assumes that the input string is valid UTF-8.
+   * This function assumes that the input string is valid UTF-L16LE and that it can be represented as Latin1.
 
    * This function is not BOM-aware.
    *
@@ -2971,7 +2996,7 @@ class implementation {
   /**
    * Convert valid UTF-16BE string into Latin1 string.
    *
-   * This function assumes that the input string is valid UTF-8.
+   * This function assumes that the input string is valid UTF16-BE and that it can be represented as Latin1.
    *
    * This function is not BOM-aware.
    *
@@ -3161,7 +3186,8 @@ class implementation {
   /**
    * Compute the number of bytes that this UTF-16LE string would require in UTF-8 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3174,7 +3200,8 @@ class implementation {
   /**
    * Compute the number of bytes that this UTF-16BE string would require in UTF-8 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3275,7 +3302,7 @@ class implementation {
   simdutf_warn_unused virtual size_t convert_valid_utf32_to_utf8(const char32_t * input, size_t length, char* utf8_buffer) const noexcept = 0;
 
 
-    /**
+  /**
    * Return the number of bytes that this UTF-16 string would require in Latin1 format.
    *
    *
@@ -3399,7 +3426,8 @@ class implementation {
   /**
    * Compute the number of bytes that this UTF-32 string would require in UTF-8 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-32 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the UTF-32 string to convert
    * @param length        the length of the string in 4-byte code units (char32_t)
@@ -3410,7 +3438,8 @@ class implementation {
   /**
    * Compute the number of bytes that this UTF-32 string would require in Latin1 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-32 strings but in such cases
+   * the result is implementation defined.
    *
    * @param length        the length of the string in 4-byte code units (char32_t)
    * @return the number of bytes required to encode the UTF-32 string as Latin1
@@ -3420,7 +3449,8 @@ class implementation {
   /**
    * Compute the number of bytes that this UTF-8 string would require in Latin1 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-8 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the UTF-8 string to convert
    * @param length        the length of the string in byte
@@ -3431,7 +3461,8 @@ class implementation {
   /*
    * Compute the number of bytes that this UTF-16LE/BE string would require in Latin1 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3444,7 +3475,8 @@ class implementation {
   /**
    * Compute the number of two-byte code units that this UTF-32 string would require in UTF-16 format.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-32 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the UTF-32 string to convert
    * @param length        the length of the string in 4-byte code units (char32_t)
@@ -3453,11 +3485,9 @@ class implementation {
   simdutf_warn_unused virtual size_t utf16_length_from_utf32(const char32_t * input, size_t length) const noexcept = 0;
 
 
-    /**
+  /**
    * Return the number of bytes that this UTF-32 string would require in Latin1 format.
    *
-   * This function does not validate the input.
-   *
    * @param input         the UTF-32 string to convert
    * @param length        the length of the string in 4-byte code units (char32_t)
    * @return the number of bytes required to encode the UTF-32 string as Latin1
@@ -3469,7 +3499,8 @@ class implementation {
    *
    * This function is equivalent to count_utf16le.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3484,7 +3515,8 @@ class implementation {
    *
    * This function is equivalent to count_utf16be.
    *
-   * This function does not validate the input.
+   * This function does not validate the input. It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3499,6 +3531,8 @@ class implementation {
    * it is valid.
    *
    * This function assumes that the input string is valid UTF-16LE.
+   * It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3513,6 +3547,8 @@ class implementation {
    * it is valid.
    *
    * This function assumes that the input string is valid UTF-16BE.
+   * It is acceptable to pass invalid UTF-16 strings but in such cases
+   * the result is implementation defined.
    *
    * This function is not BOM-aware.
    *
@@ -3528,6 +3564,8 @@ class implementation {
    * it is valid.
    *
    * This function assumes that the input string is valid UTF-8.
+   * It is acceptable to pass invalid UTF-8 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the UTF-8 string to process
    * @param length        the length of the string in bytes
@@ -3538,7 +3576,8 @@ class implementation {
   /**
    * Provide the maximal binary length in bytes given the base64 input.
    * In general, if the input contains ASCII spaces, the result will be less than
-   * the maximum length.
+   * the maximum length. It is acceptable to pass invalid base64 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the base64 input to process
    * @param length        the length of the base64 input in bytes
@@ -3549,7 +3588,8 @@ class implementation {
   /**
    * Provide the maximal binary length in bytes given the base64 input.
    * In general, if the input contains ASCII spaces, the result will be less than
-   * the maximum length.
+   * the maximum length. It is acceptable to pass invalid base64 strings but in such cases
+   * the result is implementation defined.
    *
    * @param input         the base64 input to process, in ASCII stored as 16-bit units
    * @param length        the length of the base64 input in 16-bit units