diff --git a/.github/label-pr-config.yml b/.github/label-pr-config.yml
index ea3f1d145e414c..809ec49eb029e7 100644
--- a/.github/label-pr-config.yml
+++ b/.github/label-pr-config.yml
@@ -125,6 +125,8 @@ exlusiveLabels:
   /^test\/es-module\//: test, esm
   /^test\/fixtures\/wpt\/streams\//: test, web streams
   /^test\/fixtures\/typescript/: test, strip-types
+  /^test\/module-hooks\//: test, module, loaders
+  /^test\/fixtures/module-hooks\//: test, module, loaders
 
   /^test\//: test
 
diff --git a/.github/workflows/create-release-proposal.yml b/.github/workflows/create-release-proposal.yml
index d3ffa3ad49b5e2..f3add22090cbc0 100644
--- a/.github/workflows/create-release-proposal.yml
+++ b/.github/workflows/create-release-proposal.yml
@@ -1,5 +1,4 @@
 # This action requires the following secrets to be set on the repository:
-#   GH_USER_NAME: GitHub user whose Jenkins and GitHub token are defined below
 #   GH_USER_TOKEN: GitHub user token, to be used by ncu and to push changes
 
 name: Create Release Proposal
@@ -52,20 +51,18 @@ jobs:
         run: |
           ncu-config set branch "${RELEASE_BRANCH}"
           ncu-config set upstream origin
-          ncu-config set username "$USERNAME"
+          ncu-config set username "$GITHUB_ACTOR"
           ncu-config set token "$GH_TOKEN"
           ncu-config set repo "$(echo "$GITHUB_REPOSITORY" | cut -d/ -f2)"
           ncu-config set owner "${GITHUB_REPOSITORY_OWNER}"
         env:
-          USERNAME: ${{ secrets.JENKINS_USER }}
           GH_TOKEN: ${{ github.token }}
 
       - name: Set up ghauth config (Ubuntu)
         run: |
           mkdir -p "${XDG_CONFIG_HOME:-~/.config}/changelog-maker"
-          echo '{}' | jq '{user: env.USERNAME, token: env.TOKEN}' > "${XDG_CONFIG_HOME:-~/.config}/changelog-maker/config.json"
+          echo '{}' | jq '{user: env.GITHUB_ACTOR, token: env.TOKEN}' > "${XDG_CONFIG_HOME:-~/.config}/changelog-maker/config.json"
         env:
-          USERNAME: ${{ secrets.JENKINS_USER }}
           TOKEN: ${{ github.token }}
 
       - name: Setup git author
@@ -78,7 +75,7 @@ jobs:
         run: |
           git update-index --assume-unchanged tools/actions/create-release.sh
           curl -fsSLo tools/actions/create-release.sh https://github.com/${GITHUB_REPOSITORY}/raw/${GITHUB_SHA}/tools/actions/create-release.sh
-          ./tools/actions/create-release.sh "${RELEASE_DATE}" "${RELEASE_LINE}"
+          ./tools/actions/create-release.sh "${RELEASE_DATE}" "${RELEASE_LINE}" "${GITHUB_ACTOR}"
         env:
           GH_TOKEN: ${{ github.token }}
           # We want the bot to push the push the release commit so CI runs on it.
diff --git a/.github/workflows/lint-release-proposal.yml b/.github/workflows/lint-release-proposal.yml
index bc2ac2d0127865..1ea2b4b1b173e2 100644
--- a/.github/workflows/lint-release-proposal.yml
+++ b/.github/workflows/lint-release-proposal.yml
@@ -43,7 +43,7 @@ jobs:
           PR_HEAD="$(gh pr view "$PR_URL" --json headRefOid -q .headRefOid)"
           echo "Head of $PR_URL: $PR_HEAD"
           echo "Current commit: $GITHUB_SHA"
-          [[ "$PR_HEAD" == "$GITHUB_SHA" ]]
+          [ "$PR_HEAD" = "$GITHUB_SHA" ]
         env:
           GH_TOKEN: ${{ github.token }}
       - name: Validate CHANGELOG
@@ -53,7 +53,10 @@ jobs:
           echo "Expected CHANGELOG section title: $EXPECTED_CHANGELOG_TITLE_INTRO"
           CHANGELOG_TITLE="$(grep "$EXPECTED_CHANGELOG_TITLE_INTRO" "doc/changelogs/CHANGELOG_V${COMMIT_SUBJECT:20:2}.md")"
           echo "Actual: $CHANGELOG_TITLE"
-          [[ "${CHANGELOG_TITLE%@*}@" == "$EXPECTED_CHANGELOG_TITLE_INTRO" ]]
+          [ "${CHANGELOG_TITLE%%@*}@" = "$EXPECTED_CHANGELOG_TITLE_INTRO" ]
       - name: Verify NODE_VERSION_IS_RELEASE bit is correctly set
         run: |
           grep -q '^#define NODE_VERSION_IS_RELEASE 1$' src/node_version.h
+      - name: Check for placeholders in documentation
+        run: |
+          ! grep "REPLACEME" doc/api/*.md
diff --git a/.github/workflows/major-release.yml b/.github/workflows/major-release.yml
new file mode 100644
index 00000000000000..a90be1798fac85
--- /dev/null
+++ b/.github/workflows/major-release.yml
@@ -0,0 +1,48 @@
+name: Major Release
+
+on:
+  schedule:
+    - cron: 0 0 15 2,8 *  # runs at midnight UTC every 15 February and 15 August
+
+permissions:
+  contents: read
+
+jobs:
+  create-issue:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+    steps:
+      - name: Check for release schedule
+        id: check-date
+        run: |
+          # Get the current month and day
+          MONTH=$(date +'%m')
+          DAY=$(date +'%d')
+          # We'll create the reminder issue two months prior the release
+          if [[ "$MONTH" == "02" || "$MONTH" == "08" ]] && [[ "$DAY" == "15" ]]; then
+            echo "create_issue=true" >> "$GITHUB_ENV"
+          fi
+      - name: Retrieve next major release info from nodejs/Release
+        if: env.create_issue == 'true'
+        run: |
+          curl -L https://github.com/nodejs/Release/raw/HEAD/schedule.json | \
+          jq -r 'to_entries | map(select(.value.start | strptime("%Y-%m-%d") | mktime > now)) | first | "VERSION=" + .key + "\nRELEASE_DATE=" + .value.start' >> "$GITHUB_ENV"
+      - name: Compute max date for landing semver-major PRs
+        if: env.create_issue == 'true'
+        run: |
+          echo "PR_MAX_DATE=$(date -d "$RELEASE_DATE -1 month" +%Y-%m-%d)" >> "$GITHUB_ENV"
+      - name: Create release announcement issue
+        if: env.create_issue == 'true'
+        run: |
+         gh issue create --repo "${GITHUB_REPOSITORY}" \
+           --title "Upcoming Node.js Major Release ($VERSION)" \
+           --body-file -<<EOF
+            A reminder that the next Node.js **SemVer Major release** is scheduled for **${RELEASE_DATE}**.
+            All commits that were landed until **${PR_MAX_DATE}** (one month prior to the release) will be included in the next semver major release. Please ensure that any necessary preparations are made in advance.
+            For more details on the release process, consult the [Node.js Release Working Group repository](https://github.com/nodejs/release).
+
+            cc: @nodejs/collaborators
+         EOF
+        env:
+          GH_TOKEN: ${{ github.token }}
diff --git a/.github/workflows/tools.yml b/.github/workflows/tools.yml
index 04c46541546ece..ea8aa33868fdf9 100644
--- a/.github/workflows/tools.yml
+++ b/.github/workflows/tools.yml
@@ -27,7 +27,7 @@ on:
           - gyp-next
           - histogram
           - icu
-          # - libuv
+          - libuv
           - llhttp
           - minimatch
           - nbytes
@@ -175,17 +175,14 @@ jobs:
               cat temp-output
               tail -n1 temp-output | grep "NEW_VERSION=" >> "$GITHUB_ENV" || true
               rm temp-output
-          # libuv update was disabled because of Feb 14, 2024 security release
-          # modified the bundled version of libuv, we cannot automatically update
-          # libuv without potentially undoing those changes.
-          # - id: libuv
-          #   subsystem: deps
-          #   label: dependencies
-          #   run: |
-          #     ./tools/dep_updaters/update-libuv.sh > temp-output
-          #     cat temp-output
-          #     tail -n1 temp-output | grep "NEW_VERSION=" >> "$GITHUB_ENV" || true
-          #     rm temp-output
+          - id: libuv
+            subsystem: deps
+            label: dependencies
+            run: |
+              ./tools/dep_updaters/update-libuv.sh > temp-output
+              cat temp-output
+              tail -n1 temp-output | grep "NEW_VERSION=" >> "$GITHUB_ENV" || true
+              rm temp-output
           - id: llhttp
             subsystem: deps
             label: dependencies
diff --git a/BUILDING.md b/BUILDING.md
index 1ccd594f00b311..ee42aea5401fc1 100644
--- a/BUILDING.md
+++ b/BUILDING.md
@@ -683,7 +683,7 @@ easily. These files will install the following
 To install Node.js prerequisites from Powershell Terminal:
 
 ```powershell
-winget configure .\configuration.dsc.yaml
+winget configure .\configurations\configuration.dsc.yaml
 ```
 
 Alternatively, you can use [Dev Home](https://learn.microsoft.com/en-us/windows/dev-home/)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ced3dfd153426c..0a7bf83dc44961 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -39,7 +39,8 @@ release.
 </tr>
 <tr>
   <td valign="top">
-<b><a href="doc/changelogs/CHANGELOG_V23.md#23.4.0">23.4.0</a></b><br/>
+<b><a href="doc/changelogs/CHANGELOG_V23.md#23.5.0">23.5.0</a></b><br/>
+<a href="doc/changelogs/CHANGELOG_V23.md#23.4.0">23.4.0</a><br/>
 <a href="doc/changelogs/CHANGELOG_V23.md#23.3.0">23.3.0</a><br/>
 <a href="doc/changelogs/CHANGELOG_V23.md#23.2.0">23.2.0</a><br/>
 <a href="doc/changelogs/CHANGELOG_V23.md#23.1.0">23.1.0</a><br/>
diff --git a/Makefile b/Makefile
index 4fa275a1b338a5..b043fdb2bb5389 100644
--- a/Makefile
+++ b/Makefile
@@ -294,6 +294,7 @@ coverage-report-js: ## Report JavaScript coverage results.
 cctest: all ## Run the C++ tests using the built `cctest` executable.
 	@out/$(BUILDTYPE)/$@ --gtest_filter=$(GTEST_FILTER)
 	$(NODE) ./test/embedding/test-embedding.js
+	$(NODE) ./test/sqlite/test-sqlite-extensions.mjs
 
 .PHONY: list-gtests
 list-gtests: ## List all available C++ gtests.
@@ -574,6 +575,7 @@ test-ci: | clear-stalled bench-addons-build build-addons build-js-native-api-tes
 		--mode=$(BUILDTYPE_LOWER) --flaky-tests=$(FLAKY_TESTS) \
 		$(TEST_CI_ARGS) $(CI_JS_SUITES) $(CI_NATIVE_SUITES) $(CI_DOC)
 	$(NODE) ./test/embedding/test-embedding.js
+	$(NODE) ./test/sqlite/test-sqlite-extensions.mjs
 	$(info Clean up any leftover processes, error if found.)
 	ps awwx | grep Release/node | grep -v grep | cat
 	@PS_OUT=`ps awwx | grep Release/node | grep -v grep | awk '{print $$1}'`; \
@@ -932,6 +934,9 @@ else
 ifeq ($(findstring riscv64,$(UNAME_M)),riscv64)
 DESTCPU ?= riscv64
 else
+ifeq ($(findstring loongarch64,$(UNAME_M)),loongarch64)
+DESTCPU ?= loong64
+else
 DESTCPU ?= x86
 endif
 endif
@@ -945,6 +950,7 @@ endif
 endif
 endif
 endif
+endif
 ifeq ($(DESTCPU),x64)
 ARCH=x64
 else
@@ -969,6 +975,9 @@ else
 ifeq ($(DESTCPU),riscv64)
 ARCH=riscv64
 else
+ifeq ($(DESTCPU),loong64)
+ARCH=loong64
+else
 ARCH=x86
 endif
 endif
@@ -978,6 +987,7 @@ endif
 endif
 endif
 endif
+endif
 
 # node and v8 use different arch names (e.g. node 'x86' vs v8 'ia32').
 # pass the proper v8 arch name to $V8_ARCH based on user-specified $DESTCPU.
@@ -1432,6 +1442,7 @@ LINT_CPP_FILES = $(filter-out $(LINT_CPP_EXCLUDE), $(wildcard \
 	test/cctest/*.h \
 	test/embedding/*.cc \
 	test/embedding/*.h \
+	test/sqlite/*.c \
 	test/fixtures/*.c \
 	test/js-native-api/*/*.cc \
 	test/node-api/*/*.cc \
diff --git a/SECURITY.md b/SECURITY.md
index fc95e1941698e6..19e876939f0f55 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -218,7 +218,7 @@ as any other stable feature.
 Security notifications will be distributed via the following methods.
 
 * <https://groups.google.com/group/nodejs-sec>
-* <https://nodejs.org/en/blog/>
+* <https://nodejs.org/en/blog/vulnerability>
 
 ## Comments on this policy
 
diff --git a/benchmark/fs/readfile-permission-enabled.js b/benchmark/fs/readfile-permission-enabled.js
index 46f20be6a0b06e..c688e9eecb0e00 100644
--- a/benchmark/fs/readfile-permission-enabled.js
+++ b/benchmark/fs/readfile-permission-enabled.js
@@ -17,7 +17,7 @@ const bench = common.createBenchmark(main, {
   concurrent: [1, 10],
 }, {
   flags: [
-    '--experimental-permission',
+    '--permission',
     '--allow-fs-read=*',
     '--allow-fs-write=*',
     '--allow-child-process',
diff --git a/benchmark/permission/permission-processhas-fs-read.js b/benchmark/permission/permission-processhas-fs-read.js
index c2c90636aa1f62..ea06aed4bc27a9 100644
--- a/benchmark/permission/permission-processhas-fs-read.js
+++ b/benchmark/permission/permission-processhas-fs-read.js
@@ -11,7 +11,7 @@ const rootPath = path.resolve(__dirname, '../../..');
 
 const options = {
   flags: [
-    '--experimental-permission',
+    '--permission',
     `--allow-fs-read=${rootPath}`,
     '--allow-child-process',
     '--no-warnings',
diff --git a/benchmark/permission/permission-startup.js b/benchmark/permission/permission-startup.js
index 08326909aa4e41..6a197cdff56111 100644
--- a/benchmark/permission/permission-startup.js
+++ b/benchmark/permission/permission-startup.js
@@ -48,7 +48,7 @@ function spawnProcess(script, bench, state) {
 function main({ count, script, nFiles, prefixPath }) {
   script = path.resolve(__dirname, '../../', `${script}.js`);
   const optionsWithScript = [
-    '--experimental-permission',
+    '--permission',
     `--allow-fs-read=${script}`,
     ...mockFiles(nFiles, prefixPath).map((file) => '--allow-fs-read=' + file),
     script,
diff --git a/common.gypi b/common.gypi
index 23196aae451f6a..a6a79adcc2fb4f 100644
--- a/common.gypi
+++ b/common.gypi
@@ -36,7 +36,7 @@
 
     # Reset this number to 0 on major V8 upgrades.
     # Increment by one for each non-official patch applied to deps/v8.
-    'v8_embedder_string': '-node.11',
+    'v8_embedder_string': '-node.12',
 
     ##### V8 defaults for Node.js #####
 
diff --git a/configure.py b/configure.py
index e2b12d8823bb64..c361676637c1cb 100755
--- a/configure.py
+++ b/configure.py
@@ -1685,6 +1685,9 @@ def configure_v8(o, configs):
     raise Exception(
         'Only one of the --v8-enable-object-print or --v8-disable-object-print options '
         'can be specified at a time.')
+  if sys.platform != 'darwin':
+    if o['variables']['v8_enable_webassembly'] and o['variables']['target_arch'] == 'x64':
+      o['variables']['v8_enable_wasm_simd256_revec'] = 1
 
 def configure_openssl(o):
   variables = o['variables']
diff --git a/deps/cares/CMakeLists.txt b/deps/cares/CMakeLists.txt
index f6560d56b08ddd..139defd8ffd159 100644
--- a/deps/cares/CMakeLists.txt
+++ b/deps/cares/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (C) The c-ares project and its contributors
 # SPDX-License-Identifier: MIT
-CMAKE_MINIMUM_REQUIRED (VERSION 3.5.0)
+CMAKE_MINIMUM_REQUIRED (VERSION 3.5.0...3.10.0)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 
@@ -12,7 +12,7 @@ INCLUDE (CheckCSourceCompiles)
 INCLUDE (CheckStructHasMember)
 INCLUDE (CheckLibraryExists)
 
-PROJECT (c-ares LANGUAGES C VERSION "1.34.3" )
+PROJECT (c-ares LANGUAGES C VERSION "1.34.4" )
 
 # Set this version before release
 SET (CARES_VERSION "${PROJECT_VERSION}")
@@ -30,7 +30,7 @@ INCLUDE (GNUInstallDirs) # include this *AFTER* PROJECT(), otherwise paths are w
 # For example, a version of 4:0:2 would generate output such as:
 #    libname.so   -> libname.so.2
 #    libname.so.2 -> libname.so.2.2.0
-SET (CARES_LIB_VERSIONINFO "21:2:19")
+SET (CARES_LIB_VERSIONINFO "21:3:19")
 
 
 OPTION (CARES_STATIC        "Build as a static library"                                             OFF)
@@ -271,6 +271,8 @@ ELSEIF (CMAKE_SYSTEM_NAME STREQUAL "AIX")
 	LIST (APPEND SYSFLAGS -D_ALL_SOURCE -D_XOPEN_SOURCE=700 -D_USE_IRS)
 ELSEIF (CMAKE_SYSTEM_NAME STREQUAL "FreeBSD")
 	# Don't define _XOPEN_SOURCE on FreeBSD, it actually reduces visibility instead of increasing it
+ELSEIF (CMAKE_SYSTEM_NAME STREQUAL "QNX")
+	LIST (APPEND SYSFLAGS -D_QNX_SOURCE)
 ELSEIF (WIN32)
 	LIST (APPEND SYSFLAGS -DWIN32_LEAN_AND_MEAN -D_CRT_SECURE_NO_DEPRECATE -D_CRT_NONSTDC_NO_DEPRECATE -D_WIN32_WINNT=0x0602)
 ENDIF ()
@@ -406,6 +408,7 @@ ENDIF ()
 
 CHECK_STRUCT_HAS_MEMBER("struct sockaddr_in6" sin6_scope_id "${CMAKE_EXTRA_INCLUDE_FILES}" HAVE_STRUCT_SOCKADDR_IN6_SIN6_SCOPE_ID LANGUAGE C)
 
+CHECK_SYMBOL_EXISTS (strnlen         "${CMAKE_EXTRA_INCLUDE_FILES}" HAVE_STRNLEN)
 CHECK_SYMBOL_EXISTS (memmem          "${CMAKE_EXTRA_INCLUDE_FILES}" HAVE_MEMMEM)
 CHECK_SYMBOL_EXISTS (closesocket     "${CMAKE_EXTRA_INCLUDE_FILES}" HAVE_CLOSESOCKET)
 CHECK_SYMBOL_EXISTS (CloseSocket     "${CMAKE_EXTRA_INCLUDE_FILES}" HAVE_CLOSESOCKET_CAMEL)
diff --git a/deps/cares/Makefile.am b/deps/cares/Makefile.am
index e99161a45f7883..51b5f6be32be78 100644
--- a/deps/cares/Makefile.am
+++ b/deps/cares/Makefile.am
@@ -3,17 +3,24 @@
 # Copyright (C) the Massachusetts Institute of Technology.
 # Copyright (C) Daniel Stenberg
 #
-# Permission to use, copy, modify, and distribute this
-# software and its documentation for any purpose and without
-# fee is hereby granted, provided that the above copyright
-# notice appear in all copies and that both that copyright
-# notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in
-# advertising or publicity pertaining to distribution of the
-# software without specific, written prior permission.
-# M.I.T. makes no representations about the suitability of
-# this software for any purpose.  It is provided "as is"
-# without express or implied warranty.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 #
 # SPDX-License-Identifier: MIT
 #
diff --git a/deps/cares/Makefile.in b/deps/cares/Makefile.in
index ba78cb77cbe335..2342125d136526 100644
--- a/deps/cares/Makefile.in
+++ b/deps/cares/Makefile.in
@@ -19,17 +19,24 @@
 # Copyright (C) the Massachusetts Institute of Technology.
 # Copyright (C) Daniel Stenberg
 #
-# Permission to use, copy, modify, and distribute this
-# software and its documentation for any purpose and without
-# fee is hereby granted, provided that the above copyright
-# notice appear in all copies and that both that copyright
-# notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in
-# advertising or publicity pertaining to distribution of the
-# software without specific, written prior permission.
-# M.I.T. makes no representations about the suitability of
-# this software for any purpose.  It is provided "as is"
-# without express or implied warranty.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 #
 # SPDX-License-Identifier: MIT
 #
@@ -111,7 +118,9 @@ build_triplet = @build@
 host_triplet = @host@
 subdir = .
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ares_check_user_namespace.m4 \
+	$(top_srcdir)/m4/ares_check_uts_namespace.m4 \
+	$(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_ac_print_to_file.m4 \
 	$(top_srcdir)/m4/ax_add_am_macro_static.m4 \
 	$(top_srcdir)/m4/ax_am_macros_static.m4 \
@@ -121,8 +130,6 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
 	$(top_srcdir)/m4/ax_check_gnu_make.m4 \
 	$(top_srcdir)/m4/ax_check_link_flag.m4 \
-	$(top_srcdir)/m4/ax_check_user_namespace.m4 \
-	$(top_srcdir)/m4/ax_check_uts_namespace.m4 \
 	$(top_srcdir)/m4/ax_code_coverage.m4 \
 	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
 	$(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \
diff --git a/deps/cares/Makefile.msvc b/deps/cares/Makefile.msvc
index 8395d1a7d67728..3266db415e09fe 100644
--- a/deps/cares/Makefile.msvc
+++ b/deps/cares/Makefile.msvc
@@ -1,17 +1,24 @@
 
 # Copyright (C) 2009-2013 by Daniel Stenberg
 #
-# Permission to use, copy, modify, and distribute this
-# software and its documentation for any purpose and without
-# fee is hereby granted, provided that the above copyright
-# notice appear in all copies and that both that copyright
-# notice and this permission notice appear in supporting
-# documentation, and that the name of M.I.T. not be used in
-# advertising or publicity pertaining to distribution of the
-# software without specific, written prior permission.
-# M.I.T. makes no representations about the suitability of
-# this software for any purpose.  It is provided "as is"
-# without express or implied warranty.
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
 #
 # SPDX-License-Identifier: MIT
 
diff --git a/deps/cares/RELEASE-NOTES.md b/deps/cares/RELEASE-NOTES.md
index f9d58d278432f1..19a204b3ea96bd 100644
--- a/deps/cares/RELEASE-NOTES.md
+++ b/deps/cares/RELEASE-NOTES.md
@@ -1,97 +1,25 @@
-## c-ares version 1.34.3 - November 9 2024
+## c-ares version 1.34.4 - December 14 2024
 
 This is a bugfix release.
 
 Changes:
-* Build the release package in an automated way so we can provide
-  provenance as per [SLSA3](https://slsa.dev/).
-  [PR #906](https://github.com/c-ares/c-ares/pull/906)
+* QNX Port: Port to QNX 8, add primary config reading support, add CI build. [PR #934](https://github.com/c-ares/c-ares/pull/934), [PR #937](https://github.com/c-ares/c-ares/pull/937), [PR #938](https://github.com/c-ares/c-ares/pull/938)
 
 Bugfixes:
-* Some upstream servers are non-compliant with EDNS options, resend queries
-  without EDNS. [Issue #911](https://github.com/c-ares/c-ares/issues/911)
-* Android: <=7 needs sys/system_properties.h
-  [a70637c](https://github.com/c-ares/c-ares/commit/a70637c)
-* Android: CMake needs `-D_GNU_SOURCE` and others.
-  [PR #915](https://github.com/c-ares/c-ares/pull/914)
-* TSAN warns on missing lock, but lock isn't actually necessary.
-  [PR #915](https://github.com/c-ares/c-ares/pull/915)
-* `ares_getaddrinfo()` for `AF_UNSPEC` should retry IPv4 if only IPv6 is
-  received. [765d558](https://github.com/c-ares/c-ares/commit/765d558)
-* `ares_send()` shouldn't return `ARES_EBADRESP`, its `ARES_EBADQUERY`.
-  [91519e7](https://github.com/c-ares/c-ares/commit/91519e7)
-* Fix typos in man pages. [PR #905](https://github.com/c-ares/c-ares/pull/905)
+* Empty TXT records were not being preserved. [PR #922](https://github.com/c-ares/c-ares/pull/922)
+* docs: update deprecation notices for `ares_create_query()` and `ares_mkquery()`. [PR #910](https://github.com/c-ares/c-ares/pull/910)
+* license: some files weren't properly updated. [PR #920](https://github.com/c-ares/c-ares/pull/920)
+* Fix bind local device regression from 1.34.0. [PR #929](https://github.com/c-ares/c-ares/pull/929), [PR #931](https://github.com/c-ares/c-ares/pull/931), [PR #935](https://github.com/c-ares/c-ares/pull/935)
+* CMake: set policy version to prevent deprecation warnings. [PR #932](https://github.com/c-ares/c-ares/pull/932)
+* CMake: shared and static library names should be the same on unix platforms like autotools uses. [PR #933](https://github.com/c-ares/c-ares/pull/933)
+* Update to latest autoconf archive macros for enhanced system compatibility. [PR #936](https://github.com/c-ares/c-ares/pull/936)
 
 Thanks go to these friendly people for their efforts and contributions for this
 release:
 
 * Brad House (@bradh352)
-* Jiwoo Park (@jimmy-park)
-
-
-## c-ares version 1.34.2 - October 15 2024
-
-This release contains a fix for downstream packages detecting the c-ares
-version based on the contents of the header file rather than the
-distributed pkgconf or cmake files.
-
-## c-ares version 1.34.1 - October 9 2024
-
-This release fixes a packaging issue.
-
-
-## c-ares version 1.34.0 - October 9 2024
-
-This is a feature and bugfix release.
-
-Features:
-* adig: read arguments from adigrc.
-  [PR #856](https://github.com/c-ares/c-ares/pull/856)
-* Add new pending write callback optimization via `ares_set_pending_write_cb`.
-  [PR #857](https://github.com/c-ares/c-ares/pull/857)
-* New function `ares_process_fds()`.
-  [PR #875](https://github.com/c-ares/c-ares/pull/875)
-* Failed servers should be probed rather than redirecting queries which could
-  cause unexpected latency.
-  [PR #877](https://github.com/c-ares/c-ares/pull/877)
-* adig: rework command line arguments to mimic dig from bind.
-  [PR #890](https://github.com/c-ares/c-ares/pull/890)
-* Add new method for overriding network functions
-  `ares_set_socket_function_ex()` to properly support all new functionality.
-  [PR #894](https://github.com/c-ares/c-ares/pull/894)
-* Fix regression with custom socket callbacks due to DNS cookie support.
-  [PR #895](https://github.com/c-ares/c-ares/pull/895)
-* ares_socket: set IP_BIND_ADDRESS_NO_PORT on ares_set_local_ip* tcp sockets
-  [PR #887](https://github.com/c-ares/c-ares/pull/887)
-* URI parser/writer for ares_set_servers_csv()/ares_get_servers_csv().
-  [PR #882](https://github.com/c-ares/c-ares/pull/882)
-
-Changes:
-* Connection handling modularization.
-  [PR #857](https://github.com/c-ares/c-ares/pull/857),
-  [PR #876](https://github.com/c-ares/c-ares/pull/876)
-* Expose library/utility functions to tools.
-  [PR #860](https://github.com/c-ares/c-ares/pull/860)
-* Remove `ares__` prefix, just use `ares_` for internal functions.
-  [PR #872](https://github.com/c-ares/c-ares/pull/872)
-
-
-Bugfixes:
-* fix: potential WIN32_LEAN_AND_MEAN redefinition.
-  [PR #869](https://github.com/c-ares/c-ares/pull/869)
-* Fix googletest v1.15 compatibility.
-  [PR #874](https://github.com/c-ares/c-ares/pull/874)
-* Fix pkgconfig thread dependencies.
-  [PR #884](https://github.com/c-ares/c-ares/pull/884)
-
-
-Thanks go to these friendly people for their efforts and contributions for this
-release:
-
-* Brad House (@bradh352)
-* Cristian Rodríguez (@crrodriguez)
-* Georg (@tacerus)
-* @lifenjoiner
-* Shelley Vohr (@codebytere)
-* 前进，前进，进 (@leleliu008)
-
+* Daniel Stenberg (@bagder)
+* Gregor Jasny (@gjasny)
+* @marcovsz
+* Nikolaos Chatzikonstantinou (@createyourpersonalaccount)
+* @vlasovsoft1979
diff --git a/deps/cares/aclocal.m4 b/deps/cares/aclocal.m4
index ce7ad1c8a86a43..04f8786c9c0c89 100644
--- a/deps/cares/aclocal.m4
+++ b/deps/cares/aclocal.m4
@@ -1221,6 +1221,8 @@ AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
 
+m4_include([m4/ares_check_user_namespace.m4])
+m4_include([m4/ares_check_uts_namespace.m4])
 m4_include([m4/ax_ac_append_to_file.m4])
 m4_include([m4/ax_ac_print_to_file.m4])
 m4_include([m4/ax_add_am_macro_static.m4])
@@ -1231,8 +1233,6 @@ m4_include([m4/ax_append_link_flags.m4])
 m4_include([m4/ax_check_compile_flag.m4])
 m4_include([m4/ax_check_gnu_make.m4])
 m4_include([m4/ax_check_link_flag.m4])
-m4_include([m4/ax_check_user_namespace.m4])
-m4_include([m4/ax_check_uts_namespace.m4])
 m4_include([m4/ax_code_coverage.m4])
 m4_include([m4/ax_compiler_vendor.m4])
 m4_include([m4/ax_cxx_compile_stdcxx.m4])
diff --git a/deps/cares/aminclude_static.am b/deps/cares/aminclude_static.am
index b83549f81adde4..ec7a86a43e6829 100644
--- a/deps/cares/aminclude_static.am
+++ b/deps/cares/aminclude_static.am
@@ -1,6 +1,6 @@
 
 # aminclude_static.am generated automatically by Autoconf
-# from AX_AM_MACROS_STATIC on Sat Nov  9 17:40:37 UTC 2024
+# from AX_AM_MACROS_STATIC on Sat Dec 14 15:15:44 UTC 2024
 
 
 # Code coverage
@@ -66,7 +66,7 @@ code_coverage_v_lcov_cap_ = $(code_coverage_v_lcov_cap_$(AM_DEFAULT_VERBOSITY))
 code_coverage_v_lcov_cap_0 = @echo "  LCOV   --capture" $(CODE_COVERAGE_OUTPUT_FILE);
 code_coverage_v_lcov_ign = $(code_coverage_v_lcov_ign_$(V))
 code_coverage_v_lcov_ign_ = $(code_coverage_v_lcov_ign_$(AM_DEFAULT_VERBOSITY))
-code_coverage_v_lcov_ign_0 = @echo "  LCOV   --remove /tmp/*" $(CODE_COVERAGE_IGNORE_PATTERN);
+code_coverage_v_lcov_ign_0 = @echo "  LCOV   --remove" "$(CODE_COVERAGE_OUTPUT_FILE).tmp" $(CODE_COVERAGE_IGNORE_PATTERN);
 code_coverage_v_genhtml = $(code_coverage_v_genhtml_$(V))
 code_coverage_v_genhtml_ = $(code_coverage_v_genhtml_$(AM_DEFAULT_VERBOSITY))
 code_coverage_v_genhtml_0 = @echo "  GEN   " "$(CODE_COVERAGE_OUTPUT_DIRECTORY)";
@@ -85,7 +85,7 @@ check-code-coverage:
 # Capture code coverage data
 code-coverage-capture: code-coverage-capture-hook
 	$(code_coverage_v_lcov_cap)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --capture --output-file "$(CODE_COVERAGE_OUTPUT_FILE).tmp" --test-name "$(call code_coverage_sanitize,$(PACKAGE_NAME)-$(PACKAGE_VERSION))" --no-checksum --compat-libtool $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_OPTIONS)
-	$(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" "/tmp/*" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS)
+	$(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS)
 	-@rm -f "$(CODE_COVERAGE_OUTPUT_FILE).tmp"
 	$(code_coverage_v_genhtml)LANG=C $(GENHTML) $(code_coverage_quiet) $(addprefix --prefix ,$(CODE_COVERAGE_DIRECTORY)) --output-directory "$(CODE_COVERAGE_OUTPUT_DIRECTORY)" --title "$(PACKAGE_NAME)-$(PACKAGE_VERSION) Code Coverage" --legend --show-details "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_GENHTML_OPTIONS)
 	@echo "file://$(abs_builddir)/$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html"
diff --git a/deps/cares/configure b/deps/cares/configure
index 76b0ddf39c136a..d02f117d2f0b64 100755
--- a/deps/cares/configure
+++ b/deps/cares/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for c-ares 1.34.3.
+# Generated by GNU Autoconf 2.71 for c-ares 1.34.4.
 #
 # Report bugs to <c-ares mailing list: http://lists.haxx.se/listinfo/c-ares>.
 #
@@ -621,8 +621,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='c-ares'
 PACKAGE_TARNAME='c-ares'
-PACKAGE_VERSION='1.34.3'
-PACKAGE_STRING='c-ares 1.34.3'
+PACKAGE_VERSION='1.34.4'
+PACKAGE_STRING='c-ares 1.34.4'
 PACKAGE_BUGREPORT='c-ares mailing list: http://lists.haxx.se/listinfo/c-ares'
 PACKAGE_URL=''
 
@@ -853,6 +853,7 @@ with_gcov
 enable_code_coverage
 enable_largefile
 enable_libgcc
+enable_tests_crossbuild
 '
       ac_precious_vars='build_alias
 host_alias
@@ -1423,7 +1424,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures c-ares 1.34.3 to adapt to many kinds of systems.
+\`configure' configures c-ares 1.34.4 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1494,7 +1495,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of c-ares 1.34.3:";;
+     short | recursive ) echo "Configuration of c-ares 1.34.4:";;
    esac
   cat <<\_ACEOF
 
@@ -1525,6 +1526,8 @@ Optional Features:
   --enable-code-coverage  Whether to enable code coverage support
   --disable-largefile     omit support for large files
   --enable-libgcc         use libgcc when linking
+  --enable-tests-crossbuild
+                          Enable test building even when cross building
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -1634,7 +1637,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-c-ares configure 1.34.3
+c-ares configure 1.34.4
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -2258,7 +2261,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by c-ares $as_me 1.34.3, which was
+It was created by c-ares $as_me 1.34.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -3232,7 +3235,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 
-CARES_VERSION_INFO="21:2:19"
+CARES_VERSION_INFO="21:3:19"
 
 
 
@@ -4891,7 +4894,17 @@ else $as_nop
 // MSVC always sets __cplusplus to 199711L in older versions; newer versions
 // only set it correctly if /Zc:__cplusplus is specified as well as a
 // /std:c++NN switch:
+//
 // https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+//
+// The value __cplusplus ought to have is available in _MSVC_LANG since
+// Visual Studio 2015 Update 3:
+//
+// https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+//
+// This was also the first MSVC version to support C++14 so we can't use the
+// value of either __cplusplus or _MSVC_LANG to quickly rule out MSVC having
+// C++11 or C++14 support, but we can check _MSVC_LANG for C++17 and later.
 #elif __cplusplus < 201103L && !defined _MSC_VER
 
 #error "This is not a C++11 compiler"
@@ -5914,7 +5927,7 @@ fi
 
 # Define the identity of the package.
  PACKAGE='c-ares'
- VERSION='1.34.3'
+ VERSION='1.34.4'
 
 
 printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h
@@ -19525,10 +19538,52 @@ then :
 
 fi
 
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for _gcov_init in -lgcov" >&5
+printf %s "checking for _gcov_init in -lgcov... " >&6; }
+if test ${ac_cv_lib_gcov__gcov_init+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+  ac_check_lib_save_LIBS=$LIBS
+LIBS="-lgcov  $LIBS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+/* Override any GCC internal prototype to avoid an error.
+   Use char because int might match the return type of a GCC
+   builtin and then its argument prototype would still apply.  */
+char _gcov_init ();
+int
+main (void)
+{
+return _gcov_init ();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"
+then :
+  ac_cv_lib_gcov__gcov_init=yes
+else $as_nop
+  ac_cv_lib_gcov__gcov_init=no
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam \
+    conftest$ac_exeext conftest.$ac_ext
+LIBS=$ac_check_lib_save_LIBS
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_gcov__gcov_init" >&5
+printf "%s\n" "$ac_cv_lib_gcov__gcov_init" >&6; }
+if test "x$ac_cv_lib_gcov__gcov_init" = xyes
+then :
+  CODE_COVERAGE_LIBS="-lgcov"
+else $as_nop
+  CODE_COVERAGE_LIBS=""
+fi
+
+
 			CODE_COVERAGE_CPPFLAGS="-DNDEBUG"
 	CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
 	CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
-	CODE_COVERAGE_LIBS="-lgcov"
 
 
 
@@ -19805,27 +19860,37 @@ eval ac_res=\$$as_CACHEVAR
 printf "%s\n" "$ac_res" >&6; }
 if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${LDFLAGS+y}
+
+if test ${LDFLAGS+y}
 then :
-  case " $LDFLAGS " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : LDFLAGS already contains \$flag"; } >&5
+
+  case " $LDFLAGS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : LDFLAGS already contains \$flag"; } >&5
   (: LDFLAGS already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : LDFLAGS=\"\$LDFLAGS \$flag\""; } >&5
-  (: LDFLAGS="$LDFLAGS $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append LDFLAGS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : LDFLAGS=\"\$LDFLAGS\""; } >&5
+  (: LDFLAGS="$LDFLAGS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      LDFLAGS="$LDFLAGS $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  LDFLAGS="$flag"
+
+  LDFLAGS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : LDFLAGS=\"\$LDFLAGS\""; } >&5
+  (: LDFLAGS="$LDFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -19870,27 +19935,37 @@ if test "x$enable_shared" = "xno" -a "x$enable_static" = "xyes" ; then
   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we need CARES_STATICLIB definition" >&5
 printf %s "checking whether we need CARES_STATICLIB definition... " >&6; }
   if test "$ac_cv_native_windows" = "yes" ; then
-    if test ${AM_CPPFLAGS+y}
+
+if test ${AM_CPPFLAGS+y}
 then :
-  case " $AM_CPPFLAGS " in
-    *" -DCARES_STATICLIB "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS already contains -DCARES_STATICLIB"; } >&5
+
+  case " $AM_CPPFLAGS " in #(
+  *" -DCARES_STATICLIB "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS already contains -DCARES_STATICLIB"; } >&5
   (: AM_CPPFLAGS already contains -DCARES_STATICLIB) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS=\"\$AM_CPPFLAGS -DCARES_STATICLIB\""; } >&5
-  (: AM_CPPFLAGS="$AM_CPPFLAGS -DCARES_STATICLIB") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append AM_CPPFLAGS " -DCARES_STATICLIB"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS=\"\$AM_CPPFLAGS\""; } >&5
+  (: AM_CPPFLAGS="$AM_CPPFLAGS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      AM_CPPFLAGS="$AM_CPPFLAGS -DCARES_STATICLIB"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  AM_CPPFLAGS="-DCARES_STATICLIB"
+
+  AM_CPPFLAGS=-DCARES_STATICLIB
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS=\"\$AM_CPPFLAGS\""; } >&5
+  (: AM_CPPFLAGS="$AM_CPPFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
     PKGCONFIG_CFLAGS="-DCARES_STATICLIB"
@@ -19910,57 +19985,24 @@ if test "$symbol_hiding" != "no" ; then
   else
     case "$ax_cv_c_compiler_vendor" in
       clang|gnu|intel)
-        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts " >&5
-printf %s "checking whether C compiler accepts ... " >&6; }
-if test ${ax_cv_check_cflags__+y}
-then :
-  printf %s "(cached) " >&6
-else $as_nop
 
-  ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS  "
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-
-int
-main (void)
-{
-
-  ;
-  return 0;
-}
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"
-then :
-  ax_cv_check_cflags__=yes
-else $as_nop
-  ax_cv_check_cflags__=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
-  CFLAGS=$ax_check_save_flags
-fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_check_cflags__" >&5
-printf "%s\n" "$ax_cv_check_cflags__" >&6; }
-if test x"$ax_cv_check_cflags__" = xyes
-then :
-  :
-else $as_nop
-  :
-fi
 
 
 
 for flag in -fvisibility=hidden; do
   as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags__$flag" | $as_tr_sh`
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
-printf %s "checking whether C compiler accepts $flag... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
 if eval test \${$as_CACHEVAR+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS  $flag"
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS  $flag $add_gnu_werror"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -19984,29 +20026,39 @@ fi
 eval ac_res=\$$as_CACHEVAR
 	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 printf "%s\n" "$ac_res" >&6; }
-if test x"`eval 'as_val=${'$as_CACHEVAR'};printf "%s\n" "$as_val"'`" = xyes
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${CARES_SYMBOL_HIDING_CFLAG+y}
+
+if test ${CARES_SYMBOL_HIDING_CFLAG+y}
 then :
-  case " $CARES_SYMBOL_HIDING_CFLAG " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG already contains \$flag"; } >&5
+
+  case " $CARES_SYMBOL_HIDING_CFLAG " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG already contains \$flag"; } >&5
   (: CARES_SYMBOL_HIDING_CFLAG already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG=\"\$CARES_SYMBOL_HIDING_CFLAG \$flag\""; } >&5
-  (: CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append CARES_SYMBOL_HIDING_CFLAG " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG=\"\$CARES_SYMBOL_HIDING_CFLAG\""; } >&5
+  (: CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  CARES_SYMBOL_HIDING_CFLAG="$flag"
+
+  CARES_SYMBOL_HIDING_CFLAG=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG=\"\$CARES_SYMBOL_HIDING_CFLAG\""; } >&5
+  (: CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -20022,17 +20074,22 @@ done
       sun)
 
 
+
+
 for flag in -xldscope=hidden; do
   as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags__$flag" | $as_tr_sh`
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
-printf %s "checking whether C compiler accepts $flag... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
 if eval test \${$as_CACHEVAR+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS  $flag"
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS  $flag $add_gnu_werror"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20056,29 +20113,39 @@ fi
 eval ac_res=\$$as_CACHEVAR
 	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 printf "%s\n" "$ac_res" >&6; }
-if test x"`eval 'as_val=${'$as_CACHEVAR'};printf "%s\n" "$as_val"'`" = xyes
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${CARES_SYMBOL_HIDING_CFLAG+y}
+
+if test ${CARES_SYMBOL_HIDING_CFLAG+y}
 then :
-  case " $CARES_SYMBOL_HIDING_CFLAG " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG already contains \$flag"; } >&5
+
+  case " $CARES_SYMBOL_HIDING_CFLAG " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG already contains \$flag"; } >&5
   (: CARES_SYMBOL_HIDING_CFLAG already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG=\"\$CARES_SYMBOL_HIDING_CFLAG \$flag\""; } >&5
-  (: CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append CARES_SYMBOL_HIDING_CFLAG " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG=\"\$CARES_SYMBOL_HIDING_CFLAG\""; } >&5
+  (: CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  CARES_SYMBOL_HIDING_CFLAG="$flag"
+
+  CARES_SYMBOL_HIDING_CFLAG=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : CARES_SYMBOL_HIDING_CFLAG=\"\$CARES_SYMBOL_HIDING_CFLAG\""; } >&5
+  (: CARES_SYMBOL_HIDING_CFLAG="$CARES_SYMBOL_HIDING_CFLAG") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -20120,17 +20187,22 @@ fi
 if test "$enable_warnings" = "yes"; then
 
 
+
+
 for flag in -Wall -Wextra -Waggregate-return -Wcast-align -Wcast-qual -Wconversion -Wdeclaration-after-statement -Wdouble-promotion -Wfloat-equal -Wformat-security -Winit-self -Wjump-misses-init -Wlogical-op -Wmissing-braces -Wmissing-declarations -Wmissing-format-attribute -Wmissing-include-dirs -Wmissing-prototypes -Wnested-externs -Wno-coverage-mismatch -Wold-style-definition -Wpacked -Wpedantic -Wpointer-arith -Wredundant-decls -Wshadow -Wsign-conversion -Wstrict-overflow -Wstrict-prototypes -Wtrampolines -Wundef -Wunreachable-code -Wunused -Wvariadic-macros -Wvla -Wwrite-strings -Werror=implicit-int -Werror=implicit-function-declaration -Werror=partial-availability -Wno-long-long ; do
   as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags_-Werror_$flag" | $as_tr_sh`
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
-printf %s "checking whether C compiler accepts $flag... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
 if eval test \${$as_CACHEVAR+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS -Werror $flag"
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS -Werror $flag $add_gnu_werror"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20154,29 +20226,39 @@ fi
 eval ac_res=\$$as_CACHEVAR
 	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 printf "%s\n" "$ac_res" >&6; }
-if test x"`eval 'as_val=${'$as_CACHEVAR'};printf "%s\n" "$as_val"'`" = xyes
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${AM_CFLAGS+y}
+
+if test ${AM_CFLAGS+y}
 then :
-  case " $AM_CFLAGS " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
+
+  case " $AM_CFLAGS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
   (: AM_CFLAGS already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS \$flag\""; } >&5
-  (: AM_CFLAGS="$AM_CFLAGS $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append AM_CFLAGS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      AM_CFLAGS="$AM_CFLAGS $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  AM_CFLAGS="$flag"
+
+  AM_CFLAGS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -20185,22 +20267,28 @@ fi
 
 done
 
+fi
+
+case $host_os in
+  *qnx*|*android*)
+
 
-    case $host_os in
-    *android*)
 
 
 for flag in -std=c99; do
   as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags_-Werror_$flag" | $as_tr_sh`
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
-printf %s "checking whether C compiler accepts $flag... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
 if eval test \${$as_CACHEVAR+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS -Werror $flag"
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS -Werror $flag $add_gnu_werror"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20224,29 +20312,39 @@ fi
 eval ac_res=\$$as_CACHEVAR
 	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 printf "%s\n" "$ac_res" >&6; }
-if test x"`eval 'as_val=${'$as_CACHEVAR'};printf "%s\n" "$as_val"'`" = xyes
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${AM_CFLAGS+y}
+
+if test ${AM_CFLAGS+y}
 then :
-  case " $AM_CFLAGS " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
+
+  case " $AM_CFLAGS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
   (: AM_CFLAGS already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS \$flag\""; } >&5
-  (: AM_CFLAGS="$AM_CFLAGS $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append AM_CFLAGS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      AM_CFLAGS="$AM_CFLAGS $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  AM_CFLAGS="$flag"
+
+  AM_CFLAGS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -20255,21 +20353,26 @@ fi
 
 done
 
-      ;;
-    *)
+    ;;
+  *)
+
+
 
 
 for flag in -std=c90; do
   as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags_-Werror_$flag" | $as_tr_sh`
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
-printf %s "checking whether C compiler accepts $flag... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
 if eval test \${$as_CACHEVAR+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS -Werror $flag"
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS -Werror $flag $add_gnu_werror"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20293,29 +20396,39 @@ fi
 eval ac_res=\$$as_CACHEVAR
 	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 printf "%s\n" "$ac_res" >&6; }
-if test x"`eval 'as_val=${'$as_CACHEVAR'};printf "%s\n" "$as_val"'`" = xyes
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${AM_CFLAGS+y}
+
+if test ${AM_CFLAGS+y}
 then :
-  case " $AM_CFLAGS " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
+
+  case " $AM_CFLAGS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
   (: AM_CFLAGS already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS \$flag\""; } >&5
-  (: AM_CFLAGS="$AM_CFLAGS $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append AM_CFLAGS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      AM_CFLAGS="$AM_CFLAGS $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  AM_CFLAGS="$flag"
+
+  AM_CFLAGS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -20324,24 +20437,115 @@ fi
 
 done
 
-      ;;
-  esac
+    ;;
+esac
+
+case $host_os in
+  *qnx*)
+
+
+
+
+for flag in -D_QNX_SOURCE; do
+  as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags_-Werror_$flag" | $as_tr_sh`
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
+if eval test \${$as_CACHEVAR+y}
+then :
+  printf %s "(cached) " >&6
+else $as_nop
+
+  ax_check_save_flags=$CFLAGS
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS -Werror $flag $add_gnu_werror"
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  eval "$as_CACHEVAR=yes"
+else $as_nop
+  eval "$as_CACHEVAR=no"
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+  CFLAGS=$ax_check_save_flags
+fi
+eval ac_res=\$$as_CACHEVAR
+	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
+printf "%s\n" "$ac_res" >&6; }
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
+then :
+
+if test ${AM_CPPFLAGS+y}
+then :
+
+  case " $AM_CPPFLAGS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS already contains \$flag"; } >&5
+  (: AM_CPPFLAGS already contains $flag) 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append AM_CPPFLAGS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS=\"\$AM_CPPFLAGS\""; } >&5
+  (: AM_CPPFLAGS="$AM_CPPFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+     ;;
+esac
+
+else $as_nop
+
+  AM_CPPFLAGS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CPPFLAGS=\"\$AM_CPPFLAGS\""; } >&5
+  (: AM_CPPFLAGS="$AM_CPPFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
+fi
+
+else $as_nop
+  :
 fi
 
+done
+
+  ;;
+esac
+
 if test "$ax_cv_c_compiler_vendor" = "intel"; then
 
 
+
+
 for flag in -shared-intel; do
   as_CACHEVAR=`printf "%s\n" "ax_cv_check_cflags__$flag" | $as_tr_sh`
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether C compiler accepts $flag" >&5
-printf %s "checking whether C compiler accepts $flag... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C compiler accepts $flag" >&5
+printf %s "checking whether the C compiler accepts $flag... " >&6; }
 if eval test \${$as_CACHEVAR+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
 
   ax_check_save_flags=$CFLAGS
-  CFLAGS="$CFLAGS  $flag"
+  if test x"$GCC" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  CFLAGS="$CFLAGS  $flag $add_gnu_werror"
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
 
@@ -20365,29 +20569,39 @@ fi
 eval ac_res=\$$as_CACHEVAR
 	       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
 printf "%s\n" "$ac_res" >&6; }
-if test x"`eval 'as_val=${'$as_CACHEVAR'};printf "%s\n" "$as_val"'`" = xyes
+if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${AM_CFLAGS+y}
+
+if test ${AM_CFLAGS+y}
 then :
-  case " $AM_CFLAGS " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
+
+  case " $AM_CFLAGS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS already contains \$flag"; } >&5
   (: AM_CFLAGS already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS \$flag\""; } >&5
-  (: AM_CFLAGS="$AM_CFLAGS $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append AM_CFLAGS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      AM_CFLAGS="$AM_CFLAGS $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  AM_CFLAGS="$flag"
+
+  AM_CFLAGS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : AM_CFLAGS=\"\$AM_CFLAGS\""; } >&5
+  (: AM_CFLAGS="$AM_CFLAGS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -20708,27 +20922,37 @@ eval ac_res=\$$as_CACHEVAR
 printf "%s\n" "$ac_res" >&6; }
 if eval test \"x\$"$as_CACHEVAR"\" = x"yes"
 then :
-  if test ${XNET_LIBS+y}
+
+if test ${XNET_LIBS+y}
 then :
-  case " $XNET_LIBS " in
-    *" $flag "*)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : XNET_LIBS already contains \$flag"; } >&5
+
+  case " $XNET_LIBS " in #(
+  *" $flag "*) :
+    { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : XNET_LIBS already contains \$flag"; } >&5
   (: XNET_LIBS already contains $flag) 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-      ;;
-    *)
-      { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : XNET_LIBS=\"\$XNET_LIBS \$flag\""; } >&5
-  (: XNET_LIBS="$XNET_LIBS $flag") 2>&5
+  test $ac_status = 0; } ;; #(
+  *) :
+
+     as_fn_append XNET_LIBS " $flag"
+     { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : XNET_LIBS=\"\$XNET_LIBS\""; } >&5
+  (: XNET_LIBS="$XNET_LIBS") 2>&5
   ac_status=$?
   printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
   test $ac_status = 0; }
-      XNET_LIBS="$XNET_LIBS $flag"
-      ;;
-   esac
+     ;;
+esac
+
 else $as_nop
-  XNET_LIBS="$flag"
+
+  XNET_LIBS=$flag
+  { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: : XNET_LIBS=\"\$XNET_LIBS\""; } >&5
+  (: XNET_LIBS="$XNET_LIBS") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+
 fi
 
 else $as_nop
@@ -22131,6 +22355,14 @@ fi
 
 
 
+ac_fn_check_decl "$LINENO" "strnlen" "ac_cv_have_decl_strnlen" "$cares_all_includes
+" "$ac_c_undeclared_builtin_options" "CFLAGS"
+if test "x$ac_cv_have_decl_strnlen" = xyes
+then :
+
+printf "%s\n" "#define HAVE_STRNLEN 1" >>confdefs.h
+
+fi
 ac_fn_check_decl "$LINENO" "memmem" "ac_cv_have_decl_memmem" "$cares_all_includes
 " "$ac_c_undeclared_builtin_options" "CFLAGS"
 if test "x$ac_cv_have_decl_memmem" = xyes
@@ -23708,6 +23940,15 @@ printf "%s\n" "$as_me: WARNING: cannot build tests when cross compiling" >&2;}
     as_fn_error $? "*** Tests not supported when cross compiling" "$LINENO" 5
   fi
 fi
+
+# Check whether --enable-tests-crossbuild was given.
+if test ${enable_tests_crossbuild+y}
+then :
+  enableval=$enable_tests_crossbuild; build_tests="$enableval"
+
+fi
+
+
 if test "x$build_tests" != "xno" ; then
 
 
@@ -23993,7 +24234,7 @@ fi
     if test "x$have_gmock_v112" = "xyes" ; then
        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether user namespaces are supported" >&5
 printf %s "checking whether user namespaces are supported... " >&6; }
-if test ${ax_cv_user_namespace+y}
+if test ${ares_cv_user_namespace+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
@@ -24006,7 +24247,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
   if test "$cross_compiling" = yes
 then :
-  ax_cv_user_namespace=no
+  ares_cv_user_namespace=no
 else $as_nop
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -24046,9 +24287,9 @@ int main() {
 _ACEOF
 if ac_fn_c_try_run "$LINENO"
 then :
-  ax_cv_user_namespace=yes
+  ares_cv_user_namespace=yes
 else $as_nop
-  ax_cv_user_namespace=no
+  ares_cv_user_namespace=no
 fi
 rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
   conftest.$ac_objext conftest.beam conftest.$ac_ext
@@ -24062,9 +24303,9 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_user_namespace" >&5
-printf "%s\n" "$ax_cv_user_namespace" >&6; }
- if test "$ax_cv_user_namespace" = yes; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ares_cv_user_namespace" >&5
+printf "%s\n" "$ares_cv_user_namespace" >&6; }
+ if test "$ares_cv_user_namespace" = yes; then
 
 printf "%s\n" "#define HAVE_USER_NAMESPACE 1" >>confdefs.h
 
@@ -24072,7 +24313,7 @@ printf "%s\n" "#define HAVE_USER_NAMESPACE 1" >>confdefs.h
 
        { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether UTS namespaces are supported" >&5
 printf %s "checking whether UTS namespaces are supported... " >&6; }
-if test ${ax_cv_uts_namespace+y}
+if test ${ares_cv_uts_namespace+y}
 then :
   printf %s "(cached) " >&6
 else $as_nop
@@ -24085,7 +24326,7 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
   if test "$cross_compiling" = yes
 then :
-  ax_cv_uts_namespace=no
+  ares_cv_uts_namespace=no
 else $as_nop
   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
 /* end confdefs.h.  */
@@ -24145,9 +24386,9 @@ int main() {
 _ACEOF
 if ac_fn_c_try_run "$LINENO"
 then :
-  ax_cv_uts_namespace=yes
+  ares_cv_uts_namespace=yes
 else $as_nop
-  ax_cv_uts_namespace=no
+  ares_cv_uts_namespace=no
 fi
 rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
   conftest.$ac_objext conftest.beam conftest.$ac_ext
@@ -24161,9 +24402,9 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
 fi
-{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ax_cv_uts_namespace" >&5
-printf "%s\n" "$ax_cv_uts_namespace" >&6; }
- if test "$ax_cv_uts_namespace" = yes; then
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ares_cv_uts_namespace" >&5
+printf "%s\n" "$ares_cv_uts_namespace" >&6; }
+ if test "$ares_cv_uts_namespace" = yes; then
 
 printf "%s\n" "#define HAVE_UTS_NAMESPACE 1" >>confdefs.h
 
@@ -24218,7 +24459,17 @@ else $as_nop
 // MSVC always sets __cplusplus to 199711L in older versions; newer versions
 // only set it correctly if /Zc:__cplusplus is specified as well as a
 // /std:c++NN switch:
+//
 // https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+//
+// The value __cplusplus ought to have is available in _MSVC_LANG since
+// Visual Studio 2015 Update 3:
+//
+// https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+//
+// This was also the first MSVC version to support C++14 so we can't use the
+// value of either __cplusplus or _MSVC_LANG to quickly rule out MSVC having
+// C++11 or C++14 support, but we can check _MSVC_LANG for C++17 and later.
 #elif __cplusplus < 201103L && !defined _MSC_VER
 
 #error "This is not a C++11 compiler"
@@ -26007,7 +26258,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by c-ares $as_me 1.34.3, which was
+This file was extended by c-ares $as_me 1.34.4, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -26075,7 +26326,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-c-ares config.status 1.34.3
+c-ares config.status 1.34.4
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/deps/cares/configure.ac b/deps/cares/configure.ac
index 5f848c28598a95..9dacf1fb2e4a40 100644
--- a/deps/cares/configure.ac
+++ b/deps/cares/configure.ac
@@ -2,10 +2,10 @@ dnl Copyright (C) The c-ares project and its contributors
 dnl SPDX-License-Identifier: MIT
 AC_PREREQ([2.69])
 
-AC_INIT([c-ares], [1.34.3],
+AC_INIT([c-ares], [1.34.4],
   [c-ares mailing list: http://lists.haxx.se/listinfo/c-ares])
 
-CARES_VERSION_INFO="21:2:19"
+CARES_VERSION_INFO="21:3:19"
 dnl This flag accepts an argument of the form current[:revision[:age]]. So,
 dnl passing -version-info 3:12:1 sets current to 3, revision to 12, and age to
 dnl 1.
@@ -245,18 +245,25 @@ AC_SUBST(CARES_SYMBOL_HIDING_CFLAG)
 if test "$enable_warnings" = "yes"; then
   AX_APPEND_COMPILE_FLAGS([-Wall -Wextra -Waggregate-return -Wcast-align -Wcast-qual -Wconversion -Wdeclaration-after-statement -Wdouble-promotion -Wfloat-equal -Wformat-security -Winit-self -Wjump-misses-init -Wlogical-op -Wmissing-braces -Wmissing-declarations -Wmissing-format-attribute -Wmissing-include-dirs -Wmissing-prototypes -Wnested-externs -Wno-coverage-mismatch -Wold-style-definition -Wpacked -Wpedantic -Wpointer-arith -Wredundant-decls -Wshadow -Wsign-conversion -Wstrict-overflow -Wstrict-prototypes -Wtrampolines -Wundef -Wunreachable-code -Wunused -Wvariadic-macros -Wvla -Wwrite-strings -Werror=implicit-int -Werror=implicit-function-declaration -Werror=partial-availability -Wno-long-long ],
     [AM_CFLAGS], [-Werror])
-
-  dnl Android requires c99, all others should use c90
-  case $host_os in
-    *android*)
-      AX_APPEND_COMPILE_FLAGS([-std=c99], [AM_CFLAGS], [-Werror])
-      ;;
-    *)
-      AX_APPEND_COMPILE_FLAGS([-std=c90], [AM_CFLAGS], [-Werror])
-      ;;
-  esac
 fi
 
+dnl Android and QNX require c99, all others should use c90
+case $host_os in
+  *qnx*|*android*)
+    AX_APPEND_COMPILE_FLAGS([-std=c99], [AM_CFLAGS], [-Werror])
+    ;;
+  *)
+    AX_APPEND_COMPILE_FLAGS([-std=c90], [AM_CFLAGS], [-Werror])
+    ;;
+esac
+
+dnl QNX needs -D_QNX_SOURCE
+case $host_os in
+  *qnx*)
+    AX_APPEND_COMPILE_FLAGS([-D_QNX_SOURCE], [AM_CPPFLAGS], [-Werror])
+  ;;
+esac
+
 if test "$ax_cv_c_compiler_vendor" = "intel"; then
   AX_APPEND_COMPILE_FLAGS([-shared-intel], [AM_CFLAGS])
 fi
@@ -543,6 +550,7 @@ dnl https://mailman.videolan.org/pipermail/vlc-devel/2015-March/101802.html
 dnl which would require we check each individually and provide function arguments
 dnl for the test.
 
+AC_CHECK_DECL(strnlen,         [AC_DEFINE([HAVE_STRNLEN],           1, [Define to 1 if you have `strnlen`]        )], [], $cares_all_includes)
 AC_CHECK_DECL(memmem,          [AC_DEFINE([HAVE_MEMMEM],            1, [Define to 1 if you have `memmem`]         )], [], $cares_all_includes)
 AC_CHECK_DECL(recv,            [AC_DEFINE([HAVE_RECV],              1, [Define to 1 if you have `recv`]           )], [], $cares_all_includes)
 AC_CHECK_DECL(recvfrom,        [AC_DEFINE([HAVE_RECVFROM],          1, [Define to 1 if you have `recvfrom`]       )], [], $cares_all_includes)
@@ -813,6 +821,13 @@ if test "x$build_tests" != "xno" -a "x$cross_compiling" = "xyes" ; then
     AC_MSG_ERROR([*** Tests not supported when cross compiling])
   fi
 fi
+
+dnl Forces compiling of tests even when cross-compiling.
+AC_ARG_ENABLE(tests-crossbuild,
+  AS_HELP_STRING([--enable-tests-crossbuild], [Enable test building even when cross building]),
+  [build_tests="$enableval"]
+)
+
 if test "x$build_tests" != "xno" ; then
   PKG_CHECK_MODULES([GMOCK], [gmock], [ have_gmock=yes ], [ have_gmock=no ])
   if test "x$have_gmock" = "xno" ; then
@@ -825,8 +840,8 @@ if test "x$build_tests" != "xno" ; then
   else
     PKG_CHECK_MODULES([GMOCK112], [gmock >= 1.12.0], [ have_gmock_v112=yes ], [ have_gmock_v112=no ])
     if test "x$have_gmock_v112" = "xyes" ; then
-      AX_CHECK_USER_NAMESPACE
-      AX_CHECK_UTS_NAMESPACE
+      ARES_CHECK_USER_NAMESPACE
+      ARES_CHECK_UTS_NAMESPACE
     fi
   fi
 fi
diff --git a/deps/cares/docs/Makefile.in b/deps/cares/docs/Makefile.in
index 6b7bb8e30d1a20..0d1873c9662c92 100644
--- a/deps/cares/docs/Makefile.in
+++ b/deps/cares/docs/Makefile.in
@@ -92,7 +92,9 @@ build_triplet = @build@
 host_triplet = @host@
 subdir = docs
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ares_check_user_namespace.m4 \
+	$(top_srcdir)/m4/ares_check_uts_namespace.m4 \
+	$(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_ac_print_to_file.m4 \
 	$(top_srcdir)/m4/ax_add_am_macro_static.m4 \
 	$(top_srcdir)/m4/ax_am_macros_static.m4 \
@@ -102,8 +104,6 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
 	$(top_srcdir)/m4/ax_check_gnu_make.m4 \
 	$(top_srcdir)/m4/ax_check_link_flag.m4 \
-	$(top_srcdir)/m4/ax_check_user_namespace.m4 \
-	$(top_srcdir)/m4/ax_check_uts_namespace.m4 \
 	$(top_srcdir)/m4/ax_code_coverage.m4 \
 	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
 	$(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \
diff --git a/deps/cares/docs/ares_create_query.3 b/deps/cares/docs/ares_create_query.3
index a54eec3e2a6bd1..3af6ba4cc3dc5b 100644
--- a/deps/cares/docs/ares_create_query.3
+++ b/deps/cares/docs/ares_create_query.3
@@ -19,6 +19,9 @@ int ares_create_query(const char *\fIname\fP,
                       int \fImax_udp_size\fP)
 .fi
 .SH DESCRIPTION
+This function is deprecated as of c-ares 1.22, please use
+\fIares_dns_record_create(3)\fP instead.
+
 The \fIares_create_query(3)\fP function composes a DNS query with a single
 question.  The parameter \fIname\fP gives the query name as a NUL-terminated C
 string of period-separated labels optionally ending with a period; periods and
diff --git a/deps/cares/docs/ares_mkquery.3 b/deps/cares/docs/ares_mkquery.3
index 0e7b5edbb89353..2f42d169210fef 100644
--- a/deps/cares/docs/ares_mkquery.3
+++ b/deps/cares/docs/ares_mkquery.3
@@ -14,7 +14,8 @@ int ares_mkquery(const char *\fIname\fP, int \fIdnsclass\fP, int \fItype\fP,
                  int *\fIbuflen\fP)
 .fi
 .SH DESCRIPTION
-Deprecated function. See \fIares_create_query(3)\fP instead!
+This function is deprecated as of c-ares 1.10, please use
+\fIares_dns_record_create(3)\fP instead.
 
 The
 .B ares_mkquery
diff --git a/deps/cares/docs/ares_send.3 b/deps/cares/docs/ares_send.3
index f6ea9140e2510c..df3e3bbe4136b0 100644
--- a/deps/cares/docs/ares_send.3
+++ b/deps/cares/docs/ares_send.3
@@ -113,6 +113,9 @@ is being destroyed; the query will not be completed.
 .B ARES_ENOSERVER
 The query will not be completed because no DNS servers were configured on the
 channel.
+.TP 19
+.B ARES_EBADQUERY
+Misformatted DNS query.
 .PP
 
 The callback argument
diff --git a/deps/cares/include/Makefile.in b/deps/cares/include/Makefile.in
index 0beee44a22bb22..7dc40eb08fab9c 100644
--- a/deps/cares/include/Makefile.in
+++ b/deps/cares/include/Makefile.in
@@ -90,7 +90,9 @@ build_triplet = @build@
 host_triplet = @host@
 subdir = include
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ares_check_user_namespace.m4 \
+	$(top_srcdir)/m4/ares_check_uts_namespace.m4 \
+	$(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_ac_print_to_file.m4 \
 	$(top_srcdir)/m4/ax_add_am_macro_static.m4 \
 	$(top_srcdir)/m4/ax_am_macros_static.m4 \
@@ -100,8 +102,6 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
 	$(top_srcdir)/m4/ax_check_gnu_make.m4 \
 	$(top_srcdir)/m4/ax_check_link_flag.m4 \
-	$(top_srcdir)/m4/ax_check_user_namespace.m4 \
-	$(top_srcdir)/m4/ax_check_uts_namespace.m4 \
 	$(top_srcdir)/m4/ax_code_coverage.m4 \
 	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
 	$(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \
diff --git a/deps/cares/include/ares.h b/deps/cares/include/ares.h
index 139c6d66ee90df..7fe3ec78f4e651 100644
--- a/deps/cares/include/ares.h
+++ b/deps/cares/include/ares.h
@@ -74,7 +74,7 @@
 #if defined(_AIX) || defined(__NOVELL_LIBC__) || defined(__NetBSD__) || \
   defined(__minix) || defined(__SYMBIAN32__) || defined(__INTEGRITY) || \
   defined(ANDROID) || defined(__ANDROID__) || defined(__OpenBSD__) ||   \
-  defined(__QNXNTO__) || defined(__MVS__) || defined(__HAIKU__)
+  defined(__QNX__) || defined(__MVS__) || defined(__HAIKU__)
 #  include <sys/select.h>
 #endif
 
diff --git a/deps/cares/include/ares_version.h b/deps/cares/include/ares_version.h
index 9cb8084dd56bc9..782046bd79d844 100644
--- a/deps/cares/include/ares_version.h
+++ b/deps/cares/include/ares_version.h
@@ -32,8 +32,8 @@
 
 #define ARES_VERSION_MAJOR 1
 #define ARES_VERSION_MINOR 34
-#define ARES_VERSION_PATCH 3
-#define ARES_VERSION_STR "1.34.3"
+#define ARES_VERSION_PATCH 4
+#define ARES_VERSION_STR "1.34.4"
 
 /* NOTE: We cannot make the version string a C preprocessor stringify operation
  *       due to assumptions made by integrators that aren't properly using
diff --git a/deps/cares/m4/ax_check_user_namespace.m4 b/deps/cares/m4/ares_check_user_namespace.m4
similarity index 82%
rename from deps/cares/m4/ax_check_user_namespace.m4
rename to deps/cares/m4/ares_check_user_namespace.m4
index aca721626f2e89..a26b384fda5c54 100644
--- a/deps/cares/m4/ax_check_user_namespace.m4
+++ b/deps/cares/m4/ares_check_user_namespace.m4
@@ -2,7 +2,7 @@
 
 # SYNOPSIS
 #
-#   AX_CHECK_USER_NAMESPACE
+#   ARES_CHECK_USER_NAMESPACE
 #
 # DESCRIPTION
 #
@@ -12,9 +12,9 @@
 # Copyright (C) The c-ares team
 # SPDX-License-Identifier: MIT
 
-AC_DEFUN([AX_CHECK_USER_NAMESPACE],[dnl
+AC_DEFUN([ARES_CHECK_USER_NAMESPACE],[dnl
  AC_CACHE_CHECK([whether user namespaces are supported],
-  ax_cv_user_namespace,[
+  ares_cv_user_namespace,[
   AC_LANG_PUSH([C])
   AC_RUN_IFELSE([AC_LANG_SOURCE([[
 #define _GNU_SOURCE
@@ -48,10 +48,10 @@ int main() {
   if (!WIFEXITED(status)) return 1;
   return WEXITSTATUS(status);
 }
-  ]])],[ax_cv_user_namespace=yes],[ax_cv_user_namespace=no],[ax_cv_user_namespace=no])
+  ]])],[ares_cv_user_namespace=yes],[ares_cv_user_namespace=no],[ares_cv_user_namespace=no])
  AC_LANG_POP([C])
  ])
- if test "$ax_cv_user_namespace" = yes; then
+ if test "$ares_cv_user_namespace" = yes; then
    AC_DEFINE([HAVE_USER_NAMESPACE],[1],[Whether user namespaces are available])
  fi
-]) # AX_CHECK_USER_NAMESPACE
+]) # ARES_CHECK_USER_NAMESPACE
diff --git a/deps/cares/m4/ax_check_uts_namespace.m4 b/deps/cares/m4/ares_check_uts_namespace.m4
similarity index 87%
rename from deps/cares/m4/ax_check_uts_namespace.m4
rename to deps/cares/m4/ares_check_uts_namespace.m4
index 5708acf1b9f376..0aeefe4a9b7b8b 100644
--- a/deps/cares/m4/ax_check_uts_namespace.m4
+++ b/deps/cares/m4/ares_check_uts_namespace.m4
@@ -2,7 +2,7 @@
 
 # SYNOPSIS
 #
-#   AX_CHECK_UTS_NAMESPACE
+#   ARES_CHECK_UTS_NAMESPACE
 #
 # DESCRIPTION
 #
@@ -14,9 +14,9 @@
 # Copyright (C) The c-ares team
 # SPDX-License-Identifier: MIT
 
-AC_DEFUN([AX_CHECK_UTS_NAMESPACE],[dnl
+AC_DEFUN([ARES_CHECK_UTS_NAMESPACE],[dnl
  AC_CACHE_CHECK([whether UTS namespaces are supported],
-  ax_cv_uts_namespace,[
+  ares_cv_uts_namespace,[
   AC_LANG_PUSH([C])
   AC_RUN_IFELSE([AC_LANG_SOURCE([[
 #define _GNU_SOURCE
@@ -70,10 +70,10 @@ int main() {
   return WEXITSTATUS(status);
 }
 ]])
-  ],[ax_cv_uts_namespace=yes],[ax_cv_uts_namespace=no],[ax_cv_uts_namespace=no])
+  ],[ares_cv_uts_namespace=yes],[ares_cv_uts_namespace=no],[ares_cv_uts_namespace=no])
  AC_LANG_POP([C])
  ])
- if test "$ax_cv_uts_namespace" = yes; then
+ if test "$ares_cv_uts_namespace" = yes; then
    AC_DEFINE([HAVE_UTS_NAMESPACE],[1],[Whether UTS namespaces are available])
  fi
-]) # AX_CHECK_UTS_NAMESPACE
+]) # ARES_CHECK_UTS_NAMESPACE
diff --git a/deps/cares/m4/ax_append_compile_flags.m4 b/deps/cares/m4/ax_append_compile_flags.m4
index 1f8e70845c20d9..9c856356c0cda6 100644
--- a/deps/cares/m4/ax_append_compile_flags.m4
+++ b/deps/cares/m4/ax_append_compile_flags.m4
@@ -1,10 +1,10 @@
-# ===========================================================================
-#  http://www.gnu.org/software/autoconf-archive/ax_append_compile_flags.html
-# ===========================================================================
+# ============================================================================
+#  https://www.gnu.org/software/autoconf-archive/ax_append_compile_flags.html
+# ============================================================================
 #
 # SYNOPSIS
 #
-#   AX_APPEND_COMPILE_FLAGS([FLAG1 FLAG2 ...], [FLAGS-VARIABLE], [EXTRA-FLAGS])
+#   AX_APPEND_COMPILE_FLAGS([FLAG1 FLAG2 ...], [FLAGS-VARIABLE], [EXTRA-FLAGS], [INPUT])
 #
 # DESCRIPTION
 #
@@ -20,6 +20,8 @@
 #   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
 #   force the compiler to issue an error when a bad flag is given.
 #
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
 #   NOTE: This macro depends on the AX_APPEND_FLAG and
 #   AX_CHECK_COMPILE_FLAG. Please keep this macro in sync with
 #   AX_APPEND_LINK_FLAGS.
@@ -28,38 +30,17 @@
 #
 #   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
 #
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
 
-#serial 3
+#serial 7
 
 AC_DEFUN([AX_APPEND_COMPILE_FLAGS],
-[AC_REQUIRE([AX_CHECK_COMPILE_FLAG])
-AC_REQUIRE([AX_APPEND_FLAG])
+[AX_REQUIRE_DEFINED([AX_CHECK_COMPILE_FLAG])
+AX_REQUIRE_DEFINED([AX_APPEND_FLAG])
 for flag in $1; do
-  AX_CHECK_COMPILE_FLAG([$flag], [AX_APPEND_FLAG([$flag], [$2])], [], [$3])
+  AX_CHECK_COMPILE_FLAG([$flag], [AX_APPEND_FLAG([$flag], [$2])], [], [$3], [$4])
 done
 ])dnl AX_APPEND_COMPILE_FLAGS
diff --git a/deps/cares/m4/ax_append_flag.m4 b/deps/cares/m4/ax_append_flag.m4
index 1d38b76fb8e157..dd6d8b61406c32 100644
--- a/deps/cares/m4/ax_append_flag.m4
+++ b/deps/cares/m4/ax_append_flag.m4
@@ -1,5 +1,5 @@
 # ===========================================================================
-#      http://www.gnu.org/software/autoconf-archive/ax_append_flag.html
+#      https://www.gnu.org/software/autoconf-archive/ax_append_flag.html
 # ===========================================================================
 #
 # SYNOPSIS
@@ -23,47 +23,28 @@
 #   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
 #   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
 #
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
 
-#serial 2
+#serial 8
 
 AC_DEFUN([AX_APPEND_FLAG],
-[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
-AS_VAR_PUSHDEF([FLAGS], [m4_default($2,_AC_LANG_PREFIX[FLAGS])])dnl
-AS_VAR_SET_IF(FLAGS,
-  [case " AS_VAR_GET(FLAGS) " in
-    *" $1 "*)
-      AC_RUN_LOG([: FLAGS already contains $1])
-      ;;
-    *)
-      AC_RUN_LOG([: FLAGS="$FLAGS $1"])
-      AS_VAR_SET(FLAGS, ["AS_VAR_GET(FLAGS) $1"])
-      ;;
-   esac],
-  [AS_VAR_SET(FLAGS,["$1"])])
+[dnl
+AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_SET_IF
+AS_VAR_PUSHDEF([FLAGS], [m4_default($2,_AC_LANG_PREFIX[FLAGS])])
+AS_VAR_SET_IF(FLAGS,[
+  AS_CASE([" AS_VAR_GET(FLAGS) "],
+    [*" $1 "*], [AC_RUN_LOG([: FLAGS already contains $1])],
+    [
+     AS_VAR_APPEND(FLAGS,[" $1"])
+     AC_RUN_LOG([: FLAGS="$FLAGS"])
+    ])
+  ],
+  [
+  AS_VAR_SET(FLAGS,[$1])
+  AC_RUN_LOG([: FLAGS="$FLAGS"])
+  ])
 AS_VAR_POPDEF([FLAGS])dnl
 ])dnl AX_APPEND_FLAG
diff --git a/deps/cares/m4/ax_check_compile_flag.m4 b/deps/cares/m4/ax_check_compile_flag.m4
index c3a8d695a1bcda..54191c55353ee5 100644
--- a/deps/cares/m4/ax_check_compile_flag.m4
+++ b/deps/cares/m4/ax_check_compile_flag.m4
@@ -1,10 +1,10 @@
 # ===========================================================================
-#   http://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
+#  https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html
 # ===========================================================================
 #
 # SYNOPSIS
 #
-#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS])
+#   AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT])
 #
 # DESCRIPTION
 #
@@ -19,6 +19,8 @@
 #   the flags: "CFLAGS EXTRA-FLAGS FLAG".  This can for example be used to
 #   force the compiler to issue an error when a bad flag is given.
 #
+#   INPUT gives an alternative input source to AC_COMPILE_IFELSE.
+#
 #   NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this
 #   macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG.
 #
@@ -27,45 +29,34 @@
 #   Copyright (c) 2008 Guido U. Draheim <guidod@gmx.de>
 #   Copyright (c) 2011 Maarten Bosmans <mkbosmans@gmail.com>
 #
-#   This program is free software: you can redistribute it and/or modify it
-#   under the terms of the GNU General Public License as published by the
-#   Free Software Foundation, either version 3 of the License, or (at your
-#   option) any later version.
-#
-#   This program is distributed in the hope that it will be useful, but
-#   WITHOUT ANY WARRANTY; without even the implied warranty of
-#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
-#   Public License for more details.
-#
-#   You should have received a copy of the GNU General Public License along
-#   with this program. If not, see <http://www.gnu.org/licenses/>.
-#
-#   As a special exception, the respective Autoconf Macro's copyright owner
-#   gives unlimited permission to copy, distribute and modify the configure
-#   scripts that are the output of Autoconf when processing the Macro. You
-#   need not follow the terms of the GNU General Public License when using
-#   or distributing such scripts, even though portions of the text of the
-#   Macro appear in them. The GNU General Public License (GPL) does govern
-#   all other use of the material that constitutes the Autoconf Macro.
-#
-#   This special exception to the GPL applies to versions of the Autoconf
-#   Macro released by the Autoconf Archive. When you make and distribute a
-#   modified version of the Autoconf Macro, you may extend this special
-#   exception to the GPL to apply to your modified version as well.
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
 
-#serial 2
+#serial 11
 
 AC_DEFUN([AX_CHECK_COMPILE_FLAG],
-[AC_PREREQ(2.59)dnl for _AC_LANG_PREFIX
+[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
 AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
-AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
+AC_CACHE_CHECK([whether the _AC_LANG compiler accepts $1], CACHEVAR, [
   ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
-  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
-  AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],
+  if test x"m4_case(_AC_LANG,
+                     [C], [$GCC],
+                     [C++], [$GXX],
+                     [Fortran], [$GFC],
+                     [Fortran 77], [$G77],
+                     [Objective C], [$GOBJC],
+                     [Objective C++], [$GOBJCXX],
+                     [no])" = xyes ; then
+    add_gnu_werror="-Werror"
+  fi
+  _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1 $add_gnu_werror"
+  AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
     [AS_VAR_SET(CACHEVAR,[yes])],
     [AS_VAR_SET(CACHEVAR,[no])])
   _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
-AS_IF([test x"AS_VAR_GET(CACHEVAR)" = xyes],
+AS_VAR_IF(CACHEVAR,yes,
   [m4_default([$2], :)],
   [m4_default([$3], :)])
 AS_VAR_POPDEF([CACHEVAR])dnl
diff --git a/deps/cares/m4/ax_code_coverage.m4 b/deps/cares/m4/ax_code_coverage.m4
index ad4063305ebcdd..216708a41f10c9 100644
--- a/deps/cares/m4/ax_code_coverage.m4
+++ b/deps/cares/m4/ax_code_coverage.m4
@@ -74,7 +74,7 @@
 #   You should have received a copy of the GNU Lesser General Public License
 #   along with this program. If not, see <https://www.gnu.org/licenses/>.
 
-#serial 34
+#serial 37
 
 m4_define(_AX_CODE_COVERAGE_RULES,[
 AX_ADD_AM_MACRO_STATIC([
@@ -144,7 +144,7 @@ code_coverage_v_lcov_cap_ = \$(code_coverage_v_lcov_cap_\$(AM_DEFAULT_VERBOSITY)
 code_coverage_v_lcov_cap_0 = @echo \"  LCOV   --capture\" \$(CODE_COVERAGE_OUTPUT_FILE);
 code_coverage_v_lcov_ign = \$(code_coverage_v_lcov_ign_\$(V))
 code_coverage_v_lcov_ign_ = \$(code_coverage_v_lcov_ign_\$(AM_DEFAULT_VERBOSITY))
-code_coverage_v_lcov_ign_0 = @echo \"  LCOV   --remove /tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN);
+code_coverage_v_lcov_ign_0 = @echo \"  LCOV   --remove\" \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \$(CODE_COVERAGE_IGNORE_PATTERN);
 code_coverage_v_genhtml = \$(code_coverage_v_genhtml_\$(V))
 code_coverage_v_genhtml_ = \$(code_coverage_v_genhtml_\$(AM_DEFAULT_VERBOSITY))
 code_coverage_v_genhtml_0 = @echo \"  GEN   \" \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\";
@@ -163,7 +163,7 @@ check-code-coverage:
 # Capture code coverage data
 code-coverage-capture: code-coverage-capture-hook
 	\$(code_coverage_v_lcov_cap)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --capture --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" --test-name \"\$(call code_coverage_sanitize,\$(PACKAGE_NAME)-\$(PACKAGE_VERSION))\" --no-checksum --compat-libtool \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_OPTIONS)
-	\$(code_coverage_v_lcov_ign)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --remove \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \"/tmp/*\" \$(CODE_COVERAGE_IGNORE_PATTERN) --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_RMOPTS)
+	\$(code_coverage_v_lcov_ign)\$(LCOV) \$(code_coverage_quiet) \$(addprefix --directory ,\$(CODE_COVERAGE_DIRECTORY)) --remove \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\" \$(CODE_COVERAGE_IGNORE_PATTERN) --output-file \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_LCOV_SHOPTS) \$(CODE_COVERAGE_LCOV_RMOPTS)
 	-@rm -f \"\$(CODE_COVERAGE_OUTPUT_FILE).tmp\"
 	\$(code_coverage_v_genhtml)LANG=C \$(GENHTML) \$(code_coverage_quiet) \$(addprefix --prefix ,\$(CODE_COVERAGE_DIRECTORY)) --output-directory \"\$(CODE_COVERAGE_OUTPUT_DIRECTORY)\" --title \"\$(PACKAGE_NAME)-\$(PACKAGE_VERSION) Code Coverage\" --legend --show-details \"\$(CODE_COVERAGE_OUTPUT_FILE)\" \$(CODE_COVERAGE_GENHTML_OPTIONS)
 	@echo \"file://\$(abs_builddir)/\$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html\"
@@ -206,14 +206,14 @@ code-coverage-capture-hook:
 ])
 
 AC_DEFUN([_AX_CODE_COVERAGE_ENABLED],[
-	AX_CHECK_GNU_MAKE([],AC_MSG_ERROR([not using GNU make that is needed for coverage]))
+	AX_CHECK_GNU_MAKE([],[AC_MSG_ERROR([not using GNU make that is needed for coverage])])
 	AC_REQUIRE([AX_ADD_AM_MACRO_STATIC])
 	# check for gcov
 	AC_CHECK_TOOL([GCOV],
 		  [$_AX_CODE_COVERAGE_GCOV_PROG_WITH],
 		  [:])
 	AS_IF([test "X$GCOV" = "X:"],
-	      AC_MSG_ERROR([gcov is needed to do coverage]))
+	      [AC_MSG_ERROR([gcov is needed to do coverage])])
 	AC_SUBST([GCOV])
 
 	dnl Check if gcc is being used
@@ -232,12 +232,13 @@ AC_DEFUN([_AX_CODE_COVERAGE_ENABLED],[
 		AC_MSG_ERROR([Could not find genhtml from the lcov package])
 	])
 
+	AC_CHECK_LIB([gcov], [_gcov_init], [CODE_COVERAGE_LIBS="-lgcov"], [CODE_COVERAGE_LIBS=""])
+
 	dnl Build the code coverage flags
 	dnl Define CODE_COVERAGE_LDFLAGS for backwards compatibility
 	CODE_COVERAGE_CPPFLAGS="-DNDEBUG"
 	CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
 	CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage"
-	CODE_COVERAGE_LIBS="-lgcov"
 
 	AC_SUBST([CODE_COVERAGE_CPPFLAGS])
 	AC_SUBST([CODE_COVERAGE_CFLAGS])
diff --git a/deps/cares/m4/ax_cxx_compile_stdcxx.m4 b/deps/cares/m4/ax_cxx_compile_stdcxx.m4
index 8edf5152ec7a91..fe6ae17e6c4d32 100644
--- a/deps/cares/m4/ax_cxx_compile_stdcxx.m4
+++ b/deps/cares/m4/ax_cxx_compile_stdcxx.m4
@@ -10,8 +10,8 @@
 #
 #   Check for baseline language coverage in the compiler for the specified
 #   version of the C++ standard.  If necessary, add switches to CXX and
-#   CXXCPP to enable support.  VERSION may be '11', '14', '17', or '20' for
-#   the respective C++ standard version.
+#   CXXCPP to enable support.  VERSION may be '11', '14', '17', '20', or
+#   '23' for the respective C++ standard version.
 #
 #   The second argument, if specified, indicates whether you insist on an
 #   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
@@ -36,14 +36,15 @@
 #   Copyright (c) 2016, 2018 Krzesimir Nowak <qdlacz@gmail.com>
 #   Copyright (c) 2019 Enji Cooper <yaneurabeya@gmail.com>
 #   Copyright (c) 2020 Jason Merrill <jason@redhat.com>
-#   Copyright (c) 2021 Jörn Heusipp <osmanx@problemloesungsmaschine.de>
+#   Copyright (c) 2021, 2024 Jörn Heusipp <osmanx@problemloesungsmaschine.de>
+#   Copyright (c) 2015, 2022, 2023, 2024 Olly Betts
 #
 #   Copying and distribution of this file, with or without modification, are
 #   permitted in any medium without royalty provided the copyright notice
 #   and this notice are preserved.  This file is offered as-is, without any
 #   warranty.
 
-#serial 18
+#serial 25
 
 dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
 dnl  (serial version number 13).
@@ -53,6 +54,7 @@ AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
         [$1], [14], [ax_cxx_compile_alternatives="14 1y"],
         [$1], [17], [ax_cxx_compile_alternatives="17 1z"],
         [$1], [20], [ax_cxx_compile_alternatives="20"],
+        [$1], [23], [ax_cxx_compile_alternatives="23"],
         [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
   m4_if([$2], [], [],
         [$2], [ext], [],
@@ -159,31 +161,41 @@ AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
 dnl  Test body for checking C++11 support
 
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  [_AX_CXX_COMPILE_STDCXX_testbody_new_in_11]
 )
 
 dnl  Test body for checking C++14 support
 
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+  [_AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14]
 )
 
 dnl  Test body for checking C++17 support
 
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_17],
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
+  [_AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_17]
 )
 
 dnl  Test body for checking C++20 support
 
 m4_define([_AX_CXX_COMPILE_STDCXX_testbody_20],
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
-  _AX_CXX_COMPILE_STDCXX_testbody_new_in_20
+  [_AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_20]
+)
+
+dnl  Test body for checking C++23 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_23],
+  [_AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_17
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_20
+   _AX_CXX_COMPILE_STDCXX_testbody_new_in_23]
 )
 
 
@@ -201,7 +213,17 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
 // MSVC always sets __cplusplus to 199711L in older versions; newer versions
 // only set it correctly if /Zc:__cplusplus is specified as well as a
 // /std:c++NN switch:
+//
 // https://devblogs.microsoft.com/cppblog/msvc-now-correctly-reports-__cplusplus/
+//
+// The value __cplusplus ought to have is available in _MSVC_LANG since
+// Visual Studio 2015 Update 3:
+//
+// https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros
+//
+// This was also the first MSVC version to support C++14 so we can't use the
+// value of either __cplusplus or _MSVC_LANG to quickly rule out MSVC having
+// C++11 or C++14 support, but we can check _MSVC_LANG for C++17 and later.
 #elif __cplusplus < 201103L && !defined _MSC_VER
 
 #error "This is not a C++11 compiler"
@@ -617,7 +639,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_17], [[
 
 #error "This is not a C++ compiler"
 
-#elif __cplusplus < 201703L && !defined _MSC_VER
+#elif (defined _MSVC_LANG ? _MSVC_LANG : __cplusplus) < 201703L
 
 #error "This is not a C++17 compiler"
 
@@ -983,7 +1005,7 @@ namespace cxx17
 
 }  // namespace cxx17
 
-#endif  // __cplusplus < 201703L && !defined _MSC_VER
+#endif  // (defined _MSVC_LANG ? _MSVC_LANG : __cplusplus) < 201703L
 
 ]])
 
@@ -996,7 +1018,7 @@ m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_20], [[
 
 #error "This is not a C++ compiler"
 
-#elif __cplusplus < 202002L && !defined _MSC_VER
+#elif (defined _MSVC_LANG ? _MSVC_LANG : __cplusplus) < 202002L
 
 #error "This is not a C++20 compiler"
 
@@ -1013,6 +1035,36 @@ namespace cxx20
 
 }  // namespace cxx20
 
-#endif  // __cplusplus < 202002L && !defined _MSC_VER
+#endif  // (defined _MSVC_LANG ? _MSVC_LANG : __cplusplus) < 202002L
+
+]])
+
+
+dnl  Tests for new features in C++23
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_23], [[
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif (defined _MSVC_LANG ? _MSVC_LANG : __cplusplus) < 202302L
+
+#error "This is not a C++23 compiler"
+
+#else
+
+#include <version>
+
+namespace cxx23
+{
+
+// As C++23 supports feature test macros in the standard, there is no
+// immediate need to actually test for feature availability on the
+// Autoconf side.
+
+}  // namespace cxx23
+
+#endif  // (defined _MSVC_LANG ? _MSVC_LANG : __cplusplus) < 202302L
 
 ]])
diff --git a/deps/cares/src/Makefile.in b/deps/cares/src/Makefile.in
index 0c3c0864d4460a..1f286880247aa1 100644
--- a/deps/cares/src/Makefile.in
+++ b/deps/cares/src/Makefile.in
@@ -89,7 +89,9 @@ build_triplet = @build@
 host_triplet = @host@
 subdir = src
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ares_check_user_namespace.m4 \
+	$(top_srcdir)/m4/ares_check_uts_namespace.m4 \
+	$(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_ac_print_to_file.m4 \
 	$(top_srcdir)/m4/ax_add_am_macro_static.m4 \
 	$(top_srcdir)/m4/ax_am_macros_static.m4 \
@@ -99,8 +101,6 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
 	$(top_srcdir)/m4/ax_check_gnu_make.m4 \
 	$(top_srcdir)/m4/ax_check_link_flag.m4 \
-	$(top_srcdir)/m4/ax_check_user_namespace.m4 \
-	$(top_srcdir)/m4/ax_check_uts_namespace.m4 \
 	$(top_srcdir)/m4/ax_code_coverage.m4 \
 	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
 	$(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \
diff --git a/deps/cares/src/lib/CMakeLists.txt b/deps/cares/src/lib/CMakeLists.txt
index 9956fd625b2ad6..9d4e10924d0adb 100644
--- a/deps/cares/src/lib/CMakeLists.txt
+++ b/deps/cares/src/lib/CMakeLists.txt
@@ -92,11 +92,23 @@ IF (CARES_STATIC)
 
 	SET_TARGET_PROPERTIES (${LIBNAME} PROPERTIES
 		EXPORT_NAME                  cares${STATIC_SUFFIX}
-		OUTPUT_NAME                  cares${STATIC_SUFFIX}
 		COMPILE_PDB_NAME             cares${STATIC_SUFFIX}
 		C_STANDARD                   90
 	)
 
+	# On Windows, the output name should have a static suffix since otherwise
+	# we would have conflicting output names (libcares.lib) for the link
+	# library.
+	# However on Unix-like systems, we typically have something like
+	# libcares.so  for shared libraries  and libcares.a for static
+	# libraries, so these don't conflict.
+	# This behavior better emulates what happens with autotools builds
+	IF (WIN32)
+		SET_TARGET_PROPERTIES(${LIBNAME} PROPERTIES OUTPUT_NAME cares${STATIC_SUFFIX})
+	ELSE ()
+		SET_TARGET_PROPERTIES(${LIBNAME} PROPERTIES OUTPUT_NAME cares)
+	ENDIF()
+
 	IF (ANDROID)
 		SET_TARGET_PROPERTIES (${LIBNAME} PROPERTIES C_STANDARD 99)
 	ENDIF ()
diff --git a/deps/cares/src/lib/Makefile.in b/deps/cares/src/lib/Makefile.in
index 4aff043b26a310..a45fc10b544755 100644
--- a/deps/cares/src/lib/Makefile.in
+++ b/deps/cares/src/lib/Makefile.in
@@ -15,7 +15,7 @@
 @SET_MAKE@
 
 # aminclude_static.am generated automatically by Autoconf
-# from AX_AM_MACROS_STATIC on Sat Nov  9 17:40:37 UTC 2024
+# from AX_AM_MACROS_STATIC on Sat Dec 14 15:15:44 UTC 2024
 
 # Copyright (C) The c-ares project and its contributors
 # SPDX-License-Identifier: MIT
@@ -100,7 +100,9 @@ host_triplet = @host@
 subdir = src/lib
 SUBDIRS =
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ares_check_user_namespace.m4 \
+	$(top_srcdir)/m4/ares_check_uts_namespace.m4 \
+	$(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_ac_print_to_file.m4 \
 	$(top_srcdir)/m4/ax_add_am_macro_static.m4 \
 	$(top_srcdir)/m4/ax_am_macros_static.m4 \
@@ -110,8 +112,6 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
 	$(top_srcdir)/m4/ax_check_gnu_make.m4 \
 	$(top_srcdir)/m4/ax_check_link_flag.m4 \
-	$(top_srcdir)/m4/ax_check_user_namespace.m4 \
-	$(top_srcdir)/m4/ax_check_uts_namespace.m4 \
 	$(top_srcdir)/m4/ax_code_coverage.m4 \
 	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
 	$(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \
@@ -629,7 +629,7 @@ libcares_la_CPPFLAGS_EXTRA = -DCARES_BUILDING_LIBRARY $(am__append_3) \
 @CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_lcov_cap_0 = @echo "  LCOV   --capture" $(CODE_COVERAGE_OUTPUT_FILE);
 @CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_lcov_ign = $(code_coverage_v_lcov_ign_$(V))
 @CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_lcov_ign_ = $(code_coverage_v_lcov_ign_$(AM_DEFAULT_VERBOSITY))
-@CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_lcov_ign_0 = @echo "  LCOV   --remove /tmp/*" $(CODE_COVERAGE_IGNORE_PATTERN);
+@CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_lcov_ign_0 = @echo "  LCOV   --remove" "$(CODE_COVERAGE_OUTPUT_FILE).tmp" $(CODE_COVERAGE_IGNORE_PATTERN);
 @CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_genhtml = $(code_coverage_v_genhtml_$(V))
 @CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_genhtml_ = $(code_coverage_v_genhtml_$(AM_DEFAULT_VERBOSITY))
 @CODE_COVERAGE_ENABLED_TRUE@code_coverage_v_genhtml_0 = @echo "  GEN   " "$(CODE_COVERAGE_OUTPUT_DIRECTORY)";
@@ -2328,7 +2328,7 @@ uninstall-am: uninstall-libLTLIBRARIES
 # Capture code coverage data
 @CODE_COVERAGE_ENABLED_TRUE@code-coverage-capture: code-coverage-capture-hook
 @CODE_COVERAGE_ENABLED_TRUE@	$(code_coverage_v_lcov_cap)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --capture --output-file "$(CODE_COVERAGE_OUTPUT_FILE).tmp" --test-name "$(call code_coverage_sanitize,$(PACKAGE_NAME)-$(PACKAGE_VERSION))" --no-checksum --compat-libtool $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_OPTIONS)
-@CODE_COVERAGE_ENABLED_TRUE@	$(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" "/tmp/*" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS)
+@CODE_COVERAGE_ENABLED_TRUE@	$(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS)
 @CODE_COVERAGE_ENABLED_TRUE@	-@rm -f "$(CODE_COVERAGE_OUTPUT_FILE).tmp"
 @CODE_COVERAGE_ENABLED_TRUE@	$(code_coverage_v_genhtml)LANG=C $(GENHTML) $(code_coverage_quiet) $(addprefix --prefix ,$(CODE_COVERAGE_DIRECTORY)) --output-directory "$(CODE_COVERAGE_OUTPUT_DIRECTORY)" --title "$(PACKAGE_NAME)-$(PACKAGE_VERSION) Code Coverage" --legend --show-details "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_GENHTML_OPTIONS)
 @CODE_COVERAGE_ENABLED_TRUE@	@echo "file://$(abs_builddir)/$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html"
diff --git a/deps/cares/src/lib/ares_config.h.cmake b/deps/cares/src/lib/ares_config.h.cmake
index 051b97f494fd32..51744fe143868c 100644
--- a/deps/cares/src/lib/ares_config.h.cmake
+++ b/deps/cares/src/lib/ares_config.h.cmake
@@ -257,6 +257,9 @@
 /* Define to 1 if you have the <signal.h> header file. */
 #cmakedefine HAVE_SIGNAL_H 1
 
+/* Define to 1 if you have the strnlen function. */
+#cmakedefine HAVE_STRNLEN 1
+
 /* Define to 1 if your struct sockaddr_in6 has sin6_scope_id. */
 #cmakedefine HAVE_STRUCT_SOCKADDR_IN6_SIN6_SCOPE_ID 1
 
diff --git a/deps/cares/src/lib/ares_config.h.in b/deps/cares/src/lib/ares_config.h.in
index d1f09d694db68e..a62e17089358aa 100644
--- a/deps/cares/src/lib/ares_config.h.in
+++ b/deps/cares/src/lib/ares_config.h.in
@@ -309,6 +309,9 @@
 /* Define to 1 if you have `strnicmp` */
 #undef HAVE_STRNICMP
 
+/* Define to 1 if you have `strnlen` */
+#undef HAVE_STRNLEN
+
 /* Define to 1 if the system has the type `struct addrinfo'. */
 #undef HAVE_STRUCT_ADDRINFO
 
diff --git a/deps/cares/src/lib/ares_private.h b/deps/cares/src/lib/ares_private.h
index ce8c3f2ddc2f6c..e6d44e8b8640f9 100644
--- a/deps/cares/src/lib/ares_private.h
+++ b/deps/cares/src/lib/ares_private.h
@@ -388,8 +388,23 @@ ares_status_t ares_sysconfig_set_options(ares_sysconfig_t *sysconfig,
 
 ares_status_t ares_init_by_environment(ares_sysconfig_t *sysconfig);
 
+
+typedef ares_status_t (*ares_sysconfig_line_cb_t)(const ares_channel_t *channel,
+                                                  ares_sysconfig_t     *sysconfig,
+                                                  ares_buf_t           *line);
+
+ares_status_t ares_sysconfig_parse_resolv_line(const ares_channel_t *channel,
+                                               ares_sysconfig_t     *sysconfig,
+                                               ares_buf_t           *line);
+
+ares_status_t ares_sysconfig_process_buf(const ares_channel_t    *channel,
+                                         ares_sysconfig_t        *sysconfig,
+                                         ares_buf_t              *buf,
+                                         ares_sysconfig_line_cb_t cb);
+
 ares_status_t ares_init_sysconfig_files(const ares_channel_t *channel,
-                                        ares_sysconfig_t     *sysconfig);
+                                        ares_sysconfig_t     *sysconfig,
+                                        ares_bool_t process_resolvconf);
 #ifdef __APPLE__
 ares_status_t ares_init_sysconfig_macos(const ares_channel_t *channel,
                                         ares_sysconfig_t     *sysconfig);
diff --git a/deps/cares/src/lib/ares_set_socket_functions.c b/deps/cares/src/lib/ares_set_socket_functions.c
index 143c491174fdba..7216ffa933fc07 100644
--- a/deps/cares/src/lib/ares_set_socket_functions.c
+++ b/deps/cares/src/lib/ares_set_socket_functions.c
@@ -288,7 +288,9 @@ static int default_asetsockopt(ares_socket_t sock, ares_socket_opt_t opt,
       return setsockopt(sock, SOL_SOCKET, SO_RCVBUF, val, val_size);
 
     case ARES_SOCKET_OPT_BIND_DEVICE:
-      if (!ares_str_isprint(val, (size_t)val_size)) {
+      /* Count the number of characters before NULL terminator then
+       * validate those are all printable */
+      if (!ares_str_isprint(val, ares_strnlen(val, (size_t)val_size))) {
         SET_SOCKERRNO(EINVAL);
         return -1;
       }
diff --git a/deps/cares/src/lib/ares_socket.c b/deps/cares/src/lib/ares_socket.c
index df02fd61b60b14..516852a84abfb8 100644
--- a/deps/cares/src/lib/ares_socket.c
+++ b/deps/cares/src/lib/ares_socket.c
@@ -263,7 +263,8 @@ ares_status_t ares_socket_configure(ares_channel_t *channel, int family,
      * compatibility */
     (void)channel->sock_funcs.asetsockopt(
       fd, ARES_SOCKET_OPT_BIND_DEVICE, channel->local_dev_name,
-      sizeof(channel->local_dev_name), channel->sock_func_cb_data);
+      (ares_socklen_t)ares_strlen(channel->local_dev_name),
+      channel->sock_func_cb_data);
   }
 
   /* Bind to ip address if configured */
diff --git a/deps/cares/src/lib/ares_sysconfig.c b/deps/cares/src/lib/ares_sysconfig.c
index 9f0d7e5061ffe0..286db60328f45b 100644
--- a/deps/cares/src/lib/ares_sysconfig.c
+++ b/deps/cares/src/lib/ares_sysconfig.c
@@ -260,6 +260,94 @@ static ares_status_t ares_init_sysconfig_android(const ares_channel_t *channel,
 }
 #endif
 
+#if defined(__QNX__)
+static ares_status_t
+  ares_init_sysconfig_qnx(const ares_channel_t *channel,
+                          ares_sysconfig_t     *sysconfig)
+{
+  /* QNX:
+   *   1. use confstr(_CS_RESOLVE, ...) as primary resolv.conf data, replacing
+   *      "_" with " ".  If that is empty, then do normal /etc/resolv.conf
+   *      processing.
+   *   2. We want to process /etc/nsswitch.conf as normal.
+   *   3. if confstr(_CS_DOMAIN, ...) this is the domain name.  Use this as
+   *      preference over anything else found.
+   */
+  ares_buf_t    *buf                = ares_buf_create();
+  unsigned char *data               = NULL;
+  size_t         data_size          = 0;
+  ares_bool_t    process_resolvconf = ARES_TRUE;
+  ares_status_t  status             = ARES_SUCCESS;
+
+  /* Prefer confstr(_CS_RESOLVE, ...) */
+  buf = ares_buf_create();
+  if (buf == NULL) {
+    status = ARES_ENOMEM;
+    goto done;
+  }
+
+  data_size = 1024;
+  data      = ares_buf_append_start(buf, &data_size);
+  if (data == NULL) {
+    status = ARES_ENOMEM;
+    goto done;
+  }
+
+  data_size = confstr(_CS_RESOLVE, (char *)data, data_size);
+  if (data_size > 1) {
+    /* confstr returns byte for NULL terminator, strip */
+    data_size--;
+
+    ares_buf_append_finish(buf, data_size);
+    /* Its odd, this uses _ instead of " " between keywords, otherwise the
+     * format is the same as resolv.conf, replace. */
+    ares_buf_replace(buf, (const unsigned char *)"_", 1,
+                     (const unsigned char *)" ", 1);
+
+    status = ares_sysconfig_process_buf(channel, sysconfig, buf,
+                                        ares_sysconfig_parse_resolv_line);
+    if (status != ARES_SUCCESS) {
+      /* ENOMEM is really the only error we'll get here */
+      goto done;
+    }
+
+    /* don't read resolv.conf if we processed *any* nameservers */
+    if (ares_llist_len(sysconfig->sconfig) != 0) {
+      process_resolvconf = ARES_FALSE;
+    }
+  }
+
+  /* Process files */
+  status = ares_init_sysconfig_files(channel, sysconfig, process_resolvconf);
+  if (status != ARES_SUCCESS) {
+    goto done;
+  }
+
+  /* Read confstr(_CS_DOMAIN, ...), but if we had a search path specified with
+   * more than one domain, lets prefer that instead.  Its not exactly clear
+   * the best way to handle this. */
+  if (sysconfig->ndomains <= 1) {
+    char   domain[256];
+    size_t domain_len;
+
+    domain_len = confstr(_CS_DOMAIN, domain, sizeof(domain_len));
+    if (domain_len != 0) {
+      ares_strsplit_free(sysconfig->domains, sysconfig->ndomains);
+      sysconfig->domains = ares_strsplit(domain, ", ", &sysconfig->ndomains);
+      if (sysconfig->domains == NULL) {
+        status = ARES_ENOMEM;
+        goto done;
+      }
+    }
+  }
+
+done:
+  ares_buf_destroy(buf);
+
+  return status;
+}
+#endif
+
 #if defined(CARES_USE_LIBRESOLV)
 static ares_status_t
   ares_init_sysconfig_libresolv(const ares_channel_t *channel,
@@ -516,8 +604,10 @@ ares_status_t ares_init_by_sysconfig(ares_channel_t *channel)
   status = ares_init_sysconfig_macos(channel, &sysconfig);
 #elif defined(CARES_USE_LIBRESOLV)
   status = ares_init_sysconfig_libresolv(channel, &sysconfig);
+#elif defined(__QNX__)
+  status = ares_init_sysconfig_qnx(channel, &sysconfig);
 #else
-  status = ares_init_sysconfig_files(channel, &sysconfig);
+  status = ares_init_sysconfig_files(channel, &sysconfig, ARES_TRUE);
 #endif
 
   if (status != ARES_SUCCESS) {
diff --git a/deps/cares/src/lib/ares_sysconfig_files.c b/deps/cares/src/lib/ares_sysconfig_files.c
index 49bc330d9d346d..a6c2a8e62bb34f 100644
--- a/deps/cares/src/lib/ares_sysconfig_files.c
+++ b/deps/cares/src/lib/ares_sysconfig_files.c
@@ -549,9 +549,9 @@ ares_status_t ares_init_by_environment(ares_sysconfig_t *sysconfig)
 /* This function will only return ARES_SUCCESS or ARES_ENOMEM.  Any other
  * conditions are ignored.  Users may mess up config files, but we want to
  * process anything we can. */
-static ares_status_t parse_resolvconf_line(const ares_channel_t *channel,
-                                           ares_sysconfig_t     *sysconfig,
-                                           ares_buf_t           *line)
+ares_status_t ares_sysconfig_parse_resolv_line(const ares_channel_t *channel,
+                                               ares_sysconfig_t     *sysconfig,
+                                               ares_buf_t           *line)
 {
   char          option[32];
   char          value[512];
@@ -726,9 +726,38 @@ static ares_status_t parse_svcconf_line(const ares_channel_t *channel,
   return status;
 }
 
-typedef ares_status_t (*line_callback_t)(const ares_channel_t *channel,
-                                         ares_sysconfig_t     *sysconfig,
-                                         ares_buf_t           *line);
+
+ares_status_t ares_sysconfig_process_buf(const ares_channel_t    *channel,
+                                         ares_sysconfig_t        *sysconfig,
+                                         ares_buf_t              *buf,
+                                         ares_sysconfig_line_cb_t cb)
+{
+  ares_array_t *lines  = NULL;
+  size_t        num;
+  size_t        i;
+  ares_status_t status;
+
+  status = ares_buf_split(buf, (const unsigned char *)"\n", 1,
+                          ARES_BUF_SPLIT_TRIM, 0, &lines);
+  if (status != ARES_SUCCESS) {
+    goto done;
+  }
+
+  num = ares_array_len(lines);
+  for (i = 0; i < num; i++) {
+    ares_buf_t **bufptr = ares_array_at(lines, i);
+    ares_buf_t  *line   = *bufptr;
+
+    status = cb(channel, sysconfig, line);
+    if (status != ARES_SUCCESS) {
+      goto done;
+    }
+  }
+
+done:
+  ares_array_destroy(lines);
+  return status;
+}
 
 /* Should only return:
  *  ARES_ENOTFOUND - file not found
@@ -737,16 +766,13 @@ typedef ares_status_t (*line_callback_t)(const ares_channel_t *channel,
  *  ARES_SUCCESS   - file processed, doesn't necessarily mean it was a good
  *                   file, but we're not erroring out if we can't parse
  *                   something (or anything at all) */
-static ares_status_t process_config_lines(const ares_channel_t *channel,
-                                          const char           *filename,
-                                          ares_sysconfig_t     *sysconfig,
-                                          line_callback_t       cb)
+static ares_status_t process_config_lines(const ares_channel_t    *channel,
+                                          const char              *filename,
+                                          ares_sysconfig_t        *sysconfig,
+                                          ares_sysconfig_line_cb_t cb)
 {
   ares_status_t status = ARES_SUCCESS;
-  ares_array_t *lines  = NULL;
   ares_buf_t   *buf    = NULL;
-  size_t        num;
-  size_t        i;
 
   buf = ares_buf_create();
   if (buf == NULL) {
@@ -759,43 +785,30 @@ static ares_status_t process_config_lines(const ares_channel_t *channel,
     goto done;
   }
 
-  status = ares_buf_split(buf, (const unsigned char *)"\n", 1,
-                          ARES_BUF_SPLIT_TRIM, 0, &lines);
-  if (status != ARES_SUCCESS) {
-    goto done;
-  }
-
-  num = ares_array_len(lines);
-  for (i = 0; i < num; i++) {
-    ares_buf_t **bufptr = ares_array_at(lines, i);
-    ares_buf_t  *line   = *bufptr;
-
-    status = cb(channel, sysconfig, line);
-    if (status != ARES_SUCCESS) {
-      goto done;
-    }
-  }
+  status = ares_sysconfig_process_buf(channel, sysconfig, buf, cb);
 
 done:
   ares_buf_destroy(buf);
-  ares_array_destroy(lines);
 
   return status;
 }
 
 ares_status_t ares_init_sysconfig_files(const ares_channel_t *channel,
-                                        ares_sysconfig_t     *sysconfig)
+                                        ares_sysconfig_t     *sysconfig,
+                                        ares_bool_t process_resolvconf)
 {
   ares_status_t status = ARES_SUCCESS;
 
   /* Resolv.conf */
-  status = process_config_lines(channel,
-                                (channel->resolvconf_path != NULL)
-                                  ? channel->resolvconf_path
-                                  : PATH_RESOLV_CONF,
-                                sysconfig, parse_resolvconf_line);
-  if (status != ARES_SUCCESS && status != ARES_ENOTFOUND) {
-    goto done;
+  if (process_resolvconf) {
+    status = process_config_lines(channel,
+                                  (channel->resolvconf_path != NULL)
+                                    ? channel->resolvconf_path
+                                    : PATH_RESOLV_CONF,
+                                  sysconfig, ares_sysconfig_parse_resolv_line);
+    if (status != ARES_SUCCESS && status != ARES_ENOTFOUND) {
+      goto done;
+    }
   }
 
   /* Nsswitch.conf */
diff --git a/deps/cares/src/lib/event/ares_event_configchg.c b/deps/cares/src/lib/event/ares_event_configchg.c
index e3e665bd165523..5ecc6888ab719f 100644
--- a/deps/cares/src/lib/event/ares_event_configchg.c
+++ b/deps/cares/src/lib/event/ares_event_configchg.c
@@ -558,14 +558,24 @@ static ares_status_t config_change_check(ares_htable_strvp_t *filestat,
                                          const char          *resolvconf_path)
 {
   size_t      i;
-  const char *configfiles[5];
+  const char *configfiles[16];
   ares_bool_t changed = ARES_FALSE;
+  size_t      cnt = 0;
 
-  configfiles[0] = resolvconf_path;
-  configfiles[1] = "/etc/nsswitch.conf";
-  configfiles[2] = "/etc/netsvc.conf";
-  configfiles[3] = "/etc/svc.conf";
-  configfiles[4] = NULL;
+  memset(configfiles, 0, sizeof(configfiles));
+
+  configfiles[cnt++] = resolvconf_path;
+  configfiles[cnt++] = "/etc/nsswitch.conf";
+#ifdef _AIX
+  configfiles[cnt++] = "/etc/netsvc.conf";
+#endif
+#ifdef __osf /* Tru64 */
+  configfiles[cnt++] = "/etc/svc.conf";
+#endif
+#ifdef __QNX__
+  configfiles[cnt++] = "/etc/net.cfg";
+#endif
+  configfiles[cnt++] = NULL;
 
   for (i = 0; configfiles[i] != NULL; i++) {
     fileinfo_t *fi = ares_htable_strvp_get_direct(filestat, configfiles[i]);
diff --git a/deps/cares/src/lib/include/ares_buf.h b/deps/cares/src/lib/include/ares_buf.h
index 7836a313e066d1..10d29eaf83bd8e 100644
--- a/deps/cares/src/lib/include/ares_buf.h
+++ b/deps/cares/src/lib/include/ares_buf.h
@@ -219,6 +219,26 @@ CARES_EXTERN unsigned char *ares_buf_finish_bin(ares_buf_t *buf, size_t *len);
  */
 CARES_EXTERN char          *ares_buf_finish_str(ares_buf_t *buf, size_t *len);
 
+/*! Replace the given search byte sequence with the replacement byte sequence.
+ *  This is only valid for allocated buffers, not const buffers.  Will replace
+ *  all byte sequences starting at the current offset to the end of the buffer.
+ *
+ *  \param[in]  buf       Initialized buffer object. Can not be a "const" buffer.
+ *  \param[in]  srch      Search byte sequence, must not be NULL.
+ *  \param[in]  srch_size Size of byte sequence, must not be zero.
+ *  \param[in]  rplc      Byte sequence to use as replacement.  May be NULL if
+ *                        rplc_size is zero.
+ *  \param[in]  rplc_size Size of replacement byte sequence, may be 0.
+ *  \return ARES_SUCCESS on success, otherwise on may return failure only on
+ *          memory allocation failure or misuse.  Will not return indication
+ *          if any replacements occurred
+ */
+CARES_EXTERN ares_status_t  ares_buf_replace(ares_buf_t *buf,
+                                             const unsigned char *srch,
+                                             size_t srch_size,
+                                             const unsigned char *rplc,
+                                             size_t rplc_size);
+
 /*! Tag a position to save in the buffer in case parsing needs to rollback,
  *  such as if insufficient data is available, but more data may be added in
  *  the future.  Only a single tag can be set per buffer object.  Setting a
diff --git a/deps/cares/src/lib/include/ares_str.h b/deps/cares/src/lib/include/ares_str.h
index ea75b3b3e7441d..4ee339510bf026 100644
--- a/deps/cares/src/lib/include/ares_str.h
+++ b/deps/cares/src/lib/include/ares_str.h
@@ -29,6 +29,20 @@
 
 CARES_EXTERN char  *ares_strdup(const char *s1);
 
+/*! Scan up to maxlen bytes for the first NULL character and return
+ *  its index, or maxlen if not found.  The function only returns
+ *  maxlen if the first maxlen bytes were not NULL characters; it
+ *  makes no guarantee for what \c str[maxlen] (if defined) is, and
+ *  does not access it.  It is behaving like the POSIX \c strnlen()
+ *  function, except that it returns 0 if the \p str pointer is \c
+ *  NULL.
+ *
+ *  \param[in] str    The string to scan for the NULL character
+ *  \param[in] maxlen The maximum number of bytes to scan
+ *  \return Index of first NULL byte. Between 0 and maxlen (inclusive).
+ */
+CARES_EXTERN size_t ares_strnlen(const char *str, size_t maxlen);
+
 CARES_EXTERN size_t ares_strlen(const char *str);
 
 /*! Copy string from source to destination with destination buffer size
diff --git a/deps/cares/src/lib/record/ares_dns_multistring.c b/deps/cares/src/lib/record/ares_dns_multistring.c
index 57c0d1c0a803ec..44fcaccd65bb6a 100644
--- a/deps/cares/src/lib/record/ares_dns_multistring.c
+++ b/deps/cares/src/lib/record/ares_dns_multistring.c
@@ -146,6 +146,18 @@ ares_status_t ares_dns_multistring_add_own(ares_dns_multistring_t *strs,
     return status;
   }
 
+  /* Issue #921, ares_dns_multistring_get() doesn't have a way to indicate
+   * success or fail on a zero-length string which is actually valid.  So we
+   * are going to allocate a 1-byte buffer to use as a placeholder in this
+   * case */
+  if (str == NULL) {
+    str = ares_malloc_zero(1);
+    if (str == NULL) {
+      ares_array_remove_last(strs->strs);
+      return ARES_ENOMEM;
+    }
+  }
+
   data->data = str;
   data->len  = len;
 
@@ -252,36 +264,38 @@ ares_status_t ares_dns_multistring_parse_buf(ares_buf_t *buf,
       break; /* LCOV_EXCL_LINE: DefensiveCoding */
     }
 
-    if (len) {
-      /* When used by the _str() parser, it really needs to be validated to
-       * be a valid printable ascii string.  Do that here */
-      if (validate_printable && ares_buf_len(buf) >= len) {
-        size_t      mylen;
-        const char *data = (const char *)ares_buf_peek(buf, &mylen);
-        if (!ares_str_isprint(data, len)) {
-          status = ARES_EBADSTR;
-          break;
-        }
+
+    /* When used by the _str() parser, it really needs to be validated to
+     * be a valid printable ascii string.  Do that here */
+    if (len && validate_printable && ares_buf_len(buf) >= len) {
+      size_t      mylen;
+      const char *data = (const char *)ares_buf_peek(buf, &mylen);
+      if (!ares_str_isprint(data, len)) {
+        status = ARES_EBADSTR;
+        break;
       }
+    }
 
-      if (strs != NULL) {
-        unsigned char *data = NULL;
+    if (strs != NULL) {
+      unsigned char *data = NULL;
+      if (len) {
         status = ares_buf_fetch_bytes_dup(buf, len, ARES_TRUE, &data);
         if (status != ARES_SUCCESS) {
           break;
         }
-        status = ares_dns_multistring_add_own(*strs, data, len);
-        if (status != ARES_SUCCESS) {
-          ares_free(data);
-          break;
-        }
-      } else {
-        status = ares_buf_consume(buf, len);
-        if (status != ARES_SUCCESS) {
-          break;
-        }
+      }
+      status = ares_dns_multistring_add_own(*strs, data, len);
+      if (status != ARES_SUCCESS) {
+        ares_free(data);
+        break;
+      }
+    } else {
+      status = ares_buf_consume(buf, len);
+      if (status != ARES_SUCCESS) {
+        break;
       }
     }
+
   }
 
   if (status != ARES_SUCCESS && strs != NULL) {
diff --git a/deps/cares/src/lib/str/ares_buf.c b/deps/cares/src/lib/str/ares_buf.c
index 69e6b38aac849e..63acc6cf7714d3 100644
--- a/deps/cares/src/lib/str/ares_buf.c
+++ b/deps/cares/src/lib/str/ares_buf.c
@@ -1104,6 +1104,72 @@ const unsigned char *ares_buf_peek(const ares_buf_t *buf, size_t *len)
   return ares_buf_fetch(buf, len);
 }
 
+ares_status_t ares_buf_replace(ares_buf_t *buf, const unsigned char *srch,
+                               size_t srch_size, const unsigned char *rplc,
+                               size_t rplc_size)
+{
+  size_t        processed_len = 0;
+  ares_status_t status;
+
+  if (buf->alloc_buf == NULL || srch == NULL || srch_size == 0 ||
+      (rplc == NULL && rplc_size != 0)) {
+    return ARES_EFORMERR;
+  }
+
+  while (1) {
+    unsigned char *ptr           = buf->alloc_buf + buf->offset + processed_len;
+    size_t         remaining_len = buf->data_len - buf->offset - processed_len;
+    size_t         found_offset  = 0;
+    size_t         move_data_len;
+
+    /* Find pattern */
+    ptr = ares_memmem(ptr, remaining_len, srch, srch_size);
+    if (ptr == NULL) {
+      break;
+    }
+
+    /* Store the offset this was found because our actual pointer might be
+     * switched out from under us by the call to ensure_space() if the
+     * replacement pattern is larger than the search pattern */
+    found_offset   = (size_t)(ptr - (size_t)(buf->alloc_buf + buf->offset));
+    if (rplc_size > srch_size) {
+      status = ares_buf_ensure_space(buf, rplc_size - srch_size);
+      if (status != ARES_SUCCESS) {
+        return status;
+      }
+    }
+
+    /* Impossible, but silence clang */
+    if (buf->alloc_buf == NULL) {
+      return ARES_ENOMEM;
+    }
+
+    /* Recalculate actual pointer */
+    ptr = buf->alloc_buf + buf->offset + found_offset;
+
+    /* Move the data */
+    move_data_len = buf->data_len - buf->offset - found_offset - srch_size;
+    memmove(ptr + rplc_size,
+            ptr + srch_size,
+            move_data_len);
+
+    /* Copy in the replacement data */
+    if (rplc != NULL && rplc_size > 0) {
+      memcpy(ptr, rplc, rplc_size);
+    }
+
+    if (rplc_size > srch_size) {
+      buf->data_len += rplc_size - srch_size;
+    } else {
+      buf->data_len -= srch_size - rplc_size;
+    }
+
+    processed_len = found_offset + rplc_size;
+  }
+
+  return ARES_SUCCESS;
+}
+
 ares_status_t ares_buf_peek_byte(const ares_buf_t *buf, unsigned char *b)
 {
   size_t               remaining_len = 0;
diff --git a/deps/cares/src/lib/str/ares_str.c b/deps/cares/src/lib/str/ares_str.c
index f6bfabf11f4467..0eda1ab9f15783 100644
--- a/deps/cares/src/lib/str/ares_str.c
+++ b/deps/cares/src/lib/str/ares_str.c
@@ -32,6 +32,23 @@
 #  include <stdint.h>
 #endif
 
+size_t ares_strnlen(const char *str, size_t maxlen) {
+  const char *p = NULL;
+  if (str == NULL) {
+    return 0;
+  }
+#ifdef HAVE_STRNLEN
+  (void)p;
+  return strnlen(str, maxlen);
+#else
+  if ((p = memchr(str, 0, maxlen)) == NULL) {
+    return maxlen;
+  } else {
+    return (size_t)(p - str);
+  }
+#endif /* HAVE_STRNLEN */
+}
+
 size_t ares_strlen(const char *str)
 {
   if (str == NULL) {
diff --git a/deps/cares/src/tools/Makefile.in b/deps/cares/src/tools/Makefile.in
index 9a96a74fa6957d..19e99a253378c7 100644
--- a/deps/cares/src/tools/Makefile.in
+++ b/deps/cares/src/tools/Makefile.in
@@ -91,7 +91,9 @@ host_triplet = @host@
 noinst_PROGRAMS = $(am__EXEEXT_1)
 subdir = src/tools
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
-am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
+am__aclocal_m4_deps = $(top_srcdir)/m4/ares_check_user_namespace.m4 \
+	$(top_srcdir)/m4/ares_check_uts_namespace.m4 \
+	$(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_ac_print_to_file.m4 \
 	$(top_srcdir)/m4/ax_add_am_macro_static.m4 \
 	$(top_srcdir)/m4/ax_am_macros_static.m4 \
@@ -101,8 +103,6 @@ am__aclocal_m4_deps = $(top_srcdir)/m4/ax_ac_append_to_file.m4 \
 	$(top_srcdir)/m4/ax_check_compile_flag.m4 \
 	$(top_srcdir)/m4/ax_check_gnu_make.m4 \
 	$(top_srcdir)/m4/ax_check_link_flag.m4 \
-	$(top_srcdir)/m4/ax_check_user_namespace.m4 \
-	$(top_srcdir)/m4/ax_check_uts_namespace.m4 \
 	$(top_srcdir)/m4/ax_code_coverage.m4 \
 	$(top_srcdir)/m4/ax_compiler_vendor.m4 \
 	$(top_srcdir)/m4/ax_cxx_compile_stdcxx.m4 \
diff --git a/deps/ngtcp2/nghttp3/lib/includes/nghttp3/nghttp3.h b/deps/ngtcp2/nghttp3/lib/includes/nghttp3/nghttp3.h
index 77eb1fbf263815..6cf947487bf899 100644
--- a/deps/ngtcp2/nghttp3/lib/includes/nghttp3/nghttp3.h
+++ b/deps/ngtcp2/nghttp3/lib/includes/nghttp3/nghttp3.h
@@ -31,11 +31,11 @@
    libcurl) */
 #if (defined(_WIN32) || defined(__WIN32__)) && !defined(WIN32)
 #  define WIN32
-#endif
+#endif /* (defined(_WIN32) || defined(__WIN32__)) && !defined(WIN32) */
 
 #ifdef __cplusplus
 extern "C" {
-#endif
+#endif /* defined(__cplusplus) */
 
 #include <stdlib.h>
 #if defined(_MSC_VER) && (_MSC_VER < 1800)
@@ -43,9 +43,9 @@ extern "C" {
    compliant.  See compiler macros and version number in
    https://sourceforge.net/p/predef/wiki/Compilers/ */
 #  include <stdint.h>
-#else /* !defined(_MSC_VER) || (_MSC_VER >= 1800) */
+#else /* !(defined(_MSC_VER) && (_MSC_VER < 1800)) */
 #  include <inttypes.h>
-#endif /* !defined(_MSC_VER) || (_MSC_VER >= 1800) */
+#endif /* !(defined(_MSC_VER) && (_MSC_VER < 1800)) */
 #include <sys/types.h>
 #include <stdarg.h>
 #include <stddef.h>
@@ -57,22 +57,22 @@ extern "C" {
 #elif defined(WIN32)
 #  ifdef BUILDING_NGHTTP3
 #    define NGHTTP3_EXTERN __declspec(dllexport)
-#  else /* !BUILDING_NGHTTP3 */
+#  else /* !defined(BUILDING_NGHTTP3) */
 #    define NGHTTP3_EXTERN __declspec(dllimport)
-#  endif /* !BUILDING_NGHTTP3 */
-#else    /* !defined(WIN32) */
+#  endif /* !defined(BUILDING_NGHTTP3) */
+#else    /* !(defined(NGHTTP3_STATICLIB) || defined(WIN32)) */
 #  ifdef BUILDING_NGHTTP3
 #    define NGHTTP3_EXTERN __attribute__((visibility("default")))
-#  else /* !BUILDING_NGHTTP3 */
+#  else /* !defined(BUILDING_NGHTTP3) */
 #    define NGHTTP3_EXTERN
-#  endif /* !BUILDING_NGHTTP3 */
-#endif   /* !defined(WIN32) */
+#  endif /* !defined(BUILDING_NGHTTP3) */
+#endif   /* !(defined(NGHTTP3_STATICLIB) || defined(WIN32)) */
 
 #ifdef _MSC_VER
 #  define NGHTTP3_ALIGN(N) __declspec(align(N))
-#else /* !_MSC_VER */
+#else /* !defined(_MSC_VER) */
 #  define NGHTTP3_ALIGN(N) __attribute__((aligned(N)))
-#endif /* !_MSC_VER */
+#endif /* !defined(_MSC_VER) */
 
 /**
  * @typedef
@@ -624,7 +624,7 @@ typedef struct nghttp3_buf {
    */
   uint8_t *end;
   /**
-   * :member:`pos` pointers to the start of data.  Typically, this
+   * :member:`pos` points to the start of data.  Typically, this
    * points to the address that next data should be read.  Initially,
    * it points to :member:`begin`.
    */
@@ -1161,8 +1161,8 @@ NGHTTP3_EXTERN void nghttp3_qpack_encoder_del(nghttp3_qpack_encoder *encoder);
  *      anymore.
  */
 NGHTTP3_EXTERN int nghttp3_qpack_encoder_encode(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *pbuf, nghttp3_buf *rbuf,
-    nghttp3_buf *ebuf, int64_t stream_id, const nghttp3_nv *nva, size_t nvlen);
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *pbuf, nghttp3_buf *rbuf,
+  nghttp3_buf *ebuf, int64_t stream_id, const nghttp3_nv *nva, size_t nvlen);
 
 /**
  * @function
@@ -1182,7 +1182,7 @@ NGHTTP3_EXTERN int nghttp3_qpack_encoder_encode(
  *     |encoder| is unable to process input because it is malformed.
  */
 NGHTTP3_EXTERN nghttp3_ssize nghttp3_qpack_encoder_read_decoder(
-    nghttp3_qpack_encoder *encoder, const uint8_t *src, size_t srclen);
+  nghttp3_qpack_encoder *encoder, const uint8_t *src, size_t srclen);
 
 /**
  * @function
@@ -1343,7 +1343,7 @@ NGHTTP3_EXTERN void nghttp3_qpack_decoder_del(nghttp3_qpack_decoder *decoder);
  *     Could not interpret encoder stream instruction.
  */
 NGHTTP3_EXTERN nghttp3_ssize nghttp3_qpack_decoder_read_encoder(
-    nghttp3_qpack_decoder *decoder, const uint8_t *src, size_t srclen);
+  nghttp3_qpack_decoder *decoder, const uint8_t *src, size_t srclen);
 
 /**
  * @function
@@ -1436,9 +1436,9 @@ nghttp3_qpack_decoder_get_icnt(const nghttp3_qpack_decoder *decoder);
  *     HTTP field is too large.
  */
 NGHTTP3_EXTERN nghttp3_ssize nghttp3_qpack_decoder_read_request(
-    nghttp3_qpack_decoder *decoder, nghttp3_qpack_stream_context *sctx,
-    nghttp3_qpack_nv *nv, uint8_t *pflags, const uint8_t *src, size_t srclen,
-    int fin);
+  nghttp3_qpack_decoder *decoder, nghttp3_qpack_stream_context *sctx,
+  nghttp3_qpack_nv *nv, uint8_t *pflags, const uint8_t *src, size_t srclen,
+  int fin);
 
 /**
  * @function
@@ -1568,7 +1568,7 @@ typedef void (*nghttp3_debug_vprintf_callback)(const char *format,
  *   times because this is important.
  */
 NGHTTP3_EXTERN void nghttp3_set_debug_vprintf_callback(
-    nghttp3_debug_vprintf_callback debug_vprintf_callback);
+  nghttp3_debug_vprintf_callback debug_vprintf_callback);
 
 /**
  * @macrosection
@@ -2118,9 +2118,10 @@ NGHTTP3_EXTERN int nghttp3_conn_bind_qpack_streams(nghttp3_conn *conn,
  * :macro:`NGHTTP3_ERR_CALLBACK_FAILURE`
  *     User callback failed.
  *
- * It may return the other error codes.  In general, the negative
- * error code means that |conn| encountered a connection error, and
- * the connection should be closed.
+ * It may return the other error codes.  The negative error code means
+ * that |conn| encountered a connection error, and the connection must
+ * be closed.  Calling nghttp3 API other than `nghttp3_conn_del`
+ * causes undefined behavior.
  */
 NGHTTP3_EXTERN nghttp3_ssize nghttp3_conn_read_stream(nghttp3_conn *conn,
                                                       int64_t stream_id,
@@ -2152,9 +2153,10 @@ NGHTTP3_EXTERN nghttp3_ssize nghttp3_conn_read_stream(nghttp3_conn *conn,
  * :macro:`NGHTTP3_ERR_CALLBACK_FAILURE`
  *     User callback failed.
  *
- * It may return the other error codes.  In general, the negative
- * error code means that |conn| encountered a connection error, and
- * the connection should be closed.
+ * It may return the other error codes.  The negative error code means
+ * that |conn| encountered a connection error, and the connection must
+ * be closed.  Calling nghttp3 API other than `nghttp3_conn_del`
+ * causes undefined behavior.
  */
 NGHTTP3_EXTERN nghttp3_ssize nghttp3_conn_writev_stream(nghttp3_conn *conn,
                                                         int64_t *pstream_id,
@@ -2198,6 +2200,9 @@ NGHTTP3_EXTERN int nghttp3_conn_add_write_offset(nghttp3_conn *conn,
  * If a stream denoted by |stream_id| is not found, this function
  * returns 0.
  *
+ * Alternatively, `nghttp3_conn_update_ack_offset` can be used to
+ * accomplish the same thing.
+ *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
  *
@@ -2207,6 +2212,31 @@ NGHTTP3_EXTERN int nghttp3_conn_add_write_offset(nghttp3_conn *conn,
 NGHTTP3_EXTERN int nghttp3_conn_add_ack_offset(nghttp3_conn *conn,
                                                int64_t stream_id, uint64_t n);
 
+/**
+ * @function
+ *
+ * `nghttp3_conn_update_ack_offset` tells |conn| that QUIC stack has
+ * acknowledged the stream data up to |offset| for a stream denoted by
+ * |stream_id|.
+ *
+ * If a stream denoted by |stream_id| is not found, this function
+ * returns 0.
+ *
+ * Alternatively, `nghttp3_conn_add_ack_offset` can be used to
+ * accomplish the same thing.
+ *
+ * This function returns 0 if it succeeds, or one of the following
+ * negative error codes:
+ *
+ * :macro:`NGHTTP3_ERR_INVALID_ARGUMENT`
+ *     |offset| is less than the number of bytes acknowledged so far.
+ * :macro:`NGHTTP3_ERR_CALLBACK_FAILURE`
+ *     User callback failed.
+ */
+NGHTTP3_EXTERN int nghttp3_conn_update_ack_offset(nghttp3_conn *conn,
+                                                  int64_t stream_id,
+                                                  uint64_t offset);
+
 /**
  * @function
  *
@@ -2314,9 +2344,9 @@ NGHTTP3_EXTERN int nghttp3_conn_resume_stream(nghttp3_conn *conn,
 /**
  * @function
  *
- * `nghttp3_conn_close_stream` closes stream identified by
- * |stream_id|.  QUIC application error code |app_error_code| is the
- * reason of the closure.
+ * `nghttp3_conn_close_stream` tells the library that a stream
+ * identified by |stream_id| has been closed.  QUIC application error
+ * code |app_error_code| is the reason of the closure.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
@@ -2419,8 +2449,8 @@ nghttp3_conn_set_max_concurrent_streams(nghttp3_conn *conn,
  * stream.
  */
 typedef nghttp3_ssize (*nghttp3_read_data_callback)(
-    nghttp3_conn *conn, int64_t stream_id, nghttp3_vec *vec, size_t veccnt,
-    uint32_t *pflags, void *conn_user_data, void *stream_user_data);
+  nghttp3_conn *conn, int64_t stream_id, nghttp3_vec *vec, size_t veccnt,
+  uint32_t *pflags, void *conn_user_data, void *stream_user_data);
 
 /**
  * @struct
@@ -2460,8 +2490,8 @@ typedef struct nghttp3_data_reader {
  *     Out of memory.
  */
 NGHTTP3_EXTERN int nghttp3_conn_submit_request(
-    nghttp3_conn *conn, int64_t stream_id, const nghttp3_nv *nva, size_t nvlen,
-    const nghttp3_data_reader *dr, void *stream_user_data);
+  nghttp3_conn *conn, int64_t stream_id, const nghttp3_nv *nva, size_t nvlen,
+  const nghttp3_data_reader *dr, void *stream_user_data);
 
 /**
  * @function
@@ -2667,7 +2697,7 @@ typedef struct NGHTTP3_ALIGN(8) nghttp3_pri {
  *     Stream not found.
  */
 NGHTTP3_EXTERN int nghttp3_conn_get_stream_priority_versioned(
-    nghttp3_conn *conn, int pri_version, nghttp3_pri *dest, int64_t stream_id);
+  nghttp3_conn *conn, int pri_version, nghttp3_pri *dest, int64_t stream_id);
 
 /**
  * @function
@@ -2720,8 +2750,8 @@ NGHTTP3_EXTERN int nghttp3_conn_set_client_stream_priority(nghttp3_conn *conn,
  *     Out of memory.
  */
 NGHTTP3_EXTERN int nghttp3_conn_set_server_stream_priority_versioned(
-    nghttp3_conn *conn, int64_t stream_id, int pri_version,
-    const nghttp3_pri *pri);
+  nghttp3_conn *conn, int64_t stream_id, int pri_version,
+  const nghttp3_pri *pri);
 
 /**
  * @function
@@ -2884,7 +2914,7 @@ NGHTTP3_EXTERN int nghttp3_err_is_fatal(int liberr);
  */
 #define nghttp3_conn_set_server_stream_priority(CONN, STREAM_ID, PRI)          \
   nghttp3_conn_set_server_stream_priority_versioned(                           \
-      (CONN), (STREAM_ID), NGHTTP3_PRI_VERSION, (PRI))
+    (CONN), (STREAM_ID), NGHTTP3_PRI_VERSION, (PRI))
 
 /*
  * `nghttp3_conn_get_stream_priority` is a wrapper around
@@ -2906,6 +2936,6 @@ NGHTTP3_EXTERN int nghttp3_err_is_fatal(int liberr);
 
 #ifdef __cplusplus
 }
-#endif
+#endif /* defined(__cplusplus) */
 
-#endif /* NGHTTP3_H */
+#endif /* !defined(NGHTTP3_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/includes/nghttp3/version.h b/deps/ngtcp2/nghttp3/lib/includes/nghttp3/version.h
index bc57eb2cfcf2d6..7f6cb8acffe672 100644
--- a/deps/ngtcp2/nghttp3/lib/includes/nghttp3/version.h
+++ b/deps/ngtcp2/nghttp3/lib/includes/nghttp3/version.h
@@ -31,7 +31,7 @@
  *
  * Version number of the nghttp3 library release.
  */
-#define NGHTTP3_VERSION "0.7.0"
+#define NGHTTP3_VERSION "1.6.0"
 
 /**
  * @macro
@@ -41,6 +41,6 @@
  * number, 8 bits for minor and 8 bits for patch. Version 1.2.3
  * becomes 0x010203.
  */
-#define NGHTTP3_VERSION_NUM 0x000700
+#define NGHTTP3_VERSION_NUM 0x010600
 
-#endif /* NGHTTP3_VERSION_H */
+#endif /* !defined(NGHTTP3_VERSION_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.c b/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.c
index e134d0f4dceb75..544e4fb1306a5f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.c
@@ -66,8 +66,8 @@ int nghttp3_balloc_get(nghttp3_balloc *balloc, void **pbuf, size_t n) {
   assert(n <= balloc->blklen);
 
   if (nghttp3_buf_left(&balloc->buf) < n) {
-    p = nghttp3_mem_malloc(balloc->mem, sizeof(nghttp3_memblock_hd) + 0x10u +
-                                            balloc->blklen);
+    p = nghttp3_mem_malloc(balloc->mem,
+                           sizeof(nghttp3_memblock_hd) + 0x8u + balloc->blklen);
     if (p == NULL) {
       return NGHTTP3_ERR_NOMEM;
     }
@@ -76,10 +76,10 @@ int nghttp3_balloc_get(nghttp3_balloc *balloc, void **pbuf, size_t n) {
     hd->next = balloc->head;
     balloc->head = hd;
     nghttp3_buf_wrap_init(
-        &balloc->buf,
-        (uint8_t *)(((uintptr_t)p + sizeof(nghttp3_memblock_hd) + 0xfu) &
-                    ~(uintptr_t)0xfu),
-        balloc->blklen);
+      &balloc->buf,
+      (uint8_t *)(((uintptr_t)p + sizeof(nghttp3_memblock_hd) + 0xfu) &
+                  ~(uintptr_t)0xfu),
+      balloc->blklen);
   }
 
   assert(((uintptr_t)balloc->buf.last & 0xfu) == 0);
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.h b/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.h
index e02f61d16b5763..c95f0426a924bf 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_balloc.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -40,7 +40,10 @@ typedef struct nghttp3_memblock_hd nghttp3_memblock_hd;
  * nghttp3_memblock_hd is the header of memory block.
  */
 struct nghttp3_memblock_hd {
-  nghttp3_memblock_hd *next;
+  union {
+    nghttp3_memblock_hd *next;
+    uint64_t pad;
+  };
 };
 
 /*
@@ -89,4 +92,4 @@ int nghttp3_balloc_get(nghttp3_balloc *balloc, void **pbuf, size_t n);
  */
 void nghttp3_balloc_clear(nghttp3_balloc *balloc);
 
-#endif /* NGHTTP3_BALLOC_H */
+#endif /* !defined(NGHTTP3_BALLOC_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_buf.h b/deps/ngtcp2/nghttp3/lib/nghttp3_buf.h
index 472a4b7b14a80e..9fa067de91b949 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_buf.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_buf.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -71,4 +71,4 @@ void nghttp3_typed_buf_init(nghttp3_typed_buf *tbuf, const nghttp3_buf *buf,
 
 void nghttp3_typed_buf_free(nghttp3_typed_buf *tbuf);
 
-#endif /* NGHTTP3_BUF_H */
+#endif /* !defined(NGHTTP3_BUF_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_conn.c b/deps/ngtcp2/nghttp3/lib/nghttp3_conn.c
index 25aaf685734cb1..f70b4f5472de64 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_conn.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_conn.c
@@ -210,9 +210,9 @@ static int conn_call_recv_settings(nghttp3_conn *conn) {
 static int ricnt_less(const nghttp3_pq_entry *lhsx,
                       const nghttp3_pq_entry *rhsx) {
   nghttp3_stream *lhs =
-      nghttp3_struct_of(lhsx, nghttp3_stream, qpack_blocked_pe);
+    nghttp3_struct_of(lhsx, nghttp3_stream, qpack_blocked_pe);
   nghttp3_stream *rhs =
-      nghttp3_struct_of(rhsx, nghttp3_stream, qpack_blocked_pe);
+    nghttp3_struct_of(rhsx, nghttp3_stream, qpack_blocked_pe);
 
   return lhs->qpack_sctx.ricnt < rhs->qpack_sctx.ricnt;
 }
@@ -250,19 +250,19 @@ static int conn_new(nghttp3_conn **pconn, int server, int callbacks_version,
 
   nghttp3_objalloc_init(&conn->out_chunk_objalloc,
                         NGHTTP3_STREAM_MIN_CHUNK_SIZE * 16, mem);
-  nghttp3_objalloc_stream_init(&conn->stream_objalloc, 64, mem);
+  nghttp3_objalloc_stream_init(&conn->stream_objalloc, 8, mem);
 
   nghttp3_map_init(&conn->streams, mem);
 
-  rv = nghttp3_qpack_decoder_init(&conn->qdec,
-                                  settings->qpack_max_dtable_capacity,
-                                  settings->qpack_blocked_streams, mem);
+  rv =
+    nghttp3_qpack_decoder_init(&conn->qdec, settings->qpack_max_dtable_capacity,
+                               settings->qpack_blocked_streams, mem);
   if (rv != 0) {
     goto qdec_init_fail;
   }
 
   rv = nghttp3_qpack_encoder_init(
-      &conn->qenc, settings->qpack_encoder_max_dtable_capacity, mem);
+    &conn->qenc, settings->qpack_encoder_max_dtable_capacity, mem);
   if (rv != 0) {
     goto qenc_init_fail;
   }
@@ -273,7 +273,7 @@ static int conn_new(nghttp3_conn **pconn, int server, int callbacks_version,
     nghttp3_pq_init(&conn->sched[i].spq, cycle_less, mem);
   }
 
-  nghttp3_idtr_init(&conn->remote.bidi.idtr, server, mem);
+  nghttp3_idtr_init(&conn->remote.bidi.idtr, mem);
 
   conn->callbacks = *callbacks;
   conn->local.settings = *settings;
@@ -368,7 +368,7 @@ void nghttp3_conn_del(nghttp3_conn *conn) {
   nghttp3_qpack_encoder_free(&conn->qenc);
   nghttp3_qpack_decoder_free(&conn->qdec);
 
-  nghttp3_map_each_free(&conn->streams, free_stream, NULL);
+  nghttp3_map_each(&conn->streams, free_stream, NULL);
   nghttp3_map_free(&conn->streams);
 
   nghttp3_objalloc_free(&conn->stream_objalloc);
@@ -419,7 +419,7 @@ nghttp3_ssize nghttp3_conn_read_stream(nghttp3_conn *conn, int64_t stream_id,
         }
 
         conn->rx.max_stream_id_bidi =
-            nghttp3_max(conn->rx.max_stream_id_bidi, stream_id);
+          nghttp3_max_int64(conn->rx.max_stream_id_bidi, stream_id);
         rv = nghttp3_conn_create_stream(conn, &stream, stream_id);
         if (rv != 0) {
           return rv;
@@ -498,7 +498,7 @@ static nghttp3_ssize conn_read_type(nghttp3_conn *conn, nghttp3_stream *stream,
 
   assert(srclen);
 
-  nread = nghttp3_read_varint(rvint, src, srclen, fin);
+  nread = nghttp3_read_varint(rvint, src, src + srclen, fin);
   if (nread < 0) {
     return NGHTTP3_ERR_H3_GENERAL_PROTOCOL_ERROR;
   }
@@ -650,7 +650,7 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
     switch (rstate->state) {
     case NGHTTP3_CTRL_STREAM_STATE_FRAME_TYPE:
       assert(end - p > 0);
-      nread = nghttp3_read_varint(rvint, p, (size_t)(end - p), /* fin = */ 0);
+      nread = nghttp3_read_varint(rvint, p, end, /* fin = */ 0);
       if (nread < 0) {
         return NGHTTP3_ERR_H3_GENERAL_PROTOCOL_ERROR;
       }
@@ -670,7 +670,7 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
       /* Fall through */
     case NGHTTP3_CTRL_STREAM_STATE_FRAME_LENGTH:
       assert(end - p > 0);
-      nread = nghttp3_read_varint(rvint, p, (size_t)(end - p), /* fin = */ 0);
+      nread = nghttp3_read_varint(rvint, p, end, /* fin = */ 0);
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -767,9 +767,9 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
         }
 
         /* Read Identifier */
-        len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+        len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
         assert(len > 0);
-        nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+        nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
         if (nread < 0) {
           return NGHTTP3_ERR_H3_FRAME_ERROR;
         }
@@ -795,7 +795,7 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
           break;
         }
 
-        nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+        nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
         if (nread < 0) {
           return NGHTTP3_ERR_H3_FRAME_ERROR;
         }
@@ -811,16 +811,16 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
         nghttp3_varint_read_state_reset(rvint);
 
         rv =
-            nghttp3_conn_on_settings_entry_received(conn, &rstate->fr.settings);
+          nghttp3_conn_on_settings_entry_received(conn, &rstate->fr.settings);
         if (rv != 0) {
           return rv;
         }
       }
       break;
     case NGHTTP3_CTRL_STREAM_STATE_SETTINGS_ID:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       assert(len > 0);
-      nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+      nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -845,9 +845,9 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
       }
       /* Fall through */
     case NGHTTP3_CTRL_STREAM_STATE_SETTINGS_VALUE:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       assert(len > 0);
-      nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+      nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -879,9 +879,9 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
       nghttp3_stream_read_state_reset(rstate);
       break;
     case NGHTTP3_CTRL_STREAM_STATE_GOAWAY:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       assert(len > 0);
-      nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+      nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -906,7 +906,7 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
 
       if (conn->callbacks.shutdown) {
         rv =
-            conn->callbacks.shutdown(conn, conn->rx.goaway_id, conn->user_data);
+          conn->callbacks.shutdown(conn, conn->rx.goaway_id, conn->user_data);
         if (rv != 0) {
           return NGHTTP3_ERR_CALLBACK_FAILURE;
         }
@@ -916,9 +916,9 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
       break;
     case NGHTTP3_CTRL_STREAM_STATE_MAX_PUSH_ID:
       /* server side only */
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       assert(len > 0);
-      nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+      nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -941,9 +941,9 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
       break;
     case NGHTTP3_CTRL_STREAM_STATE_PRIORITY_UPDATE_PRI_ELEM_ID:
       /* server side only */
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       assert(len > 0);
-      nread = nghttp3_read_varint(rvint, p, len, frame_fin(rstate, len));
+      nread = nghttp3_read_varint(rvint, p, p + len, frame_fin(rstate, len));
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -977,7 +977,7 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
     case NGHTTP3_CTRL_STREAM_STATE_PRIORITY_UPDATE:
       /* We need to buffer Priority Field Value because it might be
          fragmented. */
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       assert(len > 0);
       if (conn->rx.pri_fieldbuflen == 0 && rstate->left == (int64_t)len) {
         /* Everything is in the input buffer.  Apply same length
@@ -1032,7 +1032,7 @@ nghttp3_ssize nghttp3_conn_read_control(nghttp3_conn *conn,
       nghttp3_stream_read_state_reset(rstate);
       break;
     case NGHTTP3_CTRL_STREAM_STATE_IGN_FRAME:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       p += len;
       nconsumed += len;
       rstate->left -= (int64_t)len;
@@ -1076,7 +1076,7 @@ static int conn_delete_stream(nghttp3_conn *conn, nghttp3_stream *stream) {
   }
 
   rv =
-      nghttp3_map_remove(&conn->streams, (nghttp3_map_key_type)stream->node.id);
+    nghttp3_map_remove(&conn->streams, (nghttp3_map_key_type)stream->node.id);
 
   assert(0 == rv);
 
@@ -1104,8 +1104,8 @@ static int conn_process_blocked_stream_data(nghttp3_conn *conn,
     buf = nghttp3_ringbuf_get(&stream->inq, 0);
 
     nconsumed = nghttp3_conn_read_bidi(
-        conn, &nproc, stream, buf->pos, nghttp3_buf_len(buf),
-        len == 1 && (stream->flags & NGHTTP3_STREAM_FLAG_READ_EOF));
+      conn, &nproc, stream, buf->pos, nghttp3_buf_len(buf),
+      len == 1 && (stream->flags & NGHTTP3_STREAM_FLAG_READ_EOF));
     if (nconsumed < 0) {
       return (int)nconsumed;
     }
@@ -1144,7 +1144,7 @@ nghttp3_ssize nghttp3_conn_read_qpack_encoder(nghttp3_conn *conn,
                                               const uint8_t *src,
                                               size_t srclen) {
   nghttp3_ssize nconsumed =
-      nghttp3_qpack_decoder_read_encoder(&conn->qdec, src, srclen);
+    nghttp3_qpack_decoder_read_encoder(&conn->qdec, src, srclen);
   nghttp3_stream *stream;
   int rv;
 
@@ -1240,7 +1240,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
     switch (rstate->state) {
     case NGHTTP3_REQ_STREAM_STATE_FRAME_TYPE:
       assert(end - p > 0);
-      nread = nghttp3_read_varint(rvint, p, (size_t)(end - p), fin);
+      nread = nghttp3_read_varint(rvint, p, end, fin);
       if (nread < 0) {
         return NGHTTP3_ERR_H3_GENERAL_PROTOCOL_ERROR;
       }
@@ -1260,7 +1260,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
       /* Fall through */
     case NGHTTP3_REQ_STREAM_STATE_FRAME_LENGTH:
       assert(end - p > 0);
-      nread = nghttp3_read_varint(rvint, p, (size_t)(end - p), fin);
+      nread = nghttp3_read_varint(rvint, p, end, fin);
       if (nread < 0) {
         return NGHTTP3_ERR_H3_FRAME_ERROR;
       }
@@ -1277,14 +1277,14 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
       switch (rstate->fr.hd.type) {
       case NGHTTP3_FRAME_DATA:
         rv = nghttp3_stream_transit_rx_http_state(
-            stream, NGHTTP3_HTTP_EVENT_DATA_BEGIN);
+          stream, NGHTTP3_HTTP_EVENT_DATA_BEGIN);
         if (rv != 0) {
           return rv;
         }
         /* DATA frame might be empty. */
         if (rstate->left == 0) {
           rv = nghttp3_stream_transit_rx_http_state(
-              stream, NGHTTP3_HTTP_EVENT_DATA_END);
+            stream, NGHTTP3_HTTP_EVENT_DATA_END);
           assert(0 == rv);
 
           nghttp3_stream_read_state_reset(rstate);
@@ -1294,7 +1294,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
         break;
       case NGHTTP3_FRAME_HEADERS:
         rv = nghttp3_stream_transit_rx_http_state(
-            stream, NGHTTP3_HTTP_EVENT_HEADERS_BEGIN);
+          stream, NGHTTP3_HTTP_EVENT_HEADERS_BEGIN);
         if (rv != 0) {
           return rv;
         }
@@ -1305,7 +1305,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
           }
 
           rv = nghttp3_stream_transit_rx_http_state(
-              stream, NGHTTP3_HTTP_EVENT_HEADERS_END);
+            stream, NGHTTP3_HTTP_EVENT_HEADERS_END);
           assert(0 == rv);
 
           nghttp3_stream_read_state_reset(rstate);
@@ -1351,7 +1351,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
       }
       break;
     case NGHTTP3_REQ_STREAM_STATE_DATA:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       rv = nghttp3_conn_on_data(conn, stream, p, len);
       if (rv != 0) {
         return rv;
@@ -1370,7 +1370,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
       nghttp3_stream_read_state_reset(rstate);
       break;
     case NGHTTP3_REQ_STREAM_STATE_HEADERS:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       nread = nghttp3_conn_on_headers(conn, stream, p, len,
                                       (int64_t)len == rstate->left);
       if (nread < 0) {
@@ -1478,7 +1478,7 @@ nghttp3_ssize nghttp3_conn_read_bidi(nghttp3_conn *conn, size_t *pnproc,
 
       break;
     case NGHTTP3_REQ_STREAM_STATE_IGN_FRAME:
-      len = (size_t)nghttp3_min(rstate->left, (int64_t)(end - p));
+      len = (size_t)nghttp3_min_int64(rstate->left, (int64_t)(end - p));
       p += len;
       nconsumed += len;
       rstate->left -= (int64_t)len;
@@ -1590,9 +1590,9 @@ static nghttp3_ssize conn_decode_headers(nghttp3_conn *conn,
   buf.last = buf.end;
 
   for (;;) {
-    nread = nghttp3_qpack_decoder_read_request(qdec, &stream->qpack_sctx, &nv,
-                                               &flags, buf.pos,
-                                               nghttp3_buf_len(&buf), fin);
+    nread =
+      nghttp3_qpack_decoder_read_request(qdec, &stream->qpack_sctx, &nv, &flags,
+                                         buf.pos, nghttp3_buf_len(&buf), fin);
 
     if (nread < 0) {
       return (int)nread;
@@ -1625,8 +1625,8 @@ static nghttp3_ssize conn_decode_headers(nghttp3_conn *conn,
 
     if (flags & NGHTTP3_QPACK_DECODE_FLAG_EMIT) {
       rv = nghttp3_http_on_header(
-          http, &nv, request, trailers,
-          conn->server && conn->local.settings.enable_connect_protocol);
+        http, &nv, request, trailers,
+        conn->server && conn->local.settings.enable_connect_protocol);
       switch (rv) {
       case NGHTTP3_ERR_MALFORMED_HTTP_HEADER:
         break;
@@ -1705,7 +1705,7 @@ int nghttp3_conn_on_settings_entry_received(nghttp3_conn *conn,
     dest->qpack_blocked_streams = (size_t)ent->value;
 
     nghttp3_qpack_encoder_set_max_blocked_streams(
-        &conn->qenc, (size_t)nghttp3_min(100, ent->value));
+      &conn->qenc, (size_t)nghttp3_min_uint64(100, ent->value));
     break;
   case NGHTTP3_SETTINGS_ID_ENABLE_CONNECT_PROTOCOL:
     if (!conn->server) {
@@ -1784,7 +1784,7 @@ conn_on_priority_update_stream(nghttp3_conn *conn,
     }
 
     conn->rx.max_stream_id_bidi =
-        nghttp3_max(conn->rx.max_stream_id_bidi, stream_id);
+      nghttp3_max_int64(conn->rx.max_stream_id_bidi, stream_id);
     rv = nghttp3_conn_create_stream(conn, &stream, stream_id);
     if (rv != 0) {
       return rv;
@@ -1836,7 +1836,7 @@ int nghttp3_conn_create_stream(nghttp3_conn *conn, nghttp3_stream **pstream,
   nghttp3_stream *stream;
   int rv;
   nghttp3_stream_callbacks callbacks = {
-      conn_stream_acked_data,
+    conn_stream_acked_data,
   };
 
   rv = nghttp3_stream_new(&stream, stream_id, &callbacks,
@@ -1995,7 +1995,7 @@ nghttp3_ssize nghttp3_conn_writev_stream(nghttp3_conn *conn,
 
   if (conn->tx.ctrl && !nghttp3_stream_is_blocked(conn->tx.ctrl)) {
     ncnt =
-        conn_writev_stream(conn, pstream_id, pfin, vec, veccnt, conn->tx.ctrl);
+      conn_writev_stream(conn, pstream_id, pfin, vec, veccnt, conn->tx.ctrl);
     if (ncnt) {
       return ncnt;
     }
@@ -2008,7 +2008,7 @@ nghttp3_ssize nghttp3_conn_writev_stream(nghttp3_conn *conn,
     }
 
     ncnt =
-        conn_writev_stream(conn, pstream_id, pfin, vec, veccnt, conn->tx.qdec);
+      conn_writev_stream(conn, pstream_id, pfin, vec, veccnt, conn->tx.qdec);
     if (ncnt) {
       return ncnt;
     }
@@ -2016,7 +2016,7 @@ nghttp3_ssize nghttp3_conn_writev_stream(nghttp3_conn *conn,
 
   if (conn->tx.qenc && !nghttp3_stream_is_blocked(conn->tx.qenc)) {
     ncnt =
-        conn_writev_stream(conn, pstream_id, pfin, vec, veccnt, conn->tx.qenc);
+      conn_writev_stream(conn, pstream_id, pfin, vec, veccnt, conn->tx.qenc);
     if (ncnt) {
       return ncnt;
     }
@@ -2095,7 +2095,22 @@ int nghttp3_conn_add_ack_offset(nghttp3_conn *conn, int64_t stream_id,
     return 0;
   }
 
-  return nghttp3_stream_add_ack_offset(stream, n);
+  return nghttp3_stream_update_ack_offset(stream, stream->ack_offset + n);
+}
+
+int nghttp3_conn_update_ack_offset(nghttp3_conn *conn, int64_t stream_id,
+                                   uint64_t offset) {
+  nghttp3_stream *stream = nghttp3_conn_find_stream(conn, stream_id);
+
+  if (stream == NULL) {
+    return 0;
+  }
+
+  if (stream->ack_offset > offset) {
+    return NGHTTP3_ERR_INVALID_ARGUMENT;
+  }
+
+  return nghttp3_stream_update_ack_offset(stream, offset);
 }
 
 static int conn_submit_headers_data(nghttp3_conn *conn, nghttp3_stream *stream,
@@ -2304,7 +2319,7 @@ int nghttp3_conn_shutdown(nghttp3_conn *conn) {
   frent.fr.hd.type = NGHTTP3_FRAME_GOAWAY;
   if (conn->server) {
     frent.fr.goaway.id =
-        nghttp3_min((1ll << 62) - 4, conn->rx.max_stream_id_bidi + 4);
+      nghttp3_min_int64((1ll << 62) - 4, conn->rx.max_stream_id_bidi + 4);
   } else {
     frent.fr.goaway.id = 0;
   }
@@ -2318,7 +2333,7 @@ int nghttp3_conn_shutdown(nghttp3_conn *conn) {
 
   conn->tx.goaway_id = frent.fr.goaway.id;
   conn->flags |=
-      NGHTTP3_CONN_FLAG_GOAWAY_QUEUED | NGHTTP3_CONN_FLAG_SHUTDOWN_COMMENCED;
+    NGHTTP3_CONN_FLAG_GOAWAY_QUEUED | NGHTTP3_CONN_FLAG_SHUTDOWN_COMMENCED;
 
   return 0;
 }
@@ -2619,5 +2634,5 @@ void nghttp3_settings_default_versioned(int settings_version,
   memset(settings, 0, sizeof(nghttp3_settings));
   settings->max_field_section_size = NGHTTP3_VARINT_MAX;
   settings->qpack_encoder_max_dtable_capacity =
-      NGHTTP3_QPACK_ENCODER_MAX_DTABLE_CAPACITY;
+    NGHTTP3_QPACK_ENCODER_MAX_DTABLE_CAPACITY;
 }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_conn.h b/deps/ngtcp2/nghttp3/lib/nghttp3_conn.h
index 74f47583ce825c..1218ba508ba46a 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_conn.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_conn.h
@@ -27,7 +27,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -204,4 +204,4 @@ int nghttp3_conn_reject_stream(nghttp3_conn *conn, nghttp3_stream *stream);
  */
 nghttp3_stream *nghttp3_conn_get_next_tx_stream(nghttp3_conn *conn);
 
-#endif /* NGHTTP3_CONN_H */
+#endif /* !defined(NGHTTP3_CONN_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_conv.c b/deps/ngtcp2/nghttp3/lib/nghttp3_conv.c
index edd0adc4d0ff0a..6439a6d782960c 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_conv.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_conv.c
@@ -31,34 +31,39 @@
 #include "nghttp3_str.h"
 #include "nghttp3_unreachable.h"
 
-int64_t nghttp3_get_varint(size_t *plen, const uint8_t *p) {
+const uint8_t *nghttp3_get_varint(int64_t *dest, const uint8_t *p) {
   union {
-    char b[8];
+    uint8_t n8;
     uint16_t n16;
     uint32_t n32;
     uint64_t n64;
   } n;
 
-  *plen = (size_t)(1u << (*p >> 6));
-
-  switch (*plen) {
+  switch (*p >> 6) {
+  case 0:
+    *dest = *p++;
+    return p;
   case 1:
-    return (int64_t)*p;
-  case 2:
     memcpy(&n, p, 2);
-    n.b[0] &= 0x3f;
-    return (int64_t)ntohs(n.n16);
-  case 4:
+    n.n8 &= 0x3f;
+    *dest = ntohs(n.n16);
+
+    return p + 2;
+  case 2:
     memcpy(&n, p, 4);
-    n.b[0] &= 0x3f;
-    return (int64_t)ntohl(n.n32);
-  case 8:
+    n.n8 &= 0x3f;
+    *dest = ntohl(n.n32);
+
+    return p + 4;
+  case 3:
     memcpy(&n, p, 8);
-    n.b[0] &= 0x3f;
-    return (int64_t)nghttp3_ntohl64(n.n64);
-  }
+    n.n8 &= 0x3f;
+    *dest = (int64_t)nghttp3_ntohl64(n.n64);
 
-  nghttp3_unreachable();
+    return p + 8;
+  default:
+    nghttp3_unreachable();
+  }
 }
 
 int64_t nghttp3_get_varint_fb(const uint8_t *p) { return *p & 0x3f; }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_conv.h b/deps/ngtcp2/nghttp3/lib/nghttp3_conv.h
index 5522bc735bfd37..40f5d4de782883 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_conv.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_conv.h
@@ -28,69 +28,67 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #ifdef HAVE_ARPA_INET_H
 #  include <arpa/inet.h>
-#endif /* HAVE_ARPA_INET_H */
+#endif /* defined(HAVE_ARPA_INET_H) */
 
 #ifdef HAVE_NETINET_IN_H
 #  include <netinet/in.h>
-#endif /* HAVE_NETINET_IN_H */
+#endif /* defined(HAVE_NETINET_IN_H) */
 
 #ifdef HAVE_BYTESWAP_H
 #  include <byteswap.h>
-#endif /* HAVE_BYTESWAP_H */
+#endif /* defined(HAVE_BYTESWAP_H) */
 
 #ifdef HAVE_ENDIAN_H
 #  include <endian.h>
-#endif /* HAVE_ENDIAN_H */
+#endif /* defined(HAVE_ENDIAN_H) */
 
 #ifdef HAVE_SYS_ENDIAN_H
 #  include <sys/endian.h>
-#endif /* HAVE_SYS_ENDIAN_H */
+#endif /* defined(HAVE_SYS_ENDIAN_H) */
 
-#if defined(__APPLE__)
+#ifdef __APPLE__
 #  include <libkern/OSByteOrder.h>
-#endif // __APPLE__
+#endif /* defined(__APPLE__) */
 
 #include <nghttp3/nghttp3.h>
 
-#if defined(HAVE_BE64TOH) ||                                                   \
-    (defined(HAVE_DECL_BE64TOH) && HAVE_DECL_BE64TOH > 0)
+#if HAVE_DECL_BE64TOH
 #  define nghttp3_ntohl64(N) be64toh(N)
 #  define nghttp3_htonl64(N) htobe64(N)
-#else /* !HAVE_BE64TOH */
-#  if defined(WORDS_BIGENDIAN)
+#else /* !HAVE_DECL_BE64TOH */
+#  ifdef WORDS_BIGENDIAN
 #    define nghttp3_ntohl64(N) (N)
 #    define nghttp3_htonl64(N) (N)
-#  else /* !WORDS_BIGENDIAN */
-#    if defined(HAVE_BSWAP_64) ||                                              \
-        (defined(HAVE_DECL_BSWAP_64) && HAVE_DECL_BSWAP_64 > 0)
+#  else /* !defined(WORDS_BIGENDIAN) */
+#    if HAVE_DECL_BSWAP_64
 #      define nghttp3_bswap64 bswap_64
 #    elif defined(WIN32)
 #      define nghttp3_bswap64 _byteswap_uint64
 #    elif defined(__APPLE__)
 #      define nghttp3_bswap64 OSSwapInt64
-#    else /* !HAVE_BSWAP_64 && !WIN32 && !__APPLE__ */
+#    else /* !(HAVE_DECL_BSWAP_64 || defined(WIN32) || defined(__APPLE__)) */
 #      define nghttp3_bswap64(N)                                               \
         ((uint64_t)(ntohl((uint32_t)(N))) << 32 | ntohl((uint32_t)((N) >> 32)))
-#    endif /* !HAVE_BSWAP_64 && !WIN32 && !__APPLE__ */
+#    endif /* !(HAVE_DECL_BSWAP_64 || defined(WIN32) || defined(__APPLE__)) */
 #    define nghttp3_ntohl64(N) nghttp3_bswap64(N)
 #    define nghttp3_htonl64(N) nghttp3_bswap64(N)
-#  endif /* !WORDS_BIGENDIAN */
-#endif   /* !HAVE_BE64TOH */
+#  endif /* !defined(WORDS_BIGENDIAN) */
+#endif   /* !HAVE_DECL_BE64TOH */
 
-#if defined(WIN32)
+#ifdef WIN32
 /* Windows requires ws2_32 library for ntonl family of functions.  We
    define inline functions for those functions so that we don't have
    dependency on that lib. */
 
 #  ifdef _MSC_VER
 #    define STIN static __inline
-#  else
+#  else /* !defined(_MSC_VER) */
 #    define STIN static inline
-#  endif
+#  endif /* !defined(_MSC_VER) */
 
 STIN uint32_t htonl(uint32_t hostlong) {
   uint32_t res;
@@ -128,14 +126,14 @@ STIN uint16_t ntohs(uint16_t netshort) {
   return res;
 }
 
-#endif /* WIN32 */
+#endif /* defined(WIN32) */
 
 /*
- * nghttp3_get_varint reads variable-length integer from |p|, and
- * returns it in host byte order.  The number of bytes read is stored
- * in |*plen|.
+ * nghttp3_get_varint reads variable-length unsigned integer from |p|,
+ * and stores it in the buffer pointed by |dest| in host byte order.
+ * It returns |p| plus the number of bytes read from |p|.
  */
-int64_t nghttp3_get_varint(size_t *plen, const uint8_t *p);
+const uint8_t *nghttp3_get_varint(int64_t *dest, const uint8_t *p);
 
 /*
  * nghttp3_get_varint_fb reads first byte of encoded variable-length
@@ -193,4 +191,4 @@ uint64_t nghttp3_ord_stream_id(int64_t stream_id);
  */
 #define NGHTTP3_PRI_INC_MASK (1 << 7)
 
-#endif /* NGHTTP3_CONV_H */
+#endif /* !defined(NGHTTP3_CONV_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_debug.c b/deps/ngtcp2/nghttp3/lib/nghttp3_debug.c
index 4021b0dc469b66..0235217e9627ce 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_debug.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_debug.c
@@ -35,7 +35,7 @@ static void nghttp3_default_debug_vfprintf_callback(const char *fmt,
 }
 
 static nghttp3_debug_vprintf_callback static_debug_vprintf_callback =
-    nghttp3_default_debug_vfprintf_callback;
+  nghttp3_default_debug_vfprintf_callback;
 
 void nghttp3_debug_vprintf(const char *format, ...) {
   if (static_debug_vprintf_callback) {
@@ -47,15 +47,15 @@ void nghttp3_debug_vprintf(const char *format, ...) {
 }
 
 void nghttp3_set_debug_vprintf_callback(
-    nghttp3_debug_vprintf_callback debug_vprintf_callback) {
+  nghttp3_debug_vprintf_callback debug_vprintf_callback) {
   static_debug_vprintf_callback = debug_vprintf_callback;
 }
 
-#else /* !DEBUGBUILD */
+#else /* !defined(DEBUGBUILD) */
 
 void nghttp3_set_debug_vprintf_callback(
-    nghttp3_debug_vprintf_callback debug_vprintf_callback) {
+  nghttp3_debug_vprintf_callback debug_vprintf_callback) {
   (void)debug_vprintf_callback;
 }
 
-#endif /* !DEBUGBUILD */
+#endif /* !defined(DEBUGBUILD) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_debug.h b/deps/ngtcp2/nghttp3/lib/nghttp3_debug.h
index 01ed918414cfe5..d73bf8ecf312cd 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_debug.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_debug.h
@@ -28,17 +28,17 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
 #ifdef DEBUGBUILD
 #  define DEBUGF(...) nghttp3_debug_vprintf(__VA_ARGS__)
 void nghttp3_debug_vprintf(const char *format, ...);
-#else
+#else /* !defined(DEBUGBUILD) */
 #  define DEBUGF(...)                                                          \
     do {                                                                       \
     } while (0)
-#endif
+#endif /* !defined(DEBUGBUILD) */
 
-#endif /* NGHTTP3_DEBUG_H */
+#endif /* !defined(NGHTTP3_DEBUG_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_err.h b/deps/ngtcp2/nghttp3/lib/nghttp3_err.h
index 2fa914f86b189e..6f8205cc17ce7c 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_err.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_err.h
@@ -27,8 +27,8 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
-#endif /* NGHTTP3_ERR_H */
+#endif /* !defined(NGHTTP3_ERR_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_frame.c b/deps/ngtcp2/nghttp3/lib/nghttp3_frame.c
index 923a78f90f826f..1d87e448d887cf 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_frame.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_frame.c
@@ -102,7 +102,7 @@ nghttp3_frame_write_priority_update(uint8_t *p,
 }
 
 size_t nghttp3_frame_write_priority_update_len(
-    int64_t *ppayloadlen, const nghttp3_frame_priority_update *fr) {
+  int64_t *ppayloadlen, const nghttp3_frame_priority_update *fr) {
   size_t payloadlen = nghttp3_put_varintlen(fr->pri_elem_id) + fr->datalen;
 
   *ppayloadlen = (int64_t)payloadlen;
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_frame.h b/deps/ngtcp2/nghttp3/lib/nghttp3_frame.h
index 1079673d150ce3..e216967d740b86 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_frame.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_frame.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -188,7 +188,7 @@ nghttp3_frame_write_priority_update(uint8_t *dest,
  * stores payload length in |*ppayloadlen|.
  */
 size_t nghttp3_frame_write_priority_update_len(
-    int64_t *ppayloadlen, const nghttp3_frame_priority_update *fr);
+  int64_t *ppayloadlen, const nghttp3_frame_priority_update *fr);
 
 /*
  * nghttp3_nva_copy copies name/value pairs from |nva|, which contains
@@ -227,4 +227,4 @@ void nghttp3_frame_headers_free(nghttp3_frame_headers *fr,
 void nghttp3_frame_priority_update_free(nghttp3_frame_priority_update *fr,
                                         const nghttp3_mem *mem);
 
-#endif /* NGHTTP3_FRAME_H */
+#endif /* !defined(NGHTTP3_FRAME_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.c b/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.c
index 88cb49a02f892f..20eed5faa2bcba 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.c
@@ -37,14 +37,8 @@ void nghttp3_gaptr_init(nghttp3_gaptr *gaptr, const nghttp3_mem *mem) {
 
 static int gaptr_gap_init(nghttp3_gaptr *gaptr) {
   nghttp3_range range = {0, UINT64_MAX};
-  int rv;
-
-  rv = nghttp3_ksl_insert(&gaptr->gap, NULL, &range, NULL);
-  if (rv != 0) {
-    return rv;
-  }
 
-  return 0;
+  return nghttp3_ksl_insert(&gaptr->gap, NULL, &range, NULL);
 }
 
 void nghttp3_gaptr_free(nghttp3_gaptr *gaptr) {
@@ -82,7 +76,9 @@ int nghttp3_gaptr_push(nghttp3_gaptr *gaptr, uint64_t offset,
       nghttp3_ksl_remove_hint(&gaptr->gap, &it, &it, &k);
       continue;
     }
+
     nghttp3_range_cut(&l, &r, &k, &m);
+
     if (nghttp3_range_len(&l)) {
       nghttp3_ksl_update_key(&gaptr->gap, &k, &l);
 
@@ -95,23 +91,23 @@ int nghttp3_gaptr_push(nghttp3_gaptr *gaptr, uint64_t offset,
     } else if (nghttp3_range_len(&r)) {
       nghttp3_ksl_update_key(&gaptr->gap, &k, &r);
     }
+
     nghttp3_ksl_it_next(&it);
   }
+
   return 0;
 }
 
 uint64_t nghttp3_gaptr_first_gap_offset(nghttp3_gaptr *gaptr) {
   nghttp3_ksl_it it;
-  nghttp3_range r;
 
   if (nghttp3_ksl_len(&gaptr->gap) == 0) {
     return 0;
   }
 
   it = nghttp3_ksl_begin(&gaptr->gap);
-  r = *(nghttp3_range *)nghttp3_ksl_it_key(&it);
 
-  return r.begin;
+  return ((nghttp3_range *)nghttp3_ksl_it_key(&it))->begin;
 }
 
 nghttp3_range nghttp3_gaptr_get_first_gap_after(nghttp3_gaptr *gaptr,
@@ -136,7 +132,6 @@ int nghttp3_gaptr_is_pushed(nghttp3_gaptr *gaptr, uint64_t offset,
                             uint64_t datalen) {
   nghttp3_range q = {offset, offset + datalen};
   nghttp3_ksl_it it;
-  nghttp3_range k;
   nghttp3_range m;
 
   if (nghttp3_ksl_len(&gaptr->gap) == 0) {
@@ -145,8 +140,7 @@ int nghttp3_gaptr_is_pushed(nghttp3_gaptr *gaptr, uint64_t offset,
 
   it = nghttp3_ksl_lower_bound_compar(&gaptr->gap, &q,
                                       nghttp3_ksl_range_exclusive_compar);
-  k = *(nghttp3_range *)nghttp3_ksl_it_key(&it);
-  m = nghttp3_range_intersect(&q, &k);
+  m = nghttp3_range_intersect(&q, (nghttp3_range *)nghttp3_ksl_it_key(&it));
 
   return nghttp3_range_len(&m) == 0;
 }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.h b/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.h
index 7c83c847c9fe29..7578fdc14f6010 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_gaptr.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -40,8 +40,9 @@
  * nghttp3_gaptr maintains the gap in the range [0, UINT64_MAX).
  */
 typedef struct nghttp3_gaptr {
-  /* gap maintains the range of offset which is not received
-     yet. Initially, its range is [0, UINT64_MAX). */
+  /* gap maintains the range of offset which is not pushed
+     yet. Initially, its range is [0, UINT64_MAX).  "gap" is the range
+     that is not pushed yet. */
   nghttp3_ksl gap;
   /* mem is custom memory allocator */
   const nghttp3_mem *mem;
@@ -58,8 +59,7 @@ void nghttp3_gaptr_init(nghttp3_gaptr *gaptr, const nghttp3_mem *mem);
 void nghttp3_gaptr_free(nghttp3_gaptr *gaptr);
 
 /*
- * nghttp3_gaptr_push adds new data of length |datalen| at the stream
- * offset |offset|.
+ * nghttp3_gaptr_push pushes the range [offset, offset + datalen).
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
@@ -77,7 +77,7 @@ uint64_t nghttp3_gaptr_first_gap_offset(nghttp3_gaptr *gaptr);
 
 /*
  * nghttp3_gaptr_get_first_gap_after returns the first gap which
- * overlaps or comes after |offset|.
+ * includes or comes after |offset|.
  */
 nghttp3_range nghttp3_gaptr_get_first_gap_after(nghttp3_gaptr *gaptr,
                                                 uint64_t offset);
@@ -96,4 +96,4 @@ int nghttp3_gaptr_is_pushed(nghttp3_gaptr *gaptr, uint64_t offset,
  */
 void nghttp3_gaptr_drop_first_gap(nghttp3_gaptr *gaptr);
 
-#endif /* NGHTTP3_GAPTR_H */
+#endif /* !defined(NGHTTP3_GAPTR_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_http.c b/deps/ngtcp2/nghttp3/lib/nghttp3_http.c
index 963134f13df946..38092cfb7c322c 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_http.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_http.c
@@ -28,11 +28,15 @@
 #include <string.h>
 #include <assert.h>
 
+#ifdef __AVX2__
+#  include <immintrin.h>
+#endif /* __AVX2__ */
+
 #include "nghttp3_stream.h"
 #include "nghttp3_macro.h"
 #include "nghttp3_conv.h"
 #include "nghttp3_unreachable.h"
-#include "sfparse.h"
+#include "sfparse/sfparse.h"
 
 static uint8_t downcase(uint8_t c) {
   return 'A' <= c && c <= 'Z' ? (uint8_t)(c - 'A' + 'a') : c;
@@ -175,26 +179,282 @@ int nghttp3_pri_parse_priority_versioned(int pri_version, nghttp3_pri *dest,
   return nghttp3_http_parse_priority(dest, value, valuelen);
 }
 
+/* Generated by genauthroitychartbl.py */
+static char VALID_AUTHORITY_CHARS[] = {
+  0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
+  0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
+  0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
+  0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
+  0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
+  0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
+  0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
+  0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
+  0 /* SPC  */, 1 /* !    */, 0 /* "    */, 0 /* #    */,
+  1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
+  1 /* (    */, 1 /* )    */, 1 /* *    */, 1 /* +    */,
+  1 /* ,    */, 1 /* -    */, 1 /* .    */, 0 /* /    */,
+  1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
+  1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
+  1 /* 8    */, 1 /* 9    */, 1 /* :    */, 1 /* ;    */,
+  0 /* <    */, 1 /* =    */, 0 /* >    */, 0 /* ?    */,
+  1 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
+  1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
+  1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
+  1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
+  1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
+  1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
+  1 /* X    */, 1 /* Y    */, 1 /* Z    */, 1 /* [    */,
+  0 /* \    */, 1 /* ]    */, 0 /* ^    */, 1 /* _    */,
+  0 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
+  1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
+  1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
+  1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
+  1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
+  1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
+  1 /* x    */, 1 /* y    */, 1 /* z    */, 0 /* {    */,
+  0 /* |    */, 0 /* }    */, 1 /* ~    */, 0 /* DEL  */,
+  0 /* 0x80 */, 0 /* 0x81 */, 0 /* 0x82 */, 0 /* 0x83 */,
+  0 /* 0x84 */, 0 /* 0x85 */, 0 /* 0x86 */, 0 /* 0x87 */,
+  0 /* 0x88 */, 0 /* 0x89 */, 0 /* 0x8a */, 0 /* 0x8b */,
+  0 /* 0x8c */, 0 /* 0x8d */, 0 /* 0x8e */, 0 /* 0x8f */,
+  0 /* 0x90 */, 0 /* 0x91 */, 0 /* 0x92 */, 0 /* 0x93 */,
+  0 /* 0x94 */, 0 /* 0x95 */, 0 /* 0x96 */, 0 /* 0x97 */,
+  0 /* 0x98 */, 0 /* 0x99 */, 0 /* 0x9a */, 0 /* 0x9b */,
+  0 /* 0x9c */, 0 /* 0x9d */, 0 /* 0x9e */, 0 /* 0x9f */,
+  0 /* 0xa0 */, 0 /* 0xa1 */, 0 /* 0xa2 */, 0 /* 0xa3 */,
+  0 /* 0xa4 */, 0 /* 0xa5 */, 0 /* 0xa6 */, 0 /* 0xa7 */,
+  0 /* 0xa8 */, 0 /* 0xa9 */, 0 /* 0xaa */, 0 /* 0xab */,
+  0 /* 0xac */, 0 /* 0xad */, 0 /* 0xae */, 0 /* 0xaf */,
+  0 /* 0xb0 */, 0 /* 0xb1 */, 0 /* 0xb2 */, 0 /* 0xb3 */,
+  0 /* 0xb4 */, 0 /* 0xb5 */, 0 /* 0xb6 */, 0 /* 0xb7 */,
+  0 /* 0xb8 */, 0 /* 0xb9 */, 0 /* 0xba */, 0 /* 0xbb */,
+  0 /* 0xbc */, 0 /* 0xbd */, 0 /* 0xbe */, 0 /* 0xbf */,
+  0 /* 0xc0 */, 0 /* 0xc1 */, 0 /* 0xc2 */, 0 /* 0xc3 */,
+  0 /* 0xc4 */, 0 /* 0xc5 */, 0 /* 0xc6 */, 0 /* 0xc7 */,
+  0 /* 0xc8 */, 0 /* 0xc9 */, 0 /* 0xca */, 0 /* 0xcb */,
+  0 /* 0xcc */, 0 /* 0xcd */, 0 /* 0xce */, 0 /* 0xcf */,
+  0 /* 0xd0 */, 0 /* 0xd1 */, 0 /* 0xd2 */, 0 /* 0xd3 */,
+  0 /* 0xd4 */, 0 /* 0xd5 */, 0 /* 0xd6 */, 0 /* 0xd7 */,
+  0 /* 0xd8 */, 0 /* 0xd9 */, 0 /* 0xda */, 0 /* 0xdb */,
+  0 /* 0xdc */, 0 /* 0xdd */, 0 /* 0xde */, 0 /* 0xdf */,
+  0 /* 0xe0 */, 0 /* 0xe1 */, 0 /* 0xe2 */, 0 /* 0xe3 */,
+  0 /* 0xe4 */, 0 /* 0xe5 */, 0 /* 0xe6 */, 0 /* 0xe7 */,
+  0 /* 0xe8 */, 0 /* 0xe9 */, 0 /* 0xea */, 0 /* 0xeb */,
+  0 /* 0xec */, 0 /* 0xed */, 0 /* 0xee */, 0 /* 0xef */,
+  0 /* 0xf0 */, 0 /* 0xf1 */, 0 /* 0xf2 */, 0 /* 0xf3 */,
+  0 /* 0xf4 */, 0 /* 0xf5 */, 0 /* 0xf6 */, 0 /* 0xf7 */,
+  0 /* 0xf8 */, 0 /* 0xf9 */, 0 /* 0xfa */, 0 /* 0xfb */,
+  0 /* 0xfc */, 0 /* 0xfd */, 0 /* 0xfe */, 0 /* 0xff */
+};
+
+static int check_authority(const uint8_t *value, size_t len) {
+  const uint8_t *last;
+  for (last = value + len; value != last; ++value) {
+    if (!VALID_AUTHORITY_CHARS[*value]) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int check_scheme(const uint8_t *value, size_t len) {
+  const uint8_t *last;
+  if (len == 0) {
+    return 0;
+  }
+
+  if (!(('A' <= *value && *value <= 'Z') || ('a' <= *value && *value <= 'z'))) {
+    return 0;
+  }
+
+  last = value + len;
+  ++value;
+
+  for (; value != last; ++value) {
+    if (!(('A' <= *value && *value <= 'Z') ||
+          ('a' <= *value && *value <= 'z') ||
+          ('0' <= *value && *value <= '9') || *value == '+' || *value == '-' ||
+          *value == '.')) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+/* Generated by genmethodchartbl.py */
+static char VALID_METHOD_CHARS[] = {
+  0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
+  0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
+  0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
+  0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
+  0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
+  0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
+  0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
+  0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
+  0 /* SPC  */, 1 /* !    */, 0 /* "    */, 1 /* #    */,
+  1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
+  0 /* (    */, 0 /* )    */, 1 /* *    */, 1 /* +    */,
+  0 /* ,    */, 1 /* -    */, 1 /* .    */, 0 /* /    */,
+  1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
+  1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
+  1 /* 8    */, 1 /* 9    */, 0 /* :    */, 0 /* ;    */,
+  0 /* <    */, 0 /* =    */, 0 /* >    */, 0 /* ?    */,
+  0 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
+  1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
+  1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
+  1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
+  1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
+  1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
+  1 /* X    */, 1 /* Y    */, 1 /* Z    */, 0 /* [    */,
+  0 /* \    */, 0 /* ]    */, 1 /* ^    */, 1 /* _    */,
+  1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
+  1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
+  1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
+  1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
+  1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
+  1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
+  1 /* x    */, 1 /* y    */, 1 /* z    */, 0 /* {    */,
+  1 /* |    */, 0 /* }    */, 1 /* ~    */, 0 /* DEL  */,
+  0 /* 0x80 */, 0 /* 0x81 */, 0 /* 0x82 */, 0 /* 0x83 */,
+  0 /* 0x84 */, 0 /* 0x85 */, 0 /* 0x86 */, 0 /* 0x87 */,
+  0 /* 0x88 */, 0 /* 0x89 */, 0 /* 0x8a */, 0 /* 0x8b */,
+  0 /* 0x8c */, 0 /* 0x8d */, 0 /* 0x8e */, 0 /* 0x8f */,
+  0 /* 0x90 */, 0 /* 0x91 */, 0 /* 0x92 */, 0 /* 0x93 */,
+  0 /* 0x94 */, 0 /* 0x95 */, 0 /* 0x96 */, 0 /* 0x97 */,
+  0 /* 0x98 */, 0 /* 0x99 */, 0 /* 0x9a */, 0 /* 0x9b */,
+  0 /* 0x9c */, 0 /* 0x9d */, 0 /* 0x9e */, 0 /* 0x9f */,
+  0 /* 0xa0 */, 0 /* 0xa1 */, 0 /* 0xa2 */, 0 /* 0xa3 */,
+  0 /* 0xa4 */, 0 /* 0xa5 */, 0 /* 0xa6 */, 0 /* 0xa7 */,
+  0 /* 0xa8 */, 0 /* 0xa9 */, 0 /* 0xaa */, 0 /* 0xab */,
+  0 /* 0xac */, 0 /* 0xad */, 0 /* 0xae */, 0 /* 0xaf */,
+  0 /* 0xb0 */, 0 /* 0xb1 */, 0 /* 0xb2 */, 0 /* 0xb3 */,
+  0 /* 0xb4 */, 0 /* 0xb5 */, 0 /* 0xb6 */, 0 /* 0xb7 */,
+  0 /* 0xb8 */, 0 /* 0xb9 */, 0 /* 0xba */, 0 /* 0xbb */,
+  0 /* 0xbc */, 0 /* 0xbd */, 0 /* 0xbe */, 0 /* 0xbf */,
+  0 /* 0xc0 */, 0 /* 0xc1 */, 0 /* 0xc2 */, 0 /* 0xc3 */,
+  0 /* 0xc4 */, 0 /* 0xc5 */, 0 /* 0xc6 */, 0 /* 0xc7 */,
+  0 /* 0xc8 */, 0 /* 0xc9 */, 0 /* 0xca */, 0 /* 0xcb */,
+  0 /* 0xcc */, 0 /* 0xcd */, 0 /* 0xce */, 0 /* 0xcf */,
+  0 /* 0xd0 */, 0 /* 0xd1 */, 0 /* 0xd2 */, 0 /* 0xd3 */,
+  0 /* 0xd4 */, 0 /* 0xd5 */, 0 /* 0xd6 */, 0 /* 0xd7 */,
+  0 /* 0xd8 */, 0 /* 0xd9 */, 0 /* 0xda */, 0 /* 0xdb */,
+  0 /* 0xdc */, 0 /* 0xdd */, 0 /* 0xde */, 0 /* 0xdf */,
+  0 /* 0xe0 */, 0 /* 0xe1 */, 0 /* 0xe2 */, 0 /* 0xe3 */,
+  0 /* 0xe4 */, 0 /* 0xe5 */, 0 /* 0xe6 */, 0 /* 0xe7 */,
+  0 /* 0xe8 */, 0 /* 0xe9 */, 0 /* 0xea */, 0 /* 0xeb */,
+  0 /* 0xec */, 0 /* 0xed */, 0 /* 0xee */, 0 /* 0xef */,
+  0 /* 0xf0 */, 0 /* 0xf1 */, 0 /* 0xf2 */, 0 /* 0xf3 */,
+  0 /* 0xf4 */, 0 /* 0xf5 */, 0 /* 0xf6 */, 0 /* 0xf7 */,
+  0 /* 0xf8 */, 0 /* 0xf9 */, 0 /* 0xfa */, 0 /* 0xfb */,
+  0 /* 0xfc */, 0 /* 0xfd */, 0 /* 0xfe */, 0 /* 0xff */
+};
+
+static int check_method(const uint8_t *value, size_t len) {
+  const uint8_t *last;
+  if (len == 0) {
+    return 0;
+  }
+  for (last = value + len; value != last; ++value) {
+    if (!VALID_METHOD_CHARS[*value]) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+/* Generated by genpathchartbl.py */
+static char VALID_PATH_CHARS[] = {
+  0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
+  0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
+  0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
+  0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
+  0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
+  0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
+  0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
+  0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
+  0 /* SPC  */, 1 /* !    */, 1 /* "    */, 1 /* #    */,
+  1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
+  1 /* (    */, 1 /* )    */, 1 /* *    */, 1 /* +    */,
+  1 /* ,    */, 1 /* -    */, 1 /* .    */, 1 /* /    */,
+  1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
+  1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
+  1 /* 8    */, 1 /* 9    */, 1 /* :    */, 1 /* ;    */,
+  1 /* <    */, 1 /* =    */, 1 /* >    */, 1 /* ?    */,
+  1 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
+  1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
+  1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
+  1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
+  1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
+  1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
+  1 /* X    */, 1 /* Y    */, 1 /* Z    */, 1 /* [    */,
+  1 /* \    */, 1 /* ]    */, 1 /* ^    */, 1 /* _    */,
+  1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
+  1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
+  1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
+  1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
+  1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
+  1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
+  1 /* x    */, 1 /* y    */, 1 /* z    */, 1 /* {    */,
+  1 /* |    */, 1 /* }    */, 1 /* ~    */, 0 /* DEL  */,
+  1 /* 0x80 */, 1 /* 0x81 */, 1 /* 0x82 */, 1 /* 0x83 */,
+  1 /* 0x84 */, 1 /* 0x85 */, 1 /* 0x86 */, 1 /* 0x87 */,
+  1 /* 0x88 */, 1 /* 0x89 */, 1 /* 0x8a */, 1 /* 0x8b */,
+  1 /* 0x8c */, 1 /* 0x8d */, 1 /* 0x8e */, 1 /* 0x8f */,
+  1 /* 0x90 */, 1 /* 0x91 */, 1 /* 0x92 */, 1 /* 0x93 */,
+  1 /* 0x94 */, 1 /* 0x95 */, 1 /* 0x96 */, 1 /* 0x97 */,
+  1 /* 0x98 */, 1 /* 0x99 */, 1 /* 0x9a */, 1 /* 0x9b */,
+  1 /* 0x9c */, 1 /* 0x9d */, 1 /* 0x9e */, 1 /* 0x9f */,
+  1 /* 0xa0 */, 1 /* 0xa1 */, 1 /* 0xa2 */, 1 /* 0xa3 */,
+  1 /* 0xa4 */, 1 /* 0xa5 */, 1 /* 0xa6 */, 1 /* 0xa7 */,
+  1 /* 0xa8 */, 1 /* 0xa9 */, 1 /* 0xaa */, 1 /* 0xab */,
+  1 /* 0xac */, 1 /* 0xad */, 1 /* 0xae */, 1 /* 0xaf */,
+  1 /* 0xb0 */, 1 /* 0xb1 */, 1 /* 0xb2 */, 1 /* 0xb3 */,
+  1 /* 0xb4 */, 1 /* 0xb5 */, 1 /* 0xb6 */, 1 /* 0xb7 */,
+  1 /* 0xb8 */, 1 /* 0xb9 */, 1 /* 0xba */, 1 /* 0xbb */,
+  1 /* 0xbc */, 1 /* 0xbd */, 1 /* 0xbe */, 1 /* 0xbf */,
+  1 /* 0xc0 */, 1 /* 0xc1 */, 1 /* 0xc2 */, 1 /* 0xc3 */,
+  1 /* 0xc4 */, 1 /* 0xc5 */, 1 /* 0xc6 */, 1 /* 0xc7 */,
+  1 /* 0xc8 */, 1 /* 0xc9 */, 1 /* 0xca */, 1 /* 0xcb */,
+  1 /* 0xcc */, 1 /* 0xcd */, 1 /* 0xce */, 1 /* 0xcf */,
+  1 /* 0xd0 */, 1 /* 0xd1 */, 1 /* 0xd2 */, 1 /* 0xd3 */,
+  1 /* 0xd4 */, 1 /* 0xd5 */, 1 /* 0xd6 */, 1 /* 0xd7 */,
+  1 /* 0xd8 */, 1 /* 0xd9 */, 1 /* 0xda */, 1 /* 0xdb */,
+  1 /* 0xdc */, 1 /* 0xdd */, 1 /* 0xde */, 1 /* 0xdf */,
+  1 /* 0xe0 */, 1 /* 0xe1 */, 1 /* 0xe2 */, 1 /* 0xe3 */,
+  1 /* 0xe4 */, 1 /* 0xe5 */, 1 /* 0xe6 */, 1 /* 0xe7 */,
+  1 /* 0xe8 */, 1 /* 0xe9 */, 1 /* 0xea */, 1 /* 0xeb */,
+  1 /* 0xec */, 1 /* 0xed */, 1 /* 0xee */, 1 /* 0xef */,
+  1 /* 0xf0 */, 1 /* 0xf1 */, 1 /* 0xf2 */, 1 /* 0xf3 */,
+  1 /* 0xf4 */, 1 /* 0xf5 */, 1 /* 0xf6 */, 1 /* 0xf7 */,
+  1 /* 0xf8 */, 1 /* 0xf9 */, 1 /* 0xfa */, 1 /* 0xfb */,
+  1 /* 0xfc */, 1 /* 0xfd */, 1 /* 0xfe */, 1 /* 0xff */
+};
+
+static int check_path(const uint8_t *value, size_t len) {
+  const uint8_t *last;
+  for (last = value + len; value != last; ++value) {
+    if (!VALID_PATH_CHARS[*value]) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
 static int http_request_on_header(nghttp3_http_state *http,
                                   nghttp3_qpack_nv *nv, int trailers,
                                   int connect_protocol) {
   nghttp3_pri pri;
 
-  if (nv->name->base[0] == ':') {
-    if (trailers ||
-        (http->flags & NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED)) {
-      return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-    }
-  }
-
   switch (nv->token) {
   case NGHTTP3_QPACK_TOKEN__AUTHORITY:
-    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__AUTHORITY)) {
+    if (!check_authority(nv->value->base, nv->value->len) ||
+        !check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__AUTHORITY)) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
     break;
   case NGHTTP3_QPACK_TOKEN__METHOD:
-    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__METHOD)) {
+    if (!check_method(nv->value->base, nv->value->len) ||
+        !check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__METHOD)) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
     switch (nv->value->len) {
@@ -220,7 +480,8 @@ static int http_request_on_header(nghttp3_http_state *http,
     }
     break;
   case NGHTTP3_QPACK_TOKEN__PATH:
-    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__PATH)) {
+    if (!check_path(nv->value->base, nv->value->len) ||
+        !check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__PATH)) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
     if (nv->value->base[0] == '/') {
@@ -230,7 +491,8 @@ static int http_request_on_header(nghttp3_http_state *http,
     }
     break;
   case NGHTTP3_QPACK_TOKEN__SCHEME:
-    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__SCHEME)) {
+    if (!check_scheme(nv->value->base, nv->value->len) ||
+        !check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__SCHEME)) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
     /* scheme is case-insensitive:
@@ -241,15 +503,16 @@ static int http_request_on_header(nghttp3_http_state *http,
     }
     break;
   case NGHTTP3_QPACK_TOKEN__PROTOCOL:
-    if (!connect_protocol) {
-      return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-    }
-
-    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__PROTOCOL)) {
+    if (!connect_protocol ||
+        !nghttp3_check_header_value(nv->value->base, nv->value->len) ||
+        !check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__PROTOCOL)) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
     break;
   case NGHTTP3_QPACK_TOKEN_HOST:
+    if (!check_authority(nv->value->base, nv->value->len)) {
+      return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
+    }
     if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG_HOST)) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
@@ -284,22 +547,35 @@ static int http_request_on_header(nghttp3_http_state *http,
     }
     break;
   case NGHTTP3_QPACK_TOKEN_PRIORITY:
-    if (!trailers && !(http->flags & NGHTTP3_HTTP_FLAG_BAD_PRIORITY)) {
-      pri = http->pri;
-      if (nghttp3_http_parse_priority(&pri, nv->value->base, nv->value->len) ==
-          0) {
-        http->pri = pri;
-        http->flags |= NGHTTP3_HTTP_FLAG_PRIORITY;
-      } else {
-        http->flags &= ~NGHTTP3_HTTP_FLAG_PRIORITY;
-        http->flags |= NGHTTP3_HTTP_FLAG_BAD_PRIORITY;
-      }
+    if (!nghttp3_check_header_value(nv->value->base, nv->value->len)) {
+      return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
     }
+
+    if (trailers || (http->flags & NGHTTP3_HTTP_FLAG_BAD_PRIORITY)) {
+      break;
+    }
+
+    pri = http->pri;
+
+    if (nghttp3_http_parse_priority(&pri, nv->value->base, nv->value->len) ==
+        0) {
+      http->pri = pri;
+      http->flags |= NGHTTP3_HTTP_FLAG_PRIORITY;
+      break;
+    }
+
+    http->flags &= ~NGHTTP3_HTTP_FLAG_PRIORITY;
+    http->flags |= NGHTTP3_HTTP_FLAG_BAD_PRIORITY;
+
     break;
   default:
     if (nv->name->base[0] == ':') {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
+
+    if (!nghttp3_check_header_value(nv->value->base, nv->value->len)) {
+      return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
+    }
   }
 
   return 0;
@@ -307,19 +583,10 @@ static int http_request_on_header(nghttp3_http_state *http,
 
 static int http_response_on_header(nghttp3_http_state *http,
                                    nghttp3_qpack_nv *nv, int trailers) {
-  if (nv->name->base[0] == ':') {
-    if (trailers ||
-        (http->flags & NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED)) {
-      return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-    }
-  }
-
   switch (nv->token) {
   case NGHTTP3_QPACK_TOKEN__STATUS: {
-    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__STATUS)) {
-      return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-    }
-    if (nv->value->len != 3) {
+    if (!check_pseudo_header(http, nv, NGHTTP3_HTTP_FLAG__STATUS) ||
+        nv->value->len != 3) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
     http->status_code = (int16_t)parse_uint(nv->value->base, nv->value->len);
@@ -340,22 +607,18 @@ static int http_response_on_header(nghttp3_http_state *http,
       /* content-length header field in 204 response is prohibited by
          RFC 7230.  But some widely used servers send content-length:
          0.  Until they get fixed, we ignore it. */
-      if (http->content_length != -1) {
-        /* Found multiple content-length field */
-        return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-      }
-      if (!lstrieq("0", nv->value->base, nv->value->len)) {
+      if (/* Found multiple content-length field */
+          http->content_length != -1 ||
+          !lstrieq("0", nv->value->base, nv->value->len)) {
         return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
       }
       http->content_length = 0;
       return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
     }
-    if (http->status_code / 100 == 1) {
-      return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
-    }
-    /* https://tools.ietf.org/html/rfc7230#section-3.3.3 */
-    if (http->status_code / 100 == 2 &&
-        (http->flags & NGHTTP3_HTTP_FLAG_METH_CONNECT)) {
+    if (http->status_code / 100 == 1 ||
+        /* https://tools.ietf.org/html/rfc7230#section-3.3.3 */
+        (http->status_code / 100 == 2 &&
+         (http->flags & NGHTTP3_HTTP_FLAG_METH_CONNECT))) {
       return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
     }
     if (http->content_length != -1) {
@@ -383,349 +646,50 @@ static int http_response_on_header(nghttp3_http_state *http,
     if (nv->name->base[0] == ':') {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
-  }
-
-  return 0;
-}
-
-/* Generated by genauthroitychartbl.py */
-static char VALID_AUTHORITY_CHARS[] = {
-    0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
-    0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
-    0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
-    0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
-    0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
-    0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
-    0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
-    0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
-    0 /* SPC  */, 1 /* !    */, 0 /* "    */, 0 /* #    */,
-    1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
-    1 /* (    */, 1 /* )    */, 1 /* *    */, 1 /* +    */,
-    1 /* ,    */, 1 /* -    */, 1 /* .    */, 0 /* /    */,
-    1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
-    1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
-    1 /* 8    */, 1 /* 9    */, 1 /* :    */, 1 /* ;    */,
-    0 /* <    */, 1 /* =    */, 0 /* >    */, 0 /* ?    */,
-    1 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
-    1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
-    1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
-    1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
-    1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
-    1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
-    1 /* X    */, 1 /* Y    */, 1 /* Z    */, 1 /* [    */,
-    0 /* \    */, 1 /* ]    */, 0 /* ^    */, 1 /* _    */,
-    0 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
-    1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
-    1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
-    1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
-    1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
-    1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
-    1 /* x    */, 1 /* y    */, 1 /* z    */, 0 /* {    */,
-    0 /* |    */, 0 /* }    */, 1 /* ~    */, 0 /* DEL  */,
-    0 /* 0x80 */, 0 /* 0x81 */, 0 /* 0x82 */, 0 /* 0x83 */,
-    0 /* 0x84 */, 0 /* 0x85 */, 0 /* 0x86 */, 0 /* 0x87 */,
-    0 /* 0x88 */, 0 /* 0x89 */, 0 /* 0x8a */, 0 /* 0x8b */,
-    0 /* 0x8c */, 0 /* 0x8d */, 0 /* 0x8e */, 0 /* 0x8f */,
-    0 /* 0x90 */, 0 /* 0x91 */, 0 /* 0x92 */, 0 /* 0x93 */,
-    0 /* 0x94 */, 0 /* 0x95 */, 0 /* 0x96 */, 0 /* 0x97 */,
-    0 /* 0x98 */, 0 /* 0x99 */, 0 /* 0x9a */, 0 /* 0x9b */,
-    0 /* 0x9c */, 0 /* 0x9d */, 0 /* 0x9e */, 0 /* 0x9f */,
-    0 /* 0xa0 */, 0 /* 0xa1 */, 0 /* 0xa2 */, 0 /* 0xa3 */,
-    0 /* 0xa4 */, 0 /* 0xa5 */, 0 /* 0xa6 */, 0 /* 0xa7 */,
-    0 /* 0xa8 */, 0 /* 0xa9 */, 0 /* 0xaa */, 0 /* 0xab */,
-    0 /* 0xac */, 0 /* 0xad */, 0 /* 0xae */, 0 /* 0xaf */,
-    0 /* 0xb0 */, 0 /* 0xb1 */, 0 /* 0xb2 */, 0 /* 0xb3 */,
-    0 /* 0xb4 */, 0 /* 0xb5 */, 0 /* 0xb6 */, 0 /* 0xb7 */,
-    0 /* 0xb8 */, 0 /* 0xb9 */, 0 /* 0xba */, 0 /* 0xbb */,
-    0 /* 0xbc */, 0 /* 0xbd */, 0 /* 0xbe */, 0 /* 0xbf */,
-    0 /* 0xc0 */, 0 /* 0xc1 */, 0 /* 0xc2 */, 0 /* 0xc3 */,
-    0 /* 0xc4 */, 0 /* 0xc5 */, 0 /* 0xc6 */, 0 /* 0xc7 */,
-    0 /* 0xc8 */, 0 /* 0xc9 */, 0 /* 0xca */, 0 /* 0xcb */,
-    0 /* 0xcc */, 0 /* 0xcd */, 0 /* 0xce */, 0 /* 0xcf */,
-    0 /* 0xd0 */, 0 /* 0xd1 */, 0 /* 0xd2 */, 0 /* 0xd3 */,
-    0 /* 0xd4 */, 0 /* 0xd5 */, 0 /* 0xd6 */, 0 /* 0xd7 */,
-    0 /* 0xd8 */, 0 /* 0xd9 */, 0 /* 0xda */, 0 /* 0xdb */,
-    0 /* 0xdc */, 0 /* 0xdd */, 0 /* 0xde */, 0 /* 0xdf */,
-    0 /* 0xe0 */, 0 /* 0xe1 */, 0 /* 0xe2 */, 0 /* 0xe3 */,
-    0 /* 0xe4 */, 0 /* 0xe5 */, 0 /* 0xe6 */, 0 /* 0xe7 */,
-    0 /* 0xe8 */, 0 /* 0xe9 */, 0 /* 0xea */, 0 /* 0xeb */,
-    0 /* 0xec */, 0 /* 0xed */, 0 /* 0xee */, 0 /* 0xef */,
-    0 /* 0xf0 */, 0 /* 0xf1 */, 0 /* 0xf2 */, 0 /* 0xf3 */,
-    0 /* 0xf4 */, 0 /* 0xf5 */, 0 /* 0xf6 */, 0 /* 0xf7 */,
-    0 /* 0xf8 */, 0 /* 0xf9 */, 0 /* 0xfa */, 0 /* 0xfb */,
-    0 /* 0xfc */, 0 /* 0xfd */, 0 /* 0xfe */, 0 /* 0xff */
-};
-
-static int check_authority(const uint8_t *value, size_t len) {
-  const uint8_t *last;
-  for (last = value + len; value != last; ++value) {
-    if (!VALID_AUTHORITY_CHARS[*value]) {
-      return 0;
-    }
-  }
-  return 1;
-}
-
-static int check_scheme(const uint8_t *value, size_t len) {
-  const uint8_t *last;
-  if (len == 0) {
-    return 0;
-  }
-
-  if (!(('A' <= *value && *value <= 'Z') || ('a' <= *value && *value <= 'z'))) {
-    return 0;
-  }
 
-  last = value + len;
-  ++value;
-
-  for (; value != last; ++value) {
-    if (!(('A' <= *value && *value <= 'Z') ||
-          ('a' <= *value && *value <= 'z') ||
-          ('0' <= *value && *value <= '9') || *value == '+' || *value == '-' ||
-          *value == '.')) {
-      return 0;
+    if (!nghttp3_check_header_value(nv->value->base, nv->value->len)) {
+      return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
     }
   }
-  return 1;
-}
 
-/* Generated by genmethodchartbl.py */
-static char VALID_METHOD_CHARS[] = {
-    0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
-    0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
-    0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
-    0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
-    0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
-    0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
-    0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
-    0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
-    0 /* SPC  */, 1 /* !    */, 0 /* "    */, 1 /* #    */,
-    1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
-    0 /* (    */, 0 /* )    */, 1 /* *    */, 1 /* +    */,
-    0 /* ,    */, 1 /* -    */, 1 /* .    */, 0 /* /    */,
-    1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
-    1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
-    1 /* 8    */, 1 /* 9    */, 0 /* :    */, 0 /* ;    */,
-    0 /* <    */, 0 /* =    */, 0 /* >    */, 0 /* ?    */,
-    0 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
-    1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
-    1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
-    1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
-    1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
-    1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
-    1 /* X    */, 1 /* Y    */, 1 /* Z    */, 0 /* [    */,
-    0 /* \    */, 0 /* ]    */, 1 /* ^    */, 1 /* _    */,
-    1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
-    1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
-    1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
-    1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
-    1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
-    1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
-    1 /* x    */, 1 /* y    */, 1 /* z    */, 0 /* {    */,
-    1 /* |    */, 0 /* }    */, 1 /* ~    */, 0 /* DEL  */,
-    0 /* 0x80 */, 0 /* 0x81 */, 0 /* 0x82 */, 0 /* 0x83 */,
-    0 /* 0x84 */, 0 /* 0x85 */, 0 /* 0x86 */, 0 /* 0x87 */,
-    0 /* 0x88 */, 0 /* 0x89 */, 0 /* 0x8a */, 0 /* 0x8b */,
-    0 /* 0x8c */, 0 /* 0x8d */, 0 /* 0x8e */, 0 /* 0x8f */,
-    0 /* 0x90 */, 0 /* 0x91 */, 0 /* 0x92 */, 0 /* 0x93 */,
-    0 /* 0x94 */, 0 /* 0x95 */, 0 /* 0x96 */, 0 /* 0x97 */,
-    0 /* 0x98 */, 0 /* 0x99 */, 0 /* 0x9a */, 0 /* 0x9b */,
-    0 /* 0x9c */, 0 /* 0x9d */, 0 /* 0x9e */, 0 /* 0x9f */,
-    0 /* 0xa0 */, 0 /* 0xa1 */, 0 /* 0xa2 */, 0 /* 0xa3 */,
-    0 /* 0xa4 */, 0 /* 0xa5 */, 0 /* 0xa6 */, 0 /* 0xa7 */,
-    0 /* 0xa8 */, 0 /* 0xa9 */, 0 /* 0xaa */, 0 /* 0xab */,
-    0 /* 0xac */, 0 /* 0xad */, 0 /* 0xae */, 0 /* 0xaf */,
-    0 /* 0xb0 */, 0 /* 0xb1 */, 0 /* 0xb2 */, 0 /* 0xb3 */,
-    0 /* 0xb4 */, 0 /* 0xb5 */, 0 /* 0xb6 */, 0 /* 0xb7 */,
-    0 /* 0xb8 */, 0 /* 0xb9 */, 0 /* 0xba */, 0 /* 0xbb */,
-    0 /* 0xbc */, 0 /* 0xbd */, 0 /* 0xbe */, 0 /* 0xbf */,
-    0 /* 0xc0 */, 0 /* 0xc1 */, 0 /* 0xc2 */, 0 /* 0xc3 */,
-    0 /* 0xc4 */, 0 /* 0xc5 */, 0 /* 0xc6 */, 0 /* 0xc7 */,
-    0 /* 0xc8 */, 0 /* 0xc9 */, 0 /* 0xca */, 0 /* 0xcb */,
-    0 /* 0xcc */, 0 /* 0xcd */, 0 /* 0xce */, 0 /* 0xcf */,
-    0 /* 0xd0 */, 0 /* 0xd1 */, 0 /* 0xd2 */, 0 /* 0xd3 */,
-    0 /* 0xd4 */, 0 /* 0xd5 */, 0 /* 0xd6 */, 0 /* 0xd7 */,
-    0 /* 0xd8 */, 0 /* 0xd9 */, 0 /* 0xda */, 0 /* 0xdb */,
-    0 /* 0xdc */, 0 /* 0xdd */, 0 /* 0xde */, 0 /* 0xdf */,
-    0 /* 0xe0 */, 0 /* 0xe1 */, 0 /* 0xe2 */, 0 /* 0xe3 */,
-    0 /* 0xe4 */, 0 /* 0xe5 */, 0 /* 0xe6 */, 0 /* 0xe7 */,
-    0 /* 0xe8 */, 0 /* 0xe9 */, 0 /* 0xea */, 0 /* 0xeb */,
-    0 /* 0xec */, 0 /* 0xed */, 0 /* 0xee */, 0 /* 0xef */,
-    0 /* 0xf0 */, 0 /* 0xf1 */, 0 /* 0xf2 */, 0 /* 0xf3 */,
-    0 /* 0xf4 */, 0 /* 0xf5 */, 0 /* 0xf6 */, 0 /* 0xf7 */,
-    0 /* 0xf8 */, 0 /* 0xf9 */, 0 /* 0xfa */, 0 /* 0xfb */,
-    0 /* 0xfc */, 0 /* 0xfd */, 0 /* 0xfe */, 0 /* 0xff */
-};
-
-static int check_method(const uint8_t *value, size_t len) {
-  const uint8_t *last;
-  if (len == 0) {
-    return 0;
-  }
-  for (last = value + len; value != last; ++value) {
-    if (!VALID_METHOD_CHARS[*value]) {
-      return 0;
-    }
-  }
-  return 1;
+  return 0;
 }
 
-/* Generated by genpathchartbl.py */
-static char VALID_PATH_CHARS[] = {
-    0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
-    0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
-    0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
-    0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
-    0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
-    0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
-    0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
-    0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
-    0 /* SPC  */, 1 /* !    */, 1 /* "    */, 1 /* #    */,
-    1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
-    1 /* (    */, 1 /* )    */, 1 /* *    */, 1 /* +    */,
-    1 /* ,    */, 1 /* -    */, 1 /* .    */, 1 /* /    */,
-    1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
-    1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
-    1 /* 8    */, 1 /* 9    */, 1 /* :    */, 1 /* ;    */,
-    1 /* <    */, 1 /* =    */, 1 /* >    */, 1 /* ?    */,
-    1 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
-    1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
-    1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
-    1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
-    1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
-    1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
-    1 /* X    */, 1 /* Y    */, 1 /* Z    */, 1 /* [    */,
-    1 /* \    */, 1 /* ]    */, 1 /* ^    */, 1 /* _    */,
-    1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
-    1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
-    1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
-    1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
-    1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
-    1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
-    1 /* x    */, 1 /* y    */, 1 /* z    */, 1 /* {    */,
-    1 /* |    */, 1 /* }    */, 1 /* ~    */, 0 /* DEL  */,
-    1 /* 0x80 */, 1 /* 0x81 */, 1 /* 0x82 */, 1 /* 0x83 */,
-    1 /* 0x84 */, 1 /* 0x85 */, 1 /* 0x86 */, 1 /* 0x87 */,
-    1 /* 0x88 */, 1 /* 0x89 */, 1 /* 0x8a */, 1 /* 0x8b */,
-    1 /* 0x8c */, 1 /* 0x8d */, 1 /* 0x8e */, 1 /* 0x8f */,
-    1 /* 0x90 */, 1 /* 0x91 */, 1 /* 0x92 */, 1 /* 0x93 */,
-    1 /* 0x94 */, 1 /* 0x95 */, 1 /* 0x96 */, 1 /* 0x97 */,
-    1 /* 0x98 */, 1 /* 0x99 */, 1 /* 0x9a */, 1 /* 0x9b */,
-    1 /* 0x9c */, 1 /* 0x9d */, 1 /* 0x9e */, 1 /* 0x9f */,
-    1 /* 0xa0 */, 1 /* 0xa1 */, 1 /* 0xa2 */, 1 /* 0xa3 */,
-    1 /* 0xa4 */, 1 /* 0xa5 */, 1 /* 0xa6 */, 1 /* 0xa7 */,
-    1 /* 0xa8 */, 1 /* 0xa9 */, 1 /* 0xaa */, 1 /* 0xab */,
-    1 /* 0xac */, 1 /* 0xad */, 1 /* 0xae */, 1 /* 0xaf */,
-    1 /* 0xb0 */, 1 /* 0xb1 */, 1 /* 0xb2 */, 1 /* 0xb3 */,
-    1 /* 0xb4 */, 1 /* 0xb5 */, 1 /* 0xb6 */, 1 /* 0xb7 */,
-    1 /* 0xb8 */, 1 /* 0xb9 */, 1 /* 0xba */, 1 /* 0xbb */,
-    1 /* 0xbc */, 1 /* 0xbd */, 1 /* 0xbe */, 1 /* 0xbf */,
-    1 /* 0xc0 */, 1 /* 0xc1 */, 1 /* 0xc2 */, 1 /* 0xc3 */,
-    1 /* 0xc4 */, 1 /* 0xc5 */, 1 /* 0xc6 */, 1 /* 0xc7 */,
-    1 /* 0xc8 */, 1 /* 0xc9 */, 1 /* 0xca */, 1 /* 0xcb */,
-    1 /* 0xcc */, 1 /* 0xcd */, 1 /* 0xce */, 1 /* 0xcf */,
-    1 /* 0xd0 */, 1 /* 0xd1 */, 1 /* 0xd2 */, 1 /* 0xd3 */,
-    1 /* 0xd4 */, 1 /* 0xd5 */, 1 /* 0xd6 */, 1 /* 0xd7 */,
-    1 /* 0xd8 */, 1 /* 0xd9 */, 1 /* 0xda */, 1 /* 0xdb */,
-    1 /* 0xdc */, 1 /* 0xdd */, 1 /* 0xde */, 1 /* 0xdf */,
-    1 /* 0xe0 */, 1 /* 0xe1 */, 1 /* 0xe2 */, 1 /* 0xe3 */,
-    1 /* 0xe4 */, 1 /* 0xe5 */, 1 /* 0xe6 */, 1 /* 0xe7 */,
-    1 /* 0xe8 */, 1 /* 0xe9 */, 1 /* 0xea */, 1 /* 0xeb */,
-    1 /* 0xec */, 1 /* 0xed */, 1 /* 0xee */, 1 /* 0xef */,
-    1 /* 0xf0 */, 1 /* 0xf1 */, 1 /* 0xf2 */, 1 /* 0xf3 */,
-    1 /* 0xf4 */, 1 /* 0xf5 */, 1 /* 0xf6 */, 1 /* 0xf7 */,
-    1 /* 0xf8 */, 1 /* 0xf9 */, 1 /* 0xfa */, 1 /* 0xfb */,
-    1 /* 0xfc */, 1 /* 0xfd */, 1 /* 0xfe */, 1 /* 0xff */
-};
-
-static int check_path(const uint8_t *value, size_t len) {
-  const uint8_t *last;
-  for (last = value + len; value != last; ++value) {
-    if (!VALID_PATH_CHARS[*value]) {
-      return 0;
-    }
-  }
-  return 1;
-}
+static int http_check_nonempty_header_name(const uint8_t *name, size_t len);
 
 int nghttp3_http_on_header(nghttp3_http_state *http, nghttp3_qpack_nv *nv,
                            int request, int trailers, int connect_protocol) {
-  int rv;
-  size_t i;
-  uint8_t c;
-
-  if (!nghttp3_check_header_name(nv->name->base, nv->name->len)) {
-    if (nv->name->len > 0 && nv->name->base[0] == ':') {
-      return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-    }
-    /* header field name must be lower-cased without exception */
-    for (i = 0; i < nv->name->len; ++i) {
-      c = nv->name->base[i];
-      if ('A' <= c && c <= 'Z') {
-        return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
-      }
-    }
-    /* When ignoring regular header fields, we set this flag so that
-       we still enforce header field ordering rule for pseudo header
-       fields. */
+  if (nv->name->len == 0) {
     http->flags |= NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED;
+
     return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
   }
 
-  assert(nv->name->len > 0);
-
-  switch (nv->token) {
-  case NGHTTP3_QPACK_TOKEN__METHOD:
-    rv = check_method(nv->value->base, nv->value->len);
-    break;
-  case NGHTTP3_QPACK_TOKEN__SCHEME:
-    rv = check_scheme(nv->value->base, nv->value->len);
-    break;
-  case NGHTTP3_QPACK_TOKEN__AUTHORITY:
-  case NGHTTP3_QPACK_TOKEN_HOST:
-    if (request) {
-      rv = check_authority(nv->value->base, nv->value->len);
-    } else {
-      /* The use of host field in response field section is
-         undefined. */
-      rv = nghttp3_check_header_value(nv->value->base, nv->value->len);
+  if (nv->name->base[0] == ':') {
+    /* pseudo header must have a valid token. */
+    if (nv->token == -1 || trailers ||
+        (http->flags & NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED)) {
+      return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
-    break;
-  case NGHTTP3_QPACK_TOKEN__PATH:
-    rv = check_path(nv->value->base, nv->value->len);
-    break;
-  default:
-    rv = nghttp3_check_header_value(nv->value->base, nv->value->len);
-  }
+  } else {
+    http->flags |= NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED;
 
-  if (rv == 0) {
-    if (nv->name->base[0] == ':') {
+    switch (http_check_nonempty_header_name(nv->name->base, nv->name->len)) {
+    case 0:
+      return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
+    case -1:
+      /* header field name must be lower-cased without exception */
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
     }
-    /* When ignoring regular header fields, we set this flag so that
-       we still enforce header field ordering rule for pseudo header
-       fields. */
-    http->flags |= NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED;
-    return NGHTTP3_ERR_REMOVE_HTTP_HEADER;
   }
 
-  if (request) {
-    rv = http_request_on_header(http, nv, trailers, connect_protocol);
-  } else {
-    rv = http_response_on_header(http, nv, trailers);
-  }
+  assert(nv->name->len > 0);
 
-  if (nv->name->base[0] != ':') {
-    switch (rv) {
-    case 0:
-    case NGHTTP3_ERR_REMOVE_HTTP_HEADER:
-      http->flags |= NGHTTP3_HTTP_FLAG_PSEUDO_HEADER_DISALLOWED;
-      break;
-    }
+  if (request) {
+    return http_request_on_header(http, nv, trailers, connect_protocol);
   }
 
-  return rv;
+  return http_response_on_header(http, nv, trailers);
 }
 
 int nghttp3_http_on_request_headers(nghttp3_http_state *http) {
@@ -738,7 +702,7 @@ int nghttp3_http_on_request_headers(nghttp3_http_state *http) {
     http->content_length = -1;
   } else {
     if ((http->flags & NGHTTP3_HTTP_FLAG_REQ_HEADERS) !=
-            NGHTTP3_HTTP_FLAG_REQ_HEADERS ||
+          NGHTTP3_HTTP_FLAG_REQ_HEADERS ||
         (http->flags &
          (NGHTTP3_HTTP_FLAG__AUTHORITY | NGHTTP3_HTTP_FLAG_HOST)) == 0) {
       return NGHTTP3_ERR_MALFORMED_HTTP_HEADER;
@@ -829,70 +793,58 @@ void nghttp3_http_record_request_method(nghttp3_stream *stream,
 
 /* Generated by gennmchartbl.py */
 static const int VALID_HD_NAME_CHARS[] = {
-    0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
-    0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
-    0 /* BS   */, 0 /* HT   */, 0 /* LF   */, 0 /* VT   */,
-    0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
-    0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
-    0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
-    0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
-    0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
-    0 /* SPC  */, 1 /* !    */, 0 /* "    */, 1 /* #    */,
-    1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
-    0 /* (    */, 0 /* )    */, 1 /* *    */, 1 /* +    */,
-    0 /* ,    */, 1 /* -    */, 1 /* .    */, 0 /* /    */,
-    1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
-    1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
-    1 /* 8    */, 1 /* 9    */, 0 /* :    */, 0 /* ;    */,
-    0 /* <    */, 0 /* =    */, 0 /* >    */, 0 /* ?    */,
-    0 /* @    */, 0 /* A    */, 0 /* B    */, 0 /* C    */,
-    0 /* D    */, 0 /* E    */, 0 /* F    */, 0 /* G    */,
-    0 /* H    */, 0 /* I    */, 0 /* J    */, 0 /* K    */,
-    0 /* L    */, 0 /* M    */, 0 /* N    */, 0 /* O    */,
-    0 /* P    */, 0 /* Q    */, 0 /* R    */, 0 /* S    */,
-    0 /* T    */, 0 /* U    */, 0 /* V    */, 0 /* W    */,
-    0 /* X    */, 0 /* Y    */, 0 /* Z    */, 0 /* [    */,
-    0 /* \    */, 0 /* ]    */, 1 /* ^    */, 1 /* _    */,
-    1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
-    1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
-    1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
-    1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
-    1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
-    1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
-    1 /* x    */, 1 /* y    */, 1 /* z    */, 0 /* {    */,
-    1 /* |    */, 0 /* }    */, 1 /* ~    */, 0 /* DEL  */,
-    0 /* 0x80 */, 0 /* 0x81 */, 0 /* 0x82 */, 0 /* 0x83 */,
-    0 /* 0x84 */, 0 /* 0x85 */, 0 /* 0x86 */, 0 /* 0x87 */,
-    0 /* 0x88 */, 0 /* 0x89 */, 0 /* 0x8a */, 0 /* 0x8b */,
-    0 /* 0x8c */, 0 /* 0x8d */, 0 /* 0x8e */, 0 /* 0x8f */,
-    0 /* 0x90 */, 0 /* 0x91 */, 0 /* 0x92 */, 0 /* 0x93 */,
-    0 /* 0x94 */, 0 /* 0x95 */, 0 /* 0x96 */, 0 /* 0x97 */,
-    0 /* 0x98 */, 0 /* 0x99 */, 0 /* 0x9a */, 0 /* 0x9b */,
-    0 /* 0x9c */, 0 /* 0x9d */, 0 /* 0x9e */, 0 /* 0x9f */,
-    0 /* 0xa0 */, 0 /* 0xa1 */, 0 /* 0xa2 */, 0 /* 0xa3 */,
-    0 /* 0xa4 */, 0 /* 0xa5 */, 0 /* 0xa6 */, 0 /* 0xa7 */,
-    0 /* 0xa8 */, 0 /* 0xa9 */, 0 /* 0xaa */, 0 /* 0xab */,
-    0 /* 0xac */, 0 /* 0xad */, 0 /* 0xae */, 0 /* 0xaf */,
-    0 /* 0xb0 */, 0 /* 0xb1 */, 0 /* 0xb2 */, 0 /* 0xb3 */,
-    0 /* 0xb4 */, 0 /* 0xb5 */, 0 /* 0xb6 */, 0 /* 0xb7 */,
-    0 /* 0xb8 */, 0 /* 0xb9 */, 0 /* 0xba */, 0 /* 0xbb */,
-    0 /* 0xbc */, 0 /* 0xbd */, 0 /* 0xbe */, 0 /* 0xbf */,
-    0 /* 0xc0 */, 0 /* 0xc1 */, 0 /* 0xc2 */, 0 /* 0xc3 */,
-    0 /* 0xc4 */, 0 /* 0xc5 */, 0 /* 0xc6 */, 0 /* 0xc7 */,
-    0 /* 0xc8 */, 0 /* 0xc9 */, 0 /* 0xca */, 0 /* 0xcb */,
-    0 /* 0xcc */, 0 /* 0xcd */, 0 /* 0xce */, 0 /* 0xcf */,
-    0 /* 0xd0 */, 0 /* 0xd1 */, 0 /* 0xd2 */, 0 /* 0xd3 */,
-    0 /* 0xd4 */, 0 /* 0xd5 */, 0 /* 0xd6 */, 0 /* 0xd7 */,
-    0 /* 0xd8 */, 0 /* 0xd9 */, 0 /* 0xda */, 0 /* 0xdb */,
-    0 /* 0xdc */, 0 /* 0xdd */, 0 /* 0xde */, 0 /* 0xdf */,
-    0 /* 0xe0 */, 0 /* 0xe1 */, 0 /* 0xe2 */, 0 /* 0xe3 */,
-    0 /* 0xe4 */, 0 /* 0xe5 */, 0 /* 0xe6 */, 0 /* 0xe7 */,
-    0 /* 0xe8 */, 0 /* 0xe9 */, 0 /* 0xea */, 0 /* 0xeb */,
-    0 /* 0xec */, 0 /* 0xed */, 0 /* 0xee */, 0 /* 0xef */,
-    0 /* 0xf0 */, 0 /* 0xf1 */, 0 /* 0xf2 */, 0 /* 0xf3 */,
-    0 /* 0xf4 */, 0 /* 0xf5 */, 0 /* 0xf6 */, 0 /* 0xf7 */,
-    0 /* 0xf8 */, 0 /* 0xf9 */, 0 /* 0xfa */, 0 /* 0xfb */,
-    0 /* 0xfc */, 0 /* 0xfd */, 0 /* 0xfe */, 0 /* 0xff */
+  0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */, 0 /* EOT  */,
+  0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */, 0 /* BS   */, 0 /* HT   */,
+  0 /* LF   */, 0 /* VT   */, 0 /* FF   */, 0 /* CR   */, 0 /* SO   */,
+  0 /* SI   */, 0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
+  0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */, 0 /* CAN  */,
+  0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */, 0 /* FS   */, 0 /* GS   */,
+  0 /* RS   */, 0 /* US   */, 0 /* SPC  */, 1 /* !    */, 0 /* "    */,
+  1 /* #    */, 1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
+  0 /* (    */, 0 /* )    */, 1 /* *    */, 1 /* +    */, 0 /* ,    */,
+  1 /* -    */, 1 /* .    */, 0 /* /    */, 1 /* 0    */, 1 /* 1    */,
+  1 /* 2    */, 1 /* 3    */, 1 /* 4    */, 1 /* 5    */, 1 /* 6    */,
+  1 /* 7    */, 1 /* 8    */, 1 /* 9    */, 0 /* :    */, 0 /* ;    */,
+  0 /* <    */, 0 /* =    */, 0 /* >    */, 0 /* ?    */, 0 /* @    */,
+  -1 /* A   */, -1 /* B   */, -1 /* C   */, -1 /* D   */, -1 /* E   */,
+  -1 /* F   */, -1 /* G   */, -1 /* H   */, -1 /* I   */, -1 /* J   */,
+  -1 /* K   */, -1 /* L   */, -1 /* M   */, -1 /* N   */, -1 /* O   */,
+  -1 /* P   */, -1 /* Q   */, -1 /* R   */, -1 /* S   */, -1 /* T   */,
+  -1 /* U   */, -1 /* V   */, -1 /* W   */, -1 /* X   */, -1 /* Y   */,
+  -1 /* Z   */, 0 /* [    */, 0 /* \    */, 0 /* ]    */, 1 /* ^    */,
+  1 /* _    */, 1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
+  1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */, 1 /* h    */,
+  1 /* i    */, 1 /* j    */, 1 /* k    */, 1 /* l    */, 1 /* m    */,
+  1 /* n    */, 1 /* o    */, 1 /* p    */, 1 /* q    */, 1 /* r    */,
+  1 /* s    */, 1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
+  1 /* x    */, 1 /* y    */, 1 /* z    */, 0 /* {    */, 1 /* |    */,
+  0 /* }    */, 1 /* ~    */, 0 /* DEL  */, 0 /* 0x80 */, 0 /* 0x81 */,
+  0 /* 0x82 */, 0 /* 0x83 */, 0 /* 0x84 */, 0 /* 0x85 */, 0 /* 0x86 */,
+  0 /* 0x87 */, 0 /* 0x88 */, 0 /* 0x89 */, 0 /* 0x8a */, 0 /* 0x8b */,
+  0 /* 0x8c */, 0 /* 0x8d */, 0 /* 0x8e */, 0 /* 0x8f */, 0 /* 0x90 */,
+  0 /* 0x91 */, 0 /* 0x92 */, 0 /* 0x93 */, 0 /* 0x94 */, 0 /* 0x95 */,
+  0 /* 0x96 */, 0 /* 0x97 */, 0 /* 0x98 */, 0 /* 0x99 */, 0 /* 0x9a */,
+  0 /* 0x9b */, 0 /* 0x9c */, 0 /* 0x9d */, 0 /* 0x9e */, 0 /* 0x9f */,
+  0 /* 0xa0 */, 0 /* 0xa1 */, 0 /* 0xa2 */, 0 /* 0xa3 */, 0 /* 0xa4 */,
+  0 /* 0xa5 */, 0 /* 0xa6 */, 0 /* 0xa7 */, 0 /* 0xa8 */, 0 /* 0xa9 */,
+  0 /* 0xaa */, 0 /* 0xab */, 0 /* 0xac */, 0 /* 0xad */, 0 /* 0xae */,
+  0 /* 0xaf */, 0 /* 0xb0 */, 0 /* 0xb1 */, 0 /* 0xb2 */, 0 /* 0xb3 */,
+  0 /* 0xb4 */, 0 /* 0xb5 */, 0 /* 0xb6 */, 0 /* 0xb7 */, 0 /* 0xb8 */,
+  0 /* 0xb9 */, 0 /* 0xba */, 0 /* 0xbb */, 0 /* 0xbc */, 0 /* 0xbd */,
+  0 /* 0xbe */, 0 /* 0xbf */, 0 /* 0xc0 */, 0 /* 0xc1 */, 0 /* 0xc2 */,
+  0 /* 0xc3 */, 0 /* 0xc4 */, 0 /* 0xc5 */, 0 /* 0xc6 */, 0 /* 0xc7 */,
+  0 /* 0xc8 */, 0 /* 0xc9 */, 0 /* 0xca */, 0 /* 0xcb */, 0 /* 0xcc */,
+  0 /* 0xcd */, 0 /* 0xce */, 0 /* 0xcf */, 0 /* 0xd0 */, 0 /* 0xd1 */,
+  0 /* 0xd2 */, 0 /* 0xd3 */, 0 /* 0xd4 */, 0 /* 0xd5 */, 0 /* 0xd6 */,
+  0 /* 0xd7 */, 0 /* 0xd8 */, 0 /* 0xd9 */, 0 /* 0xda */, 0 /* 0xdb */,
+  0 /* 0xdc */, 0 /* 0xdd */, 0 /* 0xde */, 0 /* 0xdf */, 0 /* 0xe0 */,
+  0 /* 0xe1 */, 0 /* 0xe2 */, 0 /* 0xe3 */, 0 /* 0xe4 */, 0 /* 0xe5 */,
+  0 /* 0xe6 */, 0 /* 0xe7 */, 0 /* 0xe8 */, 0 /* 0xe9 */, 0 /* 0xea */,
+  0 /* 0xeb */, 0 /* 0xec */, 0 /* 0xed */, 0 /* 0xee */, 0 /* 0xef */,
+  0 /* 0xf0 */, 0 /* 0xf1 */, 0 /* 0xf2 */, 0 /* 0xf3 */, 0 /* 0xf4 */,
+  0 /* 0xf5 */, 0 /* 0xf6 */, 0 /* 0xf7 */, 0 /* 0xf8 */, 0 /* 0xf9 */,
+  0 /* 0xfa */, 0 /* 0xfb */, 0 /* 0xfc */, 0 /* 0xfd */, 0 /* 0xfe */,
+  0 /* 0xff */,
 };
 
 int nghttp3_check_header_name(const uint8_t *name, size_t len) {
@@ -915,76 +867,125 @@ int nghttp3_check_header_name(const uint8_t *name, size_t len) {
   return 1;
 }
 
+/* http_check_nonempty_header_name validates regular header name
+   pointed by |name| of length |len|.  |len| must be greater than
+   zero.  This function returns 1 if it succeeds, or -1 if the name
+   contains a character in [A-Z], otherwise 0. */
+static int http_check_nonempty_header_name(const uint8_t *name, size_t len) {
+  const uint8_t *last;
+  int rv;
+
+  for (last = name + len; name != last; ++name) {
+    rv = VALID_HD_NAME_CHARS[*name];
+    if (rv != 1) {
+      return rv;
+    }
+  }
+
+  return 1;
+}
+
 /* Generated by genvchartbl.py */
 static const int VALID_HD_VALUE_CHARS[] = {
-    0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
-    0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
-    0 /* BS   */, 1 /* HT   */, 0 /* LF   */, 0 /* VT   */,
-    0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
-    0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
-    0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
-    0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
-    0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
-    1 /* SPC  */, 1 /* !    */, 1 /* "    */, 1 /* #    */,
-    1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
-    1 /* (    */, 1 /* )    */, 1 /* *    */, 1 /* +    */,
-    1 /* ,    */, 1 /* -    */, 1 /* .    */, 1 /* /    */,
-    1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
-    1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
-    1 /* 8    */, 1 /* 9    */, 1 /* :    */, 1 /* ;    */,
-    1 /* <    */, 1 /* =    */, 1 /* >    */, 1 /* ?    */,
-    1 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
-    1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
-    1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
-    1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
-    1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
-    1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
-    1 /* X    */, 1 /* Y    */, 1 /* Z    */, 1 /* [    */,
-    1 /* \    */, 1 /* ]    */, 1 /* ^    */, 1 /* _    */,
-    1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
-    1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
-    1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
-    1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
-    1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
-    1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
-    1 /* x    */, 1 /* y    */, 1 /* z    */, 1 /* {    */,
-    1 /* |    */, 1 /* }    */, 1 /* ~    */, 0 /* DEL  */,
-    1 /* 0x80 */, 1 /* 0x81 */, 1 /* 0x82 */, 1 /* 0x83 */,
-    1 /* 0x84 */, 1 /* 0x85 */, 1 /* 0x86 */, 1 /* 0x87 */,
-    1 /* 0x88 */, 1 /* 0x89 */, 1 /* 0x8a */, 1 /* 0x8b */,
-    1 /* 0x8c */, 1 /* 0x8d */, 1 /* 0x8e */, 1 /* 0x8f */,
-    1 /* 0x90 */, 1 /* 0x91 */, 1 /* 0x92 */, 1 /* 0x93 */,
-    1 /* 0x94 */, 1 /* 0x95 */, 1 /* 0x96 */, 1 /* 0x97 */,
-    1 /* 0x98 */, 1 /* 0x99 */, 1 /* 0x9a */, 1 /* 0x9b */,
-    1 /* 0x9c */, 1 /* 0x9d */, 1 /* 0x9e */, 1 /* 0x9f */,
-    1 /* 0xa0 */, 1 /* 0xa1 */, 1 /* 0xa2 */, 1 /* 0xa3 */,
-    1 /* 0xa4 */, 1 /* 0xa5 */, 1 /* 0xa6 */, 1 /* 0xa7 */,
-    1 /* 0xa8 */, 1 /* 0xa9 */, 1 /* 0xaa */, 1 /* 0xab */,
-    1 /* 0xac */, 1 /* 0xad */, 1 /* 0xae */, 1 /* 0xaf */,
-    1 /* 0xb0 */, 1 /* 0xb1 */, 1 /* 0xb2 */, 1 /* 0xb3 */,
-    1 /* 0xb4 */, 1 /* 0xb5 */, 1 /* 0xb6 */, 1 /* 0xb7 */,
-    1 /* 0xb8 */, 1 /* 0xb9 */, 1 /* 0xba */, 1 /* 0xbb */,
-    1 /* 0xbc */, 1 /* 0xbd */, 1 /* 0xbe */, 1 /* 0xbf */,
-    1 /* 0xc0 */, 1 /* 0xc1 */, 1 /* 0xc2 */, 1 /* 0xc3 */,
-    1 /* 0xc4 */, 1 /* 0xc5 */, 1 /* 0xc6 */, 1 /* 0xc7 */,
-    1 /* 0xc8 */, 1 /* 0xc9 */, 1 /* 0xca */, 1 /* 0xcb */,
-    1 /* 0xcc */, 1 /* 0xcd */, 1 /* 0xce */, 1 /* 0xcf */,
-    1 /* 0xd0 */, 1 /* 0xd1 */, 1 /* 0xd2 */, 1 /* 0xd3 */,
-    1 /* 0xd4 */, 1 /* 0xd5 */, 1 /* 0xd6 */, 1 /* 0xd7 */,
-    1 /* 0xd8 */, 1 /* 0xd9 */, 1 /* 0xda */, 1 /* 0xdb */,
-    1 /* 0xdc */, 1 /* 0xdd */, 1 /* 0xde */, 1 /* 0xdf */,
-    1 /* 0xe0 */, 1 /* 0xe1 */, 1 /* 0xe2 */, 1 /* 0xe3 */,
-    1 /* 0xe4 */, 1 /* 0xe5 */, 1 /* 0xe6 */, 1 /* 0xe7 */,
-    1 /* 0xe8 */, 1 /* 0xe9 */, 1 /* 0xea */, 1 /* 0xeb */,
-    1 /* 0xec */, 1 /* 0xed */, 1 /* 0xee */, 1 /* 0xef */,
-    1 /* 0xf0 */, 1 /* 0xf1 */, 1 /* 0xf2 */, 1 /* 0xf3 */,
-    1 /* 0xf4 */, 1 /* 0xf5 */, 1 /* 0xf6 */, 1 /* 0xf7 */,
-    1 /* 0xf8 */, 1 /* 0xf9 */, 1 /* 0xfa */, 1 /* 0xfb */,
-    1 /* 0xfc */, 1 /* 0xfd */, 1 /* 0xfe */, 1 /* 0xff */
+  0 /* NUL  */, 0 /* SOH  */, 0 /* STX  */, 0 /* ETX  */,
+  0 /* EOT  */, 0 /* ENQ  */, 0 /* ACK  */, 0 /* BEL  */,
+  0 /* BS   */, 1 /* HT   */, 0 /* LF   */, 0 /* VT   */,
+  0 /* FF   */, 0 /* CR   */, 0 /* SO   */, 0 /* SI   */,
+  0 /* DLE  */, 0 /* DC1  */, 0 /* DC2  */, 0 /* DC3  */,
+  0 /* DC4  */, 0 /* NAK  */, 0 /* SYN  */, 0 /* ETB  */,
+  0 /* CAN  */, 0 /* EM   */, 0 /* SUB  */, 0 /* ESC  */,
+  0 /* FS   */, 0 /* GS   */, 0 /* RS   */, 0 /* US   */,
+  1 /* SPC  */, 1 /* !    */, 1 /* "    */, 1 /* #    */,
+  1 /* $    */, 1 /* %    */, 1 /* &    */, 1 /* '    */,
+  1 /* (    */, 1 /* )    */, 1 /* *    */, 1 /* +    */,
+  1 /* ,    */, 1 /* -    */, 1 /* .    */, 1 /* /    */,
+  1 /* 0    */, 1 /* 1    */, 1 /* 2    */, 1 /* 3    */,
+  1 /* 4    */, 1 /* 5    */, 1 /* 6    */, 1 /* 7    */,
+  1 /* 8    */, 1 /* 9    */, 1 /* :    */, 1 /* ;    */,
+  1 /* <    */, 1 /* =    */, 1 /* >    */, 1 /* ?    */,
+  1 /* @    */, 1 /* A    */, 1 /* B    */, 1 /* C    */,
+  1 /* D    */, 1 /* E    */, 1 /* F    */, 1 /* G    */,
+  1 /* H    */, 1 /* I    */, 1 /* J    */, 1 /* K    */,
+  1 /* L    */, 1 /* M    */, 1 /* N    */, 1 /* O    */,
+  1 /* P    */, 1 /* Q    */, 1 /* R    */, 1 /* S    */,
+  1 /* T    */, 1 /* U    */, 1 /* V    */, 1 /* W    */,
+  1 /* X    */, 1 /* Y    */, 1 /* Z    */, 1 /* [    */,
+  1 /* \    */, 1 /* ]    */, 1 /* ^    */, 1 /* _    */,
+  1 /* `    */, 1 /* a    */, 1 /* b    */, 1 /* c    */,
+  1 /* d    */, 1 /* e    */, 1 /* f    */, 1 /* g    */,
+  1 /* h    */, 1 /* i    */, 1 /* j    */, 1 /* k    */,
+  1 /* l    */, 1 /* m    */, 1 /* n    */, 1 /* o    */,
+  1 /* p    */, 1 /* q    */, 1 /* r    */, 1 /* s    */,
+  1 /* t    */, 1 /* u    */, 1 /* v    */, 1 /* w    */,
+  1 /* x    */, 1 /* y    */, 1 /* z    */, 1 /* {    */,
+  1 /* |    */, 1 /* }    */, 1 /* ~    */, 0 /* DEL  */,
+  1 /* 0x80 */, 1 /* 0x81 */, 1 /* 0x82 */, 1 /* 0x83 */,
+  1 /* 0x84 */, 1 /* 0x85 */, 1 /* 0x86 */, 1 /* 0x87 */,
+  1 /* 0x88 */, 1 /* 0x89 */, 1 /* 0x8a */, 1 /* 0x8b */,
+  1 /* 0x8c */, 1 /* 0x8d */, 1 /* 0x8e */, 1 /* 0x8f */,
+  1 /* 0x90 */, 1 /* 0x91 */, 1 /* 0x92 */, 1 /* 0x93 */,
+  1 /* 0x94 */, 1 /* 0x95 */, 1 /* 0x96 */, 1 /* 0x97 */,
+  1 /* 0x98 */, 1 /* 0x99 */, 1 /* 0x9a */, 1 /* 0x9b */,
+  1 /* 0x9c */, 1 /* 0x9d */, 1 /* 0x9e */, 1 /* 0x9f */,
+  1 /* 0xa0 */, 1 /* 0xa1 */, 1 /* 0xa2 */, 1 /* 0xa3 */,
+  1 /* 0xa4 */, 1 /* 0xa5 */, 1 /* 0xa6 */, 1 /* 0xa7 */,
+  1 /* 0xa8 */, 1 /* 0xa9 */, 1 /* 0xaa */, 1 /* 0xab */,
+  1 /* 0xac */, 1 /* 0xad */, 1 /* 0xae */, 1 /* 0xaf */,
+  1 /* 0xb0 */, 1 /* 0xb1 */, 1 /* 0xb2 */, 1 /* 0xb3 */,
+  1 /* 0xb4 */, 1 /* 0xb5 */, 1 /* 0xb6 */, 1 /* 0xb7 */,
+  1 /* 0xb8 */, 1 /* 0xb9 */, 1 /* 0xba */, 1 /* 0xbb */,
+  1 /* 0xbc */, 1 /* 0xbd */, 1 /* 0xbe */, 1 /* 0xbf */,
+  1 /* 0xc0 */, 1 /* 0xc1 */, 1 /* 0xc2 */, 1 /* 0xc3 */,
+  1 /* 0xc4 */, 1 /* 0xc5 */, 1 /* 0xc6 */, 1 /* 0xc7 */,
+  1 /* 0xc8 */, 1 /* 0xc9 */, 1 /* 0xca */, 1 /* 0xcb */,
+  1 /* 0xcc */, 1 /* 0xcd */, 1 /* 0xce */, 1 /* 0xcf */,
+  1 /* 0xd0 */, 1 /* 0xd1 */, 1 /* 0xd2 */, 1 /* 0xd3 */,
+  1 /* 0xd4 */, 1 /* 0xd5 */, 1 /* 0xd6 */, 1 /* 0xd7 */,
+  1 /* 0xd8 */, 1 /* 0xd9 */, 1 /* 0xda */, 1 /* 0xdb */,
+  1 /* 0xdc */, 1 /* 0xdd */, 1 /* 0xde */, 1 /* 0xdf */,
+  1 /* 0xe0 */, 1 /* 0xe1 */, 1 /* 0xe2 */, 1 /* 0xe3 */,
+  1 /* 0xe4 */, 1 /* 0xe5 */, 1 /* 0xe6 */, 1 /* 0xe7 */,
+  1 /* 0xe8 */, 1 /* 0xe9 */, 1 /* 0xea */, 1 /* 0xeb */,
+  1 /* 0xec */, 1 /* 0xed */, 1 /* 0xee */, 1 /* 0xef */,
+  1 /* 0xf0 */, 1 /* 0xf1 */, 1 /* 0xf2 */, 1 /* 0xf3 */,
+  1 /* 0xf4 */, 1 /* 0xf5 */, 1 /* 0xf6 */, 1 /* 0xf7 */,
+  1 /* 0xf8 */, 1 /* 0xf9 */, 1 /* 0xfa */, 1 /* 0xfb */,
+  1 /* 0xfc */, 1 /* 0xfd */, 1 /* 0xfe */, 1 /* 0xff */
 };
 
+#ifdef __AVX2__
+static int contains_bad_header_value_char_avx2(const uint8_t *first,
+                                               const uint8_t *last) {
+  const __m256i ctll = _mm256_set1_epi8(0x00 - 1);
+  const __m256i ctlr = _mm256_set1_epi8(0x1f + 1);
+  const __m256i ht = _mm256_set1_epi8('\t');
+  const __m256i del = _mm256_set1_epi8(0x7f);
+  __m256i s, x;
+  uint32_t m;
+
+  for (; first != last; first += 32) {
+    s = _mm256_loadu_si256((void *)first);
+
+    x = _mm256_andnot_si256(
+      _mm256_cmpeq_epi8(s, ht),
+      _mm256_and_si256(_mm256_cmpgt_epi8(s, ctll), _mm256_cmpgt_epi8(ctlr, s)));
+    x = _mm256_or_si256(_mm256_cmpeq_epi8(s, del), x);
+
+    m = (uint32_t)_mm256_movemask_epi8(x);
+    if (m) {
+      return 1;
+    }
+  }
+
+  return 0;
+}
+#endif /* __AVX2__ */
+
 int nghttp3_check_header_value(const uint8_t *value, size_t len) {
   const uint8_t *last;
+#ifdef __AVX2__
+  const uint8_t *last32;
+#endif /* __AVX2__ */
 
   switch (len) {
   case 0:
@@ -997,7 +998,20 @@ int nghttp3_check_header_value(const uint8_t *value, size_t len) {
     }
   }
 
-  for (last = value + len; value != last; ++value) {
+  last = value + len;
+
+#ifdef __AVX2__
+  if (len >= 32) {
+    last32 = value + (len & ~0x1fu);
+    if (contains_bad_header_value_char_avx2(value, last32)) {
+      return 0;
+    }
+
+    value = last32;
+  }
+#endif /* __AVX2__ */
+
+  for (; value != last; ++value) {
     if (!VALID_HD_VALUE_CHARS[*value]) {
       return 0;
     }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_http.h b/deps/ngtcp2/nghttp3/lib/nghttp3_http.h
index 575d9c267e1b68..f0bfc69fbade75 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_http.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_http.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -170,4 +170,4 @@ int nghttp3_http_parse_priority(nghttp3_pri *dest, const uint8_t *value,
 
 int nghttp3_pri_eq(const nghttp3_pri *a, const nghttp3_pri *b);
 
-#endif /* NGHTTP3_HTTP_H */
+#endif /* !defined(NGHTTP3_HTTP_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.c b/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.c
index dc34841fe0f8ef..ffed3064d2b791 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.c
@@ -27,10 +27,8 @@
 
 #include <assert.h>
 
-void nghttp3_idtr_init(nghttp3_idtr *idtr, int server, const nghttp3_mem *mem) {
+void nghttp3_idtr_init(nghttp3_idtr *idtr, const nghttp3_mem *mem) {
   nghttp3_gaptr_init(&idtr->gap, mem);
-
-  idtr->server = server;
 }
 
 void nghttp3_idtr_free(nghttp3_idtr *idtr) {
@@ -42,8 +40,7 @@ void nghttp3_idtr_free(nghttp3_idtr *idtr) {
 }
 
 /*
- * id_from_stream_id translates |stream_id| to id space used by
- * nghttp3_idtr.
+ * id_from_stream_id translates |stream_id| to an internal ID.
  */
 static uint64_t id_from_stream_id(int64_t stream_id) {
   return (uint64_t)(stream_id >> 2);
@@ -52,9 +49,6 @@ static uint64_t id_from_stream_id(int64_t stream_id) {
 int nghttp3_idtr_open(nghttp3_idtr *idtr, int64_t stream_id) {
   uint64_t q;
 
-  assert((idtr->server && (stream_id % 2)) ||
-         (!idtr->server && (stream_id % 2)) == 0);
-
   q = id_from_stream_id(stream_id);
 
   if (nghttp3_gaptr_is_pushed(&idtr->gap, q, 1)) {
@@ -67,14 +61,7 @@ int nghttp3_idtr_open(nghttp3_idtr *idtr, int64_t stream_id) {
 int nghttp3_idtr_is_open(nghttp3_idtr *idtr, int64_t stream_id) {
   uint64_t q;
 
-  assert((idtr->server && (stream_id % 2)) ||
-         (!idtr->server && (stream_id % 2)) == 0);
-
   q = id_from_stream_id(stream_id);
 
   return nghttp3_gaptr_is_pushed(&idtr->gap, q, 1);
 }
-
-uint64_t nghttp3_idtr_first_gap(nghttp3_idtr *idtr) {
-  return nghttp3_gaptr_first_gap_offset(&idtr->gap);
-}
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.h b/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.h
index ea3346c9a964c4..8ba15fc810cdcb 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_idtr.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -39,21 +39,17 @@
  * nghttp3_idtr tracks the usage of stream ID.
  */
 typedef struct nghttp3_idtr {
-  /* gap maintains the range of ID which is not used yet. Initially,
-     its range is [0, UINT64_MAX). */
+  /* gap maintains the range of an internal ID which is not used yet.
+     Initially, its range is [0, UINT64_MAX).  The internal ID and
+     stream ID are in the different number spaces.  See
+     id_from_stream_id to convert a stream ID to an internal ID. */
   nghttp3_gaptr gap;
-  /* server is nonzero if this object records server initiated stream
-     ID. */
-  int server;
 } nghttp3_idtr;
 
 /*
  * nghttp3_idtr_init initializes |idtr|.
- *
- * If this object records server initiated ID (even number), set
- * |server| to nonzero.
  */
-void nghttp3_idtr_init(nghttp3_idtr *idtr, int server, const nghttp3_mem *mem);
+void nghttp3_idtr_init(nghttp3_idtr *idtr, const nghttp3_mem *mem);
 
 /*
  * nghttp3_idtr_free frees resources allocated for |idtr|.
@@ -61,30 +57,21 @@ void nghttp3_idtr_init(nghttp3_idtr *idtr, int server, const nghttp3_mem *mem);
 void nghttp3_idtr_free(nghttp3_idtr *idtr);
 
 /*
- * nghttp3_idtr_open claims that |stream_id| is in used.
+ * nghttp3_idtr_open claims that |stream_id| is in use.
  *
  * It returns 0 if it succeeds, or one of the following negative error
  * codes:
  *
  * NGHTTP3_ERR_STREAM_IN_USE
- *     ID has already been used.
+ *     |stream_id| has already been used.
  * NGHTTP3_ERR_NOMEM
  *     Out of memory.
  */
 int nghttp3_idtr_open(nghttp3_idtr *idtr, int64_t stream_id);
 
 /*
- * nghttp3_idtr_open tells whether ID |stream_id| is in used or not.
- *
- * It returns nonzero if |stream_id| is used.
+ * nghttp3_idtr_open returns nonzero if |stream_id| is in use.
  */
 int nghttp3_idtr_is_open(nghttp3_idtr *idtr, int64_t stream_id);
 
-/*
- * nghttp3_idtr_first_gap returns the first id of first gap.  If there
- * is no gap, it returns UINT64_MAX.  The returned id is an id space
- * used in this object internally, and not stream ID.
- */
-uint64_t nghttp3_idtr_first_gap(nghttp3_idtr *idtr);
-
-#endif /* NGHTTP3_IDTR_H */
+#endif /* !defined(NGHTTP3_IDTR_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.c b/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.c
index d7420a5d8a1e5d..a3b5fbcb05f4f3 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.c
@@ -39,8 +39,10 @@ static nghttp3_ksl_blk null_blk = {{{NULL, NULL, 0, 0, {0}}}};
 nghttp3_objalloc_def(ksl_blk, nghttp3_ksl_blk, oplent);
 
 static size_t ksl_nodelen(size_t keylen) {
-  return (sizeof(nghttp3_ksl_node) + keylen - sizeof(uint64_t) + 0xfu) &
-         ~(uintptr_t)0xfu;
+  assert(keylen >= sizeof(uint64_t));
+
+  return (sizeof(nghttp3_ksl_node) + keylen - sizeof(uint64_t) + 0x7u) &
+         ~(uintptr_t)0x7u;
 }
 
 static size_t ksl_blklen(size_t nodelen) {
@@ -61,15 +63,14 @@ void nghttp3_ksl_init(nghttp3_ksl *ksl, nghttp3_ksl_compar compar,
   size_t nodelen = ksl_nodelen(keylen);
 
   nghttp3_objalloc_init(&ksl->blkalloc,
-                        ((ksl_blklen(nodelen) + 0xfu) & ~(uintptr_t)0xfu) * 8,
-                        mem);
+                        (ksl_blklen(nodelen) + 0xfu) & ~(uintptr_t)0xfu, mem);
 
   ksl->head = NULL;
   ksl->front = ksl->back = NULL;
   ksl->compar = compar;
+  ksl->n = 0;
   ksl->keylen = keylen;
   ksl->nodelen = nodelen;
-  ksl->n = 0;
 }
 
 static nghttp3_ksl_blk *ksl_blk_objalloc_new(nghttp3_ksl *ksl) {
@@ -83,6 +84,7 @@ static void ksl_blk_objalloc_del(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk) {
 
 static int ksl_head_init(nghttp3_ksl *ksl) {
   nghttp3_ksl_blk *head = ksl_blk_objalloc_new(ksl);
+
   if (!head) {
     return NGHTTP3_ERR_NOMEM;
   }
@@ -112,7 +114,7 @@ static void ksl_free_blk(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk) {
 
   ksl_blk_objalloc_del(ksl, blk);
 }
-#endif /* NOMEMPOOL */
+#endif /* defined(NOMEMPOOL) */
 
 void nghttp3_ksl_free(nghttp3_ksl *ksl) {
   if (!ksl || !ksl->head) {
@@ -121,7 +123,7 @@ void nghttp3_ksl_free(nghttp3_ksl *ksl) {
 
 #ifdef NOMEMPOOL
   ksl_free_blk(ksl, ksl->head);
-#endif /* NOMEMPOOL */
+#endif /* defined(NOMEMPOOL) */
 
   nghttp3_objalloc_free(&ksl->blkalloc);
 }
@@ -144,21 +146,22 @@ static nghttp3_ksl_blk *ksl_split_blk(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk) {
 
   rblk->next = blk->next;
   blk->next = rblk;
+
   if (rblk->next) {
     rblk->next->prev = rblk;
   } else if (ksl->back == blk) {
     ksl->back = rblk;
   }
+
   rblk->prev = blk;
   rblk->leaf = blk->leaf;
 
   rblk->n = blk->n / 2;
+  blk->n -= rblk->n;
 
-  memcpy(rblk->nodes, blk->nodes + ksl->nodelen * (blk->n - rblk->n),
+  memcpy(rblk->nodes, blk->nodes + ksl->nodelen * blk->n,
          ksl->nodelen * rblk->n);
 
-  blk->n -= rblk->n;
-
   assert(blk->n >= NGHTTP3_KSL_MIN_NBLK);
   assert(rblk->n >= NGHTTP3_KSL_MIN_NBLK);
 
@@ -174,7 +177,7 @@ static nghttp3_ksl_blk *ksl_split_blk(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk) {
  * codes:
  *
  * NGHTTP3_ERR_NOMEM
- *   Out of memory.
+ *     Out of memory.
  */
 static int ksl_split_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
   nghttp3_ksl_node *node;
@@ -210,7 +213,7 @@ static int ksl_split_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
  * codes:
  *
  * NGHTTP3_ERR_NOMEM
- *   Out of memory.
+ *     Out of memory.
  */
 static int ksl_split_head(nghttp3_ksl *ksl) {
   nghttp3_ksl_blk *rblk = NULL, *lblk, *nhead = NULL;
@@ -224,10 +227,12 @@ static int ksl_split_head(nghttp3_ksl *ksl) {
   lblk = ksl->head;
 
   nhead = ksl_blk_objalloc_new(ksl);
+
   if (nhead == NULL) {
     ksl_blk_objalloc_del(ksl, rblk);
     return NGHTTP3_ERR_NOMEM;
   }
+
   nhead->next = nhead->prev = NULL;
   nhead->n = 2;
   nhead->leaf = 0;
@@ -248,9 +253,9 @@ static int ksl_split_head(nghttp3_ksl *ksl) {
 }
 
 /*
- * insert_node inserts a node whose key is |key| with the associated
- * |data| at the index of |i|.  This function assumes that the number
- * of nodes contained by |blk| is strictly less than
+ * ksl_insert_node inserts a node whose key is |key| with the
+ * associated |data| at the index of |i|.  This function assumes that
+ * the number of nodes contained by |blk| is strictly less than
  * NGHTTP3_KSL_MAX_NBLK.
  */
 static void ksl_insert_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i,
@@ -269,9 +274,9 @@ static void ksl_insert_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i,
   ++blk->n;
 }
 
-static size_t ksl_bsearch(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
-                          const nghttp3_ksl_key *key,
-                          nghttp3_ksl_compar compar) {
+static size_t ksl_search(const nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
+                         const nghttp3_ksl_key *key,
+                         nghttp3_ksl_compar compar) {
   size_t i;
   nghttp3_ksl_node *node;
 
@@ -297,18 +302,17 @@ int nghttp3_ksl_insert(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
     }
   }
 
-  blk = ksl->head;
-
-  if (blk->n == NGHTTP3_KSL_MAX_NBLK) {
+  if (ksl->head->n == NGHTTP3_KSL_MAX_NBLK) {
     rv = ksl_split_head(ksl);
     if (rv != 0) {
       return rv;
     }
-    blk = ksl->head;
   }
 
+  blk = ksl->head;
+
   for (;;) {
-    i = ksl_bsearch(ksl, blk, key, ksl->compar);
+    i = ksl_search(ksl, blk, key, ksl->compar);
 
     if (blk->leaf) {
       if (i < blk->n &&
@@ -316,13 +320,17 @@ int nghttp3_ksl_insert(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
         if (it) {
           *it = nghttp3_ksl_end(ksl);
         }
+
         return NGHTTP3_ERR_INVALID_ARGUMENT;
       }
+
       ksl_insert_node(ksl, blk, i, key, data);
       ++ksl->n;
+
       if (it) {
         nghttp3_ksl_it_init(it, ksl, blk, i);
       }
+
       return 0;
     }
 
@@ -335,16 +343,21 @@ int nghttp3_ksl_insert(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
           if (rv != 0) {
             return rv;
           }
+
           node = nghttp3_ksl_nth_node(ksl, blk, blk->n - 1);
         }
+
         ksl_node_set_key(ksl, node, key);
         blk = node->blk;
       }
+
       ksl_insert_node(ksl, blk, blk->n, key, data);
       ++ksl->n;
+
       if (it) {
         nghttp3_ksl_it_init(it, ksl, blk, blk->n - 1);
       }
+
       return 0;
     }
 
@@ -355,8 +368,10 @@ int nghttp3_ksl_insert(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
       if (rv != 0) {
         return rv;
       }
+
       if (ksl->compar((nghttp3_ksl_key *)node->key, key)) {
         node = nghttp3_ksl_nth_node(ksl, blk, i + 1);
+
         if (ksl->compar((nghttp3_ksl_key *)node->key, key)) {
           ksl_node_set_key(ksl, node, key);
         }
@@ -382,19 +397,22 @@ static void ksl_remove_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
  * ksl_merge_node merges 2 nodes which are the nodes at the index of
  * |i| and |i + 1|.
  *
- * If |blk| is the direct descendant of head (root) block and the head
- * block contains just 2 nodes, the merged block becomes head block,
- * which decreases the height of |ksl| by 1.
+ * If |blk| is the head (root) block and it contains just 2 nodes
+ * before merging nodes, the merged block becomes head block, which
+ * decreases the height of |ksl| by 1.
  *
  * This function returns the pointer to the merged block.
  */
 static nghttp3_ksl_blk *ksl_merge_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
                                        size_t i) {
+  nghttp3_ksl_node *lnode;
   nghttp3_ksl_blk *lblk, *rblk;
 
   assert(i + 1 < blk->n);
 
-  lblk = nghttp3_ksl_nth_node(ksl, blk, i)->blk;
+  lnode = nghttp3_ksl_nth_node(ksl, blk, i);
+
+  lblk = lnode->blk;
   rblk = nghttp3_ksl_nth_node(ksl, blk, i + 1)->blk;
 
   assert(lblk->n + rblk->n < NGHTTP3_KSL_MAX_NBLK);
@@ -404,6 +422,7 @@ static nghttp3_ksl_blk *ksl_merge_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
 
   lblk->n += rblk->n;
   lblk->next = rblk->next;
+
   if (lblk->next) {
     lblk->next->prev = lblk;
   } else if (ksl->back == rblk) {
@@ -417,7 +436,7 @@ static nghttp3_ksl_blk *ksl_merge_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
     ksl->head = lblk;
   } else {
     ksl_remove_node(ksl, blk, i + 1);
-    ksl_node_set_key(ksl, nghttp3_ksl_nth_node(ksl, blk, i),
+    ksl_node_set_key(ksl, lnode,
                      nghttp3_ksl_nth_node(ksl, lblk, lblk->n - 1)->key);
   }
 
@@ -431,6 +450,7 @@ static nghttp3_ksl_blk *ksl_merge_node(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
  */
 static void ksl_shift_left(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
   nghttp3_ksl_node *lnode, *rnode;
+  nghttp3_ksl_blk *lblk, *rblk;
   size_t n;
 
   assert(i > 0);
@@ -438,36 +458,37 @@ static void ksl_shift_left(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
   lnode = nghttp3_ksl_nth_node(ksl, blk, i - 1);
   rnode = nghttp3_ksl_nth_node(ksl, blk, i);
 
-  assert(lnode->blk->n < NGHTTP3_KSL_MAX_NBLK);
-  assert(rnode->blk->n > NGHTTP3_KSL_MIN_NBLK);
+  lblk = lnode->blk;
+  rblk = rnode->blk;
+
+  assert(lblk->n < NGHTTP3_KSL_MAX_NBLK);
+  assert(rblk->n > NGHTTP3_KSL_MIN_NBLK);
 
-  n = (lnode->blk->n + rnode->blk->n + 1) / 2 - lnode->blk->n;
+  n = (lblk->n + rblk->n + 1) / 2 - lblk->n;
 
   assert(n > 0);
-  assert(lnode->blk->n <= NGHTTP3_KSL_MAX_NBLK - n);
-  assert(rnode->blk->n >= NGHTTP3_KSL_MIN_NBLK + n);
+  assert(lblk->n <= NGHTTP3_KSL_MAX_NBLK - n);
+  assert(rblk->n >= NGHTTP3_KSL_MIN_NBLK + n);
 
-  memcpy(lnode->blk->nodes + ksl->nodelen * lnode->blk->n, rnode->blk->nodes,
-         ksl->nodelen * n);
+  memcpy(lblk->nodes + ksl->nodelen * lblk->n, rblk->nodes, ksl->nodelen * n);
 
-  lnode->blk->n += (uint32_t)n;
-  rnode->blk->n -= (uint32_t)n;
+  lblk->n += (uint32_t)n;
+  rblk->n -= (uint32_t)n;
 
-  ksl_node_set_key(
-      ksl, lnode,
-      nghttp3_ksl_nth_node(ksl, lnode->blk, lnode->blk->n - 1)->key);
+  ksl_node_set_key(ksl, lnode,
+                   nghttp3_ksl_nth_node(ksl, lblk, lblk->n - 1)->key);
 
-  memmove(rnode->blk->nodes, rnode->blk->nodes + ksl->nodelen * n,
-          ksl->nodelen * rnode->blk->n);
+  memmove(rblk->nodes, rblk->nodes + ksl->nodelen * n, ksl->nodelen * rblk->n);
 }
 
 /*
  * ksl_shift_right moves the last nodes in blk->nodes[i]->blk->nodes
  * to blk->nodes[i + 1]->blk->nodes in a manner that they have the
- * same amount of nodes as much as possible..
+ * same amount of nodes as much as possible.
  */
 static void ksl_shift_right(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
   nghttp3_ksl_node *lnode, *rnode;
+  nghttp3_ksl_blk *lblk, *rblk;
   size_t n;
 
   assert(i < blk->n - 1);
@@ -475,27 +496,27 @@ static void ksl_shift_right(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t i) {
   lnode = nghttp3_ksl_nth_node(ksl, blk, i);
   rnode = nghttp3_ksl_nth_node(ksl, blk, i + 1);
 
-  assert(lnode->blk->n > NGHTTP3_KSL_MIN_NBLK);
-  assert(rnode->blk->n < NGHTTP3_KSL_MAX_NBLK);
+  lblk = lnode->blk;
+  rblk = rnode->blk;
 
-  n = (lnode->blk->n + rnode->blk->n + 1) / 2 - rnode->blk->n;
+  assert(lblk->n > NGHTTP3_KSL_MIN_NBLK);
+  assert(rblk->n < NGHTTP3_KSL_MAX_NBLK);
+
+  n = (lblk->n + rblk->n + 1) / 2 - rblk->n;
 
   assert(n > 0);
-  assert(lnode->blk->n >= NGHTTP3_KSL_MIN_NBLK + n);
-  assert(rnode->blk->n <= NGHTTP3_KSL_MAX_NBLK - n);
+  assert(lblk->n >= NGHTTP3_KSL_MIN_NBLK + n);
+  assert(rblk->n <= NGHTTP3_KSL_MAX_NBLK - n);
 
-  memmove(rnode->blk->nodes + ksl->nodelen * n, rnode->blk->nodes,
-          ksl->nodelen * rnode->blk->n);
+  memmove(rblk->nodes + ksl->nodelen * n, rblk->nodes, ksl->nodelen * rblk->n);
 
-  rnode->blk->n += (uint32_t)n;
-  lnode->blk->n -= (uint32_t)n;
+  rblk->n += (uint32_t)n;
+  lblk->n -= (uint32_t)n;
 
-  memcpy(rnode->blk->nodes, lnode->blk->nodes + ksl->nodelen * lnode->blk->n,
-         ksl->nodelen * n);
+  memcpy(rblk->nodes, lblk->nodes + ksl->nodelen * lblk->n, ksl->nodelen * n);
 
-  ksl_node_set_key(
-      ksl, lnode,
-      nghttp3_ksl_nth_node(ksl, lnode->blk, lnode->blk->n - 1)->key);
+  ksl_node_set_key(ksl, lnode,
+                   nghttp3_ksl_nth_node(ksl, lblk, lblk->n - 1)->key);
 }
 
 /*
@@ -539,23 +560,24 @@ int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
   nghttp3_ksl_node *node;
   size_t i;
 
-  if (!ksl->head) {
+  if (!blk) {
     return NGHTTP3_ERR_INVALID_ARGUMENT;
   }
 
   if (!blk->leaf && blk->n == 2 &&
       nghttp3_ksl_nth_node(ksl, blk, 0)->blk->n == NGHTTP3_KSL_MIN_NBLK &&
       nghttp3_ksl_nth_node(ksl, blk, 1)->blk->n == NGHTTP3_KSL_MIN_NBLK) {
-    blk = ksl_merge_node(ksl, ksl->head, 0);
+    blk = ksl_merge_node(ksl, blk, 0);
   }
 
   for (;;) {
-    i = ksl_bsearch(ksl, blk, key, ksl->compar);
+    i = ksl_search(ksl, blk, key, ksl->compar);
 
     if (i == blk->n) {
       if (it) {
         *it = nghttp3_ksl_end(ksl);
       }
+
       return NGHTTP3_ERR_INVALID_ARGUMENT;
     }
 
@@ -564,10 +586,13 @@ int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
         if (it) {
           *it = nghttp3_ksl_end(ksl);
         }
+
         return NGHTTP3_ERR_INVALID_ARGUMENT;
       }
+
       ksl_remove_node(ksl, blk, i);
       --ksl->n;
+
       if (it) {
         if (blk->n == i && blk->next) {
           nghttp3_ksl_it_init(it, ksl, blk->next, 0);
@@ -575,6 +600,7 @@ int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
           nghttp3_ksl_it_init(it, ksl, blk, i);
         }
       }
+
       return 0;
     }
 
@@ -591,6 +617,7 @@ int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
         nghttp3_ksl_nth_node(ksl, blk, i + 1)->blk->n > NGHTTP3_KSL_MIN_NBLK) {
       ksl_shift_left(ksl, blk, i + 1);
       blk = node->blk;
+
       continue;
     }
 
@@ -598,6 +625,7 @@ int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
         nghttp3_ksl_nth_node(ksl, blk, i - 1)->blk->n > NGHTTP3_KSL_MIN_NBLK) {
       ksl_shift_right(ksl, blk, i - 1);
       blk = node->blk;
+
       continue;
     }
 
@@ -612,48 +640,12 @@ int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
   }
 }
 
-nghttp3_ksl_it nghttp3_ksl_lower_bound(nghttp3_ksl *ksl,
+nghttp3_ksl_it nghttp3_ksl_lower_bound(const nghttp3_ksl *ksl,
                                        const nghttp3_ksl_key *key) {
-  nghttp3_ksl_blk *blk = ksl->head;
-  nghttp3_ksl_it it;
-  size_t i;
-
-  if (!blk) {
-    nghttp3_ksl_it_init(&it, ksl, &null_blk, 0);
-    return it;
-  }
-
-  for (;;) {
-    i = ksl_bsearch(ksl, blk, key, ksl->compar);
-
-    if (blk->leaf) {
-      if (i == blk->n && blk->next) {
-        blk = blk->next;
-        i = 0;
-      }
-      nghttp3_ksl_it_init(&it, ksl, blk, i);
-      return it;
-    }
-
-    if (i == blk->n) {
-      /* This happens if descendant has smaller key.  Fast forward to
-         find last node in this subtree. */
-      for (; !blk->leaf; blk = nghttp3_ksl_nth_node(ksl, blk, blk->n - 1)->blk)
-        ;
-      if (blk->next) {
-        blk = blk->next;
-        i = 0;
-      } else {
-        i = blk->n;
-      }
-      nghttp3_ksl_it_init(&it, ksl, blk, i);
-      return it;
-    }
-    blk = nghttp3_ksl_nth_node(ksl, blk, i)->blk;
-  }
+  return nghttp3_ksl_lower_bound_compar(ksl, key, ksl->compar);
 }
 
-nghttp3_ksl_it nghttp3_ksl_lower_bound_compar(nghttp3_ksl *ksl,
+nghttp3_ksl_it nghttp3_ksl_lower_bound_compar(const nghttp3_ksl *ksl,
                                               const nghttp3_ksl_key *key,
                                               nghttp3_ksl_compar compar) {
   nghttp3_ksl_blk *blk = ksl->head;
@@ -666,14 +658,16 @@ nghttp3_ksl_it nghttp3_ksl_lower_bound_compar(nghttp3_ksl *ksl,
   }
 
   for (;;) {
-    i = ksl_bsearch(ksl, blk, key, compar);
+    i = ksl_search(ksl, blk, key, compar);
 
     if (blk->leaf) {
       if (i == blk->n && blk->next) {
         blk = blk->next;
         i = 0;
       }
+
       nghttp3_ksl_it_init(&it, ksl, blk, i);
+
       return it;
     }
 
@@ -682,15 +676,19 @@ nghttp3_ksl_it nghttp3_ksl_lower_bound_compar(nghttp3_ksl *ksl,
          find last node in this subtree. */
       for (; !blk->leaf; blk = nghttp3_ksl_nth_node(ksl, blk, blk->n - 1)->blk)
         ;
+
       if (blk->next) {
         blk = blk->next;
         i = 0;
       } else {
         i = blk->n;
       }
+
       nghttp3_ksl_it_init(&it, ksl, blk, i);
+
       return it;
     }
+
     blk = nghttp3_ksl_nth_node(ksl, blk, i)->blk;
   }
 }
@@ -704,7 +702,7 @@ void nghttp3_ksl_update_key(nghttp3_ksl *ksl, const nghttp3_ksl_key *old_key,
   assert(ksl->head);
 
   for (;;) {
-    i = ksl_bsearch(ksl, blk, old_key, ksl->compar);
+    i = ksl_search(ksl, blk, old_key, ksl->compar);
 
     assert(i < blk->n);
     node = nghttp3_ksl_nth_node(ksl, blk, i);
@@ -712,6 +710,7 @@ void nghttp3_ksl_update_key(nghttp3_ksl *ksl, const nghttp3_ksl_key *old_key,
     if (blk->leaf) {
       assert(key_equal(ksl->compar, (nghttp3_ksl_key *)node->key, old_key));
       ksl_node_set_key(ksl, node, new_key);
+
       return;
     }
 
@@ -724,7 +723,7 @@ void nghttp3_ksl_update_key(nghttp3_ksl *ksl, const nghttp3_ksl_key *old_key,
   }
 }
 
-size_t nghttp3_ksl_len(nghttp3_ksl *ksl) { return ksl->n; }
+size_t nghttp3_ksl_len(const nghttp3_ksl *ksl) { return ksl->n; }
 
 void nghttp3_ksl_clear(nghttp3_ksl *ksl) {
   if (!ksl->head) {
@@ -733,7 +732,7 @@ void nghttp3_ksl_clear(nghttp3_ksl *ksl) {
 
 #ifdef NOMEMPOOL
   ksl_free_blk(ksl, ksl->head);
-#endif /* NOMEMPOOL */
+#endif /* defined(NOMEMPOOL) */
 
   ksl->front = ksl->back = ksl->head = NULL;
   ksl->n = 0;
@@ -742,7 +741,8 @@ void nghttp3_ksl_clear(nghttp3_ksl *ksl) {
 }
 
 #ifndef WIN32
-static void ksl_print(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t level) {
+static void ksl_print(const nghttp3_ksl *ksl, nghttp3_ksl_blk *blk,
+                      size_t level) {
   size_t i;
   nghttp3_ksl_node *node;
 
@@ -753,7 +753,9 @@ static void ksl_print(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t level) {
       node = nghttp3_ksl_nth_node(ksl, blk, i);
       fprintf(stderr, " %" PRId64, *(int64_t *)(void *)node->key);
     }
+
     fprintf(stderr, "\n");
+
     return;
   }
 
@@ -762,14 +764,14 @@ static void ksl_print(nghttp3_ksl *ksl, nghttp3_ksl_blk *blk, size_t level) {
   }
 }
 
-void nghttp3_ksl_print(nghttp3_ksl *ksl) {
+void nghttp3_ksl_print(const nghttp3_ksl *ksl) {
   if (!ksl->head) {
     return;
   }
 
   ksl_print(ksl, ksl->head, 0);
 }
-#endif /* !WIN32 */
+#endif /* !defined(WIN32) */
 
 nghttp3_ksl_it nghttp3_ksl_begin(const nghttp3_ksl *ksl) {
   nghttp3_ksl_it it;
@@ -826,6 +828,6 @@ int nghttp3_ksl_range_compar(const nghttp3_ksl_key *lhs,
 int nghttp3_ksl_range_exclusive_compar(const nghttp3_ksl_key *lhs,
                                        const nghttp3_ksl_key *rhs) {
   const nghttp3_range *a = lhs, *b = rhs;
-  return a->begin < b->begin &&
-         !(nghttp3_max(a->begin, b->begin) < nghttp3_min(a->end, b->end));
+  return a->begin < b->begin && !(nghttp3_max_uint64(a->begin, b->begin) <
+                                  nghttp3_min_uint64(a->end, b->end));
 }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.h b/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.h
index d513bdd672c750..e15e227ce50fb0 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_ksl.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <stdlib.h>
 
@@ -36,16 +36,12 @@
 
 #include "nghttp3_objalloc.h"
 
-/*
- * Skip List using single key instead of range.
- */
-
 #define NGHTTP3_KSL_DEGR 16
 /* NGHTTP3_KSL_MAX_NBLK is the maximum number of nodes which a single
    block can contain. */
 #define NGHTTP3_KSL_MAX_NBLK (2 * NGHTTP3_KSL_DEGR - 1)
 /* NGHTTP3_KSL_MIN_NBLK is the minimum number of nodes which a single
-   block other than root must contains. */
+   block other than root must contain. */
 #define NGHTTP3_KSL_MIN_NBLK (NGHTTP3_KSL_DEGR - 1)
 
 /*
@@ -122,7 +118,7 @@ typedef struct nghttp3_ksl nghttp3_ksl;
 typedef struct nghttp3_ksl_it nghttp3_ksl_it;
 
 /*
- * nghttp3_ksl_it is a forward iterator to iterate nodes.
+ * nghttp3_ksl_it is a bidirectional iterator to iterate nodes.
  */
 struct nghttp3_ksl_it {
   const nghttp3_ksl *ksl;
@@ -142,6 +138,7 @@ struct nghttp3_ksl {
   /* back points to the last leaf block. */
   nghttp3_ksl_blk *back;
   nghttp3_ksl_compar compar;
+  /* n is the number of elements stored. */
   size_t n;
   /* keylen is the size of key */
   size_t keylen;
@@ -152,7 +149,8 @@ struct nghttp3_ksl {
 
 /*
  * nghttp3_ksl_init initializes |ksl|.  |compar| specifies compare
- * function.  |keylen| is the length of key.
+ * function.  |keylen| is the length of key and must be at least
+ * sizeof(uint64_t).
  */
 void nghttp3_ksl_init(nghttp3_ksl *ksl, nghttp3_ksl_compar compar,
                       size_t keylen, const nghttp3_mem *mem);
@@ -167,15 +165,15 @@ void nghttp3_ksl_free(nghttp3_ksl *ksl);
 /*
  * nghttp3_ksl_insert inserts |key| with its associated |data|.  On
  * successful insertion, the iterator points to the inserted node is
- * stored in |*it|.
+ * stored in |*it| if |it| is not NULL.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
  *
  * NGHTTP3_ERR_NOMEM
- *   Out of memory.
+ *     Out of memory.
  * NGHTTP3_ERR_INVALID_ARGUMENT
- *   |key| already exists.
+ *     |key| already exists.
  */
 int nghttp3_ksl_insert(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
                        const nghttp3_ksl_key *key, void *data);
@@ -186,13 +184,14 @@ int nghttp3_ksl_insert(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
  * This function assigns the iterator to |*it|, which points to the
  * node which is located at the right next of the removed node if |it|
  * is not NULL.  If |key| is not found, no deletion takes place and
- * the return value of nghttp3_ksl_end(ksl) is assigned to |*it|.
+ * the return value of nghttp3_ksl_end(ksl) is assigned to |*it| if
+ * |it| is not NULL.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
  *
  * NGHTTP3_ERR_INVALID_ARGUMENT
- *   |key| does not exist.
+ *     |key| does not exist.
  */
 int nghttp3_ksl_remove(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
                        const nghttp3_ksl_key *key);
@@ -212,24 +211,24 @@ int nghttp3_ksl_remove_hint(nghttp3_ksl *ksl, nghttp3_ksl_it *it,
  * nghttp3_ksl_lower_bound returns the iterator which points to the
  * first node which has the key which is equal to |key| or the last
  * node which satisfies !compar(&node->key, key).  If there is no such
- * node, it returns the iterator which satisfies nghttp3_ksl_it_end(it)
- * != 0.
+ * node, it returns the iterator which satisfies
+ * nghttp3_ksl_it_end(it) != 0.
  */
-nghttp3_ksl_it nghttp3_ksl_lower_bound(nghttp3_ksl *ksl,
+nghttp3_ksl_it nghttp3_ksl_lower_bound(const nghttp3_ksl *ksl,
                                        const nghttp3_ksl_key *key);
 
 /*
  * nghttp3_ksl_lower_bound_compar works like nghttp3_ksl_lower_bound,
  * but it takes custom function |compar| to do lower bound search.
  */
-nghttp3_ksl_it nghttp3_ksl_lower_bound_compar(nghttp3_ksl *ksl,
+nghttp3_ksl_it nghttp3_ksl_lower_bound_compar(const nghttp3_ksl *ksl,
                                               const nghttp3_ksl_key *key,
                                               nghttp3_ksl_compar compar);
 
 /*
- * nghttp3_ksl_update_key replaces the key of nodes which has |old_key|
- * with |new_key|.  |new_key| must be strictly greater than the
- * previous node and strictly smaller than the next node.
+ * nghttp3_ksl_update_key replaces the key of nodes which has
+ * |old_key| with |new_key|.  |new_key| must be strictly greater than
+ * the previous node and strictly smaller than the next node.
  */
 void nghttp3_ksl_update_key(nghttp3_ksl *ksl, const nghttp3_ksl_key *old_key,
                             const nghttp3_ksl_key *new_key);
@@ -237,7 +236,8 @@ void nghttp3_ksl_update_key(nghttp3_ksl *ksl, const nghttp3_ksl_key *old_key,
 /*
  * nghttp3_ksl_begin returns the iterator which points to the first
  * node.  If there is no node in |ksl|, it returns the iterator which
- * satisfies nghttp3_ksl_it_end(it) != 0.
+ * satisfies both nghttp3_ksl_it_begin(it) != 0 and
+ * nghttp3_ksl_it_end(it) != 0.
  */
 nghttp3_ksl_it nghttp3_ksl_begin(const nghttp3_ksl *ksl);
 
@@ -245,14 +245,15 @@ nghttp3_ksl_it nghttp3_ksl_begin(const nghttp3_ksl *ksl);
  * nghttp3_ksl_end returns the iterator which points to the node
  * following the last node.  The returned object satisfies
  * nghttp3_ksl_it_end().  If there is no node in |ksl|, it returns the
- * iterator which satisfies nghttp3_ksl_it_begin(it) != 0.
+ * iterator which satisfies nghttp3_ksl_it_begin(it) != 0 and
+ * nghttp3_ksl_it_end(it) != 0.
  */
 nghttp3_ksl_it nghttp3_ksl_end(const nghttp3_ksl *ksl);
 
 /*
  * nghttp3_ksl_len returns the number of elements stored in |ksl|.
  */
-size_t nghttp3_ksl_len(nghttp3_ksl *ksl);
+size_t nghttp3_ksl_len(const nghttp3_ksl *ksl);
 
 /*
  * nghttp3_ksl_clear removes all elements stored in |ksl|.
@@ -271,8 +272,8 @@ void nghttp3_ksl_clear(nghttp3_ksl *ksl);
  * that the key is of type int64_t.  This function should be used for
  * the debugging purpose only.
  */
-void nghttp3_ksl_print(nghttp3_ksl *ksl);
-#endif /* !WIN32 */
+void nghttp3_ksl_print(const nghttp3_ksl *ksl);
+#endif /* !defined(WIN32) */
 
 /*
  * nghttp3_ksl_it_init initializes |it|.
@@ -295,8 +296,8 @@ void nghttp3_ksl_it_init(nghttp3_ksl_it *it, const nghttp3_ksl *ksl,
  */
 #define nghttp3_ksl_it_next(IT)                                                \
   (++(IT)->i == (IT)->blk->n && (IT)->blk->next                                \
-       ? ((IT)->blk = (IT)->blk->next, (IT)->i = 0)                            \
-       : 0)
+     ? ((IT)->blk = (IT)->blk->next, (IT)->i = 0)                              \
+     : 0)
 
 /*
  * nghttp3_ksl_it_prev moves backward the iterator by one.  It is
@@ -306,16 +307,16 @@ void nghttp3_ksl_it_init(nghttp3_ksl_it *it, const nghttp3_ksl *ksl,
 void nghttp3_ksl_it_prev(nghttp3_ksl_it *it);
 
 /*
- * nghttp3_ksl_it_end returns nonzero if |it| points to the beyond the
- * last node.
+ * nghttp3_ksl_it_end returns nonzero if |it| points to the one beyond
+ * the last node.
  */
 #define nghttp3_ksl_it_end(IT)                                                 \
   ((IT)->blk->n == (IT)->i && (IT)->blk->next == NULL)
 
 /*
  * nghttp3_ksl_it_begin returns nonzero if |it| points to the first
- * node.  |it| might satisfy both nghttp3_ksl_it_begin(&it) and
- * nghttp3_ksl_it_end(&it) if the skip list has no node.
+ * node.  |it| might satisfy both nghttp3_ksl_it_begin(it) != 0 and
+ * nghttp3_ksl_it_end(it) != 0 if the skip list has no node.
  */
 int nghttp3_ksl_it_begin(const nghttp3_ksl_it *it);
 
@@ -347,4 +348,4 @@ int nghttp3_ksl_range_compar(const nghttp3_ksl_key *lhs,
 int nghttp3_ksl_range_exclusive_compar(const nghttp3_ksl_key *lhs,
                                        const nghttp3_ksl_key *rhs);
 
-#endif /* NGHTTP3_KSL_H */
+#endif /* !defined(NGHTTP3_KSL_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_macro.h b/deps/ngtcp2/nghttp3/lib/nghttp3_macro.h
index a44e907661abbf..a4e1dfea3cda00 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_macro.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_macro.h
@@ -28,17 +28,14 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <stddef.h>
 
 #include <nghttp3/nghttp3.h>
 
-#define nghttp3_min(A, B) ((A) < (B) ? (A) : (B))
-#define nghttp3_max(A, B) ((A) > (B) ? (A) : (B))
-
 #define nghttp3_struct_of(ptr, type, member)                                   \
-  ((type *)(void *)((char *)(ptr)-offsetof(type, member)))
+  ((type *)(void *)((char *)(ptr) - offsetof(type, member)))
 
 #define nghttp3_arraylen(A) (sizeof(A) / sizeof(*(A)))
 
@@ -48,4 +45,30 @@
    variable-length integer encoding. */
 #define NGHTTP3_MAX_VARINT ((1ULL << 62) - 1)
 
-#endif /* NGHTTP3_MACRO_H */
+#define nghttp3_max_def(SUFFIX, T)                                             \
+  static inline T nghttp3_max_##SUFFIX(T a, T b) { return a < b ? b : a; }
+
+nghttp3_max_def(int8, int8_t);
+nghttp3_max_def(int16, int16_t);
+nghttp3_max_def(int32, int32_t);
+nghttp3_max_def(int64, int64_t);
+nghttp3_max_def(uint8, uint8_t);
+nghttp3_max_def(uint16, uint16_t);
+nghttp3_max_def(uint32, uint32_t);
+nghttp3_max_def(uint64, uint64_t);
+nghttp3_max_def(size, size_t);
+
+#define nghttp3_min_def(SUFFIX, T)                                             \
+  static inline T nghttp3_min_##SUFFIX(T a, T b) { return a < b ? a : b; }
+
+nghttp3_min_def(int8, int8_t);
+nghttp3_min_def(int16, int16_t);
+nghttp3_min_def(int32, int32_t);
+nghttp3_min_def(int64, int64_t);
+nghttp3_min_def(uint8, uint8_t);
+nghttp3_min_def(uint16, uint16_t);
+nghttp3_min_def(uint32, uint32_t);
+nghttp3_min_def(uint64, uint64_t);
+nghttp3_min_def(size, size_t);
+
+#endif /* !defined(NGHTTP3_MACRO_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_map.c b/deps/ngtcp2/nghttp3/lib/nghttp3_map.c
index b93fdfd3d488f5..cc5e42a2caf63f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_map.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_map.c
@@ -36,8 +36,7 @@
 
 void nghttp3_map_init(nghttp3_map *map, const nghttp3_mem *mem) {
   map->mem = mem;
-  map->tablelen = 0;
-  map->tablelenbits = 0;
+  map->hashbits = 0;
   map->table = NULL;
   map->size = 0;
 }
@@ -50,33 +49,20 @@ void nghttp3_map_free(nghttp3_map *map) {
   nghttp3_mem_free(map->mem, map->table);
 }
 
-void nghttp3_map_each_free(nghttp3_map *map, int (*func)(void *data, void *ptr),
-                           void *ptr) {
-  uint32_t i;
-  nghttp3_map_bucket *bkt;
-
-  for (i = 0; i < map->tablelen; ++i) {
-    bkt = &map->table[i];
-
-    if (bkt->data == NULL) {
-      continue;
-    }
-
-    func(bkt->data, ptr);
-  }
-}
-
-int nghttp3_map_each(nghttp3_map *map, int (*func)(void *data, void *ptr),
+int nghttp3_map_each(const nghttp3_map *map, int (*func)(void *data, void *ptr),
                      void *ptr) {
   int rv;
-  uint32_t i;
+  size_t i;
   nghttp3_map_bucket *bkt;
+  size_t tablelen;
 
   if (map->size == 0) {
     return 0;
   }
 
-  for (i = 0; i < map->tablelen; ++i) {
+  tablelen = 1u << map->hashbits;
+
+  for (i = 0; i < tablelen; ++i) {
     bkt = &map->table[i];
 
     if (bkt->data == NULL) {
@@ -92,82 +78,61 @@ int nghttp3_map_each(nghttp3_map *map, int (*func)(void *data, void *ptr),
   return 0;
 }
 
-static uint32_t hash(nghttp3_map_key_type key) {
-  return (uint32_t)((key * 11400714819323198485llu) >> 32);
-}
-
-static size_t h2idx(uint32_t hash, uint32_t bits) {
-  return hash >> (32 - bits);
-}
-
-static size_t distance(uint32_t tablelen, uint32_t tablelenbits,
-                       nghttp3_map_bucket *bkt, size_t idx) {
-  return (idx - h2idx(bkt->hash, tablelenbits)) & (tablelen - 1);
+static size_t hash(nghttp3_map_key_type key, size_t bits) {
+  return (size_t)((key * 11400714819323198485llu) >> (64 - bits));
 }
 
-static void map_bucket_swap(nghttp3_map_bucket *bkt, uint32_t *phash,
-                            nghttp3_map_key_type *pkey, void **pdata) {
-  uint32_t h = bkt->hash;
-  nghttp3_map_key_type key = bkt->key;
-  void *data = bkt->data;
-
-  bkt->hash = *phash;
-  bkt->key = *pkey;
-  bkt->data = *pdata;
+static void map_bucket_swap(nghttp3_map_bucket *a, nghttp3_map_bucket *b) {
+  nghttp3_map_bucket c = *a;
 
-  *phash = h;
-  *pkey = key;
-  *pdata = data;
-}
-
-static void map_bucket_set_data(nghttp3_map_bucket *bkt, uint32_t hash,
-                                nghttp3_map_key_type key, void *data) {
-  bkt->hash = hash;
-  bkt->key = key;
-  bkt->data = data;
+  *a = *b;
+  *b = c;
 }
 
 #ifndef WIN32
-void nghttp3_map_print_distance(nghttp3_map *map) {
-  uint32_t i;
+void nghttp3_map_print_distance(const nghttp3_map *map) {
+  size_t i;
   size_t idx;
   nghttp3_map_bucket *bkt;
+  size_t tablelen;
 
-  for (i = 0; i < map->tablelen; ++i) {
+  if (map->size == 0) {
+    return;
+  }
+
+  tablelen = 1u << map->hashbits;
+
+  for (i = 0; i < tablelen; ++i) {
     bkt = &map->table[i];
 
     if (bkt->data == NULL) {
-      fprintf(stderr, "@%u <EMPTY>\n", i);
+      fprintf(stderr, "@%zu <EMPTY>\n", i);
       continue;
     }
 
-    idx = h2idx(bkt->hash, map->tablelenbits);
-    fprintf(stderr, "@%u hash=%08x key=%" PRIu64 " base=%zu distance=%zu\n", i,
-            bkt->hash, bkt->key, idx,
-            distance(map->tablelen, map->tablelenbits, bkt, idx));
+    idx = hash(bkt->key, map->hashbits);
+    fprintf(stderr, "@%zu hash=%zu key=%" PRIu64 " base=%zu distance=%u\n", i,
+            hash(bkt->key, map->hashbits), bkt->key, idx, bkt->psl);
   }
 }
-#endif /* !WIN32 */
+#endif /* !defined(WIN32) */
 
-static int insert(nghttp3_map_bucket *table, uint32_t tablelen,
-                  uint32_t tablelenbits, uint32_t hash,
+static int insert(nghttp3_map_bucket *table, size_t hashbits,
                   nghttp3_map_key_type key, void *data) {
-  size_t idx = h2idx(hash, tablelenbits);
-  size_t d = 0, dd;
-  nghttp3_map_bucket *bkt;
+  size_t idx = hash(key, hashbits);
+  nghttp3_map_bucket b = {0, key, data}, *bkt;
+  size_t mask = (1u << hashbits) - 1;
 
   for (;;) {
     bkt = &table[idx];
 
     if (bkt->data == NULL) {
-      map_bucket_set_data(bkt, hash, key, data);
+      *bkt = b;
       return 0;
     }
 
-    dd = distance(tablelen, tablelenbits, bkt, idx);
-    if (d > dd) {
-      map_bucket_swap(bkt, &hash, &key, &data);
-      d = dd;
+    if (b.psl > bkt->psl) {
+      map_bucket_swap(bkt, &b);
     } else if (bkt->key == key) {
       /* TODO This check is just a waste after first swap or if this
          function is called from map_resize.  That said, there is no
@@ -176,41 +141,42 @@ static int insert(nghttp3_map_bucket *table, uint32_t tablelen,
       return NGHTTP3_ERR_INVALID_ARGUMENT;
     }
 
-    ++d;
-    idx = (idx + 1) & (tablelen - 1);
+    ++b.psl;
+    idx = (idx + 1) & mask;
   }
 }
 
-/* new_tablelen must be power of 2 and new_tablelen == (1 <<
-   new_tablelenbits) must hold. */
-static int map_resize(nghttp3_map *map, uint32_t new_tablelen,
-                      uint32_t new_tablelenbits) {
-  uint32_t i;
+static int map_resize(nghttp3_map *map, size_t new_hashbits) {
+  size_t i;
   nghttp3_map_bucket *new_table;
   nghttp3_map_bucket *bkt;
+  size_t tablelen;
   int rv;
   (void)rv;
 
-  new_table =
-      nghttp3_mem_calloc(map->mem, new_tablelen, sizeof(nghttp3_map_bucket));
+  new_table = nghttp3_mem_calloc(map->mem, 1u << new_hashbits,
+                                 sizeof(nghttp3_map_bucket));
   if (new_table == NULL) {
     return NGHTTP3_ERR_NOMEM;
   }
 
-  for (i = 0; i < map->tablelen; ++i) {
-    bkt = &map->table[i];
-    if (bkt->data == NULL) {
-      continue;
-    }
-    rv = insert(new_table, new_tablelen, new_tablelenbits, bkt->hash, bkt->key,
-                bkt->data);
+  if (map->size) {
+    tablelen = 1u << map->hashbits;
 
-    assert(0 == rv);
+    for (i = 0; i < tablelen; ++i) {
+      bkt = &map->table[i];
+      if (bkt->data == NULL) {
+        continue;
+      }
+
+      rv = insert(new_table, new_hashbits, bkt->key, bkt->data);
+
+      assert(0 == rv);
+    }
   }
 
   nghttp3_mem_free(map->mem, map->table);
-  map->tablelen = new_tablelen;
-  map->tablelenbits = new_tablelenbits;
+  map->hashbits = new_hashbits;
   map->table = new_table;
 
   return 0;
@@ -222,48 +188,49 @@ int nghttp3_map_insert(nghttp3_map *map, nghttp3_map_key_type key, void *data) {
   assert(data);
 
   /* Load factor is 0.75 */
-  if ((map->size + 1) * 4 > map->tablelen * 3) {
-    if (map->tablelen) {
-      rv = map_resize(map, map->tablelen * 2, map->tablelenbits + 1);
+  /* Under the very initial condition, that is map->size == 0 and
+     map->hashbits == 0, 4 > 3 still holds nicely. */
+  if ((map->size + 1) * 4 > (1u << map->hashbits) * 3) {
+    if (map->hashbits) {
+      rv = map_resize(map, map->hashbits + 1);
       if (rv != 0) {
         return rv;
       }
     } else {
-      rv = map_resize(map, 1 << NGHTTP3_INITIAL_TABLE_LENBITS,
-                      NGHTTP3_INITIAL_TABLE_LENBITS);
+      rv = map_resize(map, NGHTTP3_INITIAL_TABLE_LENBITS);
       if (rv != 0) {
         return rv;
       }
     }
   }
 
-  rv = insert(map->table, map->tablelen, map->tablelenbits, hash(key), key,
-              data);
+  rv = insert(map->table, map->hashbits, key, data);
   if (rv != 0) {
     return rv;
   }
+
   ++map->size;
+
   return 0;
 }
 
-void *nghttp3_map_find(nghttp3_map *map, nghttp3_map_key_type key) {
-  uint32_t h;
+void *nghttp3_map_find(const nghttp3_map *map, nghttp3_map_key_type key) {
   size_t idx;
   nghttp3_map_bucket *bkt;
-  size_t d = 0;
+  size_t psl = 0;
+  size_t mask;
 
   if (map->size == 0) {
     return NULL;
   }
 
-  h = hash(key);
-  idx = h2idx(h, map->tablelenbits);
+  idx = hash(key, map->hashbits);
+  mask = (1u << map->hashbits) - 1;
 
   for (;;) {
     bkt = &map->table[idx];
 
-    if (bkt->data == NULL ||
-        d > distance(map->tablelen, map->tablelenbits, bkt, idx)) {
+    if (bkt->data == NULL || psl > bkt->psl) {
       return NULL;
     }
 
@@ -271,50 +238,47 @@ void *nghttp3_map_find(nghttp3_map *map, nghttp3_map_key_type key) {
       return bkt->data;
     }
 
-    ++d;
-    idx = (idx + 1) & (map->tablelen - 1);
+    ++psl;
+    idx = (idx + 1) & mask;
   }
 }
 
 int nghttp3_map_remove(nghttp3_map *map, nghttp3_map_key_type key) {
-  uint32_t h;
-  size_t idx, didx;
-  nghttp3_map_bucket *bkt;
-  size_t d = 0;
+  size_t idx;
+  nghttp3_map_bucket *b, *bkt;
+  size_t psl = 0;
+  size_t mask;
 
   if (map->size == 0) {
     return NGHTTP3_ERR_INVALID_ARGUMENT;
   }
 
-  h = hash(key);
-  idx = h2idx(h, map->tablelenbits);
+  idx = hash(key, map->hashbits);
+  mask = (1u << map->hashbits) - 1;
 
   for (;;) {
     bkt = &map->table[idx];
 
-    if (bkt->data == NULL ||
-        d > distance(map->tablelen, map->tablelenbits, bkt, idx)) {
+    if (bkt->data == NULL || psl > bkt->psl) {
       return NGHTTP3_ERR_INVALID_ARGUMENT;
     }
 
     if (bkt->key == key) {
-      map_bucket_set_data(bkt, 0, 0, NULL);
-
-      didx = idx;
-      idx = (idx + 1) & (map->tablelen - 1);
+      b = bkt;
+      idx = (idx + 1) & mask;
 
       for (;;) {
         bkt = &map->table[idx];
-        if (bkt->data == NULL ||
-            distance(map->tablelen, map->tablelenbits, bkt, idx) == 0) {
+        if (bkt->data == NULL || bkt->psl == 0) {
+          b->data = NULL;
           break;
         }
 
-        map->table[didx] = *bkt;
-        map_bucket_set_data(bkt, 0, 0, NULL);
-        didx = idx;
+        --bkt->psl;
+        *b = *bkt;
+        b = bkt;
 
-        idx = (idx + 1) & (map->tablelen - 1);
+        idx = (idx + 1) & mask;
       }
 
       --map->size;
@@ -322,18 +286,18 @@ int nghttp3_map_remove(nghttp3_map *map, nghttp3_map_key_type key) {
       return 0;
     }
 
-    ++d;
-    idx = (idx + 1) & (map->tablelen - 1);
+    ++psl;
+    idx = (idx + 1) & mask;
   }
 }
 
 void nghttp3_map_clear(nghttp3_map *map) {
-  if (map->tablelen == 0) {
+  if (map->size == 0) {
     return;
   }
 
-  memset(map->table, 0, sizeof(*map->table) * map->tablelen);
+  memset(map->table, 0, sizeof(*map->table) * (1u << map->hashbits));
   map->size = 0;
 }
 
-size_t nghttp3_map_size(nghttp3_map *map) { return map->size; }
+size_t nghttp3_map_size(const nghttp3_map *map) { return map->size; }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_map.h b/deps/ngtcp2/nghttp3/lib/nghttp3_map.h
index 7683cfeef3f33e..2b1a6ecab5cf4c 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_map.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_map.h
@@ -29,7 +29,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -40,7 +40,7 @@
 typedef uint64_t nghttp3_map_key_type;
 
 typedef struct nghttp3_map_bucket {
-  uint32_t hash;
+  uint32_t psl;
   nghttp3_map_key_type key;
   void *data;
 } nghttp3_map_bucket;
@@ -49,33 +49,24 @@ typedef struct nghttp3_map {
   nghttp3_map_bucket *table;
   const nghttp3_mem *mem;
   size_t size;
-  uint32_t tablelen;
-  uint32_t tablelenbits;
+  size_t hashbits;
 } nghttp3_map;
 
 /*
- * Initializes the map |map|.
+ * nghttp3_map_init initializes the map |map|.
  */
 void nghttp3_map_init(nghttp3_map *map, const nghttp3_mem *mem);
 
 /*
- * Deallocates any resources allocated for |map|. The stored entries
- * are not freed by this function. Use nghttp3_map_each_free() to free
- * each entries.
+ * nghttp3_map_free deallocates any resources allocated for |map|.
+ * The stored entries are not freed by this function.  Use
+ * nghttp3_map_each() to free each entry.
  */
 void nghttp3_map_free(nghttp3_map *map);
 
 /*
- * Deallocates each entries using |func| function and any resources
- * allocated for |map|. The |func| function is responsible for freeing
- * given the |data| object. The |ptr| will be passed to the |func| as
- * send argument. The return value of the |func| will be ignored.
- */
-void nghttp3_map_each_free(nghttp3_map *map, int (*func)(void *data, void *ptr),
-                           void *ptr);
-
-/*
- * Inserts the new |data| with the |key| to the map |map|.
+ * nghttp3_map_insert inserts the new |data| with the |key| to the map
+ * |map|.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
@@ -83,57 +74,56 @@ void nghttp3_map_each_free(nghttp3_map *map, int (*func)(void *data, void *ptr),
  * NGHTTP3_ERR_INVALID_ARGUMENT
  *     The item associated by |key| already exists.
  * NGHTTP3_ERR_NOMEM
- *   Out of memory
+ *     Out of memory
  */
 int nghttp3_map_insert(nghttp3_map *map, nghttp3_map_key_type key, void *data);
 
 /*
- * Returns the data associated by the key |key|.  If there is no such
- * data, this function returns NULL.
+ * nghttp3_map_find returns the entry associated by the key |key|.  If
+ * there is no such entry, this function returns NULL.
  */
-void *nghttp3_map_find(nghttp3_map *map, nghttp3_map_key_type key);
+void *nghttp3_map_find(const nghttp3_map *map, nghttp3_map_key_type key);
 
 /*
- * Removes the data associated by the key |key| from the |map|.  The
- * removed data is not freed by this function.
+ * nghttp3_map_remove removes the entry associated by the key |key|
+ * from the |map|.  The removed entry is not freed by this function.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
  *
  * NGHTTP3_ERR_INVALID_ARGUMENT
- *     The data associated by |key| does not exist.
+ *     The entry associated by |key| does not exist.
  */
 int nghttp3_map_remove(nghttp3_map *map, nghttp3_map_key_type key);
 
 /*
- * Removes all entries from |map|.
+ * nghttp3_map_clear removes all entries from |map|.  The removed
+ * entry is not freed by this function.
  */
 void nghttp3_map_clear(nghttp3_map *map);
 
 /*
- * Returns the number of items stored in the map |map|.
+ * nghttp3_map_size returns the number of items stored in the map
+ * |map|.
  */
-size_t nghttp3_map_size(nghttp3_map *map);
+size_t nghttp3_map_size(const nghttp3_map *map);
 
 /*
- * Applies the function |func| to each data in the |map| with the
- * optional user supplied pointer |ptr|.
+ * nghttp3_map_each applies the function |func| to each entry in the
+ * |map| with the optional user supplied pointer |ptr|.
  *
  * If the |func| returns 0, this function calls the |func| with the
- * next data.  If the |func| returns nonzero, it will not call the
+ * next entry.  If the |func| returns nonzero, it will not call the
  * |func| for further entries and return the return value of the
  * |func| immediately.  Thus, this function returns 0 if all the
  * invocations of the |func| return 0, or nonzero value which the last
  * invocation of |func| returns.
- *
- * Don't use this function to free each data. Use
- * nghttp3_map_each_free() instead.
  */
-int nghttp3_map_each(nghttp3_map *map, int (*func)(void *data, void *ptr),
+int nghttp3_map_each(const nghttp3_map *map, int (*func)(void *data, void *ptr),
                      void *ptr);
 
 #ifndef WIN32
-void nghttp3_map_print_distance(nghttp3_map *map);
-#endif /* !WIN32 */
+void nghttp3_map_print_distance(const nghttp3_map *map);
+#endif /* !defined(WIN32) */
 
-#endif /* NGHTTP3_MAP_H */
+#endif /* !defined(NGHTTP3_MAP_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_mem.c b/deps/ngtcp2/nghttp3/lib/nghttp3_mem.c
index 0379e99b59cb74..687872f9020e78 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_mem.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_mem.c
@@ -73,7 +73,7 @@ void *nghttp3_mem_calloc(const nghttp3_mem *mem, size_t nmemb, size_t size) {
 void *nghttp3_mem_realloc(const nghttp3_mem *mem, void *ptr, size_t size) {
   return mem->realloc(ptr, size, mem->user_data);
 }
-#else  /* MEMDEBUG */
+#else  /* defined(MEMDEBUG) */
 void *nghttp3_mem_malloc_debug(const nghttp3_mem *mem, size_t size,
                                const char *func, const char *file,
                                size_t line) {
@@ -121,4 +121,4 @@ void *nghttp3_mem_realloc_debug(const nghttp3_mem *mem, void *ptr, size_t size,
 
   return nptr;
 }
-#endif /* MEMDEBUG */
+#endif /* defined(MEMDEBUG) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_mem.h b/deps/ngtcp2/nghttp3/lib/nghttp3_mem.h
index d6c3ada6f7e894..1ae53c91575f9f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_mem.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_mem.h
@@ -29,7 +29,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -40,7 +40,7 @@ void *nghttp3_mem_malloc(const nghttp3_mem *mem, size_t size);
 void nghttp3_mem_free(const nghttp3_mem *mem, void *ptr);
 void *nghttp3_mem_calloc(const nghttp3_mem *mem, size_t nmemb, size_t size);
 void *nghttp3_mem_realloc(const nghttp3_mem *mem, void *ptr, size_t size);
-#else /* MEMDEBUG */
+#else /* defined(MEMDEBUG) */
 void *nghttp3_mem_malloc_debug(const nghttp3_mem *mem, size_t size,
                                const char *func, const char *file, size_t line);
 
@@ -75,6 +75,6 @@ void *nghttp3_mem_realloc_debug(const nghttp3_mem *mem, void *ptr, size_t size,
 #  define nghttp3_mem_realloc(MEM, PTR, SIZE)                                  \
     nghttp3_mem_realloc_debug((MEM), (PTR), (SIZE), __func__, __FILE__,        \
                               __LINE__)
-#endif /* MEMDEBUG */
+#endif /* defined(MEMDEBUG) */
 
-#endif /* NGHTTP3_MEM_H */
+#endif /* !defined(NGHTTP3_MEM_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_objalloc.h b/deps/ngtcp2/nghttp3/lib/nghttp3_objalloc.h
index 02dff285f24060..4f8ffa093751a7 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_objalloc.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_objalloc.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -68,9 +68,9 @@ void nghttp3_objalloc_clear(nghttp3_objalloc *objalloc);
 #ifndef NOMEMPOOL
 #  define nghttp3_objalloc_decl(NAME, TYPE, OPLENTFIELD)                       \
     inline static void nghttp3_objalloc_##NAME##_init(                         \
-        nghttp3_objalloc *objalloc, size_t nmemb, const nghttp3_mem *mem) {    \
+      nghttp3_objalloc *objalloc, size_t nmemb, const nghttp3_mem *mem) {      \
       nghttp3_objalloc_init(                                                   \
-          objalloc, ((sizeof(TYPE) + 0xfu) & ~(uintptr_t)0xfu) * nmemb, mem);  \
+        objalloc, ((sizeof(TYPE) + 0xfu) & ~(uintptr_t)0xfu) * nmemb, mem);    \
     }                                                                          \
                                                                                \
     TYPE *nghttp3_objalloc_##NAME##_get(nghttp3_objalloc *objalloc);           \
@@ -79,7 +79,7 @@ void nghttp3_objalloc_clear(nghttp3_objalloc *objalloc);
                                             size_t len);                       \
                                                                                \
     inline static void nghttp3_objalloc_##NAME##_release(                      \
-        nghttp3_objalloc *objalloc, TYPE *obj) {                               \
+      nghttp3_objalloc *objalloc, TYPE *obj) {                                 \
       nghttp3_opl_push(&objalloc->opl, &obj->OPLENTFIELD);                     \
     }
 
@@ -90,8 +90,8 @@ void nghttp3_objalloc_clear(nghttp3_objalloc *objalloc);
       int rv;                                                                  \
                                                                                \
       if (!oplent) {                                                           \
-        rv = nghttp3_balloc_get(&objalloc->balloc, (void **)&obj,              \
-                                sizeof(TYPE));                                 \
+        rv =                                                                   \
+          nghttp3_balloc_get(&objalloc->balloc, (void **)&obj, sizeof(TYPE));  \
         if (rv != 0) {                                                         \
           return NULL;                                                         \
         }                                                                      \
@@ -119,30 +119,30 @@ void nghttp3_objalloc_clear(nghttp3_objalloc *objalloc);
                                                                                \
       return nghttp3_struct_of(oplent, TYPE, OPLENTFIELD);                     \
     }
-#else /* NOMEMPOOL */
+#else /* defined(NOMEMPOOL) */
 #  define nghttp3_objalloc_decl(NAME, TYPE, OPLENTFIELD)                       \
     inline static void nghttp3_objalloc_##NAME##_init(                         \
-        nghttp3_objalloc *objalloc, size_t nmemb, const nghttp3_mem *mem) {    \
+      nghttp3_objalloc *objalloc, size_t nmemb, const nghttp3_mem *mem) {      \
       nghttp3_objalloc_init(                                                   \
-          objalloc, ((sizeof(TYPE) + 0xfu) & ~(uintptr_t)0xfu) * nmemb, mem);  \
+        objalloc, ((sizeof(TYPE) + 0xfu) & ~(uintptr_t)0xfu) * nmemb, mem);    \
     }                                                                          \
                                                                                \
     inline static TYPE *nghttp3_objalloc_##NAME##_get(                         \
-        nghttp3_objalloc *objalloc) {                                          \
+      nghttp3_objalloc *objalloc) {                                            \
       return nghttp3_mem_malloc(objalloc->balloc.mem, sizeof(TYPE));           \
     }                                                                          \
                                                                                \
     inline static TYPE *nghttp3_objalloc_##NAME##_len_get(                     \
-        nghttp3_objalloc *objalloc, size_t len) {                              \
+      nghttp3_objalloc *objalloc, size_t len) {                                \
       return nghttp3_mem_malloc(objalloc->balloc.mem, len);                    \
     }                                                                          \
                                                                                \
     inline static void nghttp3_objalloc_##NAME##_release(                      \
-        nghttp3_objalloc *objalloc, TYPE *obj) {                               \
+      nghttp3_objalloc *objalloc, TYPE *obj) {                                 \
       nghttp3_mem_free(objalloc->balloc.mem, obj);                             \
     }
 
 #  define nghttp3_objalloc_def(NAME, TYPE, OPLENTFIELD)
-#endif /* NOMEMPOOL */
+#endif /* defined(NOMEMPOOL) */
 
-#endif /* NGHTTP3_OBJALLOC_H */
+#endif /* !defined(NGHTTP3_OBJALLOC_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_opl.h b/deps/ngtcp2/nghttp3/lib/nghttp3_opl.h
index 8c8a4f2051b25a..6609371dbfb89f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_opl.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_opl.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -63,4 +63,4 @@ nghttp3_opl_entry *nghttp3_opl_pop(nghttp3_opl *opl);
 
 void nghttp3_opl_clear(nghttp3_opl *opl);
 
-#endif /* NGHTTP3_OPL_H */
+#endif /* !defined(NGHTTP3_OPL_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_pq.c b/deps/ngtcp2/nghttp3/lib/nghttp3_pq.c
index 5d09050ae63798..feefcd6fc717c3 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_pq.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_pq.c
@@ -30,18 +30,21 @@
 
 #include "nghttp3_macro.h"
 
-void nghttp3_pq_init(nghttp3_pq *pq, nghttp3_less less,
+void nghttp3_pq_init(nghttp3_pq *pq, nghttp3_pq_less less,
                      const nghttp3_mem *mem) {
-  pq->mem = mem;
-  pq->capacity = 0;
   pq->q = NULL;
+  pq->mem = mem;
   pq->length = 0;
+  pq->capacity = 0;
   pq->less = less;
 }
 
 void nghttp3_pq_free(nghttp3_pq *pq) {
+  if (!pq) {
+    return;
+  }
+
   nghttp3_mem_free(pq->mem, pq->q);
-  pq->q = NULL;
 }
 
 static void swap(nghttp3_pq *pq, size_t i, size_t j) {
@@ -56,11 +59,13 @@ static void swap(nghttp3_pq *pq, size_t i, size_t j) {
 
 static void bubble_up(nghttp3_pq *pq, size_t index) {
   size_t parent;
-  while (index != 0) {
+
+  while (index) {
     parent = (index - 1) / 2;
     if (!pq->less(pq->q[index], pq->q[parent])) {
       return;
     }
+
     swap(pq, parent, index);
     index = parent;
   }
@@ -71,20 +76,23 @@ int nghttp3_pq_push(nghttp3_pq *pq, nghttp3_pq_entry *item) {
     void *nq;
     size_t ncapacity;
 
-    ncapacity = nghttp3_max(4, (pq->capacity * 2));
+    ncapacity = nghttp3_max_size(4, pq->capacity * 2);
 
     nq = nghttp3_mem_realloc(pq->mem, pq->q,
                              ncapacity * sizeof(nghttp3_pq_entry *));
     if (nq == NULL) {
       return NGHTTP3_ERR_NOMEM;
     }
+
     pq->capacity = ncapacity;
     pq->q = nq;
   }
+
   pq->q[pq->length] = item;
   item->index = pq->length;
   ++pq->length;
-  bubble_up(pq, pq->length - 1);
+  bubble_up(pq, item->index);
+
   return 0;
 }
 
@@ -95,32 +103,37 @@ nghttp3_pq_entry *nghttp3_pq_top(const nghttp3_pq *pq) {
 
 static void bubble_down(nghttp3_pq *pq, size_t index) {
   size_t i, j, minindex;
+
   for (;;) {
     j = index * 2 + 1;
     minindex = index;
+
     for (i = 0; i < 2; ++i, ++j) {
       if (j >= pq->length) {
         break;
       }
+
       if (pq->less(pq->q[j], pq->q[minindex])) {
         minindex = j;
       }
     }
+
     if (minindex == index) {
       return;
     }
+
     swap(pq, index, minindex);
     index = minindex;
   }
 }
 
 void nghttp3_pq_pop(nghttp3_pq *pq) {
-  if (pq->length > 0) {
-    pq->q[0] = pq->q[pq->length - 1];
-    pq->q[0]->index = 0;
-    --pq->length;
-    bubble_down(pq, 0);
-  }
+  assert(pq->length);
+
+  pq->q[0] = pq->q[pq->length - 1];
+  pq->q[0]->index = 0;
+  --pq->length;
+  bubble_down(pq, 0);
 }
 
 void nghttp3_pq_remove(nghttp3_pq *pq, nghttp3_pq_entry *item) {
@@ -157,11 +170,13 @@ int nghttp3_pq_each(const nghttp3_pq *pq, nghttp3_pq_item_cb fun, void *arg) {
   if (pq->length == 0) {
     return 0;
   }
+
   for (i = 0; i < pq->length; ++i) {
     if ((*fun)(pq->q[i], arg)) {
       return 1;
     }
   }
+
   return 0;
 }
 
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_pq.h b/deps/ngtcp2/nghttp3/lib/nghttp3_pq.h
index c1a54f505bbd03..3813b529473075 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_pq.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_pq.h
@@ -29,7 +29,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -39,44 +39,48 @@
 
 /* NGHTTP3_PQ_BAD_INDEX is the priority queue index which indicates
    that an entry is not queued.  Assigning this value to
-   nghttp3_pq_entry.index can check that the entry is queued or not. */
+   nghttp3_pq_entry.index can check that the entry is queued or
+   not. */
 #define NGHTTP3_PQ_BAD_INDEX SIZE_MAX
 
 typedef struct nghttp3_pq_entry {
   size_t index;
 } nghttp3_pq_entry;
 
-/* "less" function, return nonzero if |lhs| is less than |rhs|. */
-typedef int (*nghttp3_less)(const nghttp3_pq_entry *lhs,
-                            const nghttp3_pq_entry *rhs);
+/* nghttp3_pq_less is a "less" function, that returns nonzero if |lhs|
+   is considered to be less than |rhs|. */
+typedef int (*nghttp3_pq_less)(const nghttp3_pq_entry *lhs,
+                               const nghttp3_pq_entry *rhs);
 
 typedef struct nghttp3_pq {
-  /* The pointer to the pointer to the item stored */
+  /* q is a pointer to an array that stores the items. */
   nghttp3_pq_entry **q;
-  /* Memory allocator */
+  /* mem is a memory allocator. */
   const nghttp3_mem *mem;
-  /* The number of items stored */
+  /* length is the number of items stored. */
   size_t length;
-  /* The maximum number of items this pq can store. This is
-     automatically extended when length is reached to this value. */
+  /* capacity is the maximum number of items this queue can store.
+     This is automatically extended when length is reached to this
+     limit. */
   size_t capacity;
-  /* The less function between items */
-  nghttp3_less less;
+  /* less is the less function to compare items. */
+  nghttp3_pq_less less;
 } nghttp3_pq;
 
 /*
- * Initializes priority queue |pq| with compare function |cmp|.
+ * nghttp3_pq_init initializes |pq| with compare function |cmp|.
  */
-void nghttp3_pq_init(nghttp3_pq *pq, nghttp3_less less, const nghttp3_mem *mem);
+void nghttp3_pq_init(nghttp3_pq *pq, nghttp3_pq_less less,
+                     const nghttp3_mem *mem);
 
 /*
- * Deallocates any resources allocated for |pq|.  The stored items are
- * not freed by this function.
+ * nghttp3_pq_free deallocates any resources allocated for |pq|.  The
+ * stored items are not freed by this function.
  */
 void nghttp3_pq_free(nghttp3_pq *pq);
 
 /*
- * Adds |item| to the priority queue |pq|.
+ * nghttp3_pq_push adds |item| to |pq|.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
@@ -87,43 +91,47 @@ void nghttp3_pq_free(nghttp3_pq *pq);
 int nghttp3_pq_push(nghttp3_pq *pq, nghttp3_pq_entry *item);
 
 /*
- * Returns item at the top of the queue |pq|.  It is undefined if the
- * queue is empty.
+ * nghttp3_pq_top returns item at the top of |pq|.  It is undefined if
+ * |pq| is empty.
  */
 nghttp3_pq_entry *nghttp3_pq_top(const nghttp3_pq *pq);
 
 /*
- * Pops item at the top of the queue |pq|. The popped item is not
- * freed by this function.
+ * nghttp3_pq_pop pops item at the top of |pq|.  The popped item is
+ * not freed by this function.  It is undefined if |pq| is empty.
  */
 void nghttp3_pq_pop(nghttp3_pq *pq);
 
 /*
- * Returns nonzero if the queue |pq| is empty.
+ * nghttp3_pq_empty returns nonzero if |pq| is empty.
  */
 int nghttp3_pq_empty(const nghttp3_pq *pq);
 
 /*
- * Returns the number of items in the queue |pq|.
+ * nghttp3_pq_size returns the number of items |pq| contains.
  */
 size_t nghttp3_pq_size(const nghttp3_pq *pq);
 
 typedef int (*nghttp3_pq_item_cb)(nghttp3_pq_entry *item, void *arg);
 
 /*
- * Applies |fun| to each item in |pq|.  The |arg| is passed as arg
- * parameter to callback function.  This function must not change the
- * ordering key.  If the return value from callback is nonzero, this
- * function returns 1 immediately without iterating remaining items.
- * Otherwise this function returns 0.
+ * nghttp3_pq_each applies |fun| to each item in |pq|.  The |arg| is
+ * passed as arg parameter to callback function.  This function must
+ * not change the ordering key.  If the return value from callback is
+ * nonzero, this function returns 1 immediately without iterating
+ * remaining items.  Otherwise this function returns 0.
  */
 int nghttp3_pq_each(const nghttp3_pq *pq, nghttp3_pq_item_cb fun, void *arg);
 
 /*
- * Removes |item| from priority queue.
+ * nghttp3_pq_remove removes |item| from |pq|.  |pq| must contain
+ * |item| otherwise the behavior is undefined.
  */
 void nghttp3_pq_remove(nghttp3_pq *pq, nghttp3_pq_entry *item);
 
+/*
+ * nghttp3_pq_clear removes all items from |pq|.
+ */
 void nghttp3_pq_clear(nghttp3_pq *pq);
 
-#endif /* NGHTTP3_PQ_H */
+#endif /* !defined(NGHTTP3_PQ_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.c b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.c
index 428c06a82c6bfb..a1c7e7487c1a34 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.c
@@ -41,287 +41,282 @@
 #define NGHTTP3_QPACK_MAX_QPACK_STREAMS 2000
 
 /* Make scalar initialization form of nghttp3_qpack_static_entry */
-#define MAKE_STATIC_ENT(I, T, H)                                               \
-  { I, T, H }
+#define MAKE_STATIC_ENT(I, T, H) {I, T, H}
 
 /* Generated by mkstatichdtbl.py */
 static nghttp3_qpack_static_entry token_stable[] = {
-    MAKE_STATIC_ENT(0, NGHTTP3_QPACK_TOKEN__AUTHORITY, 3153725150u),
-    MAKE_STATIC_ENT(15, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(16, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(17, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(18, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(19, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(20, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(21, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
-    MAKE_STATIC_ENT(1, NGHTTP3_QPACK_TOKEN__PATH, 3292848686u),
-    MAKE_STATIC_ENT(22, NGHTTP3_QPACK_TOKEN__SCHEME, 2510477674u),
-    MAKE_STATIC_ENT(23, NGHTTP3_QPACK_TOKEN__SCHEME, 2510477674u),
-    MAKE_STATIC_ENT(24, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(25, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(26, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(27, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(28, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(63, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(64, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(65, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(66, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(67, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(68, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(69, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(70, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(71, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
-    MAKE_STATIC_ENT(29, NGHTTP3_QPACK_TOKEN_ACCEPT, 136609321u),
-    MAKE_STATIC_ENT(30, NGHTTP3_QPACK_TOKEN_ACCEPT, 136609321u),
-    MAKE_STATIC_ENT(31, NGHTTP3_QPACK_TOKEN_ACCEPT_ENCODING, 3379649177u),
-    MAKE_STATIC_ENT(72, NGHTTP3_QPACK_TOKEN_ACCEPT_LANGUAGE, 1979086614u),
-    MAKE_STATIC_ENT(32, NGHTTP3_QPACK_TOKEN_ACCEPT_RANGES, 1713753958u),
-    MAKE_STATIC_ENT(73, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS,
-                    901040780u),
-    MAKE_STATIC_ENT(74, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS,
-                    901040780u),
-    MAKE_STATIC_ENT(33, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS,
-                    1524311232u),
-    MAKE_STATIC_ENT(34, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS,
-                    1524311232u),
-    MAKE_STATIC_ENT(75, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS,
-                    1524311232u),
-    MAKE_STATIC_ENT(76, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS,
-                    2175229868u),
-    MAKE_STATIC_ENT(77, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS,
-                    2175229868u),
-    MAKE_STATIC_ENT(78, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS,
-                    2175229868u),
-    MAKE_STATIC_ENT(35, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_ORIGIN,
-                    2710797292u),
-    MAKE_STATIC_ENT(79, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_EXPOSE_HEADERS,
-                    2449824425u),
-    MAKE_STATIC_ENT(80, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_HEADERS,
-                    3599549072u),
-    MAKE_STATIC_ENT(81, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD,
-                    2417078055u),
-    MAKE_STATIC_ENT(82, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD,
-                    2417078055u),
-    MAKE_STATIC_ENT(2, NGHTTP3_QPACK_TOKEN_AGE, 742476188u),
-    MAKE_STATIC_ENT(83, NGHTTP3_QPACK_TOKEN_ALT_SVC, 2148877059u),
-    MAKE_STATIC_ENT(84, NGHTTP3_QPACK_TOKEN_AUTHORIZATION, 2436257726u),
-    MAKE_STATIC_ENT(36, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
-    MAKE_STATIC_ENT(37, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
-    MAKE_STATIC_ENT(38, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
-    MAKE_STATIC_ENT(39, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
-    MAKE_STATIC_ENT(40, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
-    MAKE_STATIC_ENT(41, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
-    MAKE_STATIC_ENT(3, NGHTTP3_QPACK_TOKEN_CONTENT_DISPOSITION, 3889184348u),
-    MAKE_STATIC_ENT(42, NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING, 65203592u),
-    MAKE_STATIC_ENT(43, NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING, 65203592u),
-    MAKE_STATIC_ENT(4, NGHTTP3_QPACK_TOKEN_CONTENT_LENGTH, 1308181789u),
-    MAKE_STATIC_ENT(85, NGHTTP3_QPACK_TOKEN_CONTENT_SECURITY_POLICY,
-                    1569039836u),
-    MAKE_STATIC_ENT(44, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(45, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(46, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(47, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(48, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(49, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(50, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(51, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(52, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(53, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(54, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
-    MAKE_STATIC_ENT(5, NGHTTP3_QPACK_TOKEN_COOKIE, 2007449791u),
-    MAKE_STATIC_ENT(6, NGHTTP3_QPACK_TOKEN_DATE, 3564297305u),
-    MAKE_STATIC_ENT(86, NGHTTP3_QPACK_TOKEN_EARLY_DATA, 4080895051u),
-    MAKE_STATIC_ENT(7, NGHTTP3_QPACK_TOKEN_ETAG, 113792960u),
-    MAKE_STATIC_ENT(87, NGHTTP3_QPACK_TOKEN_EXPECT_CT, 1183214960u),
-    MAKE_STATIC_ENT(88, NGHTTP3_QPACK_TOKEN_FORWARDED, 1485178027u),
-    MAKE_STATIC_ENT(8, NGHTTP3_QPACK_TOKEN_IF_MODIFIED_SINCE, 2213050793u),
-    MAKE_STATIC_ENT(9, NGHTTP3_QPACK_TOKEN_IF_NONE_MATCH, 2536202615u),
-    MAKE_STATIC_ENT(89, NGHTTP3_QPACK_TOKEN_IF_RANGE, 2340978238u),
-    MAKE_STATIC_ENT(10, NGHTTP3_QPACK_TOKEN_LAST_MODIFIED, 3226950251u),
-    MAKE_STATIC_ENT(11, NGHTTP3_QPACK_TOKEN_LINK, 232457833u),
-    MAKE_STATIC_ENT(12, NGHTTP3_QPACK_TOKEN_LOCATION, 200649126u),
-    MAKE_STATIC_ENT(90, NGHTTP3_QPACK_TOKEN_ORIGIN, 3649018447u),
-    MAKE_STATIC_ENT(91, NGHTTP3_QPACK_TOKEN_PURPOSE, 4212263681u),
-    MAKE_STATIC_ENT(55, NGHTTP3_QPACK_TOKEN_RANGE, 4208725202u),
-    MAKE_STATIC_ENT(13, NGHTTP3_QPACK_TOKEN_REFERER, 3969579366u),
-    MAKE_STATIC_ENT(92, NGHTTP3_QPACK_TOKEN_SERVER, 1085029842u),
-    MAKE_STATIC_ENT(14, NGHTTP3_QPACK_TOKEN_SET_COOKIE, 1848371000u),
-    MAKE_STATIC_ENT(56, NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY,
-                    4138147361u),
-    MAKE_STATIC_ENT(57, NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY,
-                    4138147361u),
-    MAKE_STATIC_ENT(58, NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY,
-                    4138147361u),
-    MAKE_STATIC_ENT(93, NGHTTP3_QPACK_TOKEN_TIMING_ALLOW_ORIGIN, 2432297564u),
-    MAKE_STATIC_ENT(94, NGHTTP3_QPACK_TOKEN_UPGRADE_INSECURE_REQUESTS,
-                    2479169413u),
-    MAKE_STATIC_ENT(95, NGHTTP3_QPACK_TOKEN_USER_AGENT, 606444526u),
-    MAKE_STATIC_ENT(59, NGHTTP3_QPACK_TOKEN_VARY, 1085005381u),
-    MAKE_STATIC_ENT(60, NGHTTP3_QPACK_TOKEN_VARY, 1085005381u),
-    MAKE_STATIC_ENT(61, NGHTTP3_QPACK_TOKEN_X_CONTENT_TYPE_OPTIONS,
-                    3644557769u),
-    MAKE_STATIC_ENT(96, NGHTTP3_QPACK_TOKEN_X_FORWARDED_FOR, 2914187656u),
-    MAKE_STATIC_ENT(97, NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS, 3993834824u),
-    MAKE_STATIC_ENT(98, NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS, 3993834824u),
-    MAKE_STATIC_ENT(62, NGHTTP3_QPACK_TOKEN_X_XSS_PROTECTION, 2501058888u),
+  MAKE_STATIC_ENT(0, NGHTTP3_QPACK_TOKEN__AUTHORITY, 3153725150u),
+  MAKE_STATIC_ENT(15, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(16, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(17, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(18, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(19, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(20, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(21, NGHTTP3_QPACK_TOKEN__METHOD, 695666056u),
+  MAKE_STATIC_ENT(1, NGHTTP3_QPACK_TOKEN__PATH, 3292848686u),
+  MAKE_STATIC_ENT(22, NGHTTP3_QPACK_TOKEN__SCHEME, 2510477674u),
+  MAKE_STATIC_ENT(23, NGHTTP3_QPACK_TOKEN__SCHEME, 2510477674u),
+  MAKE_STATIC_ENT(24, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(25, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(26, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(27, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(28, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(63, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(64, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(65, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(66, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(67, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(68, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(69, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(70, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(71, NGHTTP3_QPACK_TOKEN__STATUS, 4000288983u),
+  MAKE_STATIC_ENT(29, NGHTTP3_QPACK_TOKEN_ACCEPT, 136609321u),
+  MAKE_STATIC_ENT(30, NGHTTP3_QPACK_TOKEN_ACCEPT, 136609321u),
+  MAKE_STATIC_ENT(31, NGHTTP3_QPACK_TOKEN_ACCEPT_ENCODING, 3379649177u),
+  MAKE_STATIC_ENT(72, NGHTTP3_QPACK_TOKEN_ACCEPT_LANGUAGE, 1979086614u),
+  MAKE_STATIC_ENT(32, NGHTTP3_QPACK_TOKEN_ACCEPT_RANGES, 1713753958u),
+  MAKE_STATIC_ENT(73, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS,
+                  901040780u),
+  MAKE_STATIC_ENT(74, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS,
+                  901040780u),
+  MAKE_STATIC_ENT(33, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS,
+                  1524311232u),
+  MAKE_STATIC_ENT(34, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS,
+                  1524311232u),
+  MAKE_STATIC_ENT(75, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS,
+                  1524311232u),
+  MAKE_STATIC_ENT(76, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS,
+                  2175229868u),
+  MAKE_STATIC_ENT(77, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS,
+                  2175229868u),
+  MAKE_STATIC_ENT(78, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS,
+                  2175229868u),
+  MAKE_STATIC_ENT(35, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_ORIGIN,
+                  2710797292u),
+  MAKE_STATIC_ENT(79, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_EXPOSE_HEADERS,
+                  2449824425u),
+  MAKE_STATIC_ENT(80, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_HEADERS,
+                  3599549072u),
+  MAKE_STATIC_ENT(81, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD,
+                  2417078055u),
+  MAKE_STATIC_ENT(82, NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD,
+                  2417078055u),
+  MAKE_STATIC_ENT(2, NGHTTP3_QPACK_TOKEN_AGE, 742476188u),
+  MAKE_STATIC_ENT(83, NGHTTP3_QPACK_TOKEN_ALT_SVC, 2148877059u),
+  MAKE_STATIC_ENT(84, NGHTTP3_QPACK_TOKEN_AUTHORIZATION, 2436257726u),
+  MAKE_STATIC_ENT(36, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
+  MAKE_STATIC_ENT(37, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
+  MAKE_STATIC_ENT(38, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
+  MAKE_STATIC_ENT(39, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
+  MAKE_STATIC_ENT(40, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
+  MAKE_STATIC_ENT(41, NGHTTP3_QPACK_TOKEN_CACHE_CONTROL, 1355326669u),
+  MAKE_STATIC_ENT(3, NGHTTP3_QPACK_TOKEN_CONTENT_DISPOSITION, 3889184348u),
+  MAKE_STATIC_ENT(42, NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING, 65203592u),
+  MAKE_STATIC_ENT(43, NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING, 65203592u),
+  MAKE_STATIC_ENT(4, NGHTTP3_QPACK_TOKEN_CONTENT_LENGTH, 1308181789u),
+  MAKE_STATIC_ENT(85, NGHTTP3_QPACK_TOKEN_CONTENT_SECURITY_POLICY, 1569039836u),
+  MAKE_STATIC_ENT(44, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(45, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(46, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(47, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(48, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(49, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(50, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(51, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(52, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(53, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(54, NGHTTP3_QPACK_TOKEN_CONTENT_TYPE, 4244048277u),
+  MAKE_STATIC_ENT(5, NGHTTP3_QPACK_TOKEN_COOKIE, 2007449791u),
+  MAKE_STATIC_ENT(6, NGHTTP3_QPACK_TOKEN_DATE, 3564297305u),
+  MAKE_STATIC_ENT(86, NGHTTP3_QPACK_TOKEN_EARLY_DATA, 4080895051u),
+  MAKE_STATIC_ENT(7, NGHTTP3_QPACK_TOKEN_ETAG, 113792960u),
+  MAKE_STATIC_ENT(87, NGHTTP3_QPACK_TOKEN_EXPECT_CT, 1183214960u),
+  MAKE_STATIC_ENT(88, NGHTTP3_QPACK_TOKEN_FORWARDED, 1485178027u),
+  MAKE_STATIC_ENT(8, NGHTTP3_QPACK_TOKEN_IF_MODIFIED_SINCE, 2213050793u),
+  MAKE_STATIC_ENT(9, NGHTTP3_QPACK_TOKEN_IF_NONE_MATCH, 2536202615u),
+  MAKE_STATIC_ENT(89, NGHTTP3_QPACK_TOKEN_IF_RANGE, 2340978238u),
+  MAKE_STATIC_ENT(10, NGHTTP3_QPACK_TOKEN_LAST_MODIFIED, 3226950251u),
+  MAKE_STATIC_ENT(11, NGHTTP3_QPACK_TOKEN_LINK, 232457833u),
+  MAKE_STATIC_ENT(12, NGHTTP3_QPACK_TOKEN_LOCATION, 200649126u),
+  MAKE_STATIC_ENT(90, NGHTTP3_QPACK_TOKEN_ORIGIN, 3649018447u),
+  MAKE_STATIC_ENT(91, NGHTTP3_QPACK_TOKEN_PURPOSE, 4212263681u),
+  MAKE_STATIC_ENT(55, NGHTTP3_QPACK_TOKEN_RANGE, 4208725202u),
+  MAKE_STATIC_ENT(13, NGHTTP3_QPACK_TOKEN_REFERER, 3969579366u),
+  MAKE_STATIC_ENT(92, NGHTTP3_QPACK_TOKEN_SERVER, 1085029842u),
+  MAKE_STATIC_ENT(14, NGHTTP3_QPACK_TOKEN_SET_COOKIE, 1848371000u),
+  MAKE_STATIC_ENT(56, NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY,
+                  4138147361u),
+  MAKE_STATIC_ENT(57, NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY,
+                  4138147361u),
+  MAKE_STATIC_ENT(58, NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY,
+                  4138147361u),
+  MAKE_STATIC_ENT(93, NGHTTP3_QPACK_TOKEN_TIMING_ALLOW_ORIGIN, 2432297564u),
+  MAKE_STATIC_ENT(94, NGHTTP3_QPACK_TOKEN_UPGRADE_INSECURE_REQUESTS,
+                  2479169413u),
+  MAKE_STATIC_ENT(95, NGHTTP3_QPACK_TOKEN_USER_AGENT, 606444526u),
+  MAKE_STATIC_ENT(59, NGHTTP3_QPACK_TOKEN_VARY, 1085005381u),
+  MAKE_STATIC_ENT(60, NGHTTP3_QPACK_TOKEN_VARY, 1085005381u),
+  MAKE_STATIC_ENT(61, NGHTTP3_QPACK_TOKEN_X_CONTENT_TYPE_OPTIONS, 3644557769u),
+  MAKE_STATIC_ENT(96, NGHTTP3_QPACK_TOKEN_X_FORWARDED_FOR, 2914187656u),
+  MAKE_STATIC_ENT(97, NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS, 3993834824u),
+  MAKE_STATIC_ENT(98, NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS, 3993834824u),
+  MAKE_STATIC_ENT(62, NGHTTP3_QPACK_TOKEN_X_XSS_PROTECTION, 2501058888u),
 };
 
 /* Make scalar initialization form of nghttp3_qpack_static_entry */
 #define MAKE_STATIC_HD(N, V, T)                                                \
   {                                                                            \
     {NULL, (uint8_t *)(N), sizeof((N)) - 1, -1},                               \
-        {NULL, (uint8_t *)(V), sizeof((V)) - 1, -1}, T                         \
+    {NULL, (uint8_t *)(V), sizeof((V)) - 1, -1},                               \
+    T,                                                                         \
   }
 
 static nghttp3_qpack_static_header stable[] = {
-    MAKE_STATIC_HD(":authority", "", NGHTTP3_QPACK_TOKEN__AUTHORITY),
-    MAKE_STATIC_HD(":path", "/", NGHTTP3_QPACK_TOKEN__PATH),
-    MAKE_STATIC_HD("age", "0", NGHTTP3_QPACK_TOKEN_AGE),
-    MAKE_STATIC_HD("content-disposition", "",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_DISPOSITION),
-    MAKE_STATIC_HD("content-length", "0", NGHTTP3_QPACK_TOKEN_CONTENT_LENGTH),
-    MAKE_STATIC_HD("cookie", "", NGHTTP3_QPACK_TOKEN_COOKIE),
-    MAKE_STATIC_HD("date", "", NGHTTP3_QPACK_TOKEN_DATE),
-    MAKE_STATIC_HD("etag", "", NGHTTP3_QPACK_TOKEN_ETAG),
-    MAKE_STATIC_HD("if-modified-since", "",
-                   NGHTTP3_QPACK_TOKEN_IF_MODIFIED_SINCE),
-    MAKE_STATIC_HD("if-none-match", "", NGHTTP3_QPACK_TOKEN_IF_NONE_MATCH),
-    MAKE_STATIC_HD("last-modified", "", NGHTTP3_QPACK_TOKEN_LAST_MODIFIED),
-    MAKE_STATIC_HD("link", "", NGHTTP3_QPACK_TOKEN_LINK),
-    MAKE_STATIC_HD("location", "", NGHTTP3_QPACK_TOKEN_LOCATION),
-    MAKE_STATIC_HD("referer", "", NGHTTP3_QPACK_TOKEN_REFERER),
-    MAKE_STATIC_HD("set-cookie", "", NGHTTP3_QPACK_TOKEN_SET_COOKIE),
-    MAKE_STATIC_HD(":method", "CONNECT", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":method", "DELETE", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":method", "GET", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":method", "HEAD", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":method", "OPTIONS", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":method", "POST", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":method", "PUT", NGHTTP3_QPACK_TOKEN__METHOD),
-    MAKE_STATIC_HD(":scheme", "http", NGHTTP3_QPACK_TOKEN__SCHEME),
-    MAKE_STATIC_HD(":scheme", "https", NGHTTP3_QPACK_TOKEN__SCHEME),
-    MAKE_STATIC_HD(":status", "103", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "200", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "304", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "404", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "503", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD("accept", "*/*", NGHTTP3_QPACK_TOKEN_ACCEPT),
-    MAKE_STATIC_HD("accept", "application/dns-message",
-                   NGHTTP3_QPACK_TOKEN_ACCEPT),
-    MAKE_STATIC_HD("accept-encoding", "gzip, deflate, br",
-                   NGHTTP3_QPACK_TOKEN_ACCEPT_ENCODING),
-    MAKE_STATIC_HD("accept-ranges", "bytes", NGHTTP3_QPACK_TOKEN_ACCEPT_RANGES),
-    MAKE_STATIC_HD("access-control-allow-headers", "cache-control",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS),
-    MAKE_STATIC_HD("access-control-allow-headers", "content-type",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS),
-    MAKE_STATIC_HD("access-control-allow-origin", "*",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_ORIGIN),
-    MAKE_STATIC_HD("cache-control", "max-age=0",
-                   NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
-    MAKE_STATIC_HD("cache-control", "max-age=2592000",
-                   NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
-    MAKE_STATIC_HD("cache-control", "max-age=604800",
-                   NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
-    MAKE_STATIC_HD("cache-control", "no-cache",
-                   NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
-    MAKE_STATIC_HD("cache-control", "no-store",
-                   NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
-    MAKE_STATIC_HD("cache-control", "public, max-age=31536000",
-                   NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
-    MAKE_STATIC_HD("content-encoding", "br",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING),
-    MAKE_STATIC_HD("content-encoding", "gzip",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING),
-    MAKE_STATIC_HD("content-type", "application/dns-message",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "application/javascript",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "application/json",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "application/x-www-form-urlencoded",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "image/gif",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "image/jpeg",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "image/png",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "text/css",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "text/html; charset=utf-8",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "text/plain",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("content-type", "text/plain;charset=utf-8",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
-    MAKE_STATIC_HD("range", "bytes=0-", NGHTTP3_QPACK_TOKEN_RANGE),
-    MAKE_STATIC_HD("strict-transport-security", "max-age=31536000",
-                   NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY),
-    MAKE_STATIC_HD("strict-transport-security",
-                   "max-age=31536000; includesubdomains",
-                   NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY),
-    MAKE_STATIC_HD("strict-transport-security",
-                   "max-age=31536000; includesubdomains; preload",
-                   NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY),
-    MAKE_STATIC_HD("vary", "accept-encoding", NGHTTP3_QPACK_TOKEN_VARY),
-    MAKE_STATIC_HD("vary", "origin", NGHTTP3_QPACK_TOKEN_VARY),
-    MAKE_STATIC_HD("x-content-type-options", "nosniff",
-                   NGHTTP3_QPACK_TOKEN_X_CONTENT_TYPE_OPTIONS),
-    MAKE_STATIC_HD("x-xss-protection", "1; mode=block",
-                   NGHTTP3_QPACK_TOKEN_X_XSS_PROTECTION),
-    MAKE_STATIC_HD(":status", "100", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "204", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "206", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "302", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "400", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "403", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "421", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "425", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD(":status", "500", NGHTTP3_QPACK_TOKEN__STATUS),
-    MAKE_STATIC_HD("accept-language", "", NGHTTP3_QPACK_TOKEN_ACCEPT_LANGUAGE),
-    MAKE_STATIC_HD("access-control-allow-credentials", "FALSE",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS),
-    MAKE_STATIC_HD("access-control-allow-credentials", "TRUE",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS),
-    MAKE_STATIC_HD("access-control-allow-headers", "*",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS),
-    MAKE_STATIC_HD("access-control-allow-methods", "get",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS),
-    MAKE_STATIC_HD("access-control-allow-methods", "get, post, options",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS),
-    MAKE_STATIC_HD("access-control-allow-methods", "options",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS),
-    MAKE_STATIC_HD("access-control-expose-headers", "content-length",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_EXPOSE_HEADERS),
-    MAKE_STATIC_HD("access-control-request-headers", "content-type",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_HEADERS),
-    MAKE_STATIC_HD("access-control-request-method", "get",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD),
-    MAKE_STATIC_HD("access-control-request-method", "post",
-                   NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD),
-    MAKE_STATIC_HD("alt-svc", "clear", NGHTTP3_QPACK_TOKEN_ALT_SVC),
-    MAKE_STATIC_HD("authorization", "", NGHTTP3_QPACK_TOKEN_AUTHORIZATION),
-    MAKE_STATIC_HD("content-security-policy",
-                   "script-src 'none'; object-src 'none'; base-uri 'none'",
-                   NGHTTP3_QPACK_TOKEN_CONTENT_SECURITY_POLICY),
-    MAKE_STATIC_HD("early-data", "1", NGHTTP3_QPACK_TOKEN_EARLY_DATA),
-    MAKE_STATIC_HD("expect-ct", "", NGHTTP3_QPACK_TOKEN_EXPECT_CT),
-    MAKE_STATIC_HD("forwarded", "", NGHTTP3_QPACK_TOKEN_FORWARDED),
-    MAKE_STATIC_HD("if-range", "", NGHTTP3_QPACK_TOKEN_IF_RANGE),
-    MAKE_STATIC_HD("origin", "", NGHTTP3_QPACK_TOKEN_ORIGIN),
-    MAKE_STATIC_HD("purpose", "prefetch", NGHTTP3_QPACK_TOKEN_PURPOSE),
-    MAKE_STATIC_HD("server", "", NGHTTP3_QPACK_TOKEN_SERVER),
-    MAKE_STATIC_HD("timing-allow-origin", "*",
-                   NGHTTP3_QPACK_TOKEN_TIMING_ALLOW_ORIGIN),
-    MAKE_STATIC_HD("upgrade-insecure-requests", "1",
-                   NGHTTP3_QPACK_TOKEN_UPGRADE_INSECURE_REQUESTS),
-    MAKE_STATIC_HD("user-agent", "", NGHTTP3_QPACK_TOKEN_USER_AGENT),
-    MAKE_STATIC_HD("x-forwarded-for", "", NGHTTP3_QPACK_TOKEN_X_FORWARDED_FOR),
-    MAKE_STATIC_HD("x-frame-options", "deny",
-                   NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS),
-    MAKE_STATIC_HD("x-frame-options", "sameorigin",
-                   NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS),
+  MAKE_STATIC_HD(":authority", "", NGHTTP3_QPACK_TOKEN__AUTHORITY),
+  MAKE_STATIC_HD(":path", "/", NGHTTP3_QPACK_TOKEN__PATH),
+  MAKE_STATIC_HD("age", "0", NGHTTP3_QPACK_TOKEN_AGE),
+  MAKE_STATIC_HD("content-disposition", "",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_DISPOSITION),
+  MAKE_STATIC_HD("content-length", "0", NGHTTP3_QPACK_TOKEN_CONTENT_LENGTH),
+  MAKE_STATIC_HD("cookie", "", NGHTTP3_QPACK_TOKEN_COOKIE),
+  MAKE_STATIC_HD("date", "", NGHTTP3_QPACK_TOKEN_DATE),
+  MAKE_STATIC_HD("etag", "", NGHTTP3_QPACK_TOKEN_ETAG),
+  MAKE_STATIC_HD("if-modified-since", "",
+                 NGHTTP3_QPACK_TOKEN_IF_MODIFIED_SINCE),
+  MAKE_STATIC_HD("if-none-match", "", NGHTTP3_QPACK_TOKEN_IF_NONE_MATCH),
+  MAKE_STATIC_HD("last-modified", "", NGHTTP3_QPACK_TOKEN_LAST_MODIFIED),
+  MAKE_STATIC_HD("link", "", NGHTTP3_QPACK_TOKEN_LINK),
+  MAKE_STATIC_HD("location", "", NGHTTP3_QPACK_TOKEN_LOCATION),
+  MAKE_STATIC_HD("referer", "", NGHTTP3_QPACK_TOKEN_REFERER),
+  MAKE_STATIC_HD("set-cookie", "", NGHTTP3_QPACK_TOKEN_SET_COOKIE),
+  MAKE_STATIC_HD(":method", "CONNECT", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":method", "DELETE", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":method", "GET", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":method", "HEAD", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":method", "OPTIONS", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":method", "POST", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":method", "PUT", NGHTTP3_QPACK_TOKEN__METHOD),
+  MAKE_STATIC_HD(":scheme", "http", NGHTTP3_QPACK_TOKEN__SCHEME),
+  MAKE_STATIC_HD(":scheme", "https", NGHTTP3_QPACK_TOKEN__SCHEME),
+  MAKE_STATIC_HD(":status", "103", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "200", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "304", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "404", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "503", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD("accept", "*/*", NGHTTP3_QPACK_TOKEN_ACCEPT),
+  MAKE_STATIC_HD("accept", "application/dns-message",
+                 NGHTTP3_QPACK_TOKEN_ACCEPT),
+  MAKE_STATIC_HD("accept-encoding", "gzip, deflate, br",
+                 NGHTTP3_QPACK_TOKEN_ACCEPT_ENCODING),
+  MAKE_STATIC_HD("accept-ranges", "bytes", NGHTTP3_QPACK_TOKEN_ACCEPT_RANGES),
+  MAKE_STATIC_HD("access-control-allow-headers", "cache-control",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS),
+  MAKE_STATIC_HD("access-control-allow-headers", "content-type",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS),
+  MAKE_STATIC_HD("access-control-allow-origin", "*",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_ORIGIN),
+  MAKE_STATIC_HD("cache-control", "max-age=0",
+                 NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
+  MAKE_STATIC_HD("cache-control", "max-age=2592000",
+                 NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
+  MAKE_STATIC_HD("cache-control", "max-age=604800",
+                 NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
+  MAKE_STATIC_HD("cache-control", "no-cache",
+                 NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
+  MAKE_STATIC_HD("cache-control", "no-store",
+                 NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
+  MAKE_STATIC_HD("cache-control", "public, max-age=31536000",
+                 NGHTTP3_QPACK_TOKEN_CACHE_CONTROL),
+  MAKE_STATIC_HD("content-encoding", "br",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING),
+  MAKE_STATIC_HD("content-encoding", "gzip",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_ENCODING),
+  MAKE_STATIC_HD("content-type", "application/dns-message",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "application/javascript",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "application/json",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "application/x-www-form-urlencoded",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "image/gif", NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "image/jpeg",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "image/png", NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "text/css", NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "text/html; charset=utf-8",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "text/plain",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("content-type", "text/plain;charset=utf-8",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_TYPE),
+  MAKE_STATIC_HD("range", "bytes=0-", NGHTTP3_QPACK_TOKEN_RANGE),
+  MAKE_STATIC_HD("strict-transport-security", "max-age=31536000",
+                 NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY),
+  MAKE_STATIC_HD("strict-transport-security",
+                 "max-age=31536000; includesubdomains",
+                 NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY),
+  MAKE_STATIC_HD("strict-transport-security",
+                 "max-age=31536000; includesubdomains; preload",
+                 NGHTTP3_QPACK_TOKEN_STRICT_TRANSPORT_SECURITY),
+  MAKE_STATIC_HD("vary", "accept-encoding", NGHTTP3_QPACK_TOKEN_VARY),
+  MAKE_STATIC_HD("vary", "origin", NGHTTP3_QPACK_TOKEN_VARY),
+  MAKE_STATIC_HD("x-content-type-options", "nosniff",
+                 NGHTTP3_QPACK_TOKEN_X_CONTENT_TYPE_OPTIONS),
+  MAKE_STATIC_HD("x-xss-protection", "1; mode=block",
+                 NGHTTP3_QPACK_TOKEN_X_XSS_PROTECTION),
+  MAKE_STATIC_HD(":status", "100", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "204", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "206", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "302", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "400", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "403", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "421", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "425", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD(":status", "500", NGHTTP3_QPACK_TOKEN__STATUS),
+  MAKE_STATIC_HD("accept-language", "", NGHTTP3_QPACK_TOKEN_ACCEPT_LANGUAGE),
+  MAKE_STATIC_HD("access-control-allow-credentials", "FALSE",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS),
+  MAKE_STATIC_HD("access-control-allow-credentials", "TRUE",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_CREDENTIALS),
+  MAKE_STATIC_HD("access-control-allow-headers", "*",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_HEADERS),
+  MAKE_STATIC_HD("access-control-allow-methods", "get",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS),
+  MAKE_STATIC_HD("access-control-allow-methods", "get, post, options",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS),
+  MAKE_STATIC_HD("access-control-allow-methods", "options",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_ALLOW_METHODS),
+  MAKE_STATIC_HD("access-control-expose-headers", "content-length",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_EXPOSE_HEADERS),
+  MAKE_STATIC_HD("access-control-request-headers", "content-type",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_HEADERS),
+  MAKE_STATIC_HD("access-control-request-method", "get",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD),
+  MAKE_STATIC_HD("access-control-request-method", "post",
+                 NGHTTP3_QPACK_TOKEN_ACCESS_CONTROL_REQUEST_METHOD),
+  MAKE_STATIC_HD("alt-svc", "clear", NGHTTP3_QPACK_TOKEN_ALT_SVC),
+  MAKE_STATIC_HD("authorization", "", NGHTTP3_QPACK_TOKEN_AUTHORIZATION),
+  MAKE_STATIC_HD("content-security-policy",
+                 "script-src 'none'; object-src 'none'; base-uri 'none'",
+                 NGHTTP3_QPACK_TOKEN_CONTENT_SECURITY_POLICY),
+  MAKE_STATIC_HD("early-data", "1", NGHTTP3_QPACK_TOKEN_EARLY_DATA),
+  MAKE_STATIC_HD("expect-ct", "", NGHTTP3_QPACK_TOKEN_EXPECT_CT),
+  MAKE_STATIC_HD("forwarded", "", NGHTTP3_QPACK_TOKEN_FORWARDED),
+  MAKE_STATIC_HD("if-range", "", NGHTTP3_QPACK_TOKEN_IF_RANGE),
+  MAKE_STATIC_HD("origin", "", NGHTTP3_QPACK_TOKEN_ORIGIN),
+  MAKE_STATIC_HD("purpose", "prefetch", NGHTTP3_QPACK_TOKEN_PURPOSE),
+  MAKE_STATIC_HD("server", "", NGHTTP3_QPACK_TOKEN_SERVER),
+  MAKE_STATIC_HD("timing-allow-origin", "*",
+                 NGHTTP3_QPACK_TOKEN_TIMING_ALLOW_ORIGIN),
+  MAKE_STATIC_HD("upgrade-insecure-requests", "1",
+                 NGHTTP3_QPACK_TOKEN_UPGRADE_INSECURE_REQUESTS),
+  MAKE_STATIC_HD("user-agent", "", NGHTTP3_QPACK_TOKEN_USER_AGENT),
+  MAKE_STATIC_HD("x-forwarded-for", "", NGHTTP3_QPACK_TOKEN_X_FORWARDED_FOR),
+  MAKE_STATIC_HD("x-frame-options", "deny",
+                 NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS),
+  MAKE_STATIC_HD("x-frame-options", "sameorigin",
+                 NGHTTP3_QPACK_TOKEN_X_FRAME_OPTIONS),
 };
 
 static int memeq(const void *s1, const void *s2, size_t n) {
@@ -884,9 +879,9 @@ static void qpack_context_free(nghttp3_qpack_context *ctx) {
 static int ref_min_cnt_less(const nghttp3_pq_entry *lhsx,
                             const nghttp3_pq_entry *rhsx) {
   nghttp3_qpack_header_block_ref *lhs =
-      nghttp3_struct_of(lhsx, nghttp3_qpack_header_block_ref, min_cnts_pe);
+    nghttp3_struct_of(lhsx, nghttp3_qpack_header_block_ref, min_cnts_pe);
   nghttp3_qpack_header_block_ref *rhs =
-      nghttp3_struct_of(rhsx, nghttp3_qpack_header_block_ref, min_cnts_pe);
+    nghttp3_struct_of(rhsx, nghttp3_qpack_header_block_ref, min_cnts_pe);
 
   return lhs->min_cnt < rhs->min_cnt;
 }
@@ -926,6 +921,7 @@ int nghttp3_qpack_encoder_init(nghttp3_qpack_encoder *encoder,
   encoder->opcode = 0;
   encoder->min_dtable_update = SIZE_MAX;
   encoder->last_max_dtable_update = 0;
+  encoder->uninterrupted_decoderlen = 0;
   encoder->flags = NGHTTP3_QPACK_ENCODER_FLAG_NONE;
 
   nghttp3_qpack_read_state_reset(&encoder->rstate);
@@ -943,16 +939,16 @@ static int map_stream_free(void *data, void *ptr) {
 void nghttp3_qpack_encoder_free(nghttp3_qpack_encoder *encoder) {
   nghttp3_pq_free(&encoder->min_cnts);
   nghttp3_ksl_free(&encoder->blocked_streams);
-  nghttp3_map_each_free(&encoder->streams, map_stream_free,
-                        (void *)encoder->ctx.mem);
+  nghttp3_map_each(&encoder->streams, map_stream_free,
+                   (void *)encoder->ctx.mem);
   nghttp3_map_free(&encoder->streams);
   qpack_context_free(&encoder->ctx);
 }
 
 void nghttp3_qpack_encoder_set_max_dtable_capacity(
-    nghttp3_qpack_encoder *encoder, size_t max_dtable_capacity) {
-  max_dtable_capacity =
-      nghttp3_min(max_dtable_capacity, encoder->ctx.hard_max_dtable_capacity);
+  nghttp3_qpack_encoder *encoder, size_t max_dtable_capacity) {
+  max_dtable_capacity = nghttp3_min_size(max_dtable_capacity,
+                                         encoder->ctx.hard_max_dtable_capacity);
 
   if (encoder->ctx.max_dtable_capacity == max_dtable_capacity) {
     return;
@@ -968,7 +964,7 @@ void nghttp3_qpack_encoder_set_max_dtable_capacity(
 }
 
 void nghttp3_qpack_encoder_set_max_blocked_streams(
-    nghttp3_qpack_encoder *encoder, size_t max_blocked_streams) {
+  nghttp3_qpack_encoder *encoder, size_t max_blocked_streams) {
   encoder->ctx.max_blocked_streams = max_blocked_streams;
 }
 
@@ -977,7 +973,7 @@ uint64_t nghttp3_qpack_encoder_get_min_cnt(nghttp3_qpack_encoder *encoder) {
 
   return nghttp3_struct_of(nghttp3_pq_top(&encoder->min_cnts),
                            nghttp3_qpack_header_block_ref, min_cnts_pe)
-      ->min_cnt;
+    ->min_cnt;
 }
 
 void nghttp3_qpack_encoder_shrink_dtable(nghttp3_qpack_encoder *encoder) {
@@ -1003,7 +999,7 @@ void nghttp3_qpack_encoder_shrink_dtable(nghttp3_qpack_encoder *encoder) {
     }
 
     encoder->ctx.dtable_size -=
-        table_space(ent->nv.name->len, ent->nv.value->len);
+      table_space(ent->nv.name->len, ent->nv.value->len);
 
     nghttp3_ringbuf_pop_back(dtable);
     qpack_map_remove(&encoder->dtable_map, ent);
@@ -1088,8 +1084,8 @@ static void qpack_encoder_remove_stream(nghttp3_qpack_encoder *encoder,
 
   len = nghttp3_ringbuf_len(&stream->refs);
   for (i = 0; i < len; ++i) {
-    ref = *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs,
-                                                                  i);
+    ref =
+      *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs, i);
 
     assert(ref->min_cnts_pe.index != NGHTTP3_PQ_BAD_INDEX);
 
@@ -1098,11 +1094,9 @@ static void qpack_encoder_remove_stream(nghttp3_qpack_encoder *encoder,
 }
 
 /*
- * reserve_buf_internal ensures that |buf| contains at least
- * |extra_size| of free space.  In other words, if this function
- * succeeds, nghttp3_buf_left(buf) >= extra_size holds.  |min_size| is
- * the minimum size of buffer.  The allocated buffer has at least
- * |min_size| bytes.
+ * reserve_buf ensures that |buf| contains at least |extra_size| of
+ * free space.  In other words, if this function succeeds,
+ * nghttp3_buf_left(buf) >= extra_size holds.
  *
  * This function returns 0 if it succeeds, or one of the following
  * negative error codes:
@@ -1110,31 +1104,37 @@ static void qpack_encoder_remove_stream(nghttp3_qpack_encoder *encoder,
  * NGHTTP3_ERR_NOMEM
  *     Out of memory.
  */
-static int reserve_buf_internal(nghttp3_buf *buf, size_t extra_size,
-                                size_t min_size, const nghttp3_mem *mem) {
+static int reserve_buf(nghttp3_buf *buf, size_t extra_size,
+                       const nghttp3_mem *mem) {
   size_t left = nghttp3_buf_left(buf);
-  size_t n = min_size, need;
+  size_t n = 32;
 
   if (left >= extra_size) {
     return 0;
   }
 
-  need = nghttp3_buf_cap(buf) + extra_size - left;
-
-  for (; n < need; n *= 2)
-    ;
+  n = nghttp3_max_size(n, nghttp3_buf_cap(buf) + extra_size - left);
 
-  return nghttp3_buf_reserve(buf, n, mem);
-}
+  /* Check whether we are requesting too much memory */
+  if (n > (1u << 31)) {
+    return NGHTTP3_ERR_NOMEM;
+  }
 
-static int reserve_buf_small(nghttp3_buf *buf, size_t extra_size,
-                             const nghttp3_mem *mem) {
-  return reserve_buf_internal(buf, extra_size, 32, mem);
-}
+#ifndef WIN32
+  n = 1u << (32 - __builtin_clz((uint32_t)n - 1));
+#else  /* defined(WIN32) */
+  /* Round up to the next highest power of 2 from Bit Twiddling
+     Hacks */
+  --n;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  ++n;
+#endif /* defined(WIN32) */
 
-static int reserve_buf(nghttp3_buf *buf, size_t extra_size,
-                       const nghttp3_mem *mem) {
-  return reserve_buf_internal(buf, extra_size, 32, mem);
+  return nghttp3_buf_reserve(buf, n, mem);
 }
 
 int nghttp3_qpack_encoder_encode(nghttp3_qpack_encoder *encoder,
@@ -1162,10 +1162,10 @@ int nghttp3_qpack_encoder_encode(nghttp3_qpack_encoder *encoder,
 
   stream = nghttp3_qpack_encoder_find_stream(encoder, stream_id);
   blocked_stream =
-      stream && nghttp3_qpack_encoder_stream_is_blocked(encoder, stream);
+    stream && nghttp3_qpack_encoder_stream_is_blocked(encoder, stream);
   allow_blocking =
-      blocked_stream || encoder->ctx.max_blocked_streams >
-                            nghttp3_ksl_len(&encoder->blocked_streams);
+    blocked_stream || encoder->ctx.max_blocked_streams >
+                        nghttp3_ksl_len(&encoder->blocked_streams);
 
   DEBUGF("qpack::encode: stream %ld blocked=%d allow_blocking=%d\n", stream_id,
          blocked_stream, allow_blocking);
@@ -1181,13 +1181,15 @@ int nghttp3_qpack_encoder_encode(nghttp3_qpack_encoder *encoder,
   nghttp3_qpack_encoder_write_field_section_prefix(encoder, pbuf, max_cnt,
                                                    base);
 
+  encoder->uninterrupted_decoderlen = 0;
+
   /* TODO If max_cnt == 0, no reference is made to dtable. */
   if (!max_cnt) {
     return 0;
   }
 
-  rv = qpack_encoder_add_stream_ref(encoder, stream_id, stream, max_cnt,
-                                    min_cnt);
+  rv =
+    qpack_encoder_add_stream_ref(encoder, stream_id, stream, max_cnt, min_cnt);
   if (rv != 0) {
     goto fail;
   }
@@ -1253,7 +1255,7 @@ int nghttp3_qpack_encoder_process_dtable_update(nghttp3_qpack_encoder *encoder,
   }
 
   rv = nghttp3_qpack_encoder_write_set_dtable_cap(
-      encoder, ebuf, encoder->last_max_dtable_update);
+    encoder, ebuf, encoder->last_max_dtable_update);
   if (rv != 0) {
     return rv;
   }
@@ -1374,7 +1376,7 @@ static int qpack_encoder_can_index(nghttp3_qpack_encoder *encoder, size_t need,
 
   if (!nghttp3_pq_empty(&encoder->min_cnts)) {
     gmin_cnt = nghttp3_qpack_encoder_get_min_cnt(encoder);
-    min_cnt = nghttp3_min(min_cnt, gmin_cnt);
+    min_cnt = nghttp3_min_uint64(min_cnt, gmin_cnt);
   }
 
   if (min_cnt == UINT64_MAX) {
@@ -1402,7 +1404,7 @@ static int qpack_encoder_can_index(nghttp3_qpack_encoder *encoder, size_t need,
 static int qpack_encoder_can_index_nv(nghttp3_qpack_encoder *encoder,
                                       const nghttp3_nv *nv, uint64_t min_cnt) {
   return qpack_encoder_can_index(
-      encoder, table_space(nv->namelen, nv->valuelen), min_cnt);
+    encoder, table_space(nv->namelen, nv->valuelen), min_cnt);
 }
 
 /*
@@ -1415,10 +1417,10 @@ static int qpack_encoder_can_index_duplicate(nghttp3_qpack_encoder *encoder,
                                              uint64_t absidx,
                                              uint64_t min_cnt) {
   nghttp3_qpack_entry *ent =
-      nghttp3_qpack_context_dtable_get(&encoder->ctx, absidx);
+    nghttp3_qpack_context_dtable_get(&encoder->ctx, absidx);
 
   return qpack_encoder_can_index(
-      encoder, table_space(ent->nv.name->len, ent->nv.value->len), min_cnt);
+    encoder, table_space(ent->nv.name->len, ent->nv.value->len), min_cnt);
 }
 
 /*
@@ -1428,7 +1430,7 @@ static int qpack_encoder_can_index_duplicate(nghttp3_qpack_encoder *encoder,
 static int qpack_context_check_draining(nghttp3_qpack_context *ctx,
                                         uint64_t absidx) {
   const size_t safe = ctx->max_dtable_capacity -
-                      nghttp3_min(512, ctx->max_dtable_capacity * 1 / 8);
+                      nghttp3_min_size(512, ctx->max_dtable_capacity * 1 / 8);
   nghttp3_qpack_entry *ent = nghttp3_qpack_context_dtable_get(ctx, absidx);
 
   return ctx->dtable_sum - ent->sum > safe;
@@ -1483,11 +1485,10 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
   }
 
   if (nghttp3_map_size(&encoder->streams) < NGHTTP3_QPACK_MAX_QPACK_STREAMS) {
-    dres = nghttp3_qpack_encoder_lookup_dtable(encoder, nv, token, hash,
-                                               indexing_mode, encoder->krcnt,
-                                               allow_blocking);
-    just_index = indexing_mode == NGHTTP3_QPACK_INDEXING_MODE_STORE &&
-                 dres.pb_index == -1;
+    dres = nghttp3_qpack_encoder_lookup_dtable(
+      encoder, nv, token, hash, indexing_mode, encoder->krcnt, allow_blocking);
+    just_index =
+      indexing_mode == NGHTTP3_QPACK_INDEXING_MODE_STORE && dres.pb_index == -1;
   }
 
   if (dres.index != -1 && dres.name_value_match) {
@@ -1500,8 +1501,8 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
       if (rv != 0) {
         return rv;
       }
-      rv = nghttp3_qpack_encoder_dtable_duplicate_add(encoder,
-                                                      (size_t)dres.index);
+      rv =
+        nghttp3_qpack_encoder_dtable_duplicate_add(encoder, (size_t)dres.index);
       if (rv != 0) {
         return rv;
       }
@@ -1509,11 +1510,11 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
       new_ent = nghttp3_qpack_context_dtable_top(&encoder->ctx);
       dres.index = (nghttp3_ssize)new_ent->absidx;
     }
-    *pmax_cnt = nghttp3_max(*pmax_cnt, (size_t)(dres.index + 1));
-    *pmin_cnt = nghttp3_min(*pmin_cnt, (size_t)(dres.index + 1));
+    *pmax_cnt = nghttp3_max_uint64(*pmax_cnt, (uint64_t)(dres.index + 1));
+    *pmin_cnt = nghttp3_min_uint64(*pmin_cnt, (uint64_t)(dres.index + 1));
 
     return nghttp3_qpack_encoder_write_dynamic_indexed(
-        encoder, rbuf, (size_t)dres.index, base);
+      encoder, rbuf, (size_t)dres.index, base);
   }
 
   if (sres.index != -1) {
@@ -1530,24 +1531,25 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
       }
       if (allow_blocking) {
         new_ent = nghttp3_qpack_context_dtable_top(&encoder->ctx);
-        *pmax_cnt = nghttp3_max(*pmax_cnt, new_ent->absidx + 1);
-        *pmin_cnt = nghttp3_min(*pmin_cnt, new_ent->absidx + 1);
+        *pmax_cnt = nghttp3_max_uint64(*pmax_cnt, new_ent->absidx + 1);
+        *pmin_cnt = nghttp3_min_uint64(*pmin_cnt, new_ent->absidx + 1);
 
         return nghttp3_qpack_encoder_write_dynamic_indexed(
-            encoder, rbuf, new_ent->absidx, base);
+          encoder, rbuf, new_ent->absidx, base);
       }
     }
 
     return nghttp3_qpack_encoder_write_static_indexed_name(
-        encoder, rbuf, (size_t)sres.index, nv);
+      encoder, rbuf, (size_t)sres.index, nv);
   }
 
   if (dres.index != -1) {
     if (just_index &&
         qpack_encoder_can_index_nv(
-            encoder, nv,
-            allow_blocking ? *pmin_cnt
-                           : nghttp3_min((size_t)dres.index + 1, *pmin_cnt))) {
+          encoder, nv,
+          allow_blocking
+            ? *pmin_cnt
+            : nghttp3_min_uint64((uint64_t)dres.index + 1, *pmin_cnt))) {
       rv = nghttp3_qpack_encoder_write_dynamic_insert(encoder, ebuf,
                                                       (size_t)dres.index, nv);
       if (rv != 0) {
@@ -1555,7 +1557,7 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
       }
 
       if (!allow_blocking) {
-        *pmin_cnt = nghttp3_min(*pmin_cnt, (size_t)dres.index + 1);
+        *pmin_cnt = nghttp3_min_uint64(*pmin_cnt, (uint64_t)dres.index + 1);
       }
 
       rv = nghttp3_qpack_encoder_dtable_dynamic_add(encoder, (size_t)dres.index,
@@ -1566,19 +1568,19 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
 
       if (allow_blocking) {
         new_ent = nghttp3_qpack_context_dtable_top(&encoder->ctx);
-        *pmax_cnt = nghttp3_max(*pmax_cnt, new_ent->absidx + 1);
-        *pmin_cnt = nghttp3_min(*pmin_cnt, new_ent->absidx + 1);
+        *pmax_cnt = nghttp3_max_uint64(*pmax_cnt, new_ent->absidx + 1);
+        *pmin_cnt = nghttp3_min_uint64(*pmin_cnt, new_ent->absidx + 1);
 
         return nghttp3_qpack_encoder_write_dynamic_indexed(
-            encoder, rbuf, new_ent->absidx, base);
+          encoder, rbuf, new_ent->absidx, base);
       }
     }
 
-    *pmax_cnt = nghttp3_max(*pmax_cnt, (size_t)(dres.index + 1));
-    *pmin_cnt = nghttp3_min(*pmin_cnt, (size_t)(dres.index + 1));
+    *pmax_cnt = nghttp3_max_uint64(*pmax_cnt, (uint64_t)(dres.index + 1));
+    *pmin_cnt = nghttp3_min_uint64(*pmin_cnt, (uint64_t)(dres.index + 1));
 
     return nghttp3_qpack_encoder_write_dynamic_indexed_name(
-        encoder, rbuf, (size_t)dres.index, base, nv);
+      encoder, rbuf, (size_t)dres.index, base, nv);
   }
 
   if (just_index && qpack_encoder_can_index_nv(encoder, nv, *pmin_cnt)) {
@@ -1592,8 +1594,8 @@ int nghttp3_qpack_encoder_encode_nv(nghttp3_qpack_encoder *encoder,
     }
     if (allow_blocking) {
       new_ent = nghttp3_qpack_context_dtable_top(&encoder->ctx);
-      *pmax_cnt = nghttp3_max(*pmax_cnt, new_ent->absidx + 1);
-      *pmin_cnt = nghttp3_min(*pmin_cnt, new_ent->absidx + 1);
+      *pmax_cnt = nghttp3_max_uint64(*pmax_cnt, new_ent->absidx + 1);
+      *pmin_cnt = nghttp3_min_uint64(*pmin_cnt, new_ent->absidx + 1);
 
       return nghttp3_qpack_encoder_write_dynamic_indexed(encoder, rbuf,
                                                          new_ent->absidx, base);
@@ -1634,9 +1636,9 @@ nghttp3_qpack_lookup_stable(const nghttp3_nv *nv, int32_t token,
 }
 
 nghttp3_qpack_lookup_result nghttp3_qpack_encoder_lookup_dtable(
-    nghttp3_qpack_encoder *encoder, const nghttp3_nv *nv, int32_t token,
-    uint32_t hash, nghttp3_qpack_indexing_mode indexing_mode, uint64_t krcnt,
-    int allow_blocking) {
+  nghttp3_qpack_encoder *encoder, const nghttp3_nv *nv, int32_t token,
+  uint32_t hash, nghttp3_qpack_indexing_mode indexing_mode, uint64_t krcnt,
+  int allow_blocking) {
   nghttp3_qpack_lookup_result res = {-1, 0, -1};
   int exact_match = 0;
   nghttp3_qpack_entry *match, *pb_match;
@@ -1659,7 +1661,7 @@ int nghttp3_qpack_header_block_ref_new(nghttp3_qpack_header_block_ref **pref,
                                        uint64_t max_cnt, uint64_t min_cnt,
                                        const nghttp3_mem *mem) {
   nghttp3_qpack_header_block_ref *ref =
-      nghttp3_mem_malloc(mem, sizeof(nghttp3_qpack_header_block_ref));
+    nghttp3_mem_malloc(mem, sizeof(nghttp3_qpack_header_block_ref));
 
   if (ref == NULL) {
     return NGHTTP3_ERR_NOMEM;
@@ -1683,9 +1685,9 @@ void nghttp3_qpack_header_block_ref_del(nghttp3_qpack_header_block_ref *ref,
 static int ref_max_cnt_greater(const nghttp3_pq_entry *lhsx,
                                const nghttp3_pq_entry *rhsx) {
   const nghttp3_qpack_header_block_ref *lhs =
-      nghttp3_struct_of(lhsx, nghttp3_qpack_header_block_ref, max_cnts_pe);
+    nghttp3_struct_of(lhsx, nghttp3_qpack_header_block_ref, max_cnts_pe);
   const nghttp3_qpack_header_block_ref *rhs =
-      nghttp3_struct_of(rhsx, nghttp3_qpack_header_block_ref, max_cnts_pe);
+    nghttp3_struct_of(rhsx, nghttp3_qpack_header_block_ref, max_cnts_pe);
 
   return lhs->max_cnt > rhs->max_cnt;
 }
@@ -1729,8 +1731,8 @@ void nghttp3_qpack_stream_del(nghttp3_qpack_stream *stream,
 
   len = nghttp3_ringbuf_len(&stream->refs);
   for (i = 0; i < len; ++i) {
-    ref = *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs,
-                                                                  i);
+    ref =
+      *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs, i);
     nghttp3_qpack_header_block_ref_del(ref, mem);
   }
 
@@ -1776,7 +1778,7 @@ void nghttp3_qpack_stream_pop_ref(nghttp3_qpack_stream *stream) {
   assert(nghttp3_ringbuf_len(&stream->refs));
 
   ref =
-      *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs, 0);
+    *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs, 0);
 
   assert(ref->max_cnts_pe.index != NGHTTP3_PQ_BAD_INDEX);
 
@@ -1832,7 +1834,7 @@ static int qpack_encoder_write_indexed_name(nghttp3_qpack_encoder *encoder,
   int h = 0;
 
   hlen = nghttp3_qpack_huffman_encode_count(nv->value, nv->valuelen);
-  if (hlen < nv->valuelen) {
+  if (hlen * 4 < nv->valuelen * 3) {
     h = 1;
     len += nghttp3_qpack_put_varint_len(hlen, 7) + hlen;
   } else {
@@ -1869,10 +1871,10 @@ static int qpack_encoder_write_indexed_name(nghttp3_qpack_encoder *encoder,
 }
 
 int nghttp3_qpack_encoder_write_static_indexed_name(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
-    const nghttp3_nv *nv) {
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
+  const nghttp3_nv *nv) {
   uint8_t fb =
-      (uint8_t)(0x50 | ((nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) ? 0x20 : 0));
+    (uint8_t)(0x50 | ((nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) ? 0x20 : 0));
 
   DEBUGF("qpack::encode: Literal Field Line With Name Reference (static) "
          "absidx=%" PRIu64 " never=%d\n",
@@ -1881,8 +1883,8 @@ int nghttp3_qpack_encoder_write_static_indexed_name(
 }
 
 int nghttp3_qpack_encoder_write_dynamic_indexed_name(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
-    uint64_t base, const nghttp3_nv *nv) {
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
+  uint64_t base, const nghttp3_nv *nv) {
   uint8_t fb;
 
   DEBUGF("qpack::encode: Literal Field Line With Name Reference (dynamic) "
@@ -1890,8 +1892,8 @@ int nghttp3_qpack_encoder_write_dynamic_indexed_name(
          absidx, base, (nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) != 0);
 
   if (absidx < base) {
-    fb = (uint8_t)(0x40 |
-                   ((nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) ? 0x20 : 0));
+    fb =
+      (uint8_t)(0x40 | ((nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) ? 0x20 : 0));
     return qpack_encoder_write_indexed_name(encoder, rbuf, fb,
                                             base - absidx - 1, 4, nv);
   }
@@ -1923,7 +1925,7 @@ static int qpack_encoder_write_literal(nghttp3_qpack_encoder *encoder,
   int nh = 0, vh = 0;
 
   nhlen = nghttp3_qpack_huffman_encode_count(nv->name, nv->namelen);
-  if (nhlen < nv->namelen) {
+  if (nhlen * 4 < nv->namelen * 3) {
     nh = 1;
     len = nghttp3_qpack_put_varint_len(nhlen, prefix) + nhlen;
   } else {
@@ -1931,7 +1933,7 @@ static int qpack_encoder_write_literal(nghttp3_qpack_encoder *encoder,
   }
 
   vhlen = nghttp3_qpack_huffman_encode_count(nv->value, nv->valuelen);
-  if (vhlen < nv->valuelen) {
+  if (vhlen * 4 < nv->valuelen * 3) {
     vh = 1;
     len += nghttp3_qpack_put_varint_len(vhlen, 7) + vhlen;
   } else {
@@ -1981,7 +1983,7 @@ int nghttp3_qpack_encoder_write_literal(nghttp3_qpack_encoder *encoder,
                                         nghttp3_buf *rbuf,
                                         const nghttp3_nv *nv) {
   uint8_t fb =
-      (uint8_t)(0x20 | ((nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) ? 0x10 : 0));
+    (uint8_t)(0x20 | ((nv->flags & NGHTTP3_NV_FLAG_NEVER_INDEX) ? 0x10 : 0));
 
   DEBUGF("qpack::encode: Literal Field Line With Literal Name\n");
   return qpack_encoder_write_literal(encoder, rbuf, fb, 3, nv);
@@ -2005,7 +2007,7 @@ int nghttp3_qpack_encoder_write_dynamic_insert(nghttp3_qpack_encoder *encoder,
          "\n",
          absidx);
   return qpack_encoder_write_indexed_name(
-      encoder, ebuf, 0x80, encoder->ctx.next_absidx - absidx - 1, 6, nv);
+    encoder, ebuf, 0x80, encoder->ctx.next_absidx - absidx - 1, 6, nv);
 }
 
 int nghttp3_qpack_encoder_write_duplicate_insert(nghttp3_qpack_encoder *encoder,
@@ -2257,10 +2259,10 @@ void nghttp3_qpack_entry_free(nghttp3_qpack_entry *ent) {
 int nghttp3_qpack_encoder_block_stream(nghttp3_qpack_encoder *encoder,
                                        nghttp3_qpack_stream *stream) {
   nghttp3_blocked_streams_key bsk = {
-      nghttp3_struct_of(nghttp3_pq_top(&stream->max_cnts),
-                        nghttp3_qpack_header_block_ref, max_cnts_pe)
-          ->max_cnt,
-      (uint64_t)stream->stream_id};
+    nghttp3_struct_of(nghttp3_pq_top(&stream->max_cnts),
+                      nghttp3_qpack_header_block_ref, max_cnts_pe)
+      ->max_cnt,
+    (uint64_t)stream->stream_id};
 
   return nghttp3_ksl_insert(&encoder->blocked_streams, NULL, &bsk, stream);
 }
@@ -2268,10 +2270,10 @@ int nghttp3_qpack_encoder_block_stream(nghttp3_qpack_encoder *encoder,
 void nghttp3_qpack_encoder_unblock_stream(nghttp3_qpack_encoder *encoder,
                                           nghttp3_qpack_stream *stream) {
   nghttp3_blocked_streams_key bsk = {
-      nghttp3_struct_of(nghttp3_pq_top(&stream->max_cnts),
-                        nghttp3_qpack_header_block_ref, max_cnts_pe)
-          ->max_cnt,
-      (uint64_t)stream->stream_id};
+    nghttp3_struct_of(nghttp3_pq_top(&stream->max_cnts),
+                      nghttp3_qpack_header_block_ref, max_cnts_pe)
+      ->max_cnt,
+    (uint64_t)stream->stream_id};
   nghttp3_ksl_it it;
 
   /* This is purely debugging purpose only */
@@ -2299,7 +2301,7 @@ void nghttp3_qpack_encoder_unblock(nghttp3_qpack_encoder *encoder,
 int nghttp3_qpack_encoder_ack_header(nghttp3_qpack_encoder *encoder,
                                      int64_t stream_id) {
   nghttp3_qpack_stream *stream =
-      nghttp3_qpack_encoder_find_stream(encoder, stream_id);
+    nghttp3_qpack_encoder_find_stream(encoder, stream_id);
   const nghttp3_mem *mem = encoder->ctx.mem;
   nghttp3_qpack_header_block_ref *ref;
 
@@ -2310,7 +2312,7 @@ int nghttp3_qpack_encoder_ack_header(nghttp3_qpack_encoder *encoder,
   assert(nghttp3_ringbuf_len(&stream->refs));
 
   ref =
-      *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs, 0);
+    *(nghttp3_qpack_header_block_ref **)nghttp3_ringbuf_get(&stream->refs, 0);
 
   DEBUGF("qpack::encoder: Header acknowledgement stream=%ld ricnt=%" PRIu64
          " krcnt=%" PRIu64 "\n",
@@ -2357,15 +2359,15 @@ void nghttp3_qpack_encoder_ack_everything(nghttp3_qpack_encoder *encoder) {
 
   nghttp3_ksl_clear(&encoder->blocked_streams);
   nghttp3_pq_clear(&encoder->min_cnts);
-  nghttp3_map_each_free(&encoder->streams, map_stream_free,
-                        (void *)encoder->ctx.mem);
+  nghttp3_map_each(&encoder->streams, map_stream_free,
+                   (void *)encoder->ctx.mem);
   nghttp3_map_clear(&encoder->streams);
 }
 
 void nghttp3_qpack_encoder_cancel_stream(nghttp3_qpack_encoder *encoder,
                                          int64_t stream_id) {
   nghttp3_qpack_stream *stream =
-      nghttp3_qpack_encoder_find_stream(encoder, stream_id);
+    nghttp3_qpack_encoder_find_stream(encoder, stream_id);
   const nghttp3_mem *mem = encoder->ctx.mem;
 
   if (stream == NULL) {
@@ -2387,10 +2389,10 @@ nghttp3_qpack_encoder_get_num_blocked_streams(nghttp3_qpack_encoder *encoder) {
 }
 
 int nghttp3_qpack_encoder_write_field_section_prefix(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *pbuf, uint64_t ricnt,
-    uint64_t base) {
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *pbuf, uint64_t ricnt,
+  uint64_t base) {
   size_t max_ents =
-      encoder->ctx.hard_max_dtable_capacity / NGHTTP3_QPACK_ENTRY_OVERHEAD;
+    encoder->ctx.hard_max_dtable_capacity / NGHTTP3_QPACK_ENTRY_OVERHEAD;
   uint64_t encricnt = ricnt == 0 ? 0 : (ricnt % (2 * max_ents)) + 1;
   int sign = base < ricnt;
   uint64_t delta_base = sign ? ricnt - base - 1 : base - ricnt;
@@ -2523,20 +2525,31 @@ nghttp3_ssize nghttp3_qpack_encoder_read_decoder(nghttp3_qpack_encoder *encoder,
     return 0;
   }
 
+  encoder->uninterrupted_decoderlen += srclen;
+  if (encoder->uninterrupted_decoderlen > NGHTTP3_QPACK_MAX_DECODERLEN) {
+    return NGHTTP3_ERR_QPACK_DECODER_STREAM_ERROR;
+  }
+
   end = src + srclen;
 
   for (; p != end;) {
     switch (encoder->state) {
     case NGHTTP3_QPACK_DS_STATE_OPCODE:
-      if ((*p) & 0x80) {
+      switch ((*p) & 0xc0) {
+      case 0x80:
+      case 0xc0:
         DEBUGF("qpack::encode: OPCODE_SECTION_ACK\n");
         encoder->opcode = NGHTTP3_QPACK_DS_OPCODE_SECTION_ACK;
         encoder->rstate.prefix = 7;
-      } else if ((*p) & 0x40) {
+
+        break;
+      case 0x40:
         DEBUGF("qpack::encode: OPCODE_STREAM_CANCEL\n");
         encoder->opcode = NGHTTP3_QPACK_DS_OPCODE_STREAM_CANCEL;
         encoder->rstate.prefix = 6;
-      } else {
+
+        break;
+      default:
         DEBUGF("qpack::encode: OPCODE_ICNT_INCREMENT\n");
         encoder->opcode = NGHTTP3_QPACK_DS_OPCODE_ICNT_INCREMENT;
         encoder->rstate.prefix = 6;
@@ -2670,6 +2683,7 @@ int nghttp3_qpack_decoder_init(nghttp3_qpack_decoder *decoder,
   decoder->opcode = 0;
   decoder->written_icnt = 0;
   decoder->max_concurrent_streams = 0;
+  decoder->uninterrupted_encoderlen = 0;
 
   nghttp3_qpack_read_state_reset(&decoder->rstate);
   nghttp3_buf_init(&decoder->dbuf);
@@ -2726,7 +2740,7 @@ static nghttp3_ssize qpack_read_string(nghttp3_qpack_read_state *rstate,
                                        nghttp3_buf *dest, const uint8_t *begin,
                                        const uint8_t *end) {
   size_t len = (size_t)(end - begin);
-  size_t n = (size_t)nghttp3_min((uint64_t)len, rstate->left);
+  size_t n = (size_t)nghttp3_min_uint64((uint64_t)len, rstate->left);
 
   dest->last = nghttp3_cpymem(dest->last, begin, n);
 
@@ -2747,10 +2761,10 @@ static int qpack_decoder_validate_index(nghttp3_qpack_decoder *decoder,
                                         nghttp3_qpack_read_state *rstate) {
   if (rstate->dynamic) {
     return rstate->absidx < decoder->ctx.next_absidx &&
-                   decoder->ctx.next_absidx - rstate->absidx - 1 <
-                       nghttp3_ringbuf_len(&decoder->ctx.dtable)
-               ? 0
-               : NGHTTP3_ERR_QPACK_FATAL;
+               decoder->ctx.next_absidx - rstate->absidx - 1 <
+                 nghttp3_ringbuf_len(&decoder->ctx.dtable)
+             ? 0
+             : NGHTTP3_ERR_QPACK_FATAL;
   }
   return rstate->absidx < nghttp3_arraylen(stable) ? 0
                                                    : NGHTTP3_ERR_QPACK_FATAL;
@@ -2789,39 +2803,51 @@ nghttp3_ssize nghttp3_qpack_decoder_read_encoder(nghttp3_qpack_decoder *decoder,
     return 0;
   }
 
+  decoder->uninterrupted_encoderlen += srclen;
+  if (decoder->uninterrupted_encoderlen > NGHTTP3_QPACK_MAX_ENCODERLEN) {
+    return NGHTTP3_ERR_QPACK_ENCODER_STREAM_ERROR;
+  }
+
   end = src + srclen;
 
   for (; p != end || busy;) {
     busy = 0;
     switch (decoder->state) {
     case NGHTTP3_QPACK_ES_STATE_OPCODE:
-      if ((*p) & 0x80) {
+      switch ((*p) & 0xe0) {
+      case 0x80:
+      case 0xa0:
+      case 0xc0:
+      case 0xe0:
         DEBUGF("qpack::decode: OPCODE_INSERT_INDEXED\n");
         decoder->opcode = NGHTTP3_QPACK_ES_OPCODE_INSERT_INDEXED;
         decoder->rstate.dynamic = !((*p) & 0x40);
         decoder->rstate.prefix = 6;
         decoder->state = NGHTTP3_QPACK_ES_STATE_READ_INDEX;
-      } else if ((*p) & 0x40) {
+
+        break;
+      case 0x40:
+      case 0x60:
         DEBUGF("qpack::decode: OPCODE_INSERT\n");
         decoder->opcode = NGHTTP3_QPACK_ES_OPCODE_INSERT;
         decoder->rstate.dynamic = 0;
         decoder->rstate.prefix = 5;
         decoder->state = NGHTTP3_QPACK_ES_STATE_CHECK_NAME_HUFFMAN;
-      } else if ((*p) & 0x20) {
+
+        break;
+      case 0x20:
         DEBUGF("qpack::decode: OPCODE_SET_DTABLE_TABLE_CAP\n");
         decoder->opcode = NGHTTP3_QPACK_ES_OPCODE_SET_DTABLE_CAP;
         decoder->rstate.prefix = 5;
         decoder->state = NGHTTP3_QPACK_ES_STATE_READ_INDEX;
-      } else if (!((*p) & 0x20)) {
+
+        break;
+      default:
         DEBUGF("qpack::decode: OPCODE_DUPLICATE\n");
         decoder->opcode = NGHTTP3_QPACK_ES_OPCODE_DUPLICATE;
         decoder->rstate.dynamic = 1;
         decoder->rstate.prefix = 5;
         decoder->state = NGHTTP3_QPACK_ES_STATE_READ_INDEX;
-      } else {
-        DEBUGF("qpack::decode: unknown opcode %02x\n", *p);
-        rv = NGHTTP3_ERR_QPACK_ENCODER_STREAM_ERROR;
-        goto fail;
       }
       break;
     case NGHTTP3_QPACK_ES_STATE_READ_INDEX:
@@ -2842,7 +2868,7 @@ nghttp3_ssize nghttp3_qpack_decoder_read_encoder(nghttp3_qpack_decoder *decoder,
         DEBUGF("qpack::decode: Set dtable capacity to %" PRIu64 "\n",
                decoder->rstate.left);
         rv = nghttp3_qpack_decoder_set_max_dtable_capacity(
-            decoder, (size_t)decoder->rstate.left);
+          decoder, (size_t)decoder->rstate.left);
         if (rv != 0) {
           rv = NGHTTP3_ERR_QPACK_ENCODER_STREAM_ERROR;
           goto fail;
@@ -2944,7 +2970,7 @@ nghttp3_ssize nghttp3_qpack_decoder_read_encoder(nghttp3_qpack_decoder *decoder,
       break;
     case NGHTTP3_QPACK_ES_STATE_READ_NAME:
       nread =
-          qpack_read_string(&decoder->rstate, &decoder->rstate.namebuf, p, end);
+        qpack_read_string(&decoder->rstate, &decoder->rstate.namebuf, p, end);
       if (nread < 0) {
         rv = (int)nread;
         goto fail;
@@ -3042,8 +3068,8 @@ nghttp3_ssize nghttp3_qpack_decoder_read_encoder(nghttp3_qpack_decoder *decoder,
       nghttp3_qpack_read_state_reset(&decoder->rstate);
       break;
     case NGHTTP3_QPACK_ES_STATE_READ_VALUE:
-      nread = qpack_read_string(&decoder->rstate, &decoder->rstate.valuebuf, p,
-                                end);
+      nread =
+        qpack_read_string(&decoder->rstate, &decoder->rstate.valuebuf, p, end);
       if (nread < 0) {
         rv = (int)nread;
         goto fail;
@@ -3085,7 +3111,7 @@ nghttp3_ssize nghttp3_qpack_decoder_read_encoder(nghttp3_qpack_decoder *decoder,
 }
 
 int nghttp3_qpack_decoder_set_max_dtable_capacity(
-    nghttp3_qpack_decoder *decoder, size_t max_dtable_capacity) {
+  nghttp3_qpack_decoder *decoder, size_t max_dtable_capacity) {
   nghttp3_qpack_entry *ent;
   size_t i;
   nghttp3_qpack_context *ctx = &decoder->ctx;
@@ -3230,9 +3256,9 @@ int nghttp3_qpack_decoder_dtable_literal_add(nghttp3_qpack_decoder *decoder) {
 }
 
 void nghttp3_qpack_decoder_set_max_concurrent_streams(
-    nghttp3_qpack_decoder *decoder, size_t max_concurrent_streams) {
+  nghttp3_qpack_decoder *decoder, size_t max_concurrent_streams) {
   decoder->max_concurrent_streams =
-      nghttp3_max(decoder->max_concurrent_streams, max_concurrent_streams);
+    nghttp3_max_size(decoder->max_concurrent_streams, max_concurrent_streams);
 }
 
 void nghttp3_qpack_stream_context_init(nghttp3_qpack_stream_context *sctx,
@@ -3357,33 +3383,53 @@ nghttp3_qpack_decoder_read_request(nghttp3_qpack_decoder *decoder,
     case NGHTTP3_QPACK_RS_STATE_OPCODE:
       assert(sctx->rstate.left == 0);
       assert(sctx->rstate.shift == 0);
-      if ((*p) & 0x80) {
+      switch ((*p) & 0xf0) {
+      case 0x80:
+      case 0x90:
+      case 0xa0:
+      case 0xb0:
+      case 0xc0:
+      case 0xd0:
+      case 0xe0:
+      case 0xf0:
         DEBUGF("qpack::decode: OPCODE_INDEXED\n");
         sctx->opcode = NGHTTP3_QPACK_RS_OPCODE_INDEXED;
         sctx->rstate.dynamic = !((*p) & 0x40);
         sctx->rstate.prefix = 6;
         sctx->state = NGHTTP3_QPACK_RS_STATE_READ_INDEX;
-      } else if ((*p) & 0x40) {
+
+        break;
+      case 0x40:
+      case 0x50:
+      case 0x60:
+      case 0x70:
         DEBUGF("qpack::decode: OPCODE_INDEXED_NAME\n");
         sctx->opcode = NGHTTP3_QPACK_RS_OPCODE_INDEXED_NAME;
         sctx->rstate.never = (*p) & 0x20;
         sctx->rstate.dynamic = !((*p) & 0x10);
         sctx->rstate.prefix = 4;
         sctx->state = NGHTTP3_QPACK_RS_STATE_READ_INDEX;
-      } else if ((*p) & 0x20) {
+
+        break;
+      case 0x20:
+      case 0x30:
         DEBUGF("qpack::decode: OPCODE_LITERAL\n");
         sctx->opcode = NGHTTP3_QPACK_RS_OPCODE_LITERAL;
         sctx->rstate.never = (*p) & 0x10;
         sctx->rstate.dynamic = 0;
         sctx->rstate.prefix = 3;
         sctx->state = NGHTTP3_QPACK_RS_STATE_CHECK_NAME_HUFFMAN;
-      } else if ((*p) & 0x10) {
+
+        break;
+      case 0x10:
         DEBUGF("qpack::decode: OPCODE_INDEXED_PB\n");
         sctx->opcode = NGHTTP3_QPACK_RS_OPCODE_INDEXED_PB;
         sctx->rstate.dynamic = 1;
         sctx->rstate.prefix = 4;
         sctx->state = NGHTTP3_QPACK_RS_STATE_READ_INDEX;
-      } else {
+
+        break;
+      default:
         DEBUGF("qpack::decode: OPCODE_INDEXED_NAME_PB\n");
         sctx->opcode = NGHTTP3_QPACK_RS_OPCODE_INDEXED_NAME_PB;
         sctx->rstate.never = (*p) & 0x08;
@@ -3494,8 +3540,8 @@ nghttp3_qpack_decoder_read_request(nghttp3_qpack_decoder *decoder,
                             sctx->rstate.name->len);
       break;
     case NGHTTP3_QPACK_RS_STATE_READ_NAME_HUFFMAN:
-      nread = qpack_read_huffman_string(&sctx->rstate, &sctx->rstate.namebuf, p,
-                                        end);
+      nread =
+        qpack_read_huffman_string(&sctx->rstate, &sctx->rstate.namebuf, p, end);
       if (nread < 0) {
         assert(NGHTTP3_ERR_QPACK_FATAL == nread);
         rv = NGHTTP3_ERR_QPACK_DECOMPRESSION_FAILED;
@@ -3679,6 +3725,8 @@ nghttp3_qpack_decoder_read_request(nghttp3_qpack_decoder *decoder,
         goto fail;
       }
     }
+
+    decoder->uninterrupted_encoderlen = 0;
   }
 
   return p - src;
@@ -3689,13 +3737,13 @@ nghttp3_qpack_decoder_read_request(nghttp3_qpack_decoder *decoder,
 }
 
 static int qpack_decoder_dbuf_overflow(nghttp3_qpack_decoder *decoder) {
-  size_t limit = nghttp3_max(decoder->max_concurrent_streams, 100);
+  size_t limit = nghttp3_max_size(decoder->max_concurrent_streams, 100);
   /* 10 = nghttp3_qpack_put_varint_len((1ULL << 62) - 1, 2)) */
   return nghttp3_buf_len(&decoder->dbuf) > limit * 2 * 10;
 }
 
 int nghttp3_qpack_decoder_write_section_ack(
-    nghttp3_qpack_decoder *decoder, const nghttp3_qpack_stream_context *sctx) {
+  nghttp3_qpack_decoder *decoder, const nghttp3_qpack_stream_context *sctx) {
   nghttp3_buf *dbuf = &decoder->dbuf;
   uint8_t *p;
   int rv;
@@ -3704,9 +3752,9 @@ int nghttp3_qpack_decoder_write_section_ack(
     return NGHTTP3_ERR_QPACK_FATAL;
   }
 
-  rv = reserve_buf_small(
-      dbuf, nghttp3_qpack_put_varint_len((uint64_t)sctx->stream_id, 7),
-      decoder->ctx.mem);
+  rv = reserve_buf(dbuf,
+                   nghttp3_qpack_put_varint_len((uint64_t)sctx->stream_id, 7),
+                   decoder->ctx.mem);
   if (rv != 0) {
     return rv;
   }
@@ -3798,7 +3846,7 @@ int nghttp3_qpack_decoder_reconstruct_ricnt(nghttp3_qpack_decoder *decoder,
   }
 
   max_ents =
-      decoder->ctx.hard_max_dtable_capacity / NGHTTP3_QPACK_ENTRY_OVERHEAD;
+    decoder->ctx.hard_max_dtable_capacity / NGHTTP3_QPACK_ENTRY_OVERHEAD;
   full = 2 * max_ents;
 
   if (encricnt > full) {
@@ -3910,7 +3958,7 @@ qpack_decoder_emit_dynamic_indexed(nghttp3_qpack_decoder *decoder,
                                    nghttp3_qpack_stream_context *sctx,
                                    nghttp3_qpack_nv *nv) {
   nghttp3_qpack_entry *ent =
-      nghttp3_qpack_context_dtable_get(&decoder->ctx, sctx->rstate.absidx);
+    nghttp3_qpack_context_dtable_get(&decoder->ctx, sctx->rstate.absidx);
 
   *nv = ent->nv;
 
@@ -3942,7 +3990,7 @@ qpack_decoder_emit_static_indexed_name(nghttp3_qpack_decoder *decoder,
   nv->value = sctx->rstate.value;
   nv->token = shd->token;
   nv->flags =
-      sctx->rstate.never ? NGHTTP3_NV_FLAG_NEVER_INDEX : NGHTTP3_NV_FLAG_NONE;
+    sctx->rstate.never ? NGHTTP3_NV_FLAG_NEVER_INDEX : NGHTTP3_NV_FLAG_NONE;
 
   sctx->rstate.value = NULL;
 }
@@ -3965,7 +4013,7 @@ qpack_decoder_emit_dynamic_indexed_name(nghttp3_qpack_decoder *decoder,
   nv->value = sctx->rstate.value;
   nv->token = ent->nv.token;
   nv->flags =
-      sctx->rstate.never ? NGHTTP3_NV_FLAG_NEVER_INDEX : NGHTTP3_NV_FLAG_NONE;
+    sctx->rstate.never ? NGHTTP3_NV_FLAG_NEVER_INDEX : NGHTTP3_NV_FLAG_NONE;
 
   nghttp3_rcbuf_incref(nv->name);
 
@@ -4005,7 +4053,7 @@ void nghttp3_qpack_decoder_emit_literal(nghttp3_qpack_decoder *decoder,
   nv->value = sctx->rstate.value;
   nv->token = qpack_lookup_token(nv->name->base, nv->name->len);
   nv->flags =
-      sctx->rstate.never ? NGHTTP3_NV_FLAG_NEVER_INDEX : NGHTTP3_NV_FLAG_NONE;
+    sctx->rstate.never ? NGHTTP3_NV_FLAG_NEVER_INDEX : NGHTTP3_NV_FLAG_NONE;
 
   sctx->rstate.name = NULL;
   sctx->rstate.value = NULL;
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.h b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.h
index 804969e14d6091..d2bb8a3581135b 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -48,6 +48,14 @@
 /* NGHTTP3_QPACK_MAX_VALUELEN is the maximum (compressed) length of
    header value this library can decode. */
 #define NGHTTP3_QPACK_MAX_VALUELEN 65536
+/* NGHTTP3_QPACK_MAX_ENCODERLEN is the maximum encoder stream length
+   that a decoder accepts without completely processing a single field
+   section. */
+#define NGHTTP3_QPACK_MAX_ENCODERLEN (128 * 1024)
+/* NGHTTP3_QPACK_MAX_DECODERLEN is the maximum decoder stream length
+   that an encoder accepts without completely encoding a single field
+   section. */
+#define NGHTTP3_QPACK_MAX_DECODERLEN (4 * 1024)
 
 /* nghttp3_qpack_indexing_mode is a indexing strategy. */
 typedef enum nghttp3_qpack_indexing_mode {
@@ -250,6 +258,9 @@ struct nghttp3_qpack_encoder {
   /* last_max_dtable_update is the dynamic table size last
      requested. */
   size_t last_max_dtable_update;
+  /* uninterrupted_decoderlen is the number of bytes read from decoder
+     stream without encoding a single field section. */
+  size_t uninterrupted_decoderlen;
   /* flags is bitwise OR of zero or more of
      NGHTTP3_QPACK_ENCODER_FLAG_*. */
   uint8_t flags;
@@ -325,9 +336,9 @@ nghttp3_qpack_lookup_stable(const nghttp3_nv *nv, int32_t token,
  * blocked (or it has been blocked already).
  */
 nghttp3_qpack_lookup_result nghttp3_qpack_encoder_lookup_dtable(
-    nghttp3_qpack_encoder *encoder, const nghttp3_nv *nv, int32_t token,
-    uint32_t hash, nghttp3_qpack_indexing_mode indexing_mode, uint64_t krcnt,
-    int allow_blocking);
+  nghttp3_qpack_encoder *encoder, const nghttp3_nv *nv, int32_t token,
+  uint32_t hash, nghttp3_qpack_indexing_mode indexing_mode, uint64_t krcnt,
+  int allow_blocking);
 
 /*
  * nghttp3_qpack_encoder_write_field_section_prefix writes Encoded
@@ -341,8 +352,8 @@ nghttp3_qpack_lookup_result nghttp3_qpack_encoder_lookup_dtable(
  *     Out of memory.
  */
 int nghttp3_qpack_encoder_write_field_section_prefix(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *pbuf, uint64_t ricnt,
-    uint64_t base);
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *pbuf, uint64_t ricnt,
+  uint64_t base);
 
 /*
  * nghttp3_qpack_encoder_write_static_indexed writes Indexed Header
@@ -386,8 +397,8 @@ int nghttp3_qpack_encoder_write_dynamic_indexed(nghttp3_qpack_encoder *encoder,
  *     Out of memory.
  */
 int nghttp3_qpack_encoder_write_static_indexed_name(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
-    const nghttp3_nv *nv);
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
+  const nghttp3_nv *nv);
 
 /*
  * nghttp3_qpack_encoder_write_dynamic_indexed writes Literal Header
@@ -402,8 +413,8 @@ int nghttp3_qpack_encoder_write_static_indexed_name(
  *     Out of memory.
  */
 int nghttp3_qpack_encoder_write_dynamic_indexed_name(
-    nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
-    uint64_t base, const nghttp3_nv *nv);
+  nghttp3_qpack_encoder *encoder, nghttp3_buf *rbuf, uint64_t absidx,
+  uint64_t base, const nghttp3_nv *nv);
 
 /*
  * nghttp3_qpack_encoder_write_literal writes Literal Header Field
@@ -779,6 +790,9 @@ struct nghttp3_qpack_decoder {
      unidirectional streams which potentially receives QPACK encoded
      HEADER frame. */
   size_t max_concurrent_streams;
+  /* uninterrupted_encoderlen is the number of bytes read from encoder
+     stream without completing a single field section. */
+  size_t uninterrupted_encoderlen;
 };
 
 /*
@@ -880,19 +894,19 @@ int nghttp3_qpack_decoder_dtable_duplicate_add(nghttp3_qpack_decoder *decoder);
 int nghttp3_qpack_decoder_dtable_literal_add(nghttp3_qpack_decoder *decoder);
 
 struct nghttp3_qpack_stream_context {
-  /* state is a current state of reading request stream. */
-  nghttp3_qpack_request_stream_state state;
   /* rstate is a set of intermediate state which are used to process
      request stream. */
   nghttp3_qpack_read_state rstate;
   const nghttp3_mem *mem;
-  /* opcode is a request stream opcode being processed. */
-  nghttp3_qpack_request_stream_opcode opcode;
   int64_t stream_id;
   /* ricnt is Required Insert Count to decode this header block. */
   uint64_t ricnt;
   /* base is Base in Header Block Prefix. */
   uint64_t base;
+  /* state is a current state of reading request stream. */
+  nghttp3_qpack_request_stream_state state;
+  /* opcode is a request stream opcode being processed. */
+  nghttp3_qpack_request_stream_opcode opcode;
   /* dbase_sign is the delta base sign in Header Block Prefix. */
   int dbase_sign;
 };
@@ -991,6 +1005,6 @@ void nghttp3_qpack_decoder_emit_literal(nghttp3_qpack_decoder *decoder,
  *     Decoder stream overflow.
  */
 int nghttp3_qpack_decoder_write_section_ack(
-    nghttp3_qpack_decoder *decoder, const nghttp3_qpack_stream_context *sctx);
+  nghttp3_qpack_decoder *decoder, const nghttp3_qpack_stream_context *sctx);
 
-#endif /* NGHTTP3_QPACK_H */
+#endif /* !defined(NGHTTP3_QPACK_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.c b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.c
index c36a68ededd1af..3398f3f5080e60 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.c
@@ -78,7 +78,7 @@ uint8_t *nghttp3_qpack_huffman_encode(uint8_t *dest, const uint8_t *src,
 }
 
 void nghttp3_qpack_huffman_decode_context_init(
-    nghttp3_qpack_huffman_decode_context *ctx) {
+  nghttp3_qpack_huffman_decode_context *ctx) {
   ctx->fstate = NGHTTP3_QPACK_HUFFMAN_ACCEPTED;
 }
 
@@ -93,7 +93,9 @@ nghttp3_qpack_huffman_decode(nghttp3_qpack_huffman_decode_context *ctx,
   uint8_t c;
 
   /* We use the decoding algorithm described in
-     http://graphics.ics.uci.edu/pub/Prefix.pdf */
+      - http://graphics.ics.uci.edu/pub/Prefix.pdf [!!! NO LONGER VALID !!!]
+      - https://ics.uci.edu/~dan/pubs/Prefix.pdf
+      - https://github.com/nghttp2/nghttp2/files/15141264/Prefix.pdf */
   for (; src != end;) {
     c = *src++;
     t = &qpack_huffman_decode_table[t->fstate & 0x1ff][c >> 4];
@@ -117,6 +119,6 @@ nghttp3_qpack_huffman_decode(nghttp3_qpack_huffman_decode_context *ctx,
 }
 
 int nghttp3_qpack_huffman_decode_failure_state(
-    nghttp3_qpack_huffman_decode_context *ctx) {
+  nghttp3_qpack_huffman_decode_context *ctx) {
   return ctx->fstate == 0x100;
 }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.h b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.h
index fc3bc7b264a900..ab6d82a16a9134 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -75,7 +75,7 @@ typedef struct nghttp3_qpack_huffman_decode_context {
 extern const nghttp3_qpack_huffman_decode_node qpack_huffman_decode_table[][16];
 
 void nghttp3_qpack_huffman_decode_context_init(
-    nghttp3_qpack_huffman_decode_context *ctx);
+  nghttp3_qpack_huffman_decode_context *ctx);
 
 /*
  * nghttp3_qpack_huffman_decode decodes huffman encoded byte string
@@ -103,6 +103,6 @@ nghttp3_qpack_huffman_decode(nghttp3_qpack_huffman_decode_context *ctx,
  * indicates that huffman decoding context is in failure state.
  */
 int nghttp3_qpack_huffman_decode_failure_state(
-    nghttp3_qpack_huffman_decode_context *ctx);
+  nghttp3_qpack_huffman_decode_context *ctx);
 
-#endif /* NGHTTP3_QPACK_HUFFMAN_H */
+#endif /* !defined(NGHTTP3_QPACK_HUFFMAN_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman_data.c b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman_data.c
index 0c104dbc0a0bd8..7c3c230f041211 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman_data.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_qpack_huffman_data.c
@@ -28,4954 +28,4954 @@
 /* Generated by mkhufftbl.py */
 
 const nghttp3_qpack_huffman_sym huffman_sym_table[] = {
-    {13, 0xffc00000u}, {23, 0xffffb000u}, {28, 0xfffffe20u}, {28, 0xfffffe30u},
-    {28, 0xfffffe40u}, {28, 0xfffffe50u}, {28, 0xfffffe60u}, {28, 0xfffffe70u},
-    {28, 0xfffffe80u}, {24, 0xffffea00u}, {30, 0xfffffff0u}, {28, 0xfffffe90u},
-    {28, 0xfffffea0u}, {30, 0xfffffff4u}, {28, 0xfffffeb0u}, {28, 0xfffffec0u},
-    {28, 0xfffffed0u}, {28, 0xfffffee0u}, {28, 0xfffffef0u}, {28, 0xffffff00u},
-    {28, 0xffffff10u}, {28, 0xffffff20u}, {30, 0xfffffff8u}, {28, 0xffffff30u},
-    {28, 0xffffff40u}, {28, 0xffffff50u}, {28, 0xffffff60u}, {28, 0xffffff70u},
-    {28, 0xffffff80u}, {28, 0xffffff90u}, {28, 0xffffffa0u}, {28, 0xffffffb0u},
-    {6, 0x50000000u},  {10, 0xfe000000u}, {10, 0xfe400000u}, {12, 0xffa00000u},
-    {13, 0xffc80000u}, {6, 0x54000000u},  {8, 0xf8000000u},  {11, 0xff400000u},
-    {10, 0xfe800000u}, {10, 0xfec00000u}, {8, 0xf9000000u},  {11, 0xff600000u},
-    {8, 0xfa000000u},  {6, 0x58000000u},  {6, 0x5c000000u},  {6, 0x60000000u},
-    {5, 0x0u},         {5, 0x8000000u},   {5, 0x10000000u},  {6, 0x64000000u},
-    {6, 0x68000000u},  {6, 0x6c000000u},  {6, 0x70000000u},  {6, 0x74000000u},
-    {6, 0x78000000u},  {6, 0x7c000000u},  {7, 0xb8000000u},  {8, 0xfb000000u},
-    {15, 0xfff80000u}, {6, 0x80000000u},  {12, 0xffb00000u}, {10, 0xff000000u},
-    {13, 0xffd00000u}, {6, 0x84000000u},  {7, 0xba000000u},  {7, 0xbc000000u},
-    {7, 0xbe000000u},  {7, 0xc0000000u},  {7, 0xc2000000u},  {7, 0xc4000000u},
-    {7, 0xc6000000u},  {7, 0xc8000000u},  {7, 0xca000000u},  {7, 0xcc000000u},
-    {7, 0xce000000u},  {7, 0xd0000000u},  {7, 0xd2000000u},  {7, 0xd4000000u},
-    {7, 0xd6000000u},  {7, 0xd8000000u},  {7, 0xda000000u},  {7, 0xdc000000u},
-    {7, 0xde000000u},  {7, 0xe0000000u},  {7, 0xe2000000u},  {7, 0xe4000000u},
-    {8, 0xfc000000u},  {7, 0xe6000000u},  {8, 0xfd000000u},  {13, 0xffd80000u},
-    {19, 0xfffe0000u}, {13, 0xffe00000u}, {14, 0xfff00000u}, {6, 0x88000000u},
-    {15, 0xfffa0000u}, {5, 0x18000000u},  {6, 0x8c000000u},  {5, 0x20000000u},
-    {6, 0x90000000u},  {5, 0x28000000u},  {6, 0x94000000u},  {6, 0x98000000u},
-    {6, 0x9c000000u},  {5, 0x30000000u},  {7, 0xe8000000u},  {7, 0xea000000u},
-    {6, 0xa0000000u},  {6, 0xa4000000u},  {6, 0xa8000000u},  {5, 0x38000000u},
-    {6, 0xac000000u},  {7, 0xec000000u},  {6, 0xb0000000u},  {5, 0x40000000u},
-    {5, 0x48000000u},  {6, 0xb4000000u},  {7, 0xee000000u},  {7, 0xf0000000u},
-    {7, 0xf2000000u},  {7, 0xf4000000u},  {7, 0xf6000000u},  {15, 0xfffc0000u},
-    {11, 0xff800000u}, {14, 0xfff40000u}, {13, 0xffe80000u}, {28, 0xffffffc0u},
-    {20, 0xfffe6000u}, {22, 0xffff4800u}, {20, 0xfffe7000u}, {20, 0xfffe8000u},
-    {22, 0xffff4c00u}, {22, 0xffff5000u}, {22, 0xffff5400u}, {23, 0xffffb200u},
-    {22, 0xffff5800u}, {23, 0xffffb400u}, {23, 0xffffb600u}, {23, 0xffffb800u},
-    {23, 0xffffba00u}, {23, 0xffffbc00u}, {24, 0xffffeb00u}, {23, 0xffffbe00u},
-    {24, 0xffffec00u}, {24, 0xffffed00u}, {22, 0xffff5c00u}, {23, 0xffffc000u},
-    {24, 0xffffee00u}, {23, 0xffffc200u}, {23, 0xffffc400u}, {23, 0xffffc600u},
-    {23, 0xffffc800u}, {21, 0xfffee000u}, {22, 0xffff6000u}, {23, 0xffffca00u},
-    {22, 0xffff6400u}, {23, 0xffffcc00u}, {23, 0xffffce00u}, {24, 0xffffef00u},
-    {22, 0xffff6800u}, {21, 0xfffee800u}, {20, 0xfffe9000u}, {22, 0xffff6c00u},
-    {22, 0xffff7000u}, {23, 0xffffd000u}, {23, 0xffffd200u}, {21, 0xfffef000u},
-    {23, 0xffffd400u}, {22, 0xffff7400u}, {22, 0xffff7800u}, {24, 0xfffff000u},
-    {21, 0xfffef800u}, {22, 0xffff7c00u}, {23, 0xffffd600u}, {23, 0xffffd800u},
-    {21, 0xffff0000u}, {21, 0xffff0800u}, {22, 0xffff8000u}, {21, 0xffff1000u},
-    {23, 0xffffda00u}, {22, 0xffff8400u}, {23, 0xffffdc00u}, {23, 0xffffde00u},
-    {20, 0xfffea000u}, {22, 0xffff8800u}, {22, 0xffff8c00u}, {22, 0xffff9000u},
-    {23, 0xffffe000u}, {22, 0xffff9400u}, {22, 0xffff9800u}, {23, 0xffffe200u},
-    {26, 0xfffff800u}, {26, 0xfffff840u}, {20, 0xfffeb000u}, {19, 0xfffe2000u},
-    {22, 0xffff9c00u}, {23, 0xffffe400u}, {22, 0xffffa000u}, {25, 0xfffff600u},
-    {26, 0xfffff880u}, {26, 0xfffff8c0u}, {26, 0xfffff900u}, {27, 0xfffffbc0u},
-    {27, 0xfffffbe0u}, {26, 0xfffff940u}, {24, 0xfffff100u}, {25, 0xfffff680u},
-    {19, 0xfffe4000u}, {21, 0xffff1800u}, {26, 0xfffff980u}, {27, 0xfffffc00u},
-    {27, 0xfffffc20u}, {26, 0xfffff9c0u}, {27, 0xfffffc40u}, {24, 0xfffff200u},
-    {21, 0xffff2000u}, {21, 0xffff2800u}, {26, 0xfffffa00u}, {26, 0xfffffa40u},
-    {28, 0xffffffd0u}, {27, 0xfffffc60u}, {27, 0xfffffc80u}, {27, 0xfffffca0u},
-    {20, 0xfffec000u}, {24, 0xfffff300u}, {20, 0xfffed000u}, {21, 0xffff3000u},
-    {22, 0xffffa400u}, {21, 0xffff3800u}, {21, 0xffff4000u}, {23, 0xffffe600u},
-    {22, 0xffffa800u}, {22, 0xffffac00u}, {25, 0xfffff700u}, {25, 0xfffff780u},
-    {24, 0xfffff400u}, {24, 0xfffff500u}, {26, 0xfffffa80u}, {23, 0xffffe800u},
-    {26, 0xfffffac0u}, {27, 0xfffffcc0u}, {26, 0xfffffb00u}, {26, 0xfffffb40u},
-    {27, 0xfffffce0u}, {27, 0xfffffd00u}, {27, 0xfffffd20u}, {27, 0xfffffd40u},
-    {27, 0xfffffd60u}, {28, 0xffffffe0u}, {27, 0xfffffd80u}, {27, 0xfffffda0u},
-    {27, 0xfffffdc0u}, {27, 0xfffffde0u}, {27, 0xfffffe00u}, {26, 0xfffffb80u},
-    {30, 0xfffffffcu}};
+  {13, 0xffc00000u}, {23, 0xffffb000u}, {28, 0xfffffe20u}, {28, 0xfffffe30u},
+  {28, 0xfffffe40u}, {28, 0xfffffe50u}, {28, 0xfffffe60u}, {28, 0xfffffe70u},
+  {28, 0xfffffe80u}, {24, 0xffffea00u}, {30, 0xfffffff0u}, {28, 0xfffffe90u},
+  {28, 0xfffffea0u}, {30, 0xfffffff4u}, {28, 0xfffffeb0u}, {28, 0xfffffec0u},
+  {28, 0xfffffed0u}, {28, 0xfffffee0u}, {28, 0xfffffef0u}, {28, 0xffffff00u},
+  {28, 0xffffff10u}, {28, 0xffffff20u}, {30, 0xfffffff8u}, {28, 0xffffff30u},
+  {28, 0xffffff40u}, {28, 0xffffff50u}, {28, 0xffffff60u}, {28, 0xffffff70u},
+  {28, 0xffffff80u}, {28, 0xffffff90u}, {28, 0xffffffa0u}, {28, 0xffffffb0u},
+  {6, 0x50000000u},  {10, 0xfe000000u}, {10, 0xfe400000u}, {12, 0xffa00000u},
+  {13, 0xffc80000u}, {6, 0x54000000u},  {8, 0xf8000000u},  {11, 0xff400000u},
+  {10, 0xfe800000u}, {10, 0xfec00000u}, {8, 0xf9000000u},  {11, 0xff600000u},
+  {8, 0xfa000000u},  {6, 0x58000000u},  {6, 0x5c000000u},  {6, 0x60000000u},
+  {5, 0x0u},         {5, 0x8000000u},   {5, 0x10000000u},  {6, 0x64000000u},
+  {6, 0x68000000u},  {6, 0x6c000000u},  {6, 0x70000000u},  {6, 0x74000000u},
+  {6, 0x78000000u},  {6, 0x7c000000u},  {7, 0xb8000000u},  {8, 0xfb000000u},
+  {15, 0xfff80000u}, {6, 0x80000000u},  {12, 0xffb00000u}, {10, 0xff000000u},
+  {13, 0xffd00000u}, {6, 0x84000000u},  {7, 0xba000000u},  {7, 0xbc000000u},
+  {7, 0xbe000000u},  {7, 0xc0000000u},  {7, 0xc2000000u},  {7, 0xc4000000u},
+  {7, 0xc6000000u},  {7, 0xc8000000u},  {7, 0xca000000u},  {7, 0xcc000000u},
+  {7, 0xce000000u},  {7, 0xd0000000u},  {7, 0xd2000000u},  {7, 0xd4000000u},
+  {7, 0xd6000000u},  {7, 0xd8000000u},  {7, 0xda000000u},  {7, 0xdc000000u},
+  {7, 0xde000000u},  {7, 0xe0000000u},  {7, 0xe2000000u},  {7, 0xe4000000u},
+  {8, 0xfc000000u},  {7, 0xe6000000u},  {8, 0xfd000000u},  {13, 0xffd80000u},
+  {19, 0xfffe0000u}, {13, 0xffe00000u}, {14, 0xfff00000u}, {6, 0x88000000u},
+  {15, 0xfffa0000u}, {5, 0x18000000u},  {6, 0x8c000000u},  {5, 0x20000000u},
+  {6, 0x90000000u},  {5, 0x28000000u},  {6, 0x94000000u},  {6, 0x98000000u},
+  {6, 0x9c000000u},  {5, 0x30000000u},  {7, 0xe8000000u},  {7, 0xea000000u},
+  {6, 0xa0000000u},  {6, 0xa4000000u},  {6, 0xa8000000u},  {5, 0x38000000u},
+  {6, 0xac000000u},  {7, 0xec000000u},  {6, 0xb0000000u},  {5, 0x40000000u},
+  {5, 0x48000000u},  {6, 0xb4000000u},  {7, 0xee000000u},  {7, 0xf0000000u},
+  {7, 0xf2000000u},  {7, 0xf4000000u},  {7, 0xf6000000u},  {15, 0xfffc0000u},
+  {11, 0xff800000u}, {14, 0xfff40000u}, {13, 0xffe80000u}, {28, 0xffffffc0u},
+  {20, 0xfffe6000u}, {22, 0xffff4800u}, {20, 0xfffe7000u}, {20, 0xfffe8000u},
+  {22, 0xffff4c00u}, {22, 0xffff5000u}, {22, 0xffff5400u}, {23, 0xffffb200u},
+  {22, 0xffff5800u}, {23, 0xffffb400u}, {23, 0xffffb600u}, {23, 0xffffb800u},
+  {23, 0xffffba00u}, {23, 0xffffbc00u}, {24, 0xffffeb00u}, {23, 0xffffbe00u},
+  {24, 0xffffec00u}, {24, 0xffffed00u}, {22, 0xffff5c00u}, {23, 0xffffc000u},
+  {24, 0xffffee00u}, {23, 0xffffc200u}, {23, 0xffffc400u}, {23, 0xffffc600u},
+  {23, 0xffffc800u}, {21, 0xfffee000u}, {22, 0xffff6000u}, {23, 0xffffca00u},
+  {22, 0xffff6400u}, {23, 0xffffcc00u}, {23, 0xffffce00u}, {24, 0xffffef00u},
+  {22, 0xffff6800u}, {21, 0xfffee800u}, {20, 0xfffe9000u}, {22, 0xffff6c00u},
+  {22, 0xffff7000u}, {23, 0xffffd000u}, {23, 0xffffd200u}, {21, 0xfffef000u},
+  {23, 0xffffd400u}, {22, 0xffff7400u}, {22, 0xffff7800u}, {24, 0xfffff000u},
+  {21, 0xfffef800u}, {22, 0xffff7c00u}, {23, 0xffffd600u}, {23, 0xffffd800u},
+  {21, 0xffff0000u}, {21, 0xffff0800u}, {22, 0xffff8000u}, {21, 0xffff1000u},
+  {23, 0xffffda00u}, {22, 0xffff8400u}, {23, 0xffffdc00u}, {23, 0xffffde00u},
+  {20, 0xfffea000u}, {22, 0xffff8800u}, {22, 0xffff8c00u}, {22, 0xffff9000u},
+  {23, 0xffffe000u}, {22, 0xffff9400u}, {22, 0xffff9800u}, {23, 0xffffe200u},
+  {26, 0xfffff800u}, {26, 0xfffff840u}, {20, 0xfffeb000u}, {19, 0xfffe2000u},
+  {22, 0xffff9c00u}, {23, 0xffffe400u}, {22, 0xffffa000u}, {25, 0xfffff600u},
+  {26, 0xfffff880u}, {26, 0xfffff8c0u}, {26, 0xfffff900u}, {27, 0xfffffbc0u},
+  {27, 0xfffffbe0u}, {26, 0xfffff940u}, {24, 0xfffff100u}, {25, 0xfffff680u},
+  {19, 0xfffe4000u}, {21, 0xffff1800u}, {26, 0xfffff980u}, {27, 0xfffffc00u},
+  {27, 0xfffffc20u}, {26, 0xfffff9c0u}, {27, 0xfffffc40u}, {24, 0xfffff200u},
+  {21, 0xffff2000u}, {21, 0xffff2800u}, {26, 0xfffffa00u}, {26, 0xfffffa40u},
+  {28, 0xffffffd0u}, {27, 0xfffffc60u}, {27, 0xfffffc80u}, {27, 0xfffffca0u},
+  {20, 0xfffec000u}, {24, 0xfffff300u}, {20, 0xfffed000u}, {21, 0xffff3000u},
+  {22, 0xffffa400u}, {21, 0xffff3800u}, {21, 0xffff4000u}, {23, 0xffffe600u},
+  {22, 0xffffa800u}, {22, 0xffffac00u}, {25, 0xfffff700u}, {25, 0xfffff780u},
+  {24, 0xfffff400u}, {24, 0xfffff500u}, {26, 0xfffffa80u}, {23, 0xffffe800u},
+  {26, 0xfffffac0u}, {27, 0xfffffcc0u}, {26, 0xfffffb00u}, {26, 0xfffffb40u},
+  {27, 0xfffffce0u}, {27, 0xfffffd00u}, {27, 0xfffffd20u}, {27, 0xfffffd40u},
+  {27, 0xfffffd60u}, {28, 0xffffffe0u}, {27, 0xfffffd80u}, {27, 0xfffffda0u},
+  {27, 0xfffffdc0u}, {27, 0xfffffde0u}, {27, 0xfffffe00u}, {26, 0xfffffb80u},
+  {30, 0xfffffffcu}};
 
 const nghttp3_qpack_huffman_decode_node qpack_huffman_decode_table[][16] = {
-    /* 0 */
-    {
-        {0x04, 0},
-        {0x05, 0},
-        {0x07, 0},
-        {0x08, 0},
-        {0x0b, 0},
-        {0x0c, 0},
-        {0x10, 0},
-        {0x13, 0},
-        {0x19, 0},
-        {0x1c, 0},
-        {0x20, 0},
-        {0x23, 0},
-        {0x2a, 0},
-        {0x31, 0},
-        {0x39, 0},
-        {0x4040, 0},
-    },
-    /* 1 */
-    {
-        {0xc000, 48},
-        {0xc000, 49},
-        {0xc000, 50},
-        {0xc000, 97},
-        {0xc000, 99},
-        {0xc000, 101},
-        {0xc000, 105},
-        {0xc000, 111},
-        {0xc000, 115},
-        {0xc000, 116},
-        {0x0d, 0},
-        {0x0e, 0},
-        {0x11, 0},
-        {0x12, 0},
-        {0x14, 0},
-        {0x15, 0},
-    },
-    /* 2 */
-    {
-        {0x8001, 48},
-        {0xc016, 48},
-        {0x8001, 49},
-        {0xc016, 49},
-        {0x8001, 50},
-        {0xc016, 50},
-        {0x8001, 97},
-        {0xc016, 97},
-        {0x8001, 99},
-        {0xc016, 99},
-        {0x8001, 101},
-        {0xc016, 101},
-        {0x8001, 105},
-        {0xc016, 105},
-        {0x8001, 111},
-        {0xc016, 111},
-    },
-    /* 3 */
-    {
-        {0x8002, 48},
-        {0x8009, 48},
-        {0x8017, 48},
-        {0xc028, 48},
-        {0x8002, 49},
-        {0x8009, 49},
-        {0x8017, 49},
-        {0xc028, 49},
-        {0x8002, 50},
-        {0x8009, 50},
-        {0x8017, 50},
-        {0xc028, 50},
-        {0x8002, 97},
-        {0x8009, 97},
-        {0x8017, 97},
-        {0xc028, 97},
-    },
-    /* 4 */
-    {
-        {0x8003, 48},
-        {0x8006, 48},
-        {0x800a, 48},
-        {0x800f, 48},
-        {0x8018, 48},
-        {0x801f, 48},
-        {0x8029, 48},
-        {0xc038, 48},
-        {0x8003, 49},
-        {0x8006, 49},
-        {0x800a, 49},
-        {0x800f, 49},
-        {0x8018, 49},
-        {0x801f, 49},
-        {0x8029, 49},
-        {0xc038, 49},
-    },
-    /* 5 */
-    {
-        {0x8003, 50},
-        {0x8006, 50},
-        {0x800a, 50},
-        {0x800f, 50},
-        {0x8018, 50},
-        {0x801f, 50},
-        {0x8029, 50},
-        {0xc038, 50},
-        {0x8003, 97},
-        {0x8006, 97},
-        {0x800a, 97},
-        {0x800f, 97},
-        {0x8018, 97},
-        {0x801f, 97},
-        {0x8029, 97},
-        {0xc038, 97},
-    },
-    /* 6 */
-    {
-        {0x8002, 99},
-        {0x8009, 99},
-        {0x8017, 99},
-        {0xc028, 99},
-        {0x8002, 101},
-        {0x8009, 101},
-        {0x8017, 101},
-        {0xc028, 101},
-        {0x8002, 105},
-        {0x8009, 105},
-        {0x8017, 105},
-        {0xc028, 105},
-        {0x8002, 111},
-        {0x8009, 111},
-        {0x8017, 111},
-        {0xc028, 111},
-    },
-    /* 7 */
-    {
-        {0x8003, 99},
-        {0x8006, 99},
-        {0x800a, 99},
-        {0x800f, 99},
-        {0x8018, 99},
-        {0x801f, 99},
-        {0x8029, 99},
-        {0xc038, 99},
-        {0x8003, 101},
-        {0x8006, 101},
-        {0x800a, 101},
-        {0x800f, 101},
-        {0x8018, 101},
-        {0x801f, 101},
-        {0x8029, 101},
-        {0xc038, 101},
-    },
-    /* 8 */
-    {
-        {0x8003, 105},
-        {0x8006, 105},
-        {0x800a, 105},
-        {0x800f, 105},
-        {0x8018, 105},
-        {0x801f, 105},
-        {0x8029, 105},
-        {0xc038, 105},
-        {0x8003, 111},
-        {0x8006, 111},
-        {0x800a, 111},
-        {0x800f, 111},
-        {0x8018, 111},
-        {0x801f, 111},
-        {0x8029, 111},
-        {0xc038, 111},
-    },
-    /* 9 */
-    {
-        {0x8001, 115},
-        {0xc016, 115},
-        {0x8001, 116},
-        {0xc016, 116},
-        {0xc000, 32},
-        {0xc000, 37},
-        {0xc000, 45},
-        {0xc000, 46},
-        {0xc000, 47},
-        {0xc000, 51},
-        {0xc000, 52},
-        {0xc000, 53},
-        {0xc000, 54},
-        {0xc000, 55},
-        {0xc000, 56},
-        {0xc000, 57},
-    },
-    /* 10 */
-    {
-        {0x8002, 115},
-        {0x8009, 115},
-        {0x8017, 115},
-        {0xc028, 115},
-        {0x8002, 116},
-        {0x8009, 116},
-        {0x8017, 116},
-        {0xc028, 116},
-        {0x8001, 32},
-        {0xc016, 32},
-        {0x8001, 37},
-        {0xc016, 37},
-        {0x8001, 45},
-        {0xc016, 45},
-        {0x8001, 46},
-        {0xc016, 46},
-    },
-    /* 11 */
-    {
-        {0x8003, 115},
-        {0x8006, 115},
-        {0x800a, 115},
-        {0x800f, 115},
-        {0x8018, 115},
-        {0x801f, 115},
-        {0x8029, 115},
-        {0xc038, 115},
-        {0x8003, 116},
-        {0x8006, 116},
-        {0x800a, 116},
-        {0x800f, 116},
-        {0x8018, 116},
-        {0x801f, 116},
-        {0x8029, 116},
-        {0xc038, 116},
-    },
-    /* 12 */
-    {
-        {0x8002, 32},
-        {0x8009, 32},
-        {0x8017, 32},
-        {0xc028, 32},
-        {0x8002, 37},
-        {0x8009, 37},
-        {0x8017, 37},
-        {0xc028, 37},
-        {0x8002, 45},
-        {0x8009, 45},
-        {0x8017, 45},
-        {0xc028, 45},
-        {0x8002, 46},
-        {0x8009, 46},
-        {0x8017, 46},
-        {0xc028, 46},
-    },
-    /* 13 */
-    {
-        {0x8003, 32},
-        {0x8006, 32},
-        {0x800a, 32},
-        {0x800f, 32},
-        {0x8018, 32},
-        {0x801f, 32},
-        {0x8029, 32},
-        {0xc038, 32},
-        {0x8003, 37},
-        {0x8006, 37},
-        {0x800a, 37},
-        {0x800f, 37},
-        {0x8018, 37},
-        {0x801f, 37},
-        {0x8029, 37},
-        {0xc038, 37},
-    },
-    /* 14 */
-    {
-        {0x8003, 45},
-        {0x8006, 45},
-        {0x800a, 45},
-        {0x800f, 45},
-        {0x8018, 45},
-        {0x801f, 45},
-        {0x8029, 45},
-        {0xc038, 45},
-        {0x8003, 46},
-        {0x8006, 46},
-        {0x800a, 46},
-        {0x800f, 46},
-        {0x8018, 46},
-        {0x801f, 46},
-        {0x8029, 46},
-        {0xc038, 46},
-    },
-    /* 15 */
-    {
-        {0x8001, 47},
-        {0xc016, 47},
-        {0x8001, 51},
-        {0xc016, 51},
-        {0x8001, 52},
-        {0xc016, 52},
-        {0x8001, 53},
-        {0xc016, 53},
-        {0x8001, 54},
-        {0xc016, 54},
-        {0x8001, 55},
-        {0xc016, 55},
-        {0x8001, 56},
-        {0xc016, 56},
-        {0x8001, 57},
-        {0xc016, 57},
-    },
-    /* 16 */
-    {
-        {0x8002, 47},
-        {0x8009, 47},
-        {0x8017, 47},
-        {0xc028, 47},
-        {0x8002, 51},
-        {0x8009, 51},
-        {0x8017, 51},
-        {0xc028, 51},
-        {0x8002, 52},
-        {0x8009, 52},
-        {0x8017, 52},
-        {0xc028, 52},
-        {0x8002, 53},
-        {0x8009, 53},
-        {0x8017, 53},
-        {0xc028, 53},
-    },
-    /* 17 */
-    {
-        {0x8003, 47},
-        {0x8006, 47},
-        {0x800a, 47},
-        {0x800f, 47},
-        {0x8018, 47},
-        {0x801f, 47},
-        {0x8029, 47},
-        {0xc038, 47},
-        {0x8003, 51},
-        {0x8006, 51},
-        {0x800a, 51},
-        {0x800f, 51},
-        {0x8018, 51},
-        {0x801f, 51},
-        {0x8029, 51},
-        {0xc038, 51},
-    },
-    /* 18 */
-    {
-        {0x8003, 52},
-        {0x8006, 52},
-        {0x800a, 52},
-        {0x800f, 52},
-        {0x8018, 52},
-        {0x801f, 52},
-        {0x8029, 52},
-        {0xc038, 52},
-        {0x8003, 53},
-        {0x8006, 53},
-        {0x800a, 53},
-        {0x800f, 53},
-        {0x8018, 53},
-        {0x801f, 53},
-        {0x8029, 53},
-        {0xc038, 53},
-    },
-    /* 19 */
-    {
-        {0x8002, 54},
-        {0x8009, 54},
-        {0x8017, 54},
-        {0xc028, 54},
-        {0x8002, 55},
-        {0x8009, 55},
-        {0x8017, 55},
-        {0xc028, 55},
-        {0x8002, 56},
-        {0x8009, 56},
-        {0x8017, 56},
-        {0xc028, 56},
-        {0x8002, 57},
-        {0x8009, 57},
-        {0x8017, 57},
-        {0xc028, 57},
-    },
-    /* 20 */
-    {
-        {0x8003, 54},
-        {0x8006, 54},
-        {0x800a, 54},
-        {0x800f, 54},
-        {0x8018, 54},
-        {0x801f, 54},
-        {0x8029, 54},
-        {0xc038, 54},
-        {0x8003, 55},
-        {0x8006, 55},
-        {0x800a, 55},
-        {0x800f, 55},
-        {0x8018, 55},
-        {0x801f, 55},
-        {0x8029, 55},
-        {0xc038, 55},
-    },
-    /* 21 */
-    {
-        {0x8003, 56},
-        {0x8006, 56},
-        {0x800a, 56},
-        {0x800f, 56},
-        {0x8018, 56},
-        {0x801f, 56},
-        {0x8029, 56},
-        {0xc038, 56},
-        {0x8003, 57},
-        {0x8006, 57},
-        {0x800a, 57},
-        {0x800f, 57},
-        {0x8018, 57},
-        {0x801f, 57},
-        {0x8029, 57},
-        {0xc038, 57},
-    },
-    /* 22 */
-    {
-        {0x1a, 0},
-        {0x1b, 0},
-        {0x1d, 0},
-        {0x1e, 0},
-        {0x21, 0},
-        {0x22, 0},
-        {0x24, 0},
-        {0x25, 0},
-        {0x2b, 0},
-        {0x2e, 0},
-        {0x32, 0},
-        {0x35, 0},
-        {0x3a, 0},
-        {0x3d, 0},
-        {0x41, 0},
-        {0x4044, 0},
-    },
-    /* 23 */
-    {
-        {0xc000, 61},
-        {0xc000, 65},
-        {0xc000, 95},
-        {0xc000, 98},
-        {0xc000, 100},
-        {0xc000, 102},
-        {0xc000, 103},
-        {0xc000, 104},
-        {0xc000, 108},
-        {0xc000, 109},
-        {0xc000, 110},
-        {0xc000, 112},
-        {0xc000, 114},
-        {0xc000, 117},
-        {0x26, 0},
-        {0x27, 0},
-    },
-    /* 24 */
-    {
-        {0x8001, 61},
-        {0xc016, 61},
-        {0x8001, 65},
-        {0xc016, 65},
-        {0x8001, 95},
-        {0xc016, 95},
-        {0x8001, 98},
-        {0xc016, 98},
-        {0x8001, 100},
-        {0xc016, 100},
-        {0x8001, 102},
-        {0xc016, 102},
-        {0x8001, 103},
-        {0xc016, 103},
-        {0x8001, 104},
-        {0xc016, 104},
-    },
-    /* 25 */
-    {
-        {0x8002, 61},
-        {0x8009, 61},
-        {0x8017, 61},
-        {0xc028, 61},
-        {0x8002, 65},
-        {0x8009, 65},
-        {0x8017, 65},
-        {0xc028, 65},
-        {0x8002, 95},
-        {0x8009, 95},
-        {0x8017, 95},
-        {0xc028, 95},
-        {0x8002, 98},
-        {0x8009, 98},
-        {0x8017, 98},
-        {0xc028, 98},
-    },
-    /* 26 */
-    {
-        {0x8003, 61},
-        {0x8006, 61},
-        {0x800a, 61},
-        {0x800f, 61},
-        {0x8018, 61},
-        {0x801f, 61},
-        {0x8029, 61},
-        {0xc038, 61},
-        {0x8003, 65},
-        {0x8006, 65},
-        {0x800a, 65},
-        {0x800f, 65},
-        {0x8018, 65},
-        {0x801f, 65},
-        {0x8029, 65},
-        {0xc038, 65},
-    },
-    /* 27 */
-    {
-        {0x8003, 95},
-        {0x8006, 95},
-        {0x800a, 95},
-        {0x800f, 95},
-        {0x8018, 95},
-        {0x801f, 95},
-        {0x8029, 95},
-        {0xc038, 95},
-        {0x8003, 98},
-        {0x8006, 98},
-        {0x800a, 98},
-        {0x800f, 98},
-        {0x8018, 98},
-        {0x801f, 98},
-        {0x8029, 98},
-        {0xc038, 98},
-    },
-    /* 28 */
-    {
-        {0x8002, 100},
-        {0x8009, 100},
-        {0x8017, 100},
-        {0xc028, 100},
-        {0x8002, 102},
-        {0x8009, 102},
-        {0x8017, 102},
-        {0xc028, 102},
-        {0x8002, 103},
-        {0x8009, 103},
-        {0x8017, 103},
-        {0xc028, 103},
-        {0x8002, 104},
-        {0x8009, 104},
-        {0x8017, 104},
-        {0xc028, 104},
-    },
-    /* 29 */
-    {
-        {0x8003, 100},
-        {0x8006, 100},
-        {0x800a, 100},
-        {0x800f, 100},
-        {0x8018, 100},
-        {0x801f, 100},
-        {0x8029, 100},
-        {0xc038, 100},
-        {0x8003, 102},
-        {0x8006, 102},
-        {0x800a, 102},
-        {0x800f, 102},
-        {0x8018, 102},
-        {0x801f, 102},
-        {0x8029, 102},
-        {0xc038, 102},
-    },
-    /* 30 */
-    {
-        {0x8003, 103},
-        {0x8006, 103},
-        {0x800a, 103},
-        {0x800f, 103},
-        {0x8018, 103},
-        {0x801f, 103},
-        {0x8029, 103},
-        {0xc038, 103},
-        {0x8003, 104},
-        {0x8006, 104},
-        {0x800a, 104},
-        {0x800f, 104},
-        {0x8018, 104},
-        {0x801f, 104},
-        {0x8029, 104},
-        {0xc038, 104},
-    },
-    /* 31 */
-    {
-        {0x8001, 108},
-        {0xc016, 108},
-        {0x8001, 109},
-        {0xc016, 109},
-        {0x8001, 110},
-        {0xc016, 110},
-        {0x8001, 112},
-        {0xc016, 112},
-        {0x8001, 114},
-        {0xc016, 114},
-        {0x8001, 117},
-        {0xc016, 117},
-        {0xc000, 58},
-        {0xc000, 66},
-        {0xc000, 67},
-        {0xc000, 68},
-    },
-    /* 32 */
-    {
-        {0x8002, 108},
-        {0x8009, 108},
-        {0x8017, 108},
-        {0xc028, 108},
-        {0x8002, 109},
-        {0x8009, 109},
-        {0x8017, 109},
-        {0xc028, 109},
-        {0x8002, 110},
-        {0x8009, 110},
-        {0x8017, 110},
-        {0xc028, 110},
-        {0x8002, 112},
-        {0x8009, 112},
-        {0x8017, 112},
-        {0xc028, 112},
-    },
-    /* 33 */
-    {
-        {0x8003, 108},
-        {0x8006, 108},
-        {0x800a, 108},
-        {0x800f, 108},
-        {0x8018, 108},
-        {0x801f, 108},
-        {0x8029, 108},
-        {0xc038, 108},
-        {0x8003, 109},
-        {0x8006, 109},
-        {0x800a, 109},
-        {0x800f, 109},
-        {0x8018, 109},
-        {0x801f, 109},
-        {0x8029, 109},
-        {0xc038, 109},
-    },
-    /* 34 */
-    {
-        {0x8003, 110},
-        {0x8006, 110},
-        {0x800a, 110},
-        {0x800f, 110},
-        {0x8018, 110},
-        {0x801f, 110},
-        {0x8029, 110},
-        {0xc038, 110},
-        {0x8003, 112},
-        {0x8006, 112},
-        {0x800a, 112},
-        {0x800f, 112},
-        {0x8018, 112},
-        {0x801f, 112},
-        {0x8029, 112},
-        {0xc038, 112},
-    },
-    /* 35 */
-    {
-        {0x8002, 114},
-        {0x8009, 114},
-        {0x8017, 114},
-        {0xc028, 114},
-        {0x8002, 117},
-        {0x8009, 117},
-        {0x8017, 117},
-        {0xc028, 117},
-        {0x8001, 58},
-        {0xc016, 58},
-        {0x8001, 66},
-        {0xc016, 66},
-        {0x8001, 67},
-        {0xc016, 67},
-        {0x8001, 68},
-        {0xc016, 68},
-    },
-    /* 36 */
-    {
-        {0x8003, 114},
-        {0x8006, 114},
-        {0x800a, 114},
-        {0x800f, 114},
-        {0x8018, 114},
-        {0x801f, 114},
-        {0x8029, 114},
-        {0xc038, 114},
-        {0x8003, 117},
-        {0x8006, 117},
-        {0x800a, 117},
-        {0x800f, 117},
-        {0x8018, 117},
-        {0x801f, 117},
-        {0x8029, 117},
-        {0xc038, 117},
-    },
-    /* 37 */
-    {
-        {0x8002, 58},
-        {0x8009, 58},
-        {0x8017, 58},
-        {0xc028, 58},
-        {0x8002, 66},
-        {0x8009, 66},
-        {0x8017, 66},
-        {0xc028, 66},
-        {0x8002, 67},
-        {0x8009, 67},
-        {0x8017, 67},
-        {0xc028, 67},
-        {0x8002, 68},
-        {0x8009, 68},
-        {0x8017, 68},
-        {0xc028, 68},
-    },
-    /* 38 */
-    {
-        {0x8003, 58},
-        {0x8006, 58},
-        {0x800a, 58},
-        {0x800f, 58},
-        {0x8018, 58},
-        {0x801f, 58},
-        {0x8029, 58},
-        {0xc038, 58},
-        {0x8003, 66},
-        {0x8006, 66},
-        {0x800a, 66},
-        {0x800f, 66},
-        {0x8018, 66},
-        {0x801f, 66},
-        {0x8029, 66},
-        {0xc038, 66},
-    },
-    /* 39 */
-    {
-        {0x8003, 67},
-        {0x8006, 67},
-        {0x800a, 67},
-        {0x800f, 67},
-        {0x8018, 67},
-        {0x801f, 67},
-        {0x8029, 67},
-        {0xc038, 67},
-        {0x8003, 68},
-        {0x8006, 68},
-        {0x800a, 68},
-        {0x800f, 68},
-        {0x8018, 68},
-        {0x801f, 68},
-        {0x8029, 68},
-        {0xc038, 68},
-    },
-    /* 40 */
-    {
-        {0x2c, 0},
-        {0x2d, 0},
-        {0x2f, 0},
-        {0x30, 0},
-        {0x33, 0},
-        {0x34, 0},
-        {0x36, 0},
-        {0x37, 0},
-        {0x3b, 0},
-        {0x3c, 0},
-        {0x3e, 0},
-        {0x3f, 0},
-        {0x42, 0},
-        {0x43, 0},
-        {0x45, 0},
-        {0x4048, 0},
-    },
-    /* 41 */
-    {
-        {0xc000, 69},
-        {0xc000, 70},
-        {0xc000, 71},
-        {0xc000, 72},
-        {0xc000, 73},
-        {0xc000, 74},
-        {0xc000, 75},
-        {0xc000, 76},
-        {0xc000, 77},
-        {0xc000, 78},
-        {0xc000, 79},
-        {0xc000, 80},
-        {0xc000, 81},
-        {0xc000, 82},
-        {0xc000, 83},
-        {0xc000, 84},
-    },
-    /* 42 */
-    {
-        {0x8001, 69},
-        {0xc016, 69},
-        {0x8001, 70},
-        {0xc016, 70},
-        {0x8001, 71},
-        {0xc016, 71},
-        {0x8001, 72},
-        {0xc016, 72},
-        {0x8001, 73},
-        {0xc016, 73},
-        {0x8001, 74},
-        {0xc016, 74},
-        {0x8001, 75},
-        {0xc016, 75},
-        {0x8001, 76},
-        {0xc016, 76},
-    },
-    /* 43 */
-    {
-        {0x8002, 69},
-        {0x8009, 69},
-        {0x8017, 69},
-        {0xc028, 69},
-        {0x8002, 70},
-        {0x8009, 70},
-        {0x8017, 70},
-        {0xc028, 70},
-        {0x8002, 71},
-        {0x8009, 71},
-        {0x8017, 71},
-        {0xc028, 71},
-        {0x8002, 72},
-        {0x8009, 72},
-        {0x8017, 72},
-        {0xc028, 72},
-    },
-    /* 44 */
-    {
-        {0x8003, 69},
-        {0x8006, 69},
-        {0x800a, 69},
-        {0x800f, 69},
-        {0x8018, 69},
-        {0x801f, 69},
-        {0x8029, 69},
-        {0xc038, 69},
-        {0x8003, 70},
-        {0x8006, 70},
-        {0x800a, 70},
-        {0x800f, 70},
-        {0x8018, 70},
-        {0x801f, 70},
-        {0x8029, 70},
-        {0xc038, 70},
-    },
-    /* 45 */
-    {
-        {0x8003, 71},
-        {0x8006, 71},
-        {0x800a, 71},
-        {0x800f, 71},
-        {0x8018, 71},
-        {0x801f, 71},
-        {0x8029, 71},
-        {0xc038, 71},
-        {0x8003, 72},
-        {0x8006, 72},
-        {0x800a, 72},
-        {0x800f, 72},
-        {0x8018, 72},
-        {0x801f, 72},
-        {0x8029, 72},
-        {0xc038, 72},
-    },
-    /* 46 */
-    {
-        {0x8002, 73},
-        {0x8009, 73},
-        {0x8017, 73},
-        {0xc028, 73},
-        {0x8002, 74},
-        {0x8009, 74},
-        {0x8017, 74},
-        {0xc028, 74},
-        {0x8002, 75},
-        {0x8009, 75},
-        {0x8017, 75},
-        {0xc028, 75},
-        {0x8002, 76},
-        {0x8009, 76},
-        {0x8017, 76},
-        {0xc028, 76},
-    },
-    /* 47 */
-    {
-        {0x8003, 73},
-        {0x8006, 73},
-        {0x800a, 73},
-        {0x800f, 73},
-        {0x8018, 73},
-        {0x801f, 73},
-        {0x8029, 73},
-        {0xc038, 73},
-        {0x8003, 74},
-        {0x8006, 74},
-        {0x800a, 74},
-        {0x800f, 74},
-        {0x8018, 74},
-        {0x801f, 74},
-        {0x8029, 74},
-        {0xc038, 74},
-    },
-    /* 48 */
-    {
-        {0x8003, 75},
-        {0x8006, 75},
-        {0x800a, 75},
-        {0x800f, 75},
-        {0x8018, 75},
-        {0x801f, 75},
-        {0x8029, 75},
-        {0xc038, 75},
-        {0x8003, 76},
-        {0x8006, 76},
-        {0x800a, 76},
-        {0x800f, 76},
-        {0x8018, 76},
-        {0x801f, 76},
-        {0x8029, 76},
-        {0xc038, 76},
-    },
-    /* 49 */
-    {
-        {0x8001, 77},
-        {0xc016, 77},
-        {0x8001, 78},
-        {0xc016, 78},
-        {0x8001, 79},
-        {0xc016, 79},
-        {0x8001, 80},
-        {0xc016, 80},
-        {0x8001, 81},
-        {0xc016, 81},
-        {0x8001, 82},
-        {0xc016, 82},
-        {0x8001, 83},
-        {0xc016, 83},
-        {0x8001, 84},
-        {0xc016, 84},
-    },
-    /* 50 */
-    {
-        {0x8002, 77},
-        {0x8009, 77},
-        {0x8017, 77},
-        {0xc028, 77},
-        {0x8002, 78},
-        {0x8009, 78},
-        {0x8017, 78},
-        {0xc028, 78},
-        {0x8002, 79},
-        {0x8009, 79},
-        {0x8017, 79},
-        {0xc028, 79},
-        {0x8002, 80},
-        {0x8009, 80},
-        {0x8017, 80},
-        {0xc028, 80},
-    },
-    /* 51 */
-    {
-        {0x8003, 77},
-        {0x8006, 77},
-        {0x800a, 77},
-        {0x800f, 77},
-        {0x8018, 77},
-        {0x801f, 77},
-        {0x8029, 77},
-        {0xc038, 77},
-        {0x8003, 78},
-        {0x8006, 78},
-        {0x800a, 78},
-        {0x800f, 78},
-        {0x8018, 78},
-        {0x801f, 78},
-        {0x8029, 78},
-        {0xc038, 78},
-    },
-    /* 52 */
-    {
-        {0x8003, 79},
-        {0x8006, 79},
-        {0x800a, 79},
-        {0x800f, 79},
-        {0x8018, 79},
-        {0x801f, 79},
-        {0x8029, 79},
-        {0xc038, 79},
-        {0x8003, 80},
-        {0x8006, 80},
-        {0x800a, 80},
-        {0x800f, 80},
-        {0x8018, 80},
-        {0x801f, 80},
-        {0x8029, 80},
-        {0xc038, 80},
-    },
-    /* 53 */
-    {
-        {0x8002, 81},
-        {0x8009, 81},
-        {0x8017, 81},
-        {0xc028, 81},
-        {0x8002, 82},
-        {0x8009, 82},
-        {0x8017, 82},
-        {0xc028, 82},
-        {0x8002, 83},
-        {0x8009, 83},
-        {0x8017, 83},
-        {0xc028, 83},
-        {0x8002, 84},
-        {0x8009, 84},
-        {0x8017, 84},
-        {0xc028, 84},
-    },
-    /* 54 */
-    {
-        {0x8003, 81},
-        {0x8006, 81},
-        {0x800a, 81},
-        {0x800f, 81},
-        {0x8018, 81},
-        {0x801f, 81},
-        {0x8029, 81},
-        {0xc038, 81},
-        {0x8003, 82},
-        {0x8006, 82},
-        {0x800a, 82},
-        {0x800f, 82},
-        {0x8018, 82},
-        {0x801f, 82},
-        {0x8029, 82},
-        {0xc038, 82},
-    },
-    /* 55 */
-    {
-        {0x8003, 83},
-        {0x8006, 83},
-        {0x800a, 83},
-        {0x800f, 83},
-        {0x8018, 83},
-        {0x801f, 83},
-        {0x8029, 83},
-        {0xc038, 83},
-        {0x8003, 84},
-        {0x8006, 84},
-        {0x800a, 84},
-        {0x800f, 84},
-        {0x8018, 84},
-        {0x801f, 84},
-        {0x8029, 84},
-        {0xc038, 84},
-    },
-    /* 56 */
-    {
-        {0xc000, 85},
-        {0xc000, 86},
-        {0xc000, 87},
-        {0xc000, 89},
-        {0xc000, 106},
-        {0xc000, 107},
-        {0xc000, 113},
-        {0xc000, 118},
-        {0xc000, 119},
-        {0xc000, 120},
-        {0xc000, 121},
-        {0xc000, 122},
-        {0x46, 0},
-        {0x47, 0},
-        {0x49, 0},
-        {0x404a, 0},
-    },
-    /* 57 */
-    {
-        {0x8001, 85},
-        {0xc016, 85},
-        {0x8001, 86},
-        {0xc016, 86},
-        {0x8001, 87},
-        {0xc016, 87},
-        {0x8001, 89},
-        {0xc016, 89},
-        {0x8001, 106},
-        {0xc016, 106},
-        {0x8001, 107},
-        {0xc016, 107},
-        {0x8001, 113},
-        {0xc016, 113},
-        {0x8001, 118},
-        {0xc016, 118},
-    },
-    /* 58 */
-    {
-        {0x8002, 85},
-        {0x8009, 85},
-        {0x8017, 85},
-        {0xc028, 85},
-        {0x8002, 86},
-        {0x8009, 86},
-        {0x8017, 86},
-        {0xc028, 86},
-        {0x8002, 87},
-        {0x8009, 87},
-        {0x8017, 87},
-        {0xc028, 87},
-        {0x8002, 89},
-        {0x8009, 89},
-        {0x8017, 89},
-        {0xc028, 89},
-    },
-    /* 59 */
-    {
-        {0x8003, 85},
-        {0x8006, 85},
-        {0x800a, 85},
-        {0x800f, 85},
-        {0x8018, 85},
-        {0x801f, 85},
-        {0x8029, 85},
-        {0xc038, 85},
-        {0x8003, 86},
-        {0x8006, 86},
-        {0x800a, 86},
-        {0x800f, 86},
-        {0x8018, 86},
-        {0x801f, 86},
-        {0x8029, 86},
-        {0xc038, 86},
-    },
-    /* 60 */
-    {
-        {0x8003, 87},
-        {0x8006, 87},
-        {0x800a, 87},
-        {0x800f, 87},
-        {0x8018, 87},
-        {0x801f, 87},
-        {0x8029, 87},
-        {0xc038, 87},
-        {0x8003, 89},
-        {0x8006, 89},
-        {0x800a, 89},
-        {0x800f, 89},
-        {0x8018, 89},
-        {0x801f, 89},
-        {0x8029, 89},
-        {0xc038, 89},
-    },
-    /* 61 */
-    {
-        {0x8002, 106},
-        {0x8009, 106},
-        {0x8017, 106},
-        {0xc028, 106},
-        {0x8002, 107},
-        {0x8009, 107},
-        {0x8017, 107},
-        {0xc028, 107},
-        {0x8002, 113},
-        {0x8009, 113},
-        {0x8017, 113},
-        {0xc028, 113},
-        {0x8002, 118},
-        {0x8009, 118},
-        {0x8017, 118},
-        {0xc028, 118},
-    },
-    /* 62 */
-    {
-        {0x8003, 106},
-        {0x8006, 106},
-        {0x800a, 106},
-        {0x800f, 106},
-        {0x8018, 106},
-        {0x801f, 106},
-        {0x8029, 106},
-        {0xc038, 106},
-        {0x8003, 107},
-        {0x8006, 107},
-        {0x800a, 107},
-        {0x800f, 107},
-        {0x8018, 107},
-        {0x801f, 107},
-        {0x8029, 107},
-        {0xc038, 107},
-    },
-    /* 63 */
-    {
-        {0x8003, 113},
-        {0x8006, 113},
-        {0x800a, 113},
-        {0x800f, 113},
-        {0x8018, 113},
-        {0x801f, 113},
-        {0x8029, 113},
-        {0xc038, 113},
-        {0x8003, 118},
-        {0x8006, 118},
-        {0x800a, 118},
-        {0x800f, 118},
-        {0x8018, 118},
-        {0x801f, 118},
-        {0x8029, 118},
-        {0xc038, 118},
-    },
-    /* 64 */
-    {
-        {0x8001, 119},
-        {0xc016, 119},
-        {0x8001, 120},
-        {0xc016, 120},
-        {0x8001, 121},
-        {0xc016, 121},
-        {0x8001, 122},
-        {0xc016, 122},
-        {0xc000, 38},
-        {0xc000, 42},
-        {0xc000, 44},
-        {0xc000, 59},
-        {0xc000, 88},
-        {0xc000, 90},
-        {0x4b, 0},
-        {0x4e, 0},
-    },
-    /* 65 */
-    {
-        {0x8002, 119},
-        {0x8009, 119},
-        {0x8017, 119},
-        {0xc028, 119},
-        {0x8002, 120},
-        {0x8009, 120},
-        {0x8017, 120},
-        {0xc028, 120},
-        {0x8002, 121},
-        {0x8009, 121},
-        {0x8017, 121},
-        {0xc028, 121},
-        {0x8002, 122},
-        {0x8009, 122},
-        {0x8017, 122},
-        {0xc028, 122},
-    },
-    /* 66 */
-    {
-        {0x8003, 119},
-        {0x8006, 119},
-        {0x800a, 119},
-        {0x800f, 119},
-        {0x8018, 119},
-        {0x801f, 119},
-        {0x8029, 119},
-        {0xc038, 119},
-        {0x8003, 120},
-        {0x8006, 120},
-        {0x800a, 120},
-        {0x800f, 120},
-        {0x8018, 120},
-        {0x801f, 120},
-        {0x8029, 120},
-        {0xc038, 120},
-    },
-    /* 67 */
-    {
-        {0x8003, 121},
-        {0x8006, 121},
-        {0x800a, 121},
-        {0x800f, 121},
-        {0x8018, 121},
-        {0x801f, 121},
-        {0x8029, 121},
-        {0xc038, 121},
-        {0x8003, 122},
-        {0x8006, 122},
-        {0x800a, 122},
-        {0x800f, 122},
-        {0x8018, 122},
-        {0x801f, 122},
-        {0x8029, 122},
-        {0xc038, 122},
-    },
-    /* 68 */
-    {
-        {0x8001, 38},
-        {0xc016, 38},
-        {0x8001, 42},
-        {0xc016, 42},
-        {0x8001, 44},
-        {0xc016, 44},
-        {0x8001, 59},
-        {0xc016, 59},
-        {0x8001, 88},
-        {0xc016, 88},
-        {0x8001, 90},
-        {0xc016, 90},
-        {0x4c, 0},
-        {0x4d, 0},
-        {0x4f, 0},
-        {0x51, 0},
-    },
-    /* 69 */
-    {
-        {0x8002, 38},
-        {0x8009, 38},
-        {0x8017, 38},
-        {0xc028, 38},
-        {0x8002, 42},
-        {0x8009, 42},
-        {0x8017, 42},
-        {0xc028, 42},
-        {0x8002, 44},
-        {0x8009, 44},
-        {0x8017, 44},
-        {0xc028, 44},
-        {0x8002, 59},
-        {0x8009, 59},
-        {0x8017, 59},
-        {0xc028, 59},
-    },
-    /* 70 */
-    {
-        {0x8003, 38},
-        {0x8006, 38},
-        {0x800a, 38},
-        {0x800f, 38},
-        {0x8018, 38},
-        {0x801f, 38},
-        {0x8029, 38},
-        {0xc038, 38},
-        {0x8003, 42},
-        {0x8006, 42},
-        {0x800a, 42},
-        {0x800f, 42},
-        {0x8018, 42},
-        {0x801f, 42},
-        {0x8029, 42},
-        {0xc038, 42},
-    },
-    /* 71 */
-    {
-        {0x8003, 44},
-        {0x8006, 44},
-        {0x800a, 44},
-        {0x800f, 44},
-        {0x8018, 44},
-        {0x801f, 44},
-        {0x8029, 44},
-        {0xc038, 44},
-        {0x8003, 59},
-        {0x8006, 59},
-        {0x800a, 59},
-        {0x800f, 59},
-        {0x8018, 59},
-        {0x801f, 59},
-        {0x8029, 59},
-        {0xc038, 59},
-    },
-    /* 72 */
-    {
-        {0x8002, 88},
-        {0x8009, 88},
-        {0x8017, 88},
-        {0xc028, 88},
-        {0x8002, 90},
-        {0x8009, 90},
-        {0x8017, 90},
-        {0xc028, 90},
-        {0xc000, 33},
-        {0xc000, 34},
-        {0xc000, 40},
-        {0xc000, 41},
-        {0xc000, 63},
-        {0x50, 0},
-        {0x52, 0},
-        {0x54, 0},
-    },
-    /* 73 */
-    {
-        {0x8003, 88},
-        {0x8006, 88},
-        {0x800a, 88},
-        {0x800f, 88},
-        {0x8018, 88},
-        {0x801f, 88},
-        {0x8029, 88},
-        {0xc038, 88},
-        {0x8003, 90},
-        {0x8006, 90},
-        {0x800a, 90},
-        {0x800f, 90},
-        {0x8018, 90},
-        {0x801f, 90},
-        {0x8029, 90},
-        {0xc038, 90},
-    },
-    /* 74 */
-    {
-        {0x8001, 33},
-        {0xc016, 33},
-        {0x8001, 34},
-        {0xc016, 34},
-        {0x8001, 40},
-        {0xc016, 40},
-        {0x8001, 41},
-        {0xc016, 41},
-        {0x8001, 63},
-        {0xc016, 63},
-        {0xc000, 39},
-        {0xc000, 43},
-        {0xc000, 124},
-        {0x53, 0},
-        {0x55, 0},
-        {0x58, 0},
-    },
-    /* 75 */
-    {
-        {0x8002, 33},
-        {0x8009, 33},
-        {0x8017, 33},
-        {0xc028, 33},
-        {0x8002, 34},
-        {0x8009, 34},
-        {0x8017, 34},
-        {0xc028, 34},
-        {0x8002, 40},
-        {0x8009, 40},
-        {0x8017, 40},
-        {0xc028, 40},
-        {0x8002, 41},
-        {0x8009, 41},
-        {0x8017, 41},
-        {0xc028, 41},
-    },
-    /* 76 */
-    {
-        {0x8003, 33},
-        {0x8006, 33},
-        {0x800a, 33},
-        {0x800f, 33},
-        {0x8018, 33},
-        {0x801f, 33},
-        {0x8029, 33},
-        {0xc038, 33},
-        {0x8003, 34},
-        {0x8006, 34},
-        {0x800a, 34},
-        {0x800f, 34},
-        {0x8018, 34},
-        {0x801f, 34},
-        {0x8029, 34},
-        {0xc038, 34},
-    },
-    /* 77 */
-    {
-        {0x8003, 40},
-        {0x8006, 40},
-        {0x800a, 40},
-        {0x800f, 40},
-        {0x8018, 40},
-        {0x801f, 40},
-        {0x8029, 40},
-        {0xc038, 40},
-        {0x8003, 41},
-        {0x8006, 41},
-        {0x800a, 41},
-        {0x800f, 41},
-        {0x8018, 41},
-        {0x801f, 41},
-        {0x8029, 41},
-        {0xc038, 41},
-    },
-    /* 78 */
-    {
-        {0x8002, 63},
-        {0x8009, 63},
-        {0x8017, 63},
-        {0xc028, 63},
-        {0x8001, 39},
-        {0xc016, 39},
-        {0x8001, 43},
-        {0xc016, 43},
-        {0x8001, 124},
-        {0xc016, 124},
-        {0xc000, 35},
-        {0xc000, 62},
-        {0x56, 0},
-        {0x57, 0},
-        {0x59, 0},
-        {0x5a, 0},
-    },
-    /* 79 */
-    {
-        {0x8003, 63},
-        {0x8006, 63},
-        {0x800a, 63},
-        {0x800f, 63},
-        {0x8018, 63},
-        {0x801f, 63},
-        {0x8029, 63},
-        {0xc038, 63},
-        {0x8002, 39},
-        {0x8009, 39},
-        {0x8017, 39},
-        {0xc028, 39},
-        {0x8002, 43},
-        {0x8009, 43},
-        {0x8017, 43},
-        {0xc028, 43},
-    },
-    /* 80 */
-    {
-        {0x8003, 39},
-        {0x8006, 39},
-        {0x800a, 39},
-        {0x800f, 39},
-        {0x8018, 39},
-        {0x801f, 39},
-        {0x8029, 39},
-        {0xc038, 39},
-        {0x8003, 43},
-        {0x8006, 43},
-        {0x800a, 43},
-        {0x800f, 43},
-        {0x8018, 43},
-        {0x801f, 43},
-        {0x8029, 43},
-        {0xc038, 43},
-    },
-    /* 81 */
-    {
-        {0x8002, 124},
-        {0x8009, 124},
-        {0x8017, 124},
-        {0xc028, 124},
-        {0x8001, 35},
-        {0xc016, 35},
-        {0x8001, 62},
-        {0xc016, 62},
-        {0xc000, 0},
-        {0xc000, 36},
-        {0xc000, 64},
-        {0xc000, 91},
-        {0xc000, 93},
-        {0xc000, 126},
-        {0x5b, 0},
-        {0x5c, 0},
-    },
-    /* 82 */
-    {
-        {0x8003, 124},
-        {0x8006, 124},
-        {0x800a, 124},
-        {0x800f, 124},
-        {0x8018, 124},
-        {0x801f, 124},
-        {0x8029, 124},
-        {0xc038, 124},
-        {0x8002, 35},
-        {0x8009, 35},
-        {0x8017, 35},
-        {0xc028, 35},
-        {0x8002, 62},
-        {0x8009, 62},
-        {0x8017, 62},
-        {0xc028, 62},
-    },
-    /* 83 */
-    {
-        {0x8003, 35},
-        {0x8006, 35},
-        {0x800a, 35},
-        {0x800f, 35},
-        {0x8018, 35},
-        {0x801f, 35},
-        {0x8029, 35},
-        {0xc038, 35},
-        {0x8003, 62},
-        {0x8006, 62},
-        {0x800a, 62},
-        {0x800f, 62},
-        {0x8018, 62},
-        {0x801f, 62},
-        {0x8029, 62},
-        {0xc038, 62},
-    },
-    /* 84 */
-    {
-        {0x8001, 0},
-        {0xc016, 0},
-        {0x8001, 36},
-        {0xc016, 36},
-        {0x8001, 64},
-        {0xc016, 64},
-        {0x8001, 91},
-        {0xc016, 91},
-        {0x8001, 93},
-        {0xc016, 93},
-        {0x8001, 126},
-        {0xc016, 126},
-        {0xc000, 94},
-        {0xc000, 125},
-        {0x5d, 0},
-        {0x5e, 0},
-    },
-    /* 85 */
-    {
-        {0x8002, 0},
-        {0x8009, 0},
-        {0x8017, 0},
-        {0xc028, 0},
-        {0x8002, 36},
-        {0x8009, 36},
-        {0x8017, 36},
-        {0xc028, 36},
-        {0x8002, 64},
-        {0x8009, 64},
-        {0x8017, 64},
-        {0xc028, 64},
-        {0x8002, 91},
-        {0x8009, 91},
-        {0x8017, 91},
-        {0xc028, 91},
-    },
-    /* 86 */
-    {
-        {0x8003, 0},
-        {0x8006, 0},
-        {0x800a, 0},
-        {0x800f, 0},
-        {0x8018, 0},
-        {0x801f, 0},
-        {0x8029, 0},
-        {0xc038, 0},
-        {0x8003, 36},
-        {0x8006, 36},
-        {0x800a, 36},
-        {0x800f, 36},
-        {0x8018, 36},
-        {0x801f, 36},
-        {0x8029, 36},
-        {0xc038, 36},
-    },
-    /* 87 */
-    {
-        {0x8003, 64},
-        {0x8006, 64},
-        {0x800a, 64},
-        {0x800f, 64},
-        {0x8018, 64},
-        {0x801f, 64},
-        {0x8029, 64},
-        {0xc038, 64},
-        {0x8003, 91},
-        {0x8006, 91},
-        {0x800a, 91},
-        {0x800f, 91},
-        {0x8018, 91},
-        {0x801f, 91},
-        {0x8029, 91},
-        {0xc038, 91},
-    },
-    /* 88 */
-    {
-        {0x8002, 93},
-        {0x8009, 93},
-        {0x8017, 93},
-        {0xc028, 93},
-        {0x8002, 126},
-        {0x8009, 126},
-        {0x8017, 126},
-        {0xc028, 126},
-        {0x8001, 94},
-        {0xc016, 94},
-        {0x8001, 125},
-        {0xc016, 125},
-        {0xc000, 60},
-        {0xc000, 96},
-        {0xc000, 123},
-        {0x5f, 0},
-    },
-    /* 89 */
-    {
-        {0x8003, 93},
-        {0x8006, 93},
-        {0x800a, 93},
-        {0x800f, 93},
-        {0x8018, 93},
-        {0x801f, 93},
-        {0x8029, 93},
-        {0xc038, 93},
-        {0x8003, 126},
-        {0x8006, 126},
-        {0x800a, 126},
-        {0x800f, 126},
-        {0x8018, 126},
-        {0x801f, 126},
-        {0x8029, 126},
-        {0xc038, 126},
-    },
-    /* 90 */
-    {
-        {0x8002, 94},
-        {0x8009, 94},
-        {0x8017, 94},
-        {0xc028, 94},
-        {0x8002, 125},
-        {0x8009, 125},
-        {0x8017, 125},
-        {0xc028, 125},
-        {0x8001, 60},
-        {0xc016, 60},
-        {0x8001, 96},
-        {0xc016, 96},
-        {0x8001, 123},
-        {0xc016, 123},
-        {0x60, 0},
-        {0x6e, 0},
-    },
-    /* 91 */
-    {
-        {0x8003, 94},
-        {0x8006, 94},
-        {0x800a, 94},
-        {0x800f, 94},
-        {0x8018, 94},
-        {0x801f, 94},
-        {0x8029, 94},
-        {0xc038, 94},
-        {0x8003, 125},
-        {0x8006, 125},
-        {0x800a, 125},
-        {0x800f, 125},
-        {0x8018, 125},
-        {0x801f, 125},
-        {0x8029, 125},
-        {0xc038, 125},
-    },
-    /* 92 */
-    {
-        {0x8002, 60},
-        {0x8009, 60},
-        {0x8017, 60},
-        {0xc028, 60},
-        {0x8002, 96},
-        {0x8009, 96},
-        {0x8017, 96},
-        {0xc028, 96},
-        {0x8002, 123},
-        {0x8009, 123},
-        {0x8017, 123},
-        {0xc028, 123},
-        {0x61, 0},
-        {0x65, 0},
-        {0x6f, 0},
-        {0x85, 0},
-    },
-    /* 93 */
-    {
-        {0x8003, 60},
-        {0x8006, 60},
-        {0x800a, 60},
-        {0x800f, 60},
-        {0x8018, 60},
-        {0x801f, 60},
-        {0x8029, 60},
-        {0xc038, 60},
-        {0x8003, 96},
-        {0x8006, 96},
-        {0x800a, 96},
-        {0x800f, 96},
-        {0x8018, 96},
-        {0x801f, 96},
-        {0x8029, 96},
-        {0xc038, 96},
-    },
-    /* 94 */
-    {
-        {0x8003, 123},
-        {0x8006, 123},
-        {0x800a, 123},
-        {0x800f, 123},
-        {0x8018, 123},
-        {0x801f, 123},
-        {0x8029, 123},
-        {0xc038, 123},
-        {0x62, 0},
-        {0x63, 0},
-        {0x66, 0},
-        {0x69, 0},
-        {0x70, 0},
-        {0x77, 0},
-        {0x86, 0},
-        {0x99, 0},
-    },
-    /* 95 */
-    {
-        {0xc000, 92},
-        {0xc000, 195},
-        {0xc000, 208},
-        {0x64, 0},
-        {0x67, 0},
-        {0x68, 0},
-        {0x6a, 0},
-        {0x6b, 0},
-        {0x71, 0},
-        {0x74, 0},
-        {0x78, 0},
-        {0x7e, 0},
-        {0x87, 0},
-        {0x8e, 0},
-        {0x9a, 0},
-        {0xa9, 0},
-    },
-    /* 96 */
-    {
-        {0x8001, 92},
-        {0xc016, 92},
-        {0x8001, 195},
-        {0xc016, 195},
-        {0x8001, 208},
-        {0xc016, 208},
-        {0xc000, 128},
-        {0xc000, 130},
-        {0xc000, 131},
-        {0xc000, 162},
-        {0xc000, 184},
-        {0xc000, 194},
-        {0xc000, 224},
-        {0xc000, 226},
-        {0x6c, 0},
-        {0x6d, 0},
-    },
-    /* 97 */
-    {
-        {0x8002, 92},
-        {0x8009, 92},
-        {0x8017, 92},
-        {0xc028, 92},
-        {0x8002, 195},
-        {0x8009, 195},
-        {0x8017, 195},
-        {0xc028, 195},
-        {0x8002, 208},
-        {0x8009, 208},
-        {0x8017, 208},
-        {0xc028, 208},
-        {0x8001, 128},
-        {0xc016, 128},
-        {0x8001, 130},
-        {0xc016, 130},
-    },
-    /* 98 */
-    {
-        {0x8003, 92},
-        {0x8006, 92},
-        {0x800a, 92},
-        {0x800f, 92},
-        {0x8018, 92},
-        {0x801f, 92},
-        {0x8029, 92},
-        {0xc038, 92},
-        {0x8003, 195},
-        {0x8006, 195},
-        {0x800a, 195},
-        {0x800f, 195},
-        {0x8018, 195},
-        {0x801f, 195},
-        {0x8029, 195},
-        {0xc038, 195},
-    },
-    /* 99 */
-    {
-        {0x8003, 208},
-        {0x8006, 208},
-        {0x800a, 208},
-        {0x800f, 208},
-        {0x8018, 208},
-        {0x801f, 208},
-        {0x8029, 208},
-        {0xc038, 208},
-        {0x8002, 128},
-        {0x8009, 128},
-        {0x8017, 128},
-        {0xc028, 128},
-        {0x8002, 130},
-        {0x8009, 130},
-        {0x8017, 130},
-        {0xc028, 130},
-    },
-    /* 100 */
-    {
-        {0x8003, 128},
-        {0x8006, 128},
-        {0x800a, 128},
-        {0x800f, 128},
-        {0x8018, 128},
-        {0x801f, 128},
-        {0x8029, 128},
-        {0xc038, 128},
-        {0x8003, 130},
-        {0x8006, 130},
-        {0x800a, 130},
-        {0x800f, 130},
-        {0x8018, 130},
-        {0x801f, 130},
-        {0x8029, 130},
-        {0xc038, 130},
-    },
-    /* 101 */
-    {
-        {0x8001, 131},
-        {0xc016, 131},
-        {0x8001, 162},
-        {0xc016, 162},
-        {0x8001, 184},
-        {0xc016, 184},
-        {0x8001, 194},
-        {0xc016, 194},
-        {0x8001, 224},
-        {0xc016, 224},
-        {0x8001, 226},
-        {0xc016, 226},
-        {0xc000, 153},
-        {0xc000, 161},
-        {0xc000, 167},
-        {0xc000, 172},
-    },
-    /* 102 */
-    {
-        {0x8002, 131},
-        {0x8009, 131},
-        {0x8017, 131},
-        {0xc028, 131},
-        {0x8002, 162},
-        {0x8009, 162},
-        {0x8017, 162},
-        {0xc028, 162},
-        {0x8002, 184},
-        {0x8009, 184},
-        {0x8017, 184},
-        {0xc028, 184},
-        {0x8002, 194},
-        {0x8009, 194},
-        {0x8017, 194},
-        {0xc028, 194},
-    },
-    /* 103 */
-    {
-        {0x8003, 131},
-        {0x8006, 131},
-        {0x800a, 131},
-        {0x800f, 131},
-        {0x8018, 131},
-        {0x801f, 131},
-        {0x8029, 131},
-        {0xc038, 131},
-        {0x8003, 162},
-        {0x8006, 162},
-        {0x800a, 162},
-        {0x800f, 162},
-        {0x8018, 162},
-        {0x801f, 162},
-        {0x8029, 162},
-        {0xc038, 162},
-    },
-    /* 104 */
-    {
-        {0x8003, 184},
-        {0x8006, 184},
-        {0x800a, 184},
-        {0x800f, 184},
-        {0x8018, 184},
-        {0x801f, 184},
-        {0x8029, 184},
-        {0xc038, 184},
-        {0x8003, 194},
-        {0x8006, 194},
-        {0x800a, 194},
-        {0x800f, 194},
-        {0x8018, 194},
-        {0x801f, 194},
-        {0x8029, 194},
-        {0xc038, 194},
-    },
-    /* 105 */
-    {
-        {0x8002, 224},
-        {0x8009, 224},
-        {0x8017, 224},
-        {0xc028, 224},
-        {0x8002, 226},
-        {0x8009, 226},
-        {0x8017, 226},
-        {0xc028, 226},
-        {0x8001, 153},
-        {0xc016, 153},
-        {0x8001, 161},
-        {0xc016, 161},
-        {0x8001, 167},
-        {0xc016, 167},
-        {0x8001, 172},
-        {0xc016, 172},
-    },
-    /* 106 */
-    {
-        {0x8003, 224},
-        {0x8006, 224},
-        {0x800a, 224},
-        {0x800f, 224},
-        {0x8018, 224},
-        {0x801f, 224},
-        {0x8029, 224},
-        {0xc038, 224},
-        {0x8003, 226},
-        {0x8006, 226},
-        {0x800a, 226},
-        {0x800f, 226},
-        {0x8018, 226},
-        {0x801f, 226},
-        {0x8029, 226},
-        {0xc038, 226},
-    },
-    /* 107 */
-    {
-        {0x8002, 153},
-        {0x8009, 153},
-        {0x8017, 153},
-        {0xc028, 153},
-        {0x8002, 161},
-        {0x8009, 161},
-        {0x8017, 161},
-        {0xc028, 161},
-        {0x8002, 167},
-        {0x8009, 167},
-        {0x8017, 167},
-        {0xc028, 167},
-        {0x8002, 172},
-        {0x8009, 172},
-        {0x8017, 172},
-        {0xc028, 172},
-    },
-    /* 108 */
-    {
-        {0x8003, 153},
-        {0x8006, 153},
-        {0x800a, 153},
-        {0x800f, 153},
-        {0x8018, 153},
-        {0x801f, 153},
-        {0x8029, 153},
-        {0xc038, 153},
-        {0x8003, 161},
-        {0x8006, 161},
-        {0x800a, 161},
-        {0x800f, 161},
-        {0x8018, 161},
-        {0x801f, 161},
-        {0x8029, 161},
-        {0xc038, 161},
-    },
-    /* 109 */
-    {
-        {0x8003, 167},
-        {0x8006, 167},
-        {0x800a, 167},
-        {0x800f, 167},
-        {0x8018, 167},
-        {0x801f, 167},
-        {0x8029, 167},
-        {0xc038, 167},
-        {0x8003, 172},
-        {0x8006, 172},
-        {0x800a, 172},
-        {0x800f, 172},
-        {0x8018, 172},
-        {0x801f, 172},
-        {0x8029, 172},
-        {0xc038, 172},
-    },
-    /* 110 */
-    {
-        {0x72, 0},
-        {0x73, 0},
-        {0x75, 0},
-        {0x76, 0},
-        {0x79, 0},
-        {0x7b, 0},
-        {0x7f, 0},
-        {0x82, 0},
-        {0x88, 0},
-        {0x8b, 0},
-        {0x8f, 0},
-        {0x92, 0},
-        {0x9b, 0},
-        {0xa2, 0},
-        {0xaa, 0},
-        {0xb4, 0},
-    },
-    /* 111 */
-    {
-        {0xc000, 176},
-        {0xc000, 177},
-        {0xc000, 179},
-        {0xc000, 209},
-        {0xc000, 216},
-        {0xc000, 217},
-        {0xc000, 227},
-        {0xc000, 229},
-        {0xc000, 230},
-        {0x7a, 0},
-        {0x7c, 0},
-        {0x7d, 0},
-        {0x80, 0},
-        {0x81, 0},
-        {0x83, 0},
-        {0x84, 0},
-    },
-    /* 112 */
-    {
-        {0x8001, 176},
-        {0xc016, 176},
-        {0x8001, 177},
-        {0xc016, 177},
-        {0x8001, 179},
-        {0xc016, 179},
-        {0x8001, 209},
-        {0xc016, 209},
-        {0x8001, 216},
-        {0xc016, 216},
-        {0x8001, 217},
-        {0xc016, 217},
-        {0x8001, 227},
-        {0xc016, 227},
-        {0x8001, 229},
-        {0xc016, 229},
-    },
-    /* 113 */
-    {
-        {0x8002, 176},
-        {0x8009, 176},
-        {0x8017, 176},
-        {0xc028, 176},
-        {0x8002, 177},
-        {0x8009, 177},
-        {0x8017, 177},
-        {0xc028, 177},
-        {0x8002, 179},
-        {0x8009, 179},
-        {0x8017, 179},
-        {0xc028, 179},
-        {0x8002, 209},
-        {0x8009, 209},
-        {0x8017, 209},
-        {0xc028, 209},
-    },
-    /* 114 */
-    {
-        {0x8003, 176},
-        {0x8006, 176},
-        {0x800a, 176},
-        {0x800f, 176},
-        {0x8018, 176},
-        {0x801f, 176},
-        {0x8029, 176},
-        {0xc038, 176},
-        {0x8003, 177},
-        {0x8006, 177},
-        {0x800a, 177},
-        {0x800f, 177},
-        {0x8018, 177},
-        {0x801f, 177},
-        {0x8029, 177},
-        {0xc038, 177},
-    },
-    /* 115 */
-    {
-        {0x8003, 179},
-        {0x8006, 179},
-        {0x800a, 179},
-        {0x800f, 179},
-        {0x8018, 179},
-        {0x801f, 179},
-        {0x8029, 179},
-        {0xc038, 179},
-        {0x8003, 209},
-        {0x8006, 209},
-        {0x800a, 209},
-        {0x800f, 209},
-        {0x8018, 209},
-        {0x801f, 209},
-        {0x8029, 209},
-        {0xc038, 209},
-    },
-    /* 116 */
-    {
-        {0x8002, 216},
-        {0x8009, 216},
-        {0x8017, 216},
-        {0xc028, 216},
-        {0x8002, 217},
-        {0x8009, 217},
-        {0x8017, 217},
-        {0xc028, 217},
-        {0x8002, 227},
-        {0x8009, 227},
-        {0x8017, 227},
-        {0xc028, 227},
-        {0x8002, 229},
-        {0x8009, 229},
-        {0x8017, 229},
-        {0xc028, 229},
-    },
-    /* 117 */
-    {
-        {0x8003, 216},
-        {0x8006, 216},
-        {0x800a, 216},
-        {0x800f, 216},
-        {0x8018, 216},
-        {0x801f, 216},
-        {0x8029, 216},
-        {0xc038, 216},
-        {0x8003, 217},
-        {0x8006, 217},
-        {0x800a, 217},
-        {0x800f, 217},
-        {0x8018, 217},
-        {0x801f, 217},
-        {0x8029, 217},
-        {0xc038, 217},
-    },
-    /* 118 */
-    {
-        {0x8003, 227},
-        {0x8006, 227},
-        {0x800a, 227},
-        {0x800f, 227},
-        {0x8018, 227},
-        {0x801f, 227},
-        {0x8029, 227},
-        {0xc038, 227},
-        {0x8003, 229},
-        {0x8006, 229},
-        {0x800a, 229},
-        {0x800f, 229},
-        {0x8018, 229},
-        {0x801f, 229},
-        {0x8029, 229},
-        {0xc038, 229},
-    },
-    /* 119 */
-    {
-        {0x8001, 230},
-        {0xc016, 230},
-        {0xc000, 129},
-        {0xc000, 132},
-        {0xc000, 133},
-        {0xc000, 134},
-        {0xc000, 136},
-        {0xc000, 146},
-        {0xc000, 154},
-        {0xc000, 156},
-        {0xc000, 160},
-        {0xc000, 163},
-        {0xc000, 164},
-        {0xc000, 169},
-        {0xc000, 170},
-        {0xc000, 173},
-    },
-    /* 120 */
-    {
-        {0x8002, 230},
-        {0x8009, 230},
-        {0x8017, 230},
-        {0xc028, 230},
-        {0x8001, 129},
-        {0xc016, 129},
-        {0x8001, 132},
-        {0xc016, 132},
-        {0x8001, 133},
-        {0xc016, 133},
-        {0x8001, 134},
-        {0xc016, 134},
-        {0x8001, 136},
-        {0xc016, 136},
-        {0x8001, 146},
-        {0xc016, 146},
-    },
-    /* 121 */
-    {
-        {0x8003, 230},
-        {0x8006, 230},
-        {0x800a, 230},
-        {0x800f, 230},
-        {0x8018, 230},
-        {0x801f, 230},
-        {0x8029, 230},
-        {0xc038, 230},
-        {0x8002, 129},
-        {0x8009, 129},
-        {0x8017, 129},
-        {0xc028, 129},
-        {0x8002, 132},
-        {0x8009, 132},
-        {0x8017, 132},
-        {0xc028, 132},
-    },
-    /* 122 */
-    {
-        {0x8003, 129},
-        {0x8006, 129},
-        {0x800a, 129},
-        {0x800f, 129},
-        {0x8018, 129},
-        {0x801f, 129},
-        {0x8029, 129},
-        {0xc038, 129},
-        {0x8003, 132},
-        {0x8006, 132},
-        {0x800a, 132},
-        {0x800f, 132},
-        {0x8018, 132},
-        {0x801f, 132},
-        {0x8029, 132},
-        {0xc038, 132},
-    },
-    /* 123 */
-    {
-        {0x8002, 133},
-        {0x8009, 133},
-        {0x8017, 133},
-        {0xc028, 133},
-        {0x8002, 134},
-        {0x8009, 134},
-        {0x8017, 134},
-        {0xc028, 134},
-        {0x8002, 136},
-        {0x8009, 136},
-        {0x8017, 136},
-        {0xc028, 136},
-        {0x8002, 146},
-        {0x8009, 146},
-        {0x8017, 146},
-        {0xc028, 146},
-    },
-    /* 124 */
-    {
-        {0x8003, 133},
-        {0x8006, 133},
-        {0x800a, 133},
-        {0x800f, 133},
-        {0x8018, 133},
-        {0x801f, 133},
-        {0x8029, 133},
-        {0xc038, 133},
-        {0x8003, 134},
-        {0x8006, 134},
-        {0x800a, 134},
-        {0x800f, 134},
-        {0x8018, 134},
-        {0x801f, 134},
-        {0x8029, 134},
-        {0xc038, 134},
-    },
-    /* 125 */
-    {
-        {0x8003, 136},
-        {0x8006, 136},
-        {0x800a, 136},
-        {0x800f, 136},
-        {0x8018, 136},
-        {0x801f, 136},
-        {0x8029, 136},
-        {0xc038, 136},
-        {0x8003, 146},
-        {0x8006, 146},
-        {0x800a, 146},
-        {0x800f, 146},
-        {0x8018, 146},
-        {0x801f, 146},
-        {0x8029, 146},
-        {0xc038, 146},
-    },
-    /* 126 */
-    {
-        {0x8001, 154},
-        {0xc016, 154},
-        {0x8001, 156},
-        {0xc016, 156},
-        {0x8001, 160},
-        {0xc016, 160},
-        {0x8001, 163},
-        {0xc016, 163},
-        {0x8001, 164},
-        {0xc016, 164},
-        {0x8001, 169},
-        {0xc016, 169},
-        {0x8001, 170},
-        {0xc016, 170},
-        {0x8001, 173},
-        {0xc016, 173},
-    },
-    /* 127 */
-    {
-        {0x8002, 154},
-        {0x8009, 154},
-        {0x8017, 154},
-        {0xc028, 154},
-        {0x8002, 156},
-        {0x8009, 156},
-        {0x8017, 156},
-        {0xc028, 156},
-        {0x8002, 160},
-        {0x8009, 160},
-        {0x8017, 160},
-        {0xc028, 160},
-        {0x8002, 163},
-        {0x8009, 163},
-        {0x8017, 163},
-        {0xc028, 163},
-    },
-    /* 128 */
-    {
-        {0x8003, 154},
-        {0x8006, 154},
-        {0x800a, 154},
-        {0x800f, 154},
-        {0x8018, 154},
-        {0x801f, 154},
-        {0x8029, 154},
-        {0xc038, 154},
-        {0x8003, 156},
-        {0x8006, 156},
-        {0x800a, 156},
-        {0x800f, 156},
-        {0x8018, 156},
-        {0x801f, 156},
-        {0x8029, 156},
-        {0xc038, 156},
-    },
-    /* 129 */
-    {
-        {0x8003, 160},
-        {0x8006, 160},
-        {0x800a, 160},
-        {0x800f, 160},
-        {0x8018, 160},
-        {0x801f, 160},
-        {0x8029, 160},
-        {0xc038, 160},
-        {0x8003, 163},
-        {0x8006, 163},
-        {0x800a, 163},
-        {0x800f, 163},
-        {0x8018, 163},
-        {0x801f, 163},
-        {0x8029, 163},
-        {0xc038, 163},
-    },
-    /* 130 */
-    {
-        {0x8002, 164},
-        {0x8009, 164},
-        {0x8017, 164},
-        {0xc028, 164},
-        {0x8002, 169},
-        {0x8009, 169},
-        {0x8017, 169},
-        {0xc028, 169},
-        {0x8002, 170},
-        {0x8009, 170},
-        {0x8017, 170},
-        {0xc028, 170},
-        {0x8002, 173},
-        {0x8009, 173},
-        {0x8017, 173},
-        {0xc028, 173},
-    },
-    /* 131 */
-    {
-        {0x8003, 164},
-        {0x8006, 164},
-        {0x800a, 164},
-        {0x800f, 164},
-        {0x8018, 164},
-        {0x801f, 164},
-        {0x8029, 164},
-        {0xc038, 164},
-        {0x8003, 169},
-        {0x8006, 169},
-        {0x800a, 169},
-        {0x800f, 169},
-        {0x8018, 169},
-        {0x801f, 169},
-        {0x8029, 169},
-        {0xc038, 169},
-    },
-    /* 132 */
-    {
-        {0x8003, 170},
-        {0x8006, 170},
-        {0x800a, 170},
-        {0x800f, 170},
-        {0x8018, 170},
-        {0x801f, 170},
-        {0x8029, 170},
-        {0xc038, 170},
-        {0x8003, 173},
-        {0x8006, 173},
-        {0x800a, 173},
-        {0x800f, 173},
-        {0x8018, 173},
-        {0x801f, 173},
-        {0x8029, 173},
-        {0xc038, 173},
-    },
-    /* 133 */
-    {
-        {0x89, 0},
-        {0x8a, 0},
-        {0x8c, 0},
-        {0x8d, 0},
-        {0x90, 0},
-        {0x91, 0},
-        {0x93, 0},
-        {0x96, 0},
-        {0x9c, 0},
-        {0x9f, 0},
-        {0xa3, 0},
-        {0xa6, 0},
-        {0xab, 0},
-        {0xae, 0},
-        {0xb5, 0},
-        {0xbe, 0},
-    },
-    /* 134 */
-    {
-        {0xc000, 178},
-        {0xc000, 181},
-        {0xc000, 185},
-        {0xc000, 186},
-        {0xc000, 187},
-        {0xc000, 189},
-        {0xc000, 190},
-        {0xc000, 196},
-        {0xc000, 198},
-        {0xc000, 228},
-        {0xc000, 232},
-        {0xc000, 233},
-        {0x94, 0},
-        {0x95, 0},
-        {0x97, 0},
-        {0x98, 0},
-    },
-    /* 135 */
-    {
-        {0x8001, 178},
-        {0xc016, 178},
-        {0x8001, 181},
-        {0xc016, 181},
-        {0x8001, 185},
-        {0xc016, 185},
-        {0x8001, 186},
-        {0xc016, 186},
-        {0x8001, 187},
-        {0xc016, 187},
-        {0x8001, 189},
-        {0xc016, 189},
-        {0x8001, 190},
-        {0xc016, 190},
-        {0x8001, 196},
-        {0xc016, 196},
-    },
-    /* 136 */
-    {
-        {0x8002, 178},
-        {0x8009, 178},
-        {0x8017, 178},
-        {0xc028, 178},
-        {0x8002, 181},
-        {0x8009, 181},
-        {0x8017, 181},
-        {0xc028, 181},
-        {0x8002, 185},
-        {0x8009, 185},
-        {0x8017, 185},
-        {0xc028, 185},
-        {0x8002, 186},
-        {0x8009, 186},
-        {0x8017, 186},
-        {0xc028, 186},
-    },
-    /* 137 */
-    {
-        {0x8003, 178},
-        {0x8006, 178},
-        {0x800a, 178},
-        {0x800f, 178},
-        {0x8018, 178},
-        {0x801f, 178},
-        {0x8029, 178},
-        {0xc038, 178},
-        {0x8003, 181},
-        {0x8006, 181},
-        {0x800a, 181},
-        {0x800f, 181},
-        {0x8018, 181},
-        {0x801f, 181},
-        {0x8029, 181},
-        {0xc038, 181},
-    },
-    /* 138 */
-    {
-        {0x8003, 185},
-        {0x8006, 185},
-        {0x800a, 185},
-        {0x800f, 185},
-        {0x8018, 185},
-        {0x801f, 185},
-        {0x8029, 185},
-        {0xc038, 185},
-        {0x8003, 186},
-        {0x8006, 186},
-        {0x800a, 186},
-        {0x800f, 186},
-        {0x8018, 186},
-        {0x801f, 186},
-        {0x8029, 186},
-        {0xc038, 186},
-    },
-    /* 139 */
-    {
-        {0x8002, 187},
-        {0x8009, 187},
-        {0x8017, 187},
-        {0xc028, 187},
-        {0x8002, 189},
-        {0x8009, 189},
-        {0x8017, 189},
-        {0xc028, 189},
-        {0x8002, 190},
-        {0x8009, 190},
-        {0x8017, 190},
-        {0xc028, 190},
-        {0x8002, 196},
-        {0x8009, 196},
-        {0x8017, 196},
-        {0xc028, 196},
-    },
-    /* 140 */
-    {
-        {0x8003, 187},
-        {0x8006, 187},
-        {0x800a, 187},
-        {0x800f, 187},
-        {0x8018, 187},
-        {0x801f, 187},
-        {0x8029, 187},
-        {0xc038, 187},
-        {0x8003, 189},
-        {0x8006, 189},
-        {0x800a, 189},
-        {0x800f, 189},
-        {0x8018, 189},
-        {0x801f, 189},
-        {0x8029, 189},
-        {0xc038, 189},
-    },
-    /* 141 */
-    {
-        {0x8003, 190},
-        {0x8006, 190},
-        {0x800a, 190},
-        {0x800f, 190},
-        {0x8018, 190},
-        {0x801f, 190},
-        {0x8029, 190},
-        {0xc038, 190},
-        {0x8003, 196},
-        {0x8006, 196},
-        {0x800a, 196},
-        {0x800f, 196},
-        {0x8018, 196},
-        {0x801f, 196},
-        {0x8029, 196},
-        {0xc038, 196},
-    },
-    /* 142 */
-    {
-        {0x8001, 198},
-        {0xc016, 198},
-        {0x8001, 228},
-        {0xc016, 228},
-        {0x8001, 232},
-        {0xc016, 232},
-        {0x8001, 233},
-        {0xc016, 233},
-        {0xc000, 1},
-        {0xc000, 135},
-        {0xc000, 137},
-        {0xc000, 138},
-        {0xc000, 139},
-        {0xc000, 140},
-        {0xc000, 141},
-        {0xc000, 143},
-    },
-    /* 143 */
-    {
-        {0x8002, 198},
-        {0x8009, 198},
-        {0x8017, 198},
-        {0xc028, 198},
-        {0x8002, 228},
-        {0x8009, 228},
-        {0x8017, 228},
-        {0xc028, 228},
-        {0x8002, 232},
-        {0x8009, 232},
-        {0x8017, 232},
-        {0xc028, 232},
-        {0x8002, 233},
-        {0x8009, 233},
-        {0x8017, 233},
-        {0xc028, 233},
-    },
-    /* 144 */
-    {
-        {0x8003, 198},
-        {0x8006, 198},
-        {0x800a, 198},
-        {0x800f, 198},
-        {0x8018, 198},
-        {0x801f, 198},
-        {0x8029, 198},
-        {0xc038, 198},
-        {0x8003, 228},
-        {0x8006, 228},
-        {0x800a, 228},
-        {0x800f, 228},
-        {0x8018, 228},
-        {0x801f, 228},
-        {0x8029, 228},
-        {0xc038, 228},
-    },
-    /* 145 */
-    {
-        {0x8003, 232},
-        {0x8006, 232},
-        {0x800a, 232},
-        {0x800f, 232},
-        {0x8018, 232},
-        {0x801f, 232},
-        {0x8029, 232},
-        {0xc038, 232},
-        {0x8003, 233},
-        {0x8006, 233},
-        {0x800a, 233},
-        {0x800f, 233},
-        {0x8018, 233},
-        {0x801f, 233},
-        {0x8029, 233},
-        {0xc038, 233},
-    },
-    /* 146 */
-    {
-        {0x8001, 1},
-        {0xc016, 1},
-        {0x8001, 135},
-        {0xc016, 135},
-        {0x8001, 137},
-        {0xc016, 137},
-        {0x8001, 138},
-        {0xc016, 138},
-        {0x8001, 139},
-        {0xc016, 139},
-        {0x8001, 140},
-        {0xc016, 140},
-        {0x8001, 141},
-        {0xc016, 141},
-        {0x8001, 143},
-        {0xc016, 143},
-    },
-    /* 147 */
-    {
-        {0x8002, 1},
-        {0x8009, 1},
-        {0x8017, 1},
-        {0xc028, 1},
-        {0x8002, 135},
-        {0x8009, 135},
-        {0x8017, 135},
-        {0xc028, 135},
-        {0x8002, 137},
-        {0x8009, 137},
-        {0x8017, 137},
-        {0xc028, 137},
-        {0x8002, 138},
-        {0x8009, 138},
-        {0x8017, 138},
-        {0xc028, 138},
-    },
-    /* 148 */
-    {
-        {0x8003, 1},
-        {0x8006, 1},
-        {0x800a, 1},
-        {0x800f, 1},
-        {0x8018, 1},
-        {0x801f, 1},
-        {0x8029, 1},
-        {0xc038, 1},
-        {0x8003, 135},
-        {0x8006, 135},
-        {0x800a, 135},
-        {0x800f, 135},
-        {0x8018, 135},
-        {0x801f, 135},
-        {0x8029, 135},
-        {0xc038, 135},
-    },
-    /* 149 */
-    {
-        {0x8003, 137},
-        {0x8006, 137},
-        {0x800a, 137},
-        {0x800f, 137},
-        {0x8018, 137},
-        {0x801f, 137},
-        {0x8029, 137},
-        {0xc038, 137},
-        {0x8003, 138},
-        {0x8006, 138},
-        {0x800a, 138},
-        {0x800f, 138},
-        {0x8018, 138},
-        {0x801f, 138},
-        {0x8029, 138},
-        {0xc038, 138},
-    },
-    /* 150 */
-    {
-        {0x8002, 139},
-        {0x8009, 139},
-        {0x8017, 139},
-        {0xc028, 139},
-        {0x8002, 140},
-        {0x8009, 140},
-        {0x8017, 140},
-        {0xc028, 140},
-        {0x8002, 141},
-        {0x8009, 141},
-        {0x8017, 141},
-        {0xc028, 141},
-        {0x8002, 143},
-        {0x8009, 143},
-        {0x8017, 143},
-        {0xc028, 143},
-    },
-    /* 151 */
-    {
-        {0x8003, 139},
-        {0x8006, 139},
-        {0x800a, 139},
-        {0x800f, 139},
-        {0x8018, 139},
-        {0x801f, 139},
-        {0x8029, 139},
-        {0xc038, 139},
-        {0x8003, 140},
-        {0x8006, 140},
-        {0x800a, 140},
-        {0x800f, 140},
-        {0x8018, 140},
-        {0x801f, 140},
-        {0x8029, 140},
-        {0xc038, 140},
-    },
-    /* 152 */
-    {
-        {0x8003, 141},
-        {0x8006, 141},
-        {0x800a, 141},
-        {0x800f, 141},
-        {0x8018, 141},
-        {0x801f, 141},
-        {0x8029, 141},
-        {0xc038, 141},
-        {0x8003, 143},
-        {0x8006, 143},
-        {0x800a, 143},
-        {0x800f, 143},
-        {0x8018, 143},
-        {0x801f, 143},
-        {0x8029, 143},
-        {0xc038, 143},
-    },
-    /* 153 */
-    {
-        {0x9d, 0},
-        {0x9e, 0},
-        {0xa0, 0},
-        {0xa1, 0},
-        {0xa4, 0},
-        {0xa5, 0},
-        {0xa7, 0},
-        {0xa8, 0},
-        {0xac, 0},
-        {0xad, 0},
-        {0xaf, 0},
-        {0xb1, 0},
-        {0xb6, 0},
-        {0xb9, 0},
-        {0xbf, 0},
-        {0xcf, 0},
-    },
-    /* 154 */
-    {
-        {0xc000, 147},
-        {0xc000, 149},
-        {0xc000, 150},
-        {0xc000, 151},
-        {0xc000, 152},
-        {0xc000, 155},
-        {0xc000, 157},
-        {0xc000, 158},
-        {0xc000, 165},
-        {0xc000, 166},
-        {0xc000, 168},
-        {0xc000, 174},
-        {0xc000, 175},
-        {0xc000, 180},
-        {0xc000, 182},
-        {0xc000, 183},
-    },
-    /* 155 */
-    {
-        {0x8001, 147},
-        {0xc016, 147},
-        {0x8001, 149},
-        {0xc016, 149},
-        {0x8001, 150},
-        {0xc016, 150},
-        {0x8001, 151},
-        {0xc016, 151},
-        {0x8001, 152},
-        {0xc016, 152},
-        {0x8001, 155},
-        {0xc016, 155},
-        {0x8001, 157},
-        {0xc016, 157},
-        {0x8001, 158},
-        {0xc016, 158},
-    },
-    /* 156 */
-    {
-        {0x8002, 147},
-        {0x8009, 147},
-        {0x8017, 147},
-        {0xc028, 147},
-        {0x8002, 149},
-        {0x8009, 149},
-        {0x8017, 149},
-        {0xc028, 149},
-        {0x8002, 150},
-        {0x8009, 150},
-        {0x8017, 150},
-        {0xc028, 150},
-        {0x8002, 151},
-        {0x8009, 151},
-        {0x8017, 151},
-        {0xc028, 151},
-    },
-    /* 157 */
-    {
-        {0x8003, 147},
-        {0x8006, 147},
-        {0x800a, 147},
-        {0x800f, 147},
-        {0x8018, 147},
-        {0x801f, 147},
-        {0x8029, 147},
-        {0xc038, 147},
-        {0x8003, 149},
-        {0x8006, 149},
-        {0x800a, 149},
-        {0x800f, 149},
-        {0x8018, 149},
-        {0x801f, 149},
-        {0x8029, 149},
-        {0xc038, 149},
-    },
-    /* 158 */
-    {
-        {0x8003, 150},
-        {0x8006, 150},
-        {0x800a, 150},
-        {0x800f, 150},
-        {0x8018, 150},
-        {0x801f, 150},
-        {0x8029, 150},
-        {0xc038, 150},
-        {0x8003, 151},
-        {0x8006, 151},
-        {0x800a, 151},
-        {0x800f, 151},
-        {0x8018, 151},
-        {0x801f, 151},
-        {0x8029, 151},
-        {0xc038, 151},
-    },
-    /* 159 */
-    {
-        {0x8002, 152},
-        {0x8009, 152},
-        {0x8017, 152},
-        {0xc028, 152},
-        {0x8002, 155},
-        {0x8009, 155},
-        {0x8017, 155},
-        {0xc028, 155},
-        {0x8002, 157},
-        {0x8009, 157},
-        {0x8017, 157},
-        {0xc028, 157},
-        {0x8002, 158},
-        {0x8009, 158},
-        {0x8017, 158},
-        {0xc028, 158},
-    },
-    /* 160 */
-    {
-        {0x8003, 152},
-        {0x8006, 152},
-        {0x800a, 152},
-        {0x800f, 152},
-        {0x8018, 152},
-        {0x801f, 152},
-        {0x8029, 152},
-        {0xc038, 152},
-        {0x8003, 155},
-        {0x8006, 155},
-        {0x800a, 155},
-        {0x800f, 155},
-        {0x8018, 155},
-        {0x801f, 155},
-        {0x8029, 155},
-        {0xc038, 155},
-    },
-    /* 161 */
-    {
-        {0x8003, 157},
-        {0x8006, 157},
-        {0x800a, 157},
-        {0x800f, 157},
-        {0x8018, 157},
-        {0x801f, 157},
-        {0x8029, 157},
-        {0xc038, 157},
-        {0x8003, 158},
-        {0x8006, 158},
-        {0x800a, 158},
-        {0x800f, 158},
-        {0x8018, 158},
-        {0x801f, 158},
-        {0x8029, 158},
-        {0xc038, 158},
-    },
-    /* 162 */
-    {
-        {0x8001, 165},
-        {0xc016, 165},
-        {0x8001, 166},
-        {0xc016, 166},
-        {0x8001, 168},
-        {0xc016, 168},
-        {0x8001, 174},
-        {0xc016, 174},
-        {0x8001, 175},
-        {0xc016, 175},
-        {0x8001, 180},
-        {0xc016, 180},
-        {0x8001, 182},
-        {0xc016, 182},
-        {0x8001, 183},
-        {0xc016, 183},
-    },
-    /* 163 */
-    {
-        {0x8002, 165},
-        {0x8009, 165},
-        {0x8017, 165},
-        {0xc028, 165},
-        {0x8002, 166},
-        {0x8009, 166},
-        {0x8017, 166},
-        {0xc028, 166},
-        {0x8002, 168},
-        {0x8009, 168},
-        {0x8017, 168},
-        {0xc028, 168},
-        {0x8002, 174},
-        {0x8009, 174},
-        {0x8017, 174},
-        {0xc028, 174},
-    },
-    /* 164 */
-    {
-        {0x8003, 165},
-        {0x8006, 165},
-        {0x800a, 165},
-        {0x800f, 165},
-        {0x8018, 165},
-        {0x801f, 165},
-        {0x8029, 165},
-        {0xc038, 165},
-        {0x8003, 166},
-        {0x8006, 166},
-        {0x800a, 166},
-        {0x800f, 166},
-        {0x8018, 166},
-        {0x801f, 166},
-        {0x8029, 166},
-        {0xc038, 166},
-    },
-    /* 165 */
-    {
-        {0x8003, 168},
-        {0x8006, 168},
-        {0x800a, 168},
-        {0x800f, 168},
-        {0x8018, 168},
-        {0x801f, 168},
-        {0x8029, 168},
-        {0xc038, 168},
-        {0x8003, 174},
-        {0x8006, 174},
-        {0x800a, 174},
-        {0x800f, 174},
-        {0x8018, 174},
-        {0x801f, 174},
-        {0x8029, 174},
-        {0xc038, 174},
-    },
-    /* 166 */
-    {
-        {0x8002, 175},
-        {0x8009, 175},
-        {0x8017, 175},
-        {0xc028, 175},
-        {0x8002, 180},
-        {0x8009, 180},
-        {0x8017, 180},
-        {0xc028, 180},
-        {0x8002, 182},
-        {0x8009, 182},
-        {0x8017, 182},
-        {0xc028, 182},
-        {0x8002, 183},
-        {0x8009, 183},
-        {0x8017, 183},
-        {0xc028, 183},
-    },
-    /* 167 */
-    {
-        {0x8003, 175},
-        {0x8006, 175},
-        {0x800a, 175},
-        {0x800f, 175},
-        {0x8018, 175},
-        {0x801f, 175},
-        {0x8029, 175},
-        {0xc038, 175},
-        {0x8003, 180},
-        {0x8006, 180},
-        {0x800a, 180},
-        {0x800f, 180},
-        {0x8018, 180},
-        {0x801f, 180},
-        {0x8029, 180},
-        {0xc038, 180},
-    },
-    /* 168 */
-    {
-        {0x8003, 182},
-        {0x8006, 182},
-        {0x800a, 182},
-        {0x800f, 182},
-        {0x8018, 182},
-        {0x801f, 182},
-        {0x8029, 182},
-        {0xc038, 182},
-        {0x8003, 183},
-        {0x8006, 183},
-        {0x800a, 183},
-        {0x800f, 183},
-        {0x8018, 183},
-        {0x801f, 183},
-        {0x8029, 183},
-        {0xc038, 183},
-    },
-    /* 169 */
-    {
-        {0xc000, 188},
-        {0xc000, 191},
-        {0xc000, 197},
-        {0xc000, 231},
-        {0xc000, 239},
-        {0xb0, 0},
-        {0xb2, 0},
-        {0xb3, 0},
-        {0xb7, 0},
-        {0xb8, 0},
-        {0xba, 0},
-        {0xbb, 0},
-        {0xc0, 0},
-        {0xc7, 0},
-        {0xd0, 0},
-        {0xdf, 0},
-    },
-    /* 170 */
-    {
-        {0x8001, 188},
-        {0xc016, 188},
-        {0x8001, 191},
-        {0xc016, 191},
-        {0x8001, 197},
-        {0xc016, 197},
-        {0x8001, 231},
-        {0xc016, 231},
-        {0x8001, 239},
-        {0xc016, 239},
-        {0xc000, 9},
-        {0xc000, 142},
-        {0xc000, 144},
-        {0xc000, 145},
-        {0xc000, 148},
-        {0xc000, 159},
-    },
-    /* 171 */
-    {
-        {0x8002, 188},
-        {0x8009, 188},
-        {0x8017, 188},
-        {0xc028, 188},
-        {0x8002, 191},
-        {0x8009, 191},
-        {0x8017, 191},
-        {0xc028, 191},
-        {0x8002, 197},
-        {0x8009, 197},
-        {0x8017, 197},
-        {0xc028, 197},
-        {0x8002, 231},
-        {0x8009, 231},
-        {0x8017, 231},
-        {0xc028, 231},
-    },
-    /* 172 */
-    {
-        {0x8003, 188},
-        {0x8006, 188},
-        {0x800a, 188},
-        {0x800f, 188},
-        {0x8018, 188},
-        {0x801f, 188},
-        {0x8029, 188},
-        {0xc038, 188},
-        {0x8003, 191},
-        {0x8006, 191},
-        {0x800a, 191},
-        {0x800f, 191},
-        {0x8018, 191},
-        {0x801f, 191},
-        {0x8029, 191},
-        {0xc038, 191},
-    },
-    /* 173 */
-    {
-        {0x8003, 197},
-        {0x8006, 197},
-        {0x800a, 197},
-        {0x800f, 197},
-        {0x8018, 197},
-        {0x801f, 197},
-        {0x8029, 197},
-        {0xc038, 197},
-        {0x8003, 231},
-        {0x8006, 231},
-        {0x800a, 231},
-        {0x800f, 231},
-        {0x8018, 231},
-        {0x801f, 231},
-        {0x8029, 231},
-        {0xc038, 231},
-    },
-    /* 174 */
-    {
-        {0x8002, 239},
-        {0x8009, 239},
-        {0x8017, 239},
-        {0xc028, 239},
-        {0x8001, 9},
-        {0xc016, 9},
-        {0x8001, 142},
-        {0xc016, 142},
-        {0x8001, 144},
-        {0xc016, 144},
-        {0x8001, 145},
-        {0xc016, 145},
-        {0x8001, 148},
-        {0xc016, 148},
-        {0x8001, 159},
-        {0xc016, 159},
-    },
-    /* 175 */
-    {
-        {0x8003, 239},
-        {0x8006, 239},
-        {0x800a, 239},
-        {0x800f, 239},
-        {0x8018, 239},
-        {0x801f, 239},
-        {0x8029, 239},
-        {0xc038, 239},
-        {0x8002, 9},
-        {0x8009, 9},
-        {0x8017, 9},
-        {0xc028, 9},
-        {0x8002, 142},
-        {0x8009, 142},
-        {0x8017, 142},
-        {0xc028, 142},
-    },
-    /* 176 */
-    {
-        {0x8003, 9},
-        {0x8006, 9},
-        {0x800a, 9},
-        {0x800f, 9},
-        {0x8018, 9},
-        {0x801f, 9},
-        {0x8029, 9},
-        {0xc038, 9},
-        {0x8003, 142},
-        {0x8006, 142},
-        {0x800a, 142},
-        {0x800f, 142},
-        {0x8018, 142},
-        {0x801f, 142},
-        {0x8029, 142},
-        {0xc038, 142},
-    },
-    /* 177 */
-    {
-        {0x8002, 144},
-        {0x8009, 144},
-        {0x8017, 144},
-        {0xc028, 144},
-        {0x8002, 145},
-        {0x8009, 145},
-        {0x8017, 145},
-        {0xc028, 145},
-        {0x8002, 148},
-        {0x8009, 148},
-        {0x8017, 148},
-        {0xc028, 148},
-        {0x8002, 159},
-        {0x8009, 159},
-        {0x8017, 159},
-        {0xc028, 159},
-    },
-    /* 178 */
-    {
-        {0x8003, 144},
-        {0x8006, 144},
-        {0x800a, 144},
-        {0x800f, 144},
-        {0x8018, 144},
-        {0x801f, 144},
-        {0x8029, 144},
-        {0xc038, 144},
-        {0x8003, 145},
-        {0x8006, 145},
-        {0x800a, 145},
-        {0x800f, 145},
-        {0x8018, 145},
-        {0x801f, 145},
-        {0x8029, 145},
-        {0xc038, 145},
-    },
-    /* 179 */
-    {
-        {0x8003, 148},
-        {0x8006, 148},
-        {0x800a, 148},
-        {0x800f, 148},
-        {0x8018, 148},
-        {0x801f, 148},
-        {0x8029, 148},
-        {0xc038, 148},
-        {0x8003, 159},
-        {0x8006, 159},
-        {0x800a, 159},
-        {0x800f, 159},
-        {0x8018, 159},
-        {0x801f, 159},
-        {0x8029, 159},
-        {0xc038, 159},
-    },
-    /* 180 */
-    {
-        {0xc000, 171},
-        {0xc000, 206},
-        {0xc000, 215},
-        {0xc000, 225},
-        {0xc000, 236},
-        {0xc000, 237},
-        {0xbc, 0},
-        {0xbd, 0},
-        {0xc1, 0},
-        {0xc4, 0},
-        {0xc8, 0},
-        {0xcb, 0},
-        {0xd1, 0},
-        {0xd8, 0},
-        {0xe0, 0},
-        {0xee, 0},
-    },
-    /* 181 */
-    {
-        {0x8001, 171},
-        {0xc016, 171},
-        {0x8001, 206},
-        {0xc016, 206},
-        {0x8001, 215},
-        {0xc016, 215},
-        {0x8001, 225},
-        {0xc016, 225},
-        {0x8001, 236},
-        {0xc016, 236},
-        {0x8001, 237},
-        {0xc016, 237},
-        {0xc000, 199},
-        {0xc000, 207},
-        {0xc000, 234},
-        {0xc000, 235},
-    },
-    /* 182 */
-    {
-        {0x8002, 171},
-        {0x8009, 171},
-        {0x8017, 171},
-        {0xc028, 171},
-        {0x8002, 206},
-        {0x8009, 206},
-        {0x8017, 206},
-        {0xc028, 206},
-        {0x8002, 215},
-        {0x8009, 215},
-        {0x8017, 215},
-        {0xc028, 215},
-        {0x8002, 225},
-        {0x8009, 225},
-        {0x8017, 225},
-        {0xc028, 225},
-    },
-    /* 183 */
-    {
-        {0x8003, 171},
-        {0x8006, 171},
-        {0x800a, 171},
-        {0x800f, 171},
-        {0x8018, 171},
-        {0x801f, 171},
-        {0x8029, 171},
-        {0xc038, 171},
-        {0x8003, 206},
-        {0x8006, 206},
-        {0x800a, 206},
-        {0x800f, 206},
-        {0x8018, 206},
-        {0x801f, 206},
-        {0x8029, 206},
-        {0xc038, 206},
-    },
-    /* 184 */
-    {
-        {0x8003, 215},
-        {0x8006, 215},
-        {0x800a, 215},
-        {0x800f, 215},
-        {0x8018, 215},
-        {0x801f, 215},
-        {0x8029, 215},
-        {0xc038, 215},
-        {0x8003, 225},
-        {0x8006, 225},
-        {0x800a, 225},
-        {0x800f, 225},
-        {0x8018, 225},
-        {0x801f, 225},
-        {0x8029, 225},
-        {0xc038, 225},
-    },
-    /* 185 */
-    {
-        {0x8002, 236},
-        {0x8009, 236},
-        {0x8017, 236},
-        {0xc028, 236},
-        {0x8002, 237},
-        {0x8009, 237},
-        {0x8017, 237},
-        {0xc028, 237},
-        {0x8001, 199},
-        {0xc016, 199},
-        {0x8001, 207},
-        {0xc016, 207},
-        {0x8001, 234},
-        {0xc016, 234},
-        {0x8001, 235},
-        {0xc016, 235},
-    },
-    /* 186 */
-    {
-        {0x8003, 236},
-        {0x8006, 236},
-        {0x800a, 236},
-        {0x800f, 236},
-        {0x8018, 236},
-        {0x801f, 236},
-        {0x8029, 236},
-        {0xc038, 236},
-        {0x8003, 237},
-        {0x8006, 237},
-        {0x800a, 237},
-        {0x800f, 237},
-        {0x8018, 237},
-        {0x801f, 237},
-        {0x8029, 237},
-        {0xc038, 237},
-    },
-    /* 187 */
-    {
-        {0x8002, 199},
-        {0x8009, 199},
-        {0x8017, 199},
-        {0xc028, 199},
-        {0x8002, 207},
-        {0x8009, 207},
-        {0x8017, 207},
-        {0xc028, 207},
-        {0x8002, 234},
-        {0x8009, 234},
-        {0x8017, 234},
-        {0xc028, 234},
-        {0x8002, 235},
-        {0x8009, 235},
-        {0x8017, 235},
-        {0xc028, 235},
-    },
-    /* 188 */
-    {
-        {0x8003, 199},
-        {0x8006, 199},
-        {0x800a, 199},
-        {0x800f, 199},
-        {0x8018, 199},
-        {0x801f, 199},
-        {0x8029, 199},
-        {0xc038, 199},
-        {0x8003, 207},
-        {0x8006, 207},
-        {0x800a, 207},
-        {0x800f, 207},
-        {0x8018, 207},
-        {0x801f, 207},
-        {0x8029, 207},
-        {0xc038, 207},
-    },
-    /* 189 */
-    {
-        {0x8003, 234},
-        {0x8006, 234},
-        {0x800a, 234},
-        {0x800f, 234},
-        {0x8018, 234},
-        {0x801f, 234},
-        {0x8029, 234},
-        {0xc038, 234},
-        {0x8003, 235},
-        {0x8006, 235},
-        {0x800a, 235},
-        {0x800f, 235},
-        {0x8018, 235},
-        {0x801f, 235},
-        {0x8029, 235},
-        {0xc038, 235},
-    },
-    /* 190 */
-    {
-        {0xc2, 0},
-        {0xc3, 0},
-        {0xc5, 0},
-        {0xc6, 0},
-        {0xc9, 0},
-        {0xca, 0},
-        {0xcc, 0},
-        {0xcd, 0},
-        {0xd2, 0},
-        {0xd5, 0},
-        {0xd9, 0},
-        {0xdc, 0},
-        {0xe1, 0},
-        {0xe7, 0},
-        {0xef, 0},
-        {0xf6, 0},
-    },
-    /* 191 */
-    {
-        {0xc000, 192},
-        {0xc000, 193},
-        {0xc000, 200},
-        {0xc000, 201},
-        {0xc000, 202},
-        {0xc000, 205},
-        {0xc000, 210},
-        {0xc000, 213},
-        {0xc000, 218},
-        {0xc000, 219},
-        {0xc000, 238},
-        {0xc000, 240},
-        {0xc000, 242},
-        {0xc000, 243},
-        {0xc000, 255},
-        {0xce, 0},
-    },
-    /* 192 */
-    {
-        {0x8001, 192},
-        {0xc016, 192},
-        {0x8001, 193},
-        {0xc016, 193},
-        {0x8001, 200},
-        {0xc016, 200},
-        {0x8001, 201},
-        {0xc016, 201},
-        {0x8001, 202},
-        {0xc016, 202},
-        {0x8001, 205},
-        {0xc016, 205},
-        {0x8001, 210},
-        {0xc016, 210},
-        {0x8001, 213},
-        {0xc016, 213},
-    },
-    /* 193 */
-    {
-        {0x8002, 192},
-        {0x8009, 192},
-        {0x8017, 192},
-        {0xc028, 192},
-        {0x8002, 193},
-        {0x8009, 193},
-        {0x8017, 193},
-        {0xc028, 193},
-        {0x8002, 200},
-        {0x8009, 200},
-        {0x8017, 200},
-        {0xc028, 200},
-        {0x8002, 201},
-        {0x8009, 201},
-        {0x8017, 201},
-        {0xc028, 201},
-    },
-    /* 194 */
-    {
-        {0x8003, 192},
-        {0x8006, 192},
-        {0x800a, 192},
-        {0x800f, 192},
-        {0x8018, 192},
-        {0x801f, 192},
-        {0x8029, 192},
-        {0xc038, 192},
-        {0x8003, 193},
-        {0x8006, 193},
-        {0x800a, 193},
-        {0x800f, 193},
-        {0x8018, 193},
-        {0x801f, 193},
-        {0x8029, 193},
-        {0xc038, 193},
-    },
-    /* 195 */
-    {
-        {0x8003, 200},
-        {0x8006, 200},
-        {0x800a, 200},
-        {0x800f, 200},
-        {0x8018, 200},
-        {0x801f, 200},
-        {0x8029, 200},
-        {0xc038, 200},
-        {0x8003, 201},
-        {0x8006, 201},
-        {0x800a, 201},
-        {0x800f, 201},
-        {0x8018, 201},
-        {0x801f, 201},
-        {0x8029, 201},
-        {0xc038, 201},
-    },
-    /* 196 */
-    {
-        {0x8002, 202},
-        {0x8009, 202},
-        {0x8017, 202},
-        {0xc028, 202},
-        {0x8002, 205},
-        {0x8009, 205},
-        {0x8017, 205},
-        {0xc028, 205},
-        {0x8002, 210},
-        {0x8009, 210},
-        {0x8017, 210},
-        {0xc028, 210},
-        {0x8002, 213},
-        {0x8009, 213},
-        {0x8017, 213},
-        {0xc028, 213},
-    },
-    /* 197 */
-    {
-        {0x8003, 202},
-        {0x8006, 202},
-        {0x800a, 202},
-        {0x800f, 202},
-        {0x8018, 202},
-        {0x801f, 202},
-        {0x8029, 202},
-        {0xc038, 202},
-        {0x8003, 205},
-        {0x8006, 205},
-        {0x800a, 205},
-        {0x800f, 205},
-        {0x8018, 205},
-        {0x801f, 205},
-        {0x8029, 205},
-        {0xc038, 205},
-    },
-    /* 198 */
-    {
-        {0x8003, 210},
-        {0x8006, 210},
-        {0x800a, 210},
-        {0x800f, 210},
-        {0x8018, 210},
-        {0x801f, 210},
-        {0x8029, 210},
-        {0xc038, 210},
-        {0x8003, 213},
-        {0x8006, 213},
-        {0x800a, 213},
-        {0x800f, 213},
-        {0x8018, 213},
-        {0x801f, 213},
-        {0x8029, 213},
-        {0xc038, 213},
-    },
-    /* 199 */
-    {
-        {0x8001, 218},
-        {0xc016, 218},
-        {0x8001, 219},
-        {0xc016, 219},
-        {0x8001, 238},
-        {0xc016, 238},
-        {0x8001, 240},
-        {0xc016, 240},
-        {0x8001, 242},
-        {0xc016, 242},
-        {0x8001, 243},
-        {0xc016, 243},
-        {0x8001, 255},
-        {0xc016, 255},
-        {0xc000, 203},
-        {0xc000, 204},
-    },
-    /* 200 */
-    {
-        {0x8002, 218},
-        {0x8009, 218},
-        {0x8017, 218},
-        {0xc028, 218},
-        {0x8002, 219},
-        {0x8009, 219},
-        {0x8017, 219},
-        {0xc028, 219},
-        {0x8002, 238},
-        {0x8009, 238},
-        {0x8017, 238},
-        {0xc028, 238},
-        {0x8002, 240},
-        {0x8009, 240},
-        {0x8017, 240},
-        {0xc028, 240},
-    },
-    /* 201 */
-    {
-        {0x8003, 218},
-        {0x8006, 218},
-        {0x800a, 218},
-        {0x800f, 218},
-        {0x8018, 218},
-        {0x801f, 218},
-        {0x8029, 218},
-        {0xc038, 218},
-        {0x8003, 219},
-        {0x8006, 219},
-        {0x800a, 219},
-        {0x800f, 219},
-        {0x8018, 219},
-        {0x801f, 219},
-        {0x8029, 219},
-        {0xc038, 219},
-    },
-    /* 202 */
-    {
-        {0x8003, 238},
-        {0x8006, 238},
-        {0x800a, 238},
-        {0x800f, 238},
-        {0x8018, 238},
-        {0x801f, 238},
-        {0x8029, 238},
-        {0xc038, 238},
-        {0x8003, 240},
-        {0x8006, 240},
-        {0x800a, 240},
-        {0x800f, 240},
-        {0x8018, 240},
-        {0x801f, 240},
-        {0x8029, 240},
-        {0xc038, 240},
-    },
-    /* 203 */
-    {
-        {0x8002, 242},
-        {0x8009, 242},
-        {0x8017, 242},
-        {0xc028, 242},
-        {0x8002, 243},
-        {0x8009, 243},
-        {0x8017, 243},
-        {0xc028, 243},
-        {0x8002, 255},
-        {0x8009, 255},
-        {0x8017, 255},
-        {0xc028, 255},
-        {0x8001, 203},
-        {0xc016, 203},
-        {0x8001, 204},
-        {0xc016, 204},
-    },
-    /* 204 */
-    {
-        {0x8003, 242},
-        {0x8006, 242},
-        {0x800a, 242},
-        {0x800f, 242},
-        {0x8018, 242},
-        {0x801f, 242},
-        {0x8029, 242},
-        {0xc038, 242},
-        {0x8003, 243},
-        {0x8006, 243},
-        {0x800a, 243},
-        {0x800f, 243},
-        {0x8018, 243},
-        {0x801f, 243},
-        {0x8029, 243},
-        {0xc038, 243},
-    },
-    /* 205 */
-    {
-        {0x8003, 255},
-        {0x8006, 255},
-        {0x800a, 255},
-        {0x800f, 255},
-        {0x8018, 255},
-        {0x801f, 255},
-        {0x8029, 255},
-        {0xc038, 255},
-        {0x8002, 203},
-        {0x8009, 203},
-        {0x8017, 203},
-        {0xc028, 203},
-        {0x8002, 204},
-        {0x8009, 204},
-        {0x8017, 204},
-        {0xc028, 204},
-    },
-    /* 206 */
-    {
-        {0x8003, 203},
-        {0x8006, 203},
-        {0x800a, 203},
-        {0x800f, 203},
-        {0x8018, 203},
-        {0x801f, 203},
-        {0x8029, 203},
-        {0xc038, 203},
-        {0x8003, 204},
-        {0x8006, 204},
-        {0x800a, 204},
-        {0x800f, 204},
-        {0x8018, 204},
-        {0x801f, 204},
-        {0x8029, 204},
-        {0xc038, 204},
-    },
-    /* 207 */
-    {
-        {0xd3, 0},
-        {0xd4, 0},
-        {0xd6, 0},
-        {0xd7, 0},
-        {0xda, 0},
-        {0xdb, 0},
-        {0xdd, 0},
-        {0xde, 0},
-        {0xe2, 0},
-        {0xe4, 0},
-        {0xe8, 0},
-        {0xeb, 0},
-        {0xf0, 0},
-        {0xf3, 0},
-        {0xf7, 0},
-        {0xfa, 0},
-    },
-    /* 208 */
-    {
-        {0xc000, 211},
-        {0xc000, 212},
-        {0xc000, 214},
-        {0xc000, 221},
-        {0xc000, 222},
-        {0xc000, 223},
-        {0xc000, 241},
-        {0xc000, 244},
-        {0xc000, 245},
-        {0xc000, 246},
-        {0xc000, 247},
-        {0xc000, 248},
-        {0xc000, 250},
-        {0xc000, 251},
-        {0xc000, 252},
-        {0xc000, 253},
-    },
-    /* 209 */
-    {
-        {0x8001, 211},
-        {0xc016, 211},
-        {0x8001, 212},
-        {0xc016, 212},
-        {0x8001, 214},
-        {0xc016, 214},
-        {0x8001, 221},
-        {0xc016, 221},
-        {0x8001, 222},
-        {0xc016, 222},
-        {0x8001, 223},
-        {0xc016, 223},
-        {0x8001, 241},
-        {0xc016, 241},
-        {0x8001, 244},
-        {0xc016, 244},
-    },
-    /* 210 */
-    {
-        {0x8002, 211},
-        {0x8009, 211},
-        {0x8017, 211},
-        {0xc028, 211},
-        {0x8002, 212},
-        {0x8009, 212},
-        {0x8017, 212},
-        {0xc028, 212},
-        {0x8002, 214},
-        {0x8009, 214},
-        {0x8017, 214},
-        {0xc028, 214},
-        {0x8002, 221},
-        {0x8009, 221},
-        {0x8017, 221},
-        {0xc028, 221},
-    },
-    /* 211 */
-    {
-        {0x8003, 211},
-        {0x8006, 211},
-        {0x800a, 211},
-        {0x800f, 211},
-        {0x8018, 211},
-        {0x801f, 211},
-        {0x8029, 211},
-        {0xc038, 211},
-        {0x8003, 212},
-        {0x8006, 212},
-        {0x800a, 212},
-        {0x800f, 212},
-        {0x8018, 212},
-        {0x801f, 212},
-        {0x8029, 212},
-        {0xc038, 212},
-    },
-    /* 212 */
-    {
-        {0x8003, 214},
-        {0x8006, 214},
-        {0x800a, 214},
-        {0x800f, 214},
-        {0x8018, 214},
-        {0x801f, 214},
-        {0x8029, 214},
-        {0xc038, 214},
-        {0x8003, 221},
-        {0x8006, 221},
-        {0x800a, 221},
-        {0x800f, 221},
-        {0x8018, 221},
-        {0x801f, 221},
-        {0x8029, 221},
-        {0xc038, 221},
-    },
-    /* 213 */
-    {
-        {0x8002, 222},
-        {0x8009, 222},
-        {0x8017, 222},
-        {0xc028, 222},
-        {0x8002, 223},
-        {0x8009, 223},
-        {0x8017, 223},
-        {0xc028, 223},
-        {0x8002, 241},
-        {0x8009, 241},
-        {0x8017, 241},
-        {0xc028, 241},
-        {0x8002, 244},
-        {0x8009, 244},
-        {0x8017, 244},
-        {0xc028, 244},
-    },
-    /* 214 */
-    {
-        {0x8003, 222},
-        {0x8006, 222},
-        {0x800a, 222},
-        {0x800f, 222},
-        {0x8018, 222},
-        {0x801f, 222},
-        {0x8029, 222},
-        {0xc038, 222},
-        {0x8003, 223},
-        {0x8006, 223},
-        {0x800a, 223},
-        {0x800f, 223},
-        {0x8018, 223},
-        {0x801f, 223},
-        {0x8029, 223},
-        {0xc038, 223},
-    },
-    /* 215 */
-    {
-        {0x8003, 241},
-        {0x8006, 241},
-        {0x800a, 241},
-        {0x800f, 241},
-        {0x8018, 241},
-        {0x801f, 241},
-        {0x8029, 241},
-        {0xc038, 241},
-        {0x8003, 244},
-        {0x8006, 244},
-        {0x800a, 244},
-        {0x800f, 244},
-        {0x8018, 244},
-        {0x801f, 244},
-        {0x8029, 244},
-        {0xc038, 244},
-    },
-    /* 216 */
-    {
-        {0x8001, 245},
-        {0xc016, 245},
-        {0x8001, 246},
-        {0xc016, 246},
-        {0x8001, 247},
-        {0xc016, 247},
-        {0x8001, 248},
-        {0xc016, 248},
-        {0x8001, 250},
-        {0xc016, 250},
-        {0x8001, 251},
-        {0xc016, 251},
-        {0x8001, 252},
-        {0xc016, 252},
-        {0x8001, 253},
-        {0xc016, 253},
-    },
-    /* 217 */
-    {
-        {0x8002, 245},
-        {0x8009, 245},
-        {0x8017, 245},
-        {0xc028, 245},
-        {0x8002, 246},
-        {0x8009, 246},
-        {0x8017, 246},
-        {0xc028, 246},
-        {0x8002, 247},
-        {0x8009, 247},
-        {0x8017, 247},
-        {0xc028, 247},
-        {0x8002, 248},
-        {0x8009, 248},
-        {0x8017, 248},
-        {0xc028, 248},
-    },
-    /* 218 */
-    {
-        {0x8003, 245},
-        {0x8006, 245},
-        {0x800a, 245},
-        {0x800f, 245},
-        {0x8018, 245},
-        {0x801f, 245},
-        {0x8029, 245},
-        {0xc038, 245},
-        {0x8003, 246},
-        {0x8006, 246},
-        {0x800a, 246},
-        {0x800f, 246},
-        {0x8018, 246},
-        {0x801f, 246},
-        {0x8029, 246},
-        {0xc038, 246},
-    },
-    /* 219 */
-    {
-        {0x8003, 247},
-        {0x8006, 247},
-        {0x800a, 247},
-        {0x800f, 247},
-        {0x8018, 247},
-        {0x801f, 247},
-        {0x8029, 247},
-        {0xc038, 247},
-        {0x8003, 248},
-        {0x8006, 248},
-        {0x800a, 248},
-        {0x800f, 248},
-        {0x8018, 248},
-        {0x801f, 248},
-        {0x8029, 248},
-        {0xc038, 248},
-    },
-    /* 220 */
-    {
-        {0x8002, 250},
-        {0x8009, 250},
-        {0x8017, 250},
-        {0xc028, 250},
-        {0x8002, 251},
-        {0x8009, 251},
-        {0x8017, 251},
-        {0xc028, 251},
-        {0x8002, 252},
-        {0x8009, 252},
-        {0x8017, 252},
-        {0xc028, 252},
-        {0x8002, 253},
-        {0x8009, 253},
-        {0x8017, 253},
-        {0xc028, 253},
-    },
-    /* 221 */
-    {
-        {0x8003, 250},
-        {0x8006, 250},
-        {0x800a, 250},
-        {0x800f, 250},
-        {0x8018, 250},
-        {0x801f, 250},
-        {0x8029, 250},
-        {0xc038, 250},
-        {0x8003, 251},
-        {0x8006, 251},
-        {0x800a, 251},
-        {0x800f, 251},
-        {0x8018, 251},
-        {0x801f, 251},
-        {0x8029, 251},
-        {0xc038, 251},
-    },
-    /* 222 */
-    {
-        {0x8003, 252},
-        {0x8006, 252},
-        {0x800a, 252},
-        {0x800f, 252},
-        {0x8018, 252},
-        {0x801f, 252},
-        {0x8029, 252},
-        {0xc038, 252},
-        {0x8003, 253},
-        {0x8006, 253},
-        {0x800a, 253},
-        {0x800f, 253},
-        {0x8018, 253},
-        {0x801f, 253},
-        {0x8029, 253},
-        {0xc038, 253},
-    },
-    /* 223 */
-    {
-        {0xc000, 254},
-        {0xe3, 0},
-        {0xe5, 0},
-        {0xe6, 0},
-        {0xe9, 0},
-        {0xea, 0},
-        {0xec, 0},
-        {0xed, 0},
-        {0xf1, 0},
-        {0xf2, 0},
-        {0xf4, 0},
-        {0xf5, 0},
-        {0xf8, 0},
-        {0xf9, 0},
-        {0xfb, 0},
-        {0xfc, 0},
-    },
-    /* 224 */
-    {
-        {0x8001, 254},
-        {0xc016, 254},
-        {0xc000, 2},
-        {0xc000, 3},
-        {0xc000, 4},
-        {0xc000, 5},
-        {0xc000, 6},
-        {0xc000, 7},
-        {0xc000, 8},
-        {0xc000, 11},
-        {0xc000, 12},
-        {0xc000, 14},
-        {0xc000, 15},
-        {0xc000, 16},
-        {0xc000, 17},
-        {0xc000, 18},
-    },
-    /* 225 */
-    {
-        {0x8002, 254},
-        {0x8009, 254},
-        {0x8017, 254},
-        {0xc028, 254},
-        {0x8001, 2},
-        {0xc016, 2},
-        {0x8001, 3},
-        {0xc016, 3},
-        {0x8001, 4},
-        {0xc016, 4},
-        {0x8001, 5},
-        {0xc016, 5},
-        {0x8001, 6},
-        {0xc016, 6},
-        {0x8001, 7},
-        {0xc016, 7},
-    },
-    /* 226 */
-    {
-        {0x8003, 254},
-        {0x8006, 254},
-        {0x800a, 254},
-        {0x800f, 254},
-        {0x8018, 254},
-        {0x801f, 254},
-        {0x8029, 254},
-        {0xc038, 254},
-        {0x8002, 2},
-        {0x8009, 2},
-        {0x8017, 2},
-        {0xc028, 2},
-        {0x8002, 3},
-        {0x8009, 3},
-        {0x8017, 3},
-        {0xc028, 3},
-    },
-    /* 227 */
-    {
-        {0x8003, 2},
-        {0x8006, 2},
-        {0x800a, 2},
-        {0x800f, 2},
-        {0x8018, 2},
-        {0x801f, 2},
-        {0x8029, 2},
-        {0xc038, 2},
-        {0x8003, 3},
-        {0x8006, 3},
-        {0x800a, 3},
-        {0x800f, 3},
-        {0x8018, 3},
-        {0x801f, 3},
-        {0x8029, 3},
-        {0xc038, 3},
-    },
-    /* 228 */
-    {
-        {0x8002, 4},
-        {0x8009, 4},
-        {0x8017, 4},
-        {0xc028, 4},
-        {0x8002, 5},
-        {0x8009, 5},
-        {0x8017, 5},
-        {0xc028, 5},
-        {0x8002, 6},
-        {0x8009, 6},
-        {0x8017, 6},
-        {0xc028, 6},
-        {0x8002, 7},
-        {0x8009, 7},
-        {0x8017, 7},
-        {0xc028, 7},
-    },
-    /* 229 */
-    {
-        {0x8003, 4},
-        {0x8006, 4},
-        {0x800a, 4},
-        {0x800f, 4},
-        {0x8018, 4},
-        {0x801f, 4},
-        {0x8029, 4},
-        {0xc038, 4},
-        {0x8003, 5},
-        {0x8006, 5},
-        {0x800a, 5},
-        {0x800f, 5},
-        {0x8018, 5},
-        {0x801f, 5},
-        {0x8029, 5},
-        {0xc038, 5},
-    },
-    /* 230 */
-    {
-        {0x8003, 6},
-        {0x8006, 6},
-        {0x800a, 6},
-        {0x800f, 6},
-        {0x8018, 6},
-        {0x801f, 6},
-        {0x8029, 6},
-        {0xc038, 6},
-        {0x8003, 7},
-        {0x8006, 7},
-        {0x800a, 7},
-        {0x800f, 7},
-        {0x8018, 7},
-        {0x801f, 7},
-        {0x8029, 7},
-        {0xc038, 7},
-    },
-    /* 231 */
-    {
-        {0x8001, 8},
-        {0xc016, 8},
-        {0x8001, 11},
-        {0xc016, 11},
-        {0x8001, 12},
-        {0xc016, 12},
-        {0x8001, 14},
-        {0xc016, 14},
-        {0x8001, 15},
-        {0xc016, 15},
-        {0x8001, 16},
-        {0xc016, 16},
-        {0x8001, 17},
-        {0xc016, 17},
-        {0x8001, 18},
-        {0xc016, 18},
-    },
-    /* 232 */
-    {
-        {0x8002, 8},
-        {0x8009, 8},
-        {0x8017, 8},
-        {0xc028, 8},
-        {0x8002, 11},
-        {0x8009, 11},
-        {0x8017, 11},
-        {0xc028, 11},
-        {0x8002, 12},
-        {0x8009, 12},
-        {0x8017, 12},
-        {0xc028, 12},
-        {0x8002, 14},
-        {0x8009, 14},
-        {0x8017, 14},
-        {0xc028, 14},
-    },
-    /* 233 */
-    {
-        {0x8003, 8},
-        {0x8006, 8},
-        {0x800a, 8},
-        {0x800f, 8},
-        {0x8018, 8},
-        {0x801f, 8},
-        {0x8029, 8},
-        {0xc038, 8},
-        {0x8003, 11},
-        {0x8006, 11},
-        {0x800a, 11},
-        {0x800f, 11},
-        {0x8018, 11},
-        {0x801f, 11},
-        {0x8029, 11},
-        {0xc038, 11},
-    },
-    /* 234 */
-    {
-        {0x8003, 12},
-        {0x8006, 12},
-        {0x800a, 12},
-        {0x800f, 12},
-        {0x8018, 12},
-        {0x801f, 12},
-        {0x8029, 12},
-        {0xc038, 12},
-        {0x8003, 14},
-        {0x8006, 14},
-        {0x800a, 14},
-        {0x800f, 14},
-        {0x8018, 14},
-        {0x801f, 14},
-        {0x8029, 14},
-        {0xc038, 14},
-    },
-    /* 235 */
-    {
-        {0x8002, 15},
-        {0x8009, 15},
-        {0x8017, 15},
-        {0xc028, 15},
-        {0x8002, 16},
-        {0x8009, 16},
-        {0x8017, 16},
-        {0xc028, 16},
-        {0x8002, 17},
-        {0x8009, 17},
-        {0x8017, 17},
-        {0xc028, 17},
-        {0x8002, 18},
-        {0x8009, 18},
-        {0x8017, 18},
-        {0xc028, 18},
-    },
-    /* 236 */
-    {
-        {0x8003, 15},
-        {0x8006, 15},
-        {0x800a, 15},
-        {0x800f, 15},
-        {0x8018, 15},
-        {0x801f, 15},
-        {0x8029, 15},
-        {0xc038, 15},
-        {0x8003, 16},
-        {0x8006, 16},
-        {0x800a, 16},
-        {0x800f, 16},
-        {0x8018, 16},
-        {0x801f, 16},
-        {0x8029, 16},
-        {0xc038, 16},
-    },
-    /* 237 */
-    {
-        {0x8003, 17},
-        {0x8006, 17},
-        {0x800a, 17},
-        {0x800f, 17},
-        {0x8018, 17},
-        {0x801f, 17},
-        {0x8029, 17},
-        {0xc038, 17},
-        {0x8003, 18},
-        {0x8006, 18},
-        {0x800a, 18},
-        {0x800f, 18},
-        {0x8018, 18},
-        {0x801f, 18},
-        {0x8029, 18},
-        {0xc038, 18},
-    },
-    /* 238 */
-    {
-        {0xc000, 19},
-        {0xc000, 20},
-        {0xc000, 21},
-        {0xc000, 23},
-        {0xc000, 24},
-        {0xc000, 25},
-        {0xc000, 26},
-        {0xc000, 27},
-        {0xc000, 28},
-        {0xc000, 29},
-        {0xc000, 30},
-        {0xc000, 31},
-        {0xc000, 127},
-        {0xc000, 220},
-        {0xc000, 249},
-        {0xfd, 0},
-    },
-    /* 239 */
-    {
-        {0x8001, 19},
-        {0xc016, 19},
-        {0x8001, 20},
-        {0xc016, 20},
-        {0x8001, 21},
-        {0xc016, 21},
-        {0x8001, 23},
-        {0xc016, 23},
-        {0x8001, 24},
-        {0xc016, 24},
-        {0x8001, 25},
-        {0xc016, 25},
-        {0x8001, 26},
-        {0xc016, 26},
-        {0x8001, 27},
-        {0xc016, 27},
-    },
-    /* 240 */
-    {
-        {0x8002, 19},
-        {0x8009, 19},
-        {0x8017, 19},
-        {0xc028, 19},
-        {0x8002, 20},
-        {0x8009, 20},
-        {0x8017, 20},
-        {0xc028, 20},
-        {0x8002, 21},
-        {0x8009, 21},
-        {0x8017, 21},
-        {0xc028, 21},
-        {0x8002, 23},
-        {0x8009, 23},
-        {0x8017, 23},
-        {0xc028, 23},
-    },
-    /* 241 */
-    {
-        {0x8003, 19},
-        {0x8006, 19},
-        {0x800a, 19},
-        {0x800f, 19},
-        {0x8018, 19},
-        {0x801f, 19},
-        {0x8029, 19},
-        {0xc038, 19},
-        {0x8003, 20},
-        {0x8006, 20},
-        {0x800a, 20},
-        {0x800f, 20},
-        {0x8018, 20},
-        {0x801f, 20},
-        {0x8029, 20},
-        {0xc038, 20},
-    },
-    /* 242 */
-    {
-        {0x8003, 21},
-        {0x8006, 21},
-        {0x800a, 21},
-        {0x800f, 21},
-        {0x8018, 21},
-        {0x801f, 21},
-        {0x8029, 21},
-        {0xc038, 21},
-        {0x8003, 23},
-        {0x8006, 23},
-        {0x800a, 23},
-        {0x800f, 23},
-        {0x8018, 23},
-        {0x801f, 23},
-        {0x8029, 23},
-        {0xc038, 23},
-    },
-    /* 243 */
-    {
-        {0x8002, 24},
-        {0x8009, 24},
-        {0x8017, 24},
-        {0xc028, 24},
-        {0x8002, 25},
-        {0x8009, 25},
-        {0x8017, 25},
-        {0xc028, 25},
-        {0x8002, 26},
-        {0x8009, 26},
-        {0x8017, 26},
-        {0xc028, 26},
-        {0x8002, 27},
-        {0x8009, 27},
-        {0x8017, 27},
-        {0xc028, 27},
-    },
-    /* 244 */
-    {
-        {0x8003, 24},
-        {0x8006, 24},
-        {0x800a, 24},
-        {0x800f, 24},
-        {0x8018, 24},
-        {0x801f, 24},
-        {0x8029, 24},
-        {0xc038, 24},
-        {0x8003, 25},
-        {0x8006, 25},
-        {0x800a, 25},
-        {0x800f, 25},
-        {0x8018, 25},
-        {0x801f, 25},
-        {0x8029, 25},
-        {0xc038, 25},
-    },
-    /* 245 */
-    {
-        {0x8003, 26},
-        {0x8006, 26},
-        {0x800a, 26},
-        {0x800f, 26},
-        {0x8018, 26},
-        {0x801f, 26},
-        {0x8029, 26},
-        {0xc038, 26},
-        {0x8003, 27},
-        {0x8006, 27},
-        {0x800a, 27},
-        {0x800f, 27},
-        {0x8018, 27},
-        {0x801f, 27},
-        {0x8029, 27},
-        {0xc038, 27},
-    },
-    /* 246 */
-    {
-        {0x8001, 28},
-        {0xc016, 28},
-        {0x8001, 29},
-        {0xc016, 29},
-        {0x8001, 30},
-        {0xc016, 30},
-        {0x8001, 31},
-        {0xc016, 31},
-        {0x8001, 127},
-        {0xc016, 127},
-        {0x8001, 220},
-        {0xc016, 220},
-        {0x8001, 249},
-        {0xc016, 249},
-        {0xfe, 0},
-        {0xff, 0},
-    },
-    /* 247 */
-    {
-        {0x8002, 28},
-        {0x8009, 28},
-        {0x8017, 28},
-        {0xc028, 28},
-        {0x8002, 29},
-        {0x8009, 29},
-        {0x8017, 29},
-        {0xc028, 29},
-        {0x8002, 30},
-        {0x8009, 30},
-        {0x8017, 30},
-        {0xc028, 30},
-        {0x8002, 31},
-        {0x8009, 31},
-        {0x8017, 31},
-        {0xc028, 31},
-    },
-    /* 248 */
-    {
-        {0x8003, 28},
-        {0x8006, 28},
-        {0x800a, 28},
-        {0x800f, 28},
-        {0x8018, 28},
-        {0x801f, 28},
-        {0x8029, 28},
-        {0xc038, 28},
-        {0x8003, 29},
-        {0x8006, 29},
-        {0x800a, 29},
-        {0x800f, 29},
-        {0x8018, 29},
-        {0x801f, 29},
-        {0x8029, 29},
-        {0xc038, 29},
-    },
-    /* 249 */
-    {
-        {0x8003, 30},
-        {0x8006, 30},
-        {0x800a, 30},
-        {0x800f, 30},
-        {0x8018, 30},
-        {0x801f, 30},
-        {0x8029, 30},
-        {0xc038, 30},
-        {0x8003, 31},
-        {0x8006, 31},
-        {0x800a, 31},
-        {0x800f, 31},
-        {0x8018, 31},
-        {0x801f, 31},
-        {0x8029, 31},
-        {0xc038, 31},
-    },
-    /* 250 */
-    {
-        {0x8002, 127},
-        {0x8009, 127},
-        {0x8017, 127},
-        {0xc028, 127},
-        {0x8002, 220},
-        {0x8009, 220},
-        {0x8017, 220},
-        {0xc028, 220},
-        {0x8002, 249},
-        {0x8009, 249},
-        {0x8017, 249},
-        {0xc028, 249},
-        {0xc000, 10},
-        {0xc000, 13},
-        {0xc000, 22},
-        {0x100, 0},
-    },
-    /* 251 */
-    {
-        {0x8003, 127},
-        {0x8006, 127},
-        {0x800a, 127},
-        {0x800f, 127},
-        {0x8018, 127},
-        {0x801f, 127},
-        {0x8029, 127},
-        {0xc038, 127},
-        {0x8003, 220},
-        {0x8006, 220},
-        {0x800a, 220},
-        {0x800f, 220},
-        {0x8018, 220},
-        {0x801f, 220},
-        {0x8029, 220},
-        {0xc038, 220},
-    },
-    /* 252 */
-    {
-        {0x8003, 249},
-        {0x8006, 249},
-        {0x800a, 249},
-        {0x800f, 249},
-        {0x8018, 249},
-        {0x801f, 249},
-        {0x8029, 249},
-        {0xc038, 249},
-        {0x8001, 10},
-        {0xc016, 10},
-        {0x8001, 13},
-        {0xc016, 13},
-        {0x8001, 22},
-        {0xc016, 22},
-        {0x100, 0},
-        {0x100, 0},
-    },
-    /* 253 */
-    {
-        {0x8002, 10},
-        {0x8009, 10},
-        {0x8017, 10},
-        {0xc028, 10},
-        {0x8002, 13},
-        {0x8009, 13},
-        {0x8017, 13},
-        {0xc028, 13},
-        {0x8002, 22},
-        {0x8009, 22},
-        {0x8017, 22},
-        {0xc028, 22},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-    },
-    /* 254 */
-    {
-        {0x8003, 10},
-        {0x8006, 10},
-        {0x800a, 10},
-        {0x800f, 10},
-        {0x8018, 10},
-        {0x801f, 10},
-        {0x8029, 10},
-        {0xc038, 10},
-        {0x8003, 13},
-        {0x8006, 13},
-        {0x800a, 13},
-        {0x800f, 13},
-        {0x8018, 13},
-        {0x801f, 13},
-        {0x8029, 13},
-        {0xc038, 13},
-    },
-    /* 255 */
-    {
-        {0x8003, 22},
-        {0x8006, 22},
-        {0x800a, 22},
-        {0x800f, 22},
-        {0x8018, 22},
-        {0x801f, 22},
-        {0x8029, 22},
-        {0xc038, 22},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-    },
-    /* 256 */
-    {
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-        {0x100, 0},
-    },
+  /* 0 */
+  {
+    {0x04, 0},
+    {0x05, 0},
+    {0x07, 0},
+    {0x08, 0},
+    {0x0b, 0},
+    {0x0c, 0},
+    {0x10, 0},
+    {0x13, 0},
+    {0x19, 0},
+    {0x1c, 0},
+    {0x20, 0},
+    {0x23, 0},
+    {0x2a, 0},
+    {0x31, 0},
+    {0x39, 0},
+    {0x4040, 0},
+  },
+  /* 1 */
+  {
+    {0xc000, 48},
+    {0xc000, 49},
+    {0xc000, 50},
+    {0xc000, 97},
+    {0xc000, 99},
+    {0xc000, 101},
+    {0xc000, 105},
+    {0xc000, 111},
+    {0xc000, 115},
+    {0xc000, 116},
+    {0x0d, 0},
+    {0x0e, 0},
+    {0x11, 0},
+    {0x12, 0},
+    {0x14, 0},
+    {0x15, 0},
+  },
+  /* 2 */
+  {
+    {0x8001, 48},
+    {0xc016, 48},
+    {0x8001, 49},
+    {0xc016, 49},
+    {0x8001, 50},
+    {0xc016, 50},
+    {0x8001, 97},
+    {0xc016, 97},
+    {0x8001, 99},
+    {0xc016, 99},
+    {0x8001, 101},
+    {0xc016, 101},
+    {0x8001, 105},
+    {0xc016, 105},
+    {0x8001, 111},
+    {0xc016, 111},
+  },
+  /* 3 */
+  {
+    {0x8002, 48},
+    {0x8009, 48},
+    {0x8017, 48},
+    {0xc028, 48},
+    {0x8002, 49},
+    {0x8009, 49},
+    {0x8017, 49},
+    {0xc028, 49},
+    {0x8002, 50},
+    {0x8009, 50},
+    {0x8017, 50},
+    {0xc028, 50},
+    {0x8002, 97},
+    {0x8009, 97},
+    {0x8017, 97},
+    {0xc028, 97},
+  },
+  /* 4 */
+  {
+    {0x8003, 48},
+    {0x8006, 48},
+    {0x800a, 48},
+    {0x800f, 48},
+    {0x8018, 48},
+    {0x801f, 48},
+    {0x8029, 48},
+    {0xc038, 48},
+    {0x8003, 49},
+    {0x8006, 49},
+    {0x800a, 49},
+    {0x800f, 49},
+    {0x8018, 49},
+    {0x801f, 49},
+    {0x8029, 49},
+    {0xc038, 49},
+  },
+  /* 5 */
+  {
+    {0x8003, 50},
+    {0x8006, 50},
+    {0x800a, 50},
+    {0x800f, 50},
+    {0x8018, 50},
+    {0x801f, 50},
+    {0x8029, 50},
+    {0xc038, 50},
+    {0x8003, 97},
+    {0x8006, 97},
+    {0x800a, 97},
+    {0x800f, 97},
+    {0x8018, 97},
+    {0x801f, 97},
+    {0x8029, 97},
+    {0xc038, 97},
+  },
+  /* 6 */
+  {
+    {0x8002, 99},
+    {0x8009, 99},
+    {0x8017, 99},
+    {0xc028, 99},
+    {0x8002, 101},
+    {0x8009, 101},
+    {0x8017, 101},
+    {0xc028, 101},
+    {0x8002, 105},
+    {0x8009, 105},
+    {0x8017, 105},
+    {0xc028, 105},
+    {0x8002, 111},
+    {0x8009, 111},
+    {0x8017, 111},
+    {0xc028, 111},
+  },
+  /* 7 */
+  {
+    {0x8003, 99},
+    {0x8006, 99},
+    {0x800a, 99},
+    {0x800f, 99},
+    {0x8018, 99},
+    {0x801f, 99},
+    {0x8029, 99},
+    {0xc038, 99},
+    {0x8003, 101},
+    {0x8006, 101},
+    {0x800a, 101},
+    {0x800f, 101},
+    {0x8018, 101},
+    {0x801f, 101},
+    {0x8029, 101},
+    {0xc038, 101},
+  },
+  /* 8 */
+  {
+    {0x8003, 105},
+    {0x8006, 105},
+    {0x800a, 105},
+    {0x800f, 105},
+    {0x8018, 105},
+    {0x801f, 105},
+    {0x8029, 105},
+    {0xc038, 105},
+    {0x8003, 111},
+    {0x8006, 111},
+    {0x800a, 111},
+    {0x800f, 111},
+    {0x8018, 111},
+    {0x801f, 111},
+    {0x8029, 111},
+    {0xc038, 111},
+  },
+  /* 9 */
+  {
+    {0x8001, 115},
+    {0xc016, 115},
+    {0x8001, 116},
+    {0xc016, 116},
+    {0xc000, 32},
+    {0xc000, 37},
+    {0xc000, 45},
+    {0xc000, 46},
+    {0xc000, 47},
+    {0xc000, 51},
+    {0xc000, 52},
+    {0xc000, 53},
+    {0xc000, 54},
+    {0xc000, 55},
+    {0xc000, 56},
+    {0xc000, 57},
+  },
+  /* 10 */
+  {
+    {0x8002, 115},
+    {0x8009, 115},
+    {0x8017, 115},
+    {0xc028, 115},
+    {0x8002, 116},
+    {0x8009, 116},
+    {0x8017, 116},
+    {0xc028, 116},
+    {0x8001, 32},
+    {0xc016, 32},
+    {0x8001, 37},
+    {0xc016, 37},
+    {0x8001, 45},
+    {0xc016, 45},
+    {0x8001, 46},
+    {0xc016, 46},
+  },
+  /* 11 */
+  {
+    {0x8003, 115},
+    {0x8006, 115},
+    {0x800a, 115},
+    {0x800f, 115},
+    {0x8018, 115},
+    {0x801f, 115},
+    {0x8029, 115},
+    {0xc038, 115},
+    {0x8003, 116},
+    {0x8006, 116},
+    {0x800a, 116},
+    {0x800f, 116},
+    {0x8018, 116},
+    {0x801f, 116},
+    {0x8029, 116},
+    {0xc038, 116},
+  },
+  /* 12 */
+  {
+    {0x8002, 32},
+    {0x8009, 32},
+    {0x8017, 32},
+    {0xc028, 32},
+    {0x8002, 37},
+    {0x8009, 37},
+    {0x8017, 37},
+    {0xc028, 37},
+    {0x8002, 45},
+    {0x8009, 45},
+    {0x8017, 45},
+    {0xc028, 45},
+    {0x8002, 46},
+    {0x8009, 46},
+    {0x8017, 46},
+    {0xc028, 46},
+  },
+  /* 13 */
+  {
+    {0x8003, 32},
+    {0x8006, 32},
+    {0x800a, 32},
+    {0x800f, 32},
+    {0x8018, 32},
+    {0x801f, 32},
+    {0x8029, 32},
+    {0xc038, 32},
+    {0x8003, 37},
+    {0x8006, 37},
+    {0x800a, 37},
+    {0x800f, 37},
+    {0x8018, 37},
+    {0x801f, 37},
+    {0x8029, 37},
+    {0xc038, 37},
+  },
+  /* 14 */
+  {
+    {0x8003, 45},
+    {0x8006, 45},
+    {0x800a, 45},
+    {0x800f, 45},
+    {0x8018, 45},
+    {0x801f, 45},
+    {0x8029, 45},
+    {0xc038, 45},
+    {0x8003, 46},
+    {0x8006, 46},
+    {0x800a, 46},
+    {0x800f, 46},
+    {0x8018, 46},
+    {0x801f, 46},
+    {0x8029, 46},
+    {0xc038, 46},
+  },
+  /* 15 */
+  {
+    {0x8001, 47},
+    {0xc016, 47},
+    {0x8001, 51},
+    {0xc016, 51},
+    {0x8001, 52},
+    {0xc016, 52},
+    {0x8001, 53},
+    {0xc016, 53},
+    {0x8001, 54},
+    {0xc016, 54},
+    {0x8001, 55},
+    {0xc016, 55},
+    {0x8001, 56},
+    {0xc016, 56},
+    {0x8001, 57},
+    {0xc016, 57},
+  },
+  /* 16 */
+  {
+    {0x8002, 47},
+    {0x8009, 47},
+    {0x8017, 47},
+    {0xc028, 47},
+    {0x8002, 51},
+    {0x8009, 51},
+    {0x8017, 51},
+    {0xc028, 51},
+    {0x8002, 52},
+    {0x8009, 52},
+    {0x8017, 52},
+    {0xc028, 52},
+    {0x8002, 53},
+    {0x8009, 53},
+    {0x8017, 53},
+    {0xc028, 53},
+  },
+  /* 17 */
+  {
+    {0x8003, 47},
+    {0x8006, 47},
+    {0x800a, 47},
+    {0x800f, 47},
+    {0x8018, 47},
+    {0x801f, 47},
+    {0x8029, 47},
+    {0xc038, 47},
+    {0x8003, 51},
+    {0x8006, 51},
+    {0x800a, 51},
+    {0x800f, 51},
+    {0x8018, 51},
+    {0x801f, 51},
+    {0x8029, 51},
+    {0xc038, 51},
+  },
+  /* 18 */
+  {
+    {0x8003, 52},
+    {0x8006, 52},
+    {0x800a, 52},
+    {0x800f, 52},
+    {0x8018, 52},
+    {0x801f, 52},
+    {0x8029, 52},
+    {0xc038, 52},
+    {0x8003, 53},
+    {0x8006, 53},
+    {0x800a, 53},
+    {0x800f, 53},
+    {0x8018, 53},
+    {0x801f, 53},
+    {0x8029, 53},
+    {0xc038, 53},
+  },
+  /* 19 */
+  {
+    {0x8002, 54},
+    {0x8009, 54},
+    {0x8017, 54},
+    {0xc028, 54},
+    {0x8002, 55},
+    {0x8009, 55},
+    {0x8017, 55},
+    {0xc028, 55},
+    {0x8002, 56},
+    {0x8009, 56},
+    {0x8017, 56},
+    {0xc028, 56},
+    {0x8002, 57},
+    {0x8009, 57},
+    {0x8017, 57},
+    {0xc028, 57},
+  },
+  /* 20 */
+  {
+    {0x8003, 54},
+    {0x8006, 54},
+    {0x800a, 54},
+    {0x800f, 54},
+    {0x8018, 54},
+    {0x801f, 54},
+    {0x8029, 54},
+    {0xc038, 54},
+    {0x8003, 55},
+    {0x8006, 55},
+    {0x800a, 55},
+    {0x800f, 55},
+    {0x8018, 55},
+    {0x801f, 55},
+    {0x8029, 55},
+    {0xc038, 55},
+  },
+  /* 21 */
+  {
+    {0x8003, 56},
+    {0x8006, 56},
+    {0x800a, 56},
+    {0x800f, 56},
+    {0x8018, 56},
+    {0x801f, 56},
+    {0x8029, 56},
+    {0xc038, 56},
+    {0x8003, 57},
+    {0x8006, 57},
+    {0x800a, 57},
+    {0x800f, 57},
+    {0x8018, 57},
+    {0x801f, 57},
+    {0x8029, 57},
+    {0xc038, 57},
+  },
+  /* 22 */
+  {
+    {0x1a, 0},
+    {0x1b, 0},
+    {0x1d, 0},
+    {0x1e, 0},
+    {0x21, 0},
+    {0x22, 0},
+    {0x24, 0},
+    {0x25, 0},
+    {0x2b, 0},
+    {0x2e, 0},
+    {0x32, 0},
+    {0x35, 0},
+    {0x3a, 0},
+    {0x3d, 0},
+    {0x41, 0},
+    {0x4044, 0},
+  },
+  /* 23 */
+  {
+    {0xc000, 61},
+    {0xc000, 65},
+    {0xc000, 95},
+    {0xc000, 98},
+    {0xc000, 100},
+    {0xc000, 102},
+    {0xc000, 103},
+    {0xc000, 104},
+    {0xc000, 108},
+    {0xc000, 109},
+    {0xc000, 110},
+    {0xc000, 112},
+    {0xc000, 114},
+    {0xc000, 117},
+    {0x26, 0},
+    {0x27, 0},
+  },
+  /* 24 */
+  {
+    {0x8001, 61},
+    {0xc016, 61},
+    {0x8001, 65},
+    {0xc016, 65},
+    {0x8001, 95},
+    {0xc016, 95},
+    {0x8001, 98},
+    {0xc016, 98},
+    {0x8001, 100},
+    {0xc016, 100},
+    {0x8001, 102},
+    {0xc016, 102},
+    {0x8001, 103},
+    {0xc016, 103},
+    {0x8001, 104},
+    {0xc016, 104},
+  },
+  /* 25 */
+  {
+    {0x8002, 61},
+    {0x8009, 61},
+    {0x8017, 61},
+    {0xc028, 61},
+    {0x8002, 65},
+    {0x8009, 65},
+    {0x8017, 65},
+    {0xc028, 65},
+    {0x8002, 95},
+    {0x8009, 95},
+    {0x8017, 95},
+    {0xc028, 95},
+    {0x8002, 98},
+    {0x8009, 98},
+    {0x8017, 98},
+    {0xc028, 98},
+  },
+  /* 26 */
+  {
+    {0x8003, 61},
+    {0x8006, 61},
+    {0x800a, 61},
+    {0x800f, 61},
+    {0x8018, 61},
+    {0x801f, 61},
+    {0x8029, 61},
+    {0xc038, 61},
+    {0x8003, 65},
+    {0x8006, 65},
+    {0x800a, 65},
+    {0x800f, 65},
+    {0x8018, 65},
+    {0x801f, 65},
+    {0x8029, 65},
+    {0xc038, 65},
+  },
+  /* 27 */
+  {
+    {0x8003, 95},
+    {0x8006, 95},
+    {0x800a, 95},
+    {0x800f, 95},
+    {0x8018, 95},
+    {0x801f, 95},
+    {0x8029, 95},
+    {0xc038, 95},
+    {0x8003, 98},
+    {0x8006, 98},
+    {0x800a, 98},
+    {0x800f, 98},
+    {0x8018, 98},
+    {0x801f, 98},
+    {0x8029, 98},
+    {0xc038, 98},
+  },
+  /* 28 */
+  {
+    {0x8002, 100},
+    {0x8009, 100},
+    {0x8017, 100},
+    {0xc028, 100},
+    {0x8002, 102},
+    {0x8009, 102},
+    {0x8017, 102},
+    {0xc028, 102},
+    {0x8002, 103},
+    {0x8009, 103},
+    {0x8017, 103},
+    {0xc028, 103},
+    {0x8002, 104},
+    {0x8009, 104},
+    {0x8017, 104},
+    {0xc028, 104},
+  },
+  /* 29 */
+  {
+    {0x8003, 100},
+    {0x8006, 100},
+    {0x800a, 100},
+    {0x800f, 100},
+    {0x8018, 100},
+    {0x801f, 100},
+    {0x8029, 100},
+    {0xc038, 100},
+    {0x8003, 102},
+    {0x8006, 102},
+    {0x800a, 102},
+    {0x800f, 102},
+    {0x8018, 102},
+    {0x801f, 102},
+    {0x8029, 102},
+    {0xc038, 102},
+  },
+  /* 30 */
+  {
+    {0x8003, 103},
+    {0x8006, 103},
+    {0x800a, 103},
+    {0x800f, 103},
+    {0x8018, 103},
+    {0x801f, 103},
+    {0x8029, 103},
+    {0xc038, 103},
+    {0x8003, 104},
+    {0x8006, 104},
+    {0x800a, 104},
+    {0x800f, 104},
+    {0x8018, 104},
+    {0x801f, 104},
+    {0x8029, 104},
+    {0xc038, 104},
+  },
+  /* 31 */
+  {
+    {0x8001, 108},
+    {0xc016, 108},
+    {0x8001, 109},
+    {0xc016, 109},
+    {0x8001, 110},
+    {0xc016, 110},
+    {0x8001, 112},
+    {0xc016, 112},
+    {0x8001, 114},
+    {0xc016, 114},
+    {0x8001, 117},
+    {0xc016, 117},
+    {0xc000, 58},
+    {0xc000, 66},
+    {0xc000, 67},
+    {0xc000, 68},
+  },
+  /* 32 */
+  {
+    {0x8002, 108},
+    {0x8009, 108},
+    {0x8017, 108},
+    {0xc028, 108},
+    {0x8002, 109},
+    {0x8009, 109},
+    {0x8017, 109},
+    {0xc028, 109},
+    {0x8002, 110},
+    {0x8009, 110},
+    {0x8017, 110},
+    {0xc028, 110},
+    {0x8002, 112},
+    {0x8009, 112},
+    {0x8017, 112},
+    {0xc028, 112},
+  },
+  /* 33 */
+  {
+    {0x8003, 108},
+    {0x8006, 108},
+    {0x800a, 108},
+    {0x800f, 108},
+    {0x8018, 108},
+    {0x801f, 108},
+    {0x8029, 108},
+    {0xc038, 108},
+    {0x8003, 109},
+    {0x8006, 109},
+    {0x800a, 109},
+    {0x800f, 109},
+    {0x8018, 109},
+    {0x801f, 109},
+    {0x8029, 109},
+    {0xc038, 109},
+  },
+  /* 34 */
+  {
+    {0x8003, 110},
+    {0x8006, 110},
+    {0x800a, 110},
+    {0x800f, 110},
+    {0x8018, 110},
+    {0x801f, 110},
+    {0x8029, 110},
+    {0xc038, 110},
+    {0x8003, 112},
+    {0x8006, 112},
+    {0x800a, 112},
+    {0x800f, 112},
+    {0x8018, 112},
+    {0x801f, 112},
+    {0x8029, 112},
+    {0xc038, 112},
+  },
+  /* 35 */
+  {
+    {0x8002, 114},
+    {0x8009, 114},
+    {0x8017, 114},
+    {0xc028, 114},
+    {0x8002, 117},
+    {0x8009, 117},
+    {0x8017, 117},
+    {0xc028, 117},
+    {0x8001, 58},
+    {0xc016, 58},
+    {0x8001, 66},
+    {0xc016, 66},
+    {0x8001, 67},
+    {0xc016, 67},
+    {0x8001, 68},
+    {0xc016, 68},
+  },
+  /* 36 */
+  {
+    {0x8003, 114},
+    {0x8006, 114},
+    {0x800a, 114},
+    {0x800f, 114},
+    {0x8018, 114},
+    {0x801f, 114},
+    {0x8029, 114},
+    {0xc038, 114},
+    {0x8003, 117},
+    {0x8006, 117},
+    {0x800a, 117},
+    {0x800f, 117},
+    {0x8018, 117},
+    {0x801f, 117},
+    {0x8029, 117},
+    {0xc038, 117},
+  },
+  /* 37 */
+  {
+    {0x8002, 58},
+    {0x8009, 58},
+    {0x8017, 58},
+    {0xc028, 58},
+    {0x8002, 66},
+    {0x8009, 66},
+    {0x8017, 66},
+    {0xc028, 66},
+    {0x8002, 67},
+    {0x8009, 67},
+    {0x8017, 67},
+    {0xc028, 67},
+    {0x8002, 68},
+    {0x8009, 68},
+    {0x8017, 68},
+    {0xc028, 68},
+  },
+  /* 38 */
+  {
+    {0x8003, 58},
+    {0x8006, 58},
+    {0x800a, 58},
+    {0x800f, 58},
+    {0x8018, 58},
+    {0x801f, 58},
+    {0x8029, 58},
+    {0xc038, 58},
+    {0x8003, 66},
+    {0x8006, 66},
+    {0x800a, 66},
+    {0x800f, 66},
+    {0x8018, 66},
+    {0x801f, 66},
+    {0x8029, 66},
+    {0xc038, 66},
+  },
+  /* 39 */
+  {
+    {0x8003, 67},
+    {0x8006, 67},
+    {0x800a, 67},
+    {0x800f, 67},
+    {0x8018, 67},
+    {0x801f, 67},
+    {0x8029, 67},
+    {0xc038, 67},
+    {0x8003, 68},
+    {0x8006, 68},
+    {0x800a, 68},
+    {0x800f, 68},
+    {0x8018, 68},
+    {0x801f, 68},
+    {0x8029, 68},
+    {0xc038, 68},
+  },
+  /* 40 */
+  {
+    {0x2c, 0},
+    {0x2d, 0},
+    {0x2f, 0},
+    {0x30, 0},
+    {0x33, 0},
+    {0x34, 0},
+    {0x36, 0},
+    {0x37, 0},
+    {0x3b, 0},
+    {0x3c, 0},
+    {0x3e, 0},
+    {0x3f, 0},
+    {0x42, 0},
+    {0x43, 0},
+    {0x45, 0},
+    {0x4048, 0},
+  },
+  /* 41 */
+  {
+    {0xc000, 69},
+    {0xc000, 70},
+    {0xc000, 71},
+    {0xc000, 72},
+    {0xc000, 73},
+    {0xc000, 74},
+    {0xc000, 75},
+    {0xc000, 76},
+    {0xc000, 77},
+    {0xc000, 78},
+    {0xc000, 79},
+    {0xc000, 80},
+    {0xc000, 81},
+    {0xc000, 82},
+    {0xc000, 83},
+    {0xc000, 84},
+  },
+  /* 42 */
+  {
+    {0x8001, 69},
+    {0xc016, 69},
+    {0x8001, 70},
+    {0xc016, 70},
+    {0x8001, 71},
+    {0xc016, 71},
+    {0x8001, 72},
+    {0xc016, 72},
+    {0x8001, 73},
+    {0xc016, 73},
+    {0x8001, 74},
+    {0xc016, 74},
+    {0x8001, 75},
+    {0xc016, 75},
+    {0x8001, 76},
+    {0xc016, 76},
+  },
+  /* 43 */
+  {
+    {0x8002, 69},
+    {0x8009, 69},
+    {0x8017, 69},
+    {0xc028, 69},
+    {0x8002, 70},
+    {0x8009, 70},
+    {0x8017, 70},
+    {0xc028, 70},
+    {0x8002, 71},
+    {0x8009, 71},
+    {0x8017, 71},
+    {0xc028, 71},
+    {0x8002, 72},
+    {0x8009, 72},
+    {0x8017, 72},
+    {0xc028, 72},
+  },
+  /* 44 */
+  {
+    {0x8003, 69},
+    {0x8006, 69},
+    {0x800a, 69},
+    {0x800f, 69},
+    {0x8018, 69},
+    {0x801f, 69},
+    {0x8029, 69},
+    {0xc038, 69},
+    {0x8003, 70},
+    {0x8006, 70},
+    {0x800a, 70},
+    {0x800f, 70},
+    {0x8018, 70},
+    {0x801f, 70},
+    {0x8029, 70},
+    {0xc038, 70},
+  },
+  /* 45 */
+  {
+    {0x8003, 71},
+    {0x8006, 71},
+    {0x800a, 71},
+    {0x800f, 71},
+    {0x8018, 71},
+    {0x801f, 71},
+    {0x8029, 71},
+    {0xc038, 71},
+    {0x8003, 72},
+    {0x8006, 72},
+    {0x800a, 72},
+    {0x800f, 72},
+    {0x8018, 72},
+    {0x801f, 72},
+    {0x8029, 72},
+    {0xc038, 72},
+  },
+  /* 46 */
+  {
+    {0x8002, 73},
+    {0x8009, 73},
+    {0x8017, 73},
+    {0xc028, 73},
+    {0x8002, 74},
+    {0x8009, 74},
+    {0x8017, 74},
+    {0xc028, 74},
+    {0x8002, 75},
+    {0x8009, 75},
+    {0x8017, 75},
+    {0xc028, 75},
+    {0x8002, 76},
+    {0x8009, 76},
+    {0x8017, 76},
+    {0xc028, 76},
+  },
+  /* 47 */
+  {
+    {0x8003, 73},
+    {0x8006, 73},
+    {0x800a, 73},
+    {0x800f, 73},
+    {0x8018, 73},
+    {0x801f, 73},
+    {0x8029, 73},
+    {0xc038, 73},
+    {0x8003, 74},
+    {0x8006, 74},
+    {0x800a, 74},
+    {0x800f, 74},
+    {0x8018, 74},
+    {0x801f, 74},
+    {0x8029, 74},
+    {0xc038, 74},
+  },
+  /* 48 */
+  {
+    {0x8003, 75},
+    {0x8006, 75},
+    {0x800a, 75},
+    {0x800f, 75},
+    {0x8018, 75},
+    {0x801f, 75},
+    {0x8029, 75},
+    {0xc038, 75},
+    {0x8003, 76},
+    {0x8006, 76},
+    {0x800a, 76},
+    {0x800f, 76},
+    {0x8018, 76},
+    {0x801f, 76},
+    {0x8029, 76},
+    {0xc038, 76},
+  },
+  /* 49 */
+  {
+    {0x8001, 77},
+    {0xc016, 77},
+    {0x8001, 78},
+    {0xc016, 78},
+    {0x8001, 79},
+    {0xc016, 79},
+    {0x8001, 80},
+    {0xc016, 80},
+    {0x8001, 81},
+    {0xc016, 81},
+    {0x8001, 82},
+    {0xc016, 82},
+    {0x8001, 83},
+    {0xc016, 83},
+    {0x8001, 84},
+    {0xc016, 84},
+  },
+  /* 50 */
+  {
+    {0x8002, 77},
+    {0x8009, 77},
+    {0x8017, 77},
+    {0xc028, 77},
+    {0x8002, 78},
+    {0x8009, 78},
+    {0x8017, 78},
+    {0xc028, 78},
+    {0x8002, 79},
+    {0x8009, 79},
+    {0x8017, 79},
+    {0xc028, 79},
+    {0x8002, 80},
+    {0x8009, 80},
+    {0x8017, 80},
+    {0xc028, 80},
+  },
+  /* 51 */
+  {
+    {0x8003, 77},
+    {0x8006, 77},
+    {0x800a, 77},
+    {0x800f, 77},
+    {0x8018, 77},
+    {0x801f, 77},
+    {0x8029, 77},
+    {0xc038, 77},
+    {0x8003, 78},
+    {0x8006, 78},
+    {0x800a, 78},
+    {0x800f, 78},
+    {0x8018, 78},
+    {0x801f, 78},
+    {0x8029, 78},
+    {0xc038, 78},
+  },
+  /* 52 */
+  {
+    {0x8003, 79},
+    {0x8006, 79},
+    {0x800a, 79},
+    {0x800f, 79},
+    {0x8018, 79},
+    {0x801f, 79},
+    {0x8029, 79},
+    {0xc038, 79},
+    {0x8003, 80},
+    {0x8006, 80},
+    {0x800a, 80},
+    {0x800f, 80},
+    {0x8018, 80},
+    {0x801f, 80},
+    {0x8029, 80},
+    {0xc038, 80},
+  },
+  /* 53 */
+  {
+    {0x8002, 81},
+    {0x8009, 81},
+    {0x8017, 81},
+    {0xc028, 81},
+    {0x8002, 82},
+    {0x8009, 82},
+    {0x8017, 82},
+    {0xc028, 82},
+    {0x8002, 83},
+    {0x8009, 83},
+    {0x8017, 83},
+    {0xc028, 83},
+    {0x8002, 84},
+    {0x8009, 84},
+    {0x8017, 84},
+    {0xc028, 84},
+  },
+  /* 54 */
+  {
+    {0x8003, 81},
+    {0x8006, 81},
+    {0x800a, 81},
+    {0x800f, 81},
+    {0x8018, 81},
+    {0x801f, 81},
+    {0x8029, 81},
+    {0xc038, 81},
+    {0x8003, 82},
+    {0x8006, 82},
+    {0x800a, 82},
+    {0x800f, 82},
+    {0x8018, 82},
+    {0x801f, 82},
+    {0x8029, 82},
+    {0xc038, 82},
+  },
+  /* 55 */
+  {
+    {0x8003, 83},
+    {0x8006, 83},
+    {0x800a, 83},
+    {0x800f, 83},
+    {0x8018, 83},
+    {0x801f, 83},
+    {0x8029, 83},
+    {0xc038, 83},
+    {0x8003, 84},
+    {0x8006, 84},
+    {0x800a, 84},
+    {0x800f, 84},
+    {0x8018, 84},
+    {0x801f, 84},
+    {0x8029, 84},
+    {0xc038, 84},
+  },
+  /* 56 */
+  {
+    {0xc000, 85},
+    {0xc000, 86},
+    {0xc000, 87},
+    {0xc000, 89},
+    {0xc000, 106},
+    {0xc000, 107},
+    {0xc000, 113},
+    {0xc000, 118},
+    {0xc000, 119},
+    {0xc000, 120},
+    {0xc000, 121},
+    {0xc000, 122},
+    {0x46, 0},
+    {0x47, 0},
+    {0x49, 0},
+    {0x404a, 0},
+  },
+  /* 57 */
+  {
+    {0x8001, 85},
+    {0xc016, 85},
+    {0x8001, 86},
+    {0xc016, 86},
+    {0x8001, 87},
+    {0xc016, 87},
+    {0x8001, 89},
+    {0xc016, 89},
+    {0x8001, 106},
+    {0xc016, 106},
+    {0x8001, 107},
+    {0xc016, 107},
+    {0x8001, 113},
+    {0xc016, 113},
+    {0x8001, 118},
+    {0xc016, 118},
+  },
+  /* 58 */
+  {
+    {0x8002, 85},
+    {0x8009, 85},
+    {0x8017, 85},
+    {0xc028, 85},
+    {0x8002, 86},
+    {0x8009, 86},
+    {0x8017, 86},
+    {0xc028, 86},
+    {0x8002, 87},
+    {0x8009, 87},
+    {0x8017, 87},
+    {0xc028, 87},
+    {0x8002, 89},
+    {0x8009, 89},
+    {0x8017, 89},
+    {0xc028, 89},
+  },
+  /* 59 */
+  {
+    {0x8003, 85},
+    {0x8006, 85},
+    {0x800a, 85},
+    {0x800f, 85},
+    {0x8018, 85},
+    {0x801f, 85},
+    {0x8029, 85},
+    {0xc038, 85},
+    {0x8003, 86},
+    {0x8006, 86},
+    {0x800a, 86},
+    {0x800f, 86},
+    {0x8018, 86},
+    {0x801f, 86},
+    {0x8029, 86},
+    {0xc038, 86},
+  },
+  /* 60 */
+  {
+    {0x8003, 87},
+    {0x8006, 87},
+    {0x800a, 87},
+    {0x800f, 87},
+    {0x8018, 87},
+    {0x801f, 87},
+    {0x8029, 87},
+    {0xc038, 87},
+    {0x8003, 89},
+    {0x8006, 89},
+    {0x800a, 89},
+    {0x800f, 89},
+    {0x8018, 89},
+    {0x801f, 89},
+    {0x8029, 89},
+    {0xc038, 89},
+  },
+  /* 61 */
+  {
+    {0x8002, 106},
+    {0x8009, 106},
+    {0x8017, 106},
+    {0xc028, 106},
+    {0x8002, 107},
+    {0x8009, 107},
+    {0x8017, 107},
+    {0xc028, 107},
+    {0x8002, 113},
+    {0x8009, 113},
+    {0x8017, 113},
+    {0xc028, 113},
+    {0x8002, 118},
+    {0x8009, 118},
+    {0x8017, 118},
+    {0xc028, 118},
+  },
+  /* 62 */
+  {
+    {0x8003, 106},
+    {0x8006, 106},
+    {0x800a, 106},
+    {0x800f, 106},
+    {0x8018, 106},
+    {0x801f, 106},
+    {0x8029, 106},
+    {0xc038, 106},
+    {0x8003, 107},
+    {0x8006, 107},
+    {0x800a, 107},
+    {0x800f, 107},
+    {0x8018, 107},
+    {0x801f, 107},
+    {0x8029, 107},
+    {0xc038, 107},
+  },
+  /* 63 */
+  {
+    {0x8003, 113},
+    {0x8006, 113},
+    {0x800a, 113},
+    {0x800f, 113},
+    {0x8018, 113},
+    {0x801f, 113},
+    {0x8029, 113},
+    {0xc038, 113},
+    {0x8003, 118},
+    {0x8006, 118},
+    {0x800a, 118},
+    {0x800f, 118},
+    {0x8018, 118},
+    {0x801f, 118},
+    {0x8029, 118},
+    {0xc038, 118},
+  },
+  /* 64 */
+  {
+    {0x8001, 119},
+    {0xc016, 119},
+    {0x8001, 120},
+    {0xc016, 120},
+    {0x8001, 121},
+    {0xc016, 121},
+    {0x8001, 122},
+    {0xc016, 122},
+    {0xc000, 38},
+    {0xc000, 42},
+    {0xc000, 44},
+    {0xc000, 59},
+    {0xc000, 88},
+    {0xc000, 90},
+    {0x4b, 0},
+    {0x4e, 0},
+  },
+  /* 65 */
+  {
+    {0x8002, 119},
+    {0x8009, 119},
+    {0x8017, 119},
+    {0xc028, 119},
+    {0x8002, 120},
+    {0x8009, 120},
+    {0x8017, 120},
+    {0xc028, 120},
+    {0x8002, 121},
+    {0x8009, 121},
+    {0x8017, 121},
+    {0xc028, 121},
+    {0x8002, 122},
+    {0x8009, 122},
+    {0x8017, 122},
+    {0xc028, 122},
+  },
+  /* 66 */
+  {
+    {0x8003, 119},
+    {0x8006, 119},
+    {0x800a, 119},
+    {0x800f, 119},
+    {0x8018, 119},
+    {0x801f, 119},
+    {0x8029, 119},
+    {0xc038, 119},
+    {0x8003, 120},
+    {0x8006, 120},
+    {0x800a, 120},
+    {0x800f, 120},
+    {0x8018, 120},
+    {0x801f, 120},
+    {0x8029, 120},
+    {0xc038, 120},
+  },
+  /* 67 */
+  {
+    {0x8003, 121},
+    {0x8006, 121},
+    {0x800a, 121},
+    {0x800f, 121},
+    {0x8018, 121},
+    {0x801f, 121},
+    {0x8029, 121},
+    {0xc038, 121},
+    {0x8003, 122},
+    {0x8006, 122},
+    {0x800a, 122},
+    {0x800f, 122},
+    {0x8018, 122},
+    {0x801f, 122},
+    {0x8029, 122},
+    {0xc038, 122},
+  },
+  /* 68 */
+  {
+    {0x8001, 38},
+    {0xc016, 38},
+    {0x8001, 42},
+    {0xc016, 42},
+    {0x8001, 44},
+    {0xc016, 44},
+    {0x8001, 59},
+    {0xc016, 59},
+    {0x8001, 88},
+    {0xc016, 88},
+    {0x8001, 90},
+    {0xc016, 90},
+    {0x4c, 0},
+    {0x4d, 0},
+    {0x4f, 0},
+    {0x51, 0},
+  },
+  /* 69 */
+  {
+    {0x8002, 38},
+    {0x8009, 38},
+    {0x8017, 38},
+    {0xc028, 38},
+    {0x8002, 42},
+    {0x8009, 42},
+    {0x8017, 42},
+    {0xc028, 42},
+    {0x8002, 44},
+    {0x8009, 44},
+    {0x8017, 44},
+    {0xc028, 44},
+    {0x8002, 59},
+    {0x8009, 59},
+    {0x8017, 59},
+    {0xc028, 59},
+  },
+  /* 70 */
+  {
+    {0x8003, 38},
+    {0x8006, 38},
+    {0x800a, 38},
+    {0x800f, 38},
+    {0x8018, 38},
+    {0x801f, 38},
+    {0x8029, 38},
+    {0xc038, 38},
+    {0x8003, 42},
+    {0x8006, 42},
+    {0x800a, 42},
+    {0x800f, 42},
+    {0x8018, 42},
+    {0x801f, 42},
+    {0x8029, 42},
+    {0xc038, 42},
+  },
+  /* 71 */
+  {
+    {0x8003, 44},
+    {0x8006, 44},
+    {0x800a, 44},
+    {0x800f, 44},
+    {0x8018, 44},
+    {0x801f, 44},
+    {0x8029, 44},
+    {0xc038, 44},
+    {0x8003, 59},
+    {0x8006, 59},
+    {0x800a, 59},
+    {0x800f, 59},
+    {0x8018, 59},
+    {0x801f, 59},
+    {0x8029, 59},
+    {0xc038, 59},
+  },
+  /* 72 */
+  {
+    {0x8002, 88},
+    {0x8009, 88},
+    {0x8017, 88},
+    {0xc028, 88},
+    {0x8002, 90},
+    {0x8009, 90},
+    {0x8017, 90},
+    {0xc028, 90},
+    {0xc000, 33},
+    {0xc000, 34},
+    {0xc000, 40},
+    {0xc000, 41},
+    {0xc000, 63},
+    {0x50, 0},
+    {0x52, 0},
+    {0x54, 0},
+  },
+  /* 73 */
+  {
+    {0x8003, 88},
+    {0x8006, 88},
+    {0x800a, 88},
+    {0x800f, 88},
+    {0x8018, 88},
+    {0x801f, 88},
+    {0x8029, 88},
+    {0xc038, 88},
+    {0x8003, 90},
+    {0x8006, 90},
+    {0x800a, 90},
+    {0x800f, 90},
+    {0x8018, 90},
+    {0x801f, 90},
+    {0x8029, 90},
+    {0xc038, 90},
+  },
+  /* 74 */
+  {
+    {0x8001, 33},
+    {0xc016, 33},
+    {0x8001, 34},
+    {0xc016, 34},
+    {0x8001, 40},
+    {0xc016, 40},
+    {0x8001, 41},
+    {0xc016, 41},
+    {0x8001, 63},
+    {0xc016, 63},
+    {0xc000, 39},
+    {0xc000, 43},
+    {0xc000, 124},
+    {0x53, 0},
+    {0x55, 0},
+    {0x58, 0},
+  },
+  /* 75 */
+  {
+    {0x8002, 33},
+    {0x8009, 33},
+    {0x8017, 33},
+    {0xc028, 33},
+    {0x8002, 34},
+    {0x8009, 34},
+    {0x8017, 34},
+    {0xc028, 34},
+    {0x8002, 40},
+    {0x8009, 40},
+    {0x8017, 40},
+    {0xc028, 40},
+    {0x8002, 41},
+    {0x8009, 41},
+    {0x8017, 41},
+    {0xc028, 41},
+  },
+  /* 76 */
+  {
+    {0x8003, 33},
+    {0x8006, 33},
+    {0x800a, 33},
+    {0x800f, 33},
+    {0x8018, 33},
+    {0x801f, 33},
+    {0x8029, 33},
+    {0xc038, 33},
+    {0x8003, 34},
+    {0x8006, 34},
+    {0x800a, 34},
+    {0x800f, 34},
+    {0x8018, 34},
+    {0x801f, 34},
+    {0x8029, 34},
+    {0xc038, 34},
+  },
+  /* 77 */
+  {
+    {0x8003, 40},
+    {0x8006, 40},
+    {0x800a, 40},
+    {0x800f, 40},
+    {0x8018, 40},
+    {0x801f, 40},
+    {0x8029, 40},
+    {0xc038, 40},
+    {0x8003, 41},
+    {0x8006, 41},
+    {0x800a, 41},
+    {0x800f, 41},
+    {0x8018, 41},
+    {0x801f, 41},
+    {0x8029, 41},
+    {0xc038, 41},
+  },
+  /* 78 */
+  {
+    {0x8002, 63},
+    {0x8009, 63},
+    {0x8017, 63},
+    {0xc028, 63},
+    {0x8001, 39},
+    {0xc016, 39},
+    {0x8001, 43},
+    {0xc016, 43},
+    {0x8001, 124},
+    {0xc016, 124},
+    {0xc000, 35},
+    {0xc000, 62},
+    {0x56, 0},
+    {0x57, 0},
+    {0x59, 0},
+    {0x5a, 0},
+  },
+  /* 79 */
+  {
+    {0x8003, 63},
+    {0x8006, 63},
+    {0x800a, 63},
+    {0x800f, 63},
+    {0x8018, 63},
+    {0x801f, 63},
+    {0x8029, 63},
+    {0xc038, 63},
+    {0x8002, 39},
+    {0x8009, 39},
+    {0x8017, 39},
+    {0xc028, 39},
+    {0x8002, 43},
+    {0x8009, 43},
+    {0x8017, 43},
+    {0xc028, 43},
+  },
+  /* 80 */
+  {
+    {0x8003, 39},
+    {0x8006, 39},
+    {0x800a, 39},
+    {0x800f, 39},
+    {0x8018, 39},
+    {0x801f, 39},
+    {0x8029, 39},
+    {0xc038, 39},
+    {0x8003, 43},
+    {0x8006, 43},
+    {0x800a, 43},
+    {0x800f, 43},
+    {0x8018, 43},
+    {0x801f, 43},
+    {0x8029, 43},
+    {0xc038, 43},
+  },
+  /* 81 */
+  {
+    {0x8002, 124},
+    {0x8009, 124},
+    {0x8017, 124},
+    {0xc028, 124},
+    {0x8001, 35},
+    {0xc016, 35},
+    {0x8001, 62},
+    {0xc016, 62},
+    {0xc000, 0},
+    {0xc000, 36},
+    {0xc000, 64},
+    {0xc000, 91},
+    {0xc000, 93},
+    {0xc000, 126},
+    {0x5b, 0},
+    {0x5c, 0},
+  },
+  /* 82 */
+  {
+    {0x8003, 124},
+    {0x8006, 124},
+    {0x800a, 124},
+    {0x800f, 124},
+    {0x8018, 124},
+    {0x801f, 124},
+    {0x8029, 124},
+    {0xc038, 124},
+    {0x8002, 35},
+    {0x8009, 35},
+    {0x8017, 35},
+    {0xc028, 35},
+    {0x8002, 62},
+    {0x8009, 62},
+    {0x8017, 62},
+    {0xc028, 62},
+  },
+  /* 83 */
+  {
+    {0x8003, 35},
+    {0x8006, 35},
+    {0x800a, 35},
+    {0x800f, 35},
+    {0x8018, 35},
+    {0x801f, 35},
+    {0x8029, 35},
+    {0xc038, 35},
+    {0x8003, 62},
+    {0x8006, 62},
+    {0x800a, 62},
+    {0x800f, 62},
+    {0x8018, 62},
+    {0x801f, 62},
+    {0x8029, 62},
+    {0xc038, 62},
+  },
+  /* 84 */
+  {
+    {0x8001, 0},
+    {0xc016, 0},
+    {0x8001, 36},
+    {0xc016, 36},
+    {0x8001, 64},
+    {0xc016, 64},
+    {0x8001, 91},
+    {0xc016, 91},
+    {0x8001, 93},
+    {0xc016, 93},
+    {0x8001, 126},
+    {0xc016, 126},
+    {0xc000, 94},
+    {0xc000, 125},
+    {0x5d, 0},
+    {0x5e, 0},
+  },
+  /* 85 */
+  {
+    {0x8002, 0},
+    {0x8009, 0},
+    {0x8017, 0},
+    {0xc028, 0},
+    {0x8002, 36},
+    {0x8009, 36},
+    {0x8017, 36},
+    {0xc028, 36},
+    {0x8002, 64},
+    {0x8009, 64},
+    {0x8017, 64},
+    {0xc028, 64},
+    {0x8002, 91},
+    {0x8009, 91},
+    {0x8017, 91},
+    {0xc028, 91},
+  },
+  /* 86 */
+  {
+    {0x8003, 0},
+    {0x8006, 0},
+    {0x800a, 0},
+    {0x800f, 0},
+    {0x8018, 0},
+    {0x801f, 0},
+    {0x8029, 0},
+    {0xc038, 0},
+    {0x8003, 36},
+    {0x8006, 36},
+    {0x800a, 36},
+    {0x800f, 36},
+    {0x8018, 36},
+    {0x801f, 36},
+    {0x8029, 36},
+    {0xc038, 36},
+  },
+  /* 87 */
+  {
+    {0x8003, 64},
+    {0x8006, 64},
+    {0x800a, 64},
+    {0x800f, 64},
+    {0x8018, 64},
+    {0x801f, 64},
+    {0x8029, 64},
+    {0xc038, 64},
+    {0x8003, 91},
+    {0x8006, 91},
+    {0x800a, 91},
+    {0x800f, 91},
+    {0x8018, 91},
+    {0x801f, 91},
+    {0x8029, 91},
+    {0xc038, 91},
+  },
+  /* 88 */
+  {
+    {0x8002, 93},
+    {0x8009, 93},
+    {0x8017, 93},
+    {0xc028, 93},
+    {0x8002, 126},
+    {0x8009, 126},
+    {0x8017, 126},
+    {0xc028, 126},
+    {0x8001, 94},
+    {0xc016, 94},
+    {0x8001, 125},
+    {0xc016, 125},
+    {0xc000, 60},
+    {0xc000, 96},
+    {0xc000, 123},
+    {0x5f, 0},
+  },
+  /* 89 */
+  {
+    {0x8003, 93},
+    {0x8006, 93},
+    {0x800a, 93},
+    {0x800f, 93},
+    {0x8018, 93},
+    {0x801f, 93},
+    {0x8029, 93},
+    {0xc038, 93},
+    {0x8003, 126},
+    {0x8006, 126},
+    {0x800a, 126},
+    {0x800f, 126},
+    {0x8018, 126},
+    {0x801f, 126},
+    {0x8029, 126},
+    {0xc038, 126},
+  },
+  /* 90 */
+  {
+    {0x8002, 94},
+    {0x8009, 94},
+    {0x8017, 94},
+    {0xc028, 94},
+    {0x8002, 125},
+    {0x8009, 125},
+    {0x8017, 125},
+    {0xc028, 125},
+    {0x8001, 60},
+    {0xc016, 60},
+    {0x8001, 96},
+    {0xc016, 96},
+    {0x8001, 123},
+    {0xc016, 123},
+    {0x60, 0},
+    {0x6e, 0},
+  },
+  /* 91 */
+  {
+    {0x8003, 94},
+    {0x8006, 94},
+    {0x800a, 94},
+    {0x800f, 94},
+    {0x8018, 94},
+    {0x801f, 94},
+    {0x8029, 94},
+    {0xc038, 94},
+    {0x8003, 125},
+    {0x8006, 125},
+    {0x800a, 125},
+    {0x800f, 125},
+    {0x8018, 125},
+    {0x801f, 125},
+    {0x8029, 125},
+    {0xc038, 125},
+  },
+  /* 92 */
+  {
+    {0x8002, 60},
+    {0x8009, 60},
+    {0x8017, 60},
+    {0xc028, 60},
+    {0x8002, 96},
+    {0x8009, 96},
+    {0x8017, 96},
+    {0xc028, 96},
+    {0x8002, 123},
+    {0x8009, 123},
+    {0x8017, 123},
+    {0xc028, 123},
+    {0x61, 0},
+    {0x65, 0},
+    {0x6f, 0},
+    {0x85, 0},
+  },
+  /* 93 */
+  {
+    {0x8003, 60},
+    {0x8006, 60},
+    {0x800a, 60},
+    {0x800f, 60},
+    {0x8018, 60},
+    {0x801f, 60},
+    {0x8029, 60},
+    {0xc038, 60},
+    {0x8003, 96},
+    {0x8006, 96},
+    {0x800a, 96},
+    {0x800f, 96},
+    {0x8018, 96},
+    {0x801f, 96},
+    {0x8029, 96},
+    {0xc038, 96},
+  },
+  /* 94 */
+  {
+    {0x8003, 123},
+    {0x8006, 123},
+    {0x800a, 123},
+    {0x800f, 123},
+    {0x8018, 123},
+    {0x801f, 123},
+    {0x8029, 123},
+    {0xc038, 123},
+    {0x62, 0},
+    {0x63, 0},
+    {0x66, 0},
+    {0x69, 0},
+    {0x70, 0},
+    {0x77, 0},
+    {0x86, 0},
+    {0x99, 0},
+  },
+  /* 95 */
+  {
+    {0xc000, 92},
+    {0xc000, 195},
+    {0xc000, 208},
+    {0x64, 0},
+    {0x67, 0},
+    {0x68, 0},
+    {0x6a, 0},
+    {0x6b, 0},
+    {0x71, 0},
+    {0x74, 0},
+    {0x78, 0},
+    {0x7e, 0},
+    {0x87, 0},
+    {0x8e, 0},
+    {0x9a, 0},
+    {0xa9, 0},
+  },
+  /* 96 */
+  {
+    {0x8001, 92},
+    {0xc016, 92},
+    {0x8001, 195},
+    {0xc016, 195},
+    {0x8001, 208},
+    {0xc016, 208},
+    {0xc000, 128},
+    {0xc000, 130},
+    {0xc000, 131},
+    {0xc000, 162},
+    {0xc000, 184},
+    {0xc000, 194},
+    {0xc000, 224},
+    {0xc000, 226},
+    {0x6c, 0},
+    {0x6d, 0},
+  },
+  /* 97 */
+  {
+    {0x8002, 92},
+    {0x8009, 92},
+    {0x8017, 92},
+    {0xc028, 92},
+    {0x8002, 195},
+    {0x8009, 195},
+    {0x8017, 195},
+    {0xc028, 195},
+    {0x8002, 208},
+    {0x8009, 208},
+    {0x8017, 208},
+    {0xc028, 208},
+    {0x8001, 128},
+    {0xc016, 128},
+    {0x8001, 130},
+    {0xc016, 130},
+  },
+  /* 98 */
+  {
+    {0x8003, 92},
+    {0x8006, 92},
+    {0x800a, 92},
+    {0x800f, 92},
+    {0x8018, 92},
+    {0x801f, 92},
+    {0x8029, 92},
+    {0xc038, 92},
+    {0x8003, 195},
+    {0x8006, 195},
+    {0x800a, 195},
+    {0x800f, 195},
+    {0x8018, 195},
+    {0x801f, 195},
+    {0x8029, 195},
+    {0xc038, 195},
+  },
+  /* 99 */
+  {
+    {0x8003, 208},
+    {0x8006, 208},
+    {0x800a, 208},
+    {0x800f, 208},
+    {0x8018, 208},
+    {0x801f, 208},
+    {0x8029, 208},
+    {0xc038, 208},
+    {0x8002, 128},
+    {0x8009, 128},
+    {0x8017, 128},
+    {0xc028, 128},
+    {0x8002, 130},
+    {0x8009, 130},
+    {0x8017, 130},
+    {0xc028, 130},
+  },
+  /* 100 */
+  {
+    {0x8003, 128},
+    {0x8006, 128},
+    {0x800a, 128},
+    {0x800f, 128},
+    {0x8018, 128},
+    {0x801f, 128},
+    {0x8029, 128},
+    {0xc038, 128},
+    {0x8003, 130},
+    {0x8006, 130},
+    {0x800a, 130},
+    {0x800f, 130},
+    {0x8018, 130},
+    {0x801f, 130},
+    {0x8029, 130},
+    {0xc038, 130},
+  },
+  /* 101 */
+  {
+    {0x8001, 131},
+    {0xc016, 131},
+    {0x8001, 162},
+    {0xc016, 162},
+    {0x8001, 184},
+    {0xc016, 184},
+    {0x8001, 194},
+    {0xc016, 194},
+    {0x8001, 224},
+    {0xc016, 224},
+    {0x8001, 226},
+    {0xc016, 226},
+    {0xc000, 153},
+    {0xc000, 161},
+    {0xc000, 167},
+    {0xc000, 172},
+  },
+  /* 102 */
+  {
+    {0x8002, 131},
+    {0x8009, 131},
+    {0x8017, 131},
+    {0xc028, 131},
+    {0x8002, 162},
+    {0x8009, 162},
+    {0x8017, 162},
+    {0xc028, 162},
+    {0x8002, 184},
+    {0x8009, 184},
+    {0x8017, 184},
+    {0xc028, 184},
+    {0x8002, 194},
+    {0x8009, 194},
+    {0x8017, 194},
+    {0xc028, 194},
+  },
+  /* 103 */
+  {
+    {0x8003, 131},
+    {0x8006, 131},
+    {0x800a, 131},
+    {0x800f, 131},
+    {0x8018, 131},
+    {0x801f, 131},
+    {0x8029, 131},
+    {0xc038, 131},
+    {0x8003, 162},
+    {0x8006, 162},
+    {0x800a, 162},
+    {0x800f, 162},
+    {0x8018, 162},
+    {0x801f, 162},
+    {0x8029, 162},
+    {0xc038, 162},
+  },
+  /* 104 */
+  {
+    {0x8003, 184},
+    {0x8006, 184},
+    {0x800a, 184},
+    {0x800f, 184},
+    {0x8018, 184},
+    {0x801f, 184},
+    {0x8029, 184},
+    {0xc038, 184},
+    {0x8003, 194},
+    {0x8006, 194},
+    {0x800a, 194},
+    {0x800f, 194},
+    {0x8018, 194},
+    {0x801f, 194},
+    {0x8029, 194},
+    {0xc038, 194},
+  },
+  /* 105 */
+  {
+    {0x8002, 224},
+    {0x8009, 224},
+    {0x8017, 224},
+    {0xc028, 224},
+    {0x8002, 226},
+    {0x8009, 226},
+    {0x8017, 226},
+    {0xc028, 226},
+    {0x8001, 153},
+    {0xc016, 153},
+    {0x8001, 161},
+    {0xc016, 161},
+    {0x8001, 167},
+    {0xc016, 167},
+    {0x8001, 172},
+    {0xc016, 172},
+  },
+  /* 106 */
+  {
+    {0x8003, 224},
+    {0x8006, 224},
+    {0x800a, 224},
+    {0x800f, 224},
+    {0x8018, 224},
+    {0x801f, 224},
+    {0x8029, 224},
+    {0xc038, 224},
+    {0x8003, 226},
+    {0x8006, 226},
+    {0x800a, 226},
+    {0x800f, 226},
+    {0x8018, 226},
+    {0x801f, 226},
+    {0x8029, 226},
+    {0xc038, 226},
+  },
+  /* 107 */
+  {
+    {0x8002, 153},
+    {0x8009, 153},
+    {0x8017, 153},
+    {0xc028, 153},
+    {0x8002, 161},
+    {0x8009, 161},
+    {0x8017, 161},
+    {0xc028, 161},
+    {0x8002, 167},
+    {0x8009, 167},
+    {0x8017, 167},
+    {0xc028, 167},
+    {0x8002, 172},
+    {0x8009, 172},
+    {0x8017, 172},
+    {0xc028, 172},
+  },
+  /* 108 */
+  {
+    {0x8003, 153},
+    {0x8006, 153},
+    {0x800a, 153},
+    {0x800f, 153},
+    {0x8018, 153},
+    {0x801f, 153},
+    {0x8029, 153},
+    {0xc038, 153},
+    {0x8003, 161},
+    {0x8006, 161},
+    {0x800a, 161},
+    {0x800f, 161},
+    {0x8018, 161},
+    {0x801f, 161},
+    {0x8029, 161},
+    {0xc038, 161},
+  },
+  /* 109 */
+  {
+    {0x8003, 167},
+    {0x8006, 167},
+    {0x800a, 167},
+    {0x800f, 167},
+    {0x8018, 167},
+    {0x801f, 167},
+    {0x8029, 167},
+    {0xc038, 167},
+    {0x8003, 172},
+    {0x8006, 172},
+    {0x800a, 172},
+    {0x800f, 172},
+    {0x8018, 172},
+    {0x801f, 172},
+    {0x8029, 172},
+    {0xc038, 172},
+  },
+  /* 110 */
+  {
+    {0x72, 0},
+    {0x73, 0},
+    {0x75, 0},
+    {0x76, 0},
+    {0x79, 0},
+    {0x7b, 0},
+    {0x7f, 0},
+    {0x82, 0},
+    {0x88, 0},
+    {0x8b, 0},
+    {0x8f, 0},
+    {0x92, 0},
+    {0x9b, 0},
+    {0xa2, 0},
+    {0xaa, 0},
+    {0xb4, 0},
+  },
+  /* 111 */
+  {
+    {0xc000, 176},
+    {0xc000, 177},
+    {0xc000, 179},
+    {0xc000, 209},
+    {0xc000, 216},
+    {0xc000, 217},
+    {0xc000, 227},
+    {0xc000, 229},
+    {0xc000, 230},
+    {0x7a, 0},
+    {0x7c, 0},
+    {0x7d, 0},
+    {0x80, 0},
+    {0x81, 0},
+    {0x83, 0},
+    {0x84, 0},
+  },
+  /* 112 */
+  {
+    {0x8001, 176},
+    {0xc016, 176},
+    {0x8001, 177},
+    {0xc016, 177},
+    {0x8001, 179},
+    {0xc016, 179},
+    {0x8001, 209},
+    {0xc016, 209},
+    {0x8001, 216},
+    {0xc016, 216},
+    {0x8001, 217},
+    {0xc016, 217},
+    {0x8001, 227},
+    {0xc016, 227},
+    {0x8001, 229},
+    {0xc016, 229},
+  },
+  /* 113 */
+  {
+    {0x8002, 176},
+    {0x8009, 176},
+    {0x8017, 176},
+    {0xc028, 176},
+    {0x8002, 177},
+    {0x8009, 177},
+    {0x8017, 177},
+    {0xc028, 177},
+    {0x8002, 179},
+    {0x8009, 179},
+    {0x8017, 179},
+    {0xc028, 179},
+    {0x8002, 209},
+    {0x8009, 209},
+    {0x8017, 209},
+    {0xc028, 209},
+  },
+  /* 114 */
+  {
+    {0x8003, 176},
+    {0x8006, 176},
+    {0x800a, 176},
+    {0x800f, 176},
+    {0x8018, 176},
+    {0x801f, 176},
+    {0x8029, 176},
+    {0xc038, 176},
+    {0x8003, 177},
+    {0x8006, 177},
+    {0x800a, 177},
+    {0x800f, 177},
+    {0x8018, 177},
+    {0x801f, 177},
+    {0x8029, 177},
+    {0xc038, 177},
+  },
+  /* 115 */
+  {
+    {0x8003, 179},
+    {0x8006, 179},
+    {0x800a, 179},
+    {0x800f, 179},
+    {0x8018, 179},
+    {0x801f, 179},
+    {0x8029, 179},
+    {0xc038, 179},
+    {0x8003, 209},
+    {0x8006, 209},
+    {0x800a, 209},
+    {0x800f, 209},
+    {0x8018, 209},
+    {0x801f, 209},
+    {0x8029, 209},
+    {0xc038, 209},
+  },
+  /* 116 */
+  {
+    {0x8002, 216},
+    {0x8009, 216},
+    {0x8017, 216},
+    {0xc028, 216},
+    {0x8002, 217},
+    {0x8009, 217},
+    {0x8017, 217},
+    {0xc028, 217},
+    {0x8002, 227},
+    {0x8009, 227},
+    {0x8017, 227},
+    {0xc028, 227},
+    {0x8002, 229},
+    {0x8009, 229},
+    {0x8017, 229},
+    {0xc028, 229},
+  },
+  /* 117 */
+  {
+    {0x8003, 216},
+    {0x8006, 216},
+    {0x800a, 216},
+    {0x800f, 216},
+    {0x8018, 216},
+    {0x801f, 216},
+    {0x8029, 216},
+    {0xc038, 216},
+    {0x8003, 217},
+    {0x8006, 217},
+    {0x800a, 217},
+    {0x800f, 217},
+    {0x8018, 217},
+    {0x801f, 217},
+    {0x8029, 217},
+    {0xc038, 217},
+  },
+  /* 118 */
+  {
+    {0x8003, 227},
+    {0x8006, 227},
+    {0x800a, 227},
+    {0x800f, 227},
+    {0x8018, 227},
+    {0x801f, 227},
+    {0x8029, 227},
+    {0xc038, 227},
+    {0x8003, 229},
+    {0x8006, 229},
+    {0x800a, 229},
+    {0x800f, 229},
+    {0x8018, 229},
+    {0x801f, 229},
+    {0x8029, 229},
+    {0xc038, 229},
+  },
+  /* 119 */
+  {
+    {0x8001, 230},
+    {0xc016, 230},
+    {0xc000, 129},
+    {0xc000, 132},
+    {0xc000, 133},
+    {0xc000, 134},
+    {0xc000, 136},
+    {0xc000, 146},
+    {0xc000, 154},
+    {0xc000, 156},
+    {0xc000, 160},
+    {0xc000, 163},
+    {0xc000, 164},
+    {0xc000, 169},
+    {0xc000, 170},
+    {0xc000, 173},
+  },
+  /* 120 */
+  {
+    {0x8002, 230},
+    {0x8009, 230},
+    {0x8017, 230},
+    {0xc028, 230},
+    {0x8001, 129},
+    {0xc016, 129},
+    {0x8001, 132},
+    {0xc016, 132},
+    {0x8001, 133},
+    {0xc016, 133},
+    {0x8001, 134},
+    {0xc016, 134},
+    {0x8001, 136},
+    {0xc016, 136},
+    {0x8001, 146},
+    {0xc016, 146},
+  },
+  /* 121 */
+  {
+    {0x8003, 230},
+    {0x8006, 230},
+    {0x800a, 230},
+    {0x800f, 230},
+    {0x8018, 230},
+    {0x801f, 230},
+    {0x8029, 230},
+    {0xc038, 230},
+    {0x8002, 129},
+    {0x8009, 129},
+    {0x8017, 129},
+    {0xc028, 129},
+    {0x8002, 132},
+    {0x8009, 132},
+    {0x8017, 132},
+    {0xc028, 132},
+  },
+  /* 122 */
+  {
+    {0x8003, 129},
+    {0x8006, 129},
+    {0x800a, 129},
+    {0x800f, 129},
+    {0x8018, 129},
+    {0x801f, 129},
+    {0x8029, 129},
+    {0xc038, 129},
+    {0x8003, 132},
+    {0x8006, 132},
+    {0x800a, 132},
+    {0x800f, 132},
+    {0x8018, 132},
+    {0x801f, 132},
+    {0x8029, 132},
+    {0xc038, 132},
+  },
+  /* 123 */
+  {
+    {0x8002, 133},
+    {0x8009, 133},
+    {0x8017, 133},
+    {0xc028, 133},
+    {0x8002, 134},
+    {0x8009, 134},
+    {0x8017, 134},
+    {0xc028, 134},
+    {0x8002, 136},
+    {0x8009, 136},
+    {0x8017, 136},
+    {0xc028, 136},
+    {0x8002, 146},
+    {0x8009, 146},
+    {0x8017, 146},
+    {0xc028, 146},
+  },
+  /* 124 */
+  {
+    {0x8003, 133},
+    {0x8006, 133},
+    {0x800a, 133},
+    {0x800f, 133},
+    {0x8018, 133},
+    {0x801f, 133},
+    {0x8029, 133},
+    {0xc038, 133},
+    {0x8003, 134},
+    {0x8006, 134},
+    {0x800a, 134},
+    {0x800f, 134},
+    {0x8018, 134},
+    {0x801f, 134},
+    {0x8029, 134},
+    {0xc038, 134},
+  },
+  /* 125 */
+  {
+    {0x8003, 136},
+    {0x8006, 136},
+    {0x800a, 136},
+    {0x800f, 136},
+    {0x8018, 136},
+    {0x801f, 136},
+    {0x8029, 136},
+    {0xc038, 136},
+    {0x8003, 146},
+    {0x8006, 146},
+    {0x800a, 146},
+    {0x800f, 146},
+    {0x8018, 146},
+    {0x801f, 146},
+    {0x8029, 146},
+    {0xc038, 146},
+  },
+  /* 126 */
+  {
+    {0x8001, 154},
+    {0xc016, 154},
+    {0x8001, 156},
+    {0xc016, 156},
+    {0x8001, 160},
+    {0xc016, 160},
+    {0x8001, 163},
+    {0xc016, 163},
+    {0x8001, 164},
+    {0xc016, 164},
+    {0x8001, 169},
+    {0xc016, 169},
+    {0x8001, 170},
+    {0xc016, 170},
+    {0x8001, 173},
+    {0xc016, 173},
+  },
+  /* 127 */
+  {
+    {0x8002, 154},
+    {0x8009, 154},
+    {0x8017, 154},
+    {0xc028, 154},
+    {0x8002, 156},
+    {0x8009, 156},
+    {0x8017, 156},
+    {0xc028, 156},
+    {0x8002, 160},
+    {0x8009, 160},
+    {0x8017, 160},
+    {0xc028, 160},
+    {0x8002, 163},
+    {0x8009, 163},
+    {0x8017, 163},
+    {0xc028, 163},
+  },
+  /* 128 */
+  {
+    {0x8003, 154},
+    {0x8006, 154},
+    {0x800a, 154},
+    {0x800f, 154},
+    {0x8018, 154},
+    {0x801f, 154},
+    {0x8029, 154},
+    {0xc038, 154},
+    {0x8003, 156},
+    {0x8006, 156},
+    {0x800a, 156},
+    {0x800f, 156},
+    {0x8018, 156},
+    {0x801f, 156},
+    {0x8029, 156},
+    {0xc038, 156},
+  },
+  /* 129 */
+  {
+    {0x8003, 160},
+    {0x8006, 160},
+    {0x800a, 160},
+    {0x800f, 160},
+    {0x8018, 160},
+    {0x801f, 160},
+    {0x8029, 160},
+    {0xc038, 160},
+    {0x8003, 163},
+    {0x8006, 163},
+    {0x800a, 163},
+    {0x800f, 163},
+    {0x8018, 163},
+    {0x801f, 163},
+    {0x8029, 163},
+    {0xc038, 163},
+  },
+  /* 130 */
+  {
+    {0x8002, 164},
+    {0x8009, 164},
+    {0x8017, 164},
+    {0xc028, 164},
+    {0x8002, 169},
+    {0x8009, 169},
+    {0x8017, 169},
+    {0xc028, 169},
+    {0x8002, 170},
+    {0x8009, 170},
+    {0x8017, 170},
+    {0xc028, 170},
+    {0x8002, 173},
+    {0x8009, 173},
+    {0x8017, 173},
+    {0xc028, 173},
+  },
+  /* 131 */
+  {
+    {0x8003, 164},
+    {0x8006, 164},
+    {0x800a, 164},
+    {0x800f, 164},
+    {0x8018, 164},
+    {0x801f, 164},
+    {0x8029, 164},
+    {0xc038, 164},
+    {0x8003, 169},
+    {0x8006, 169},
+    {0x800a, 169},
+    {0x800f, 169},
+    {0x8018, 169},
+    {0x801f, 169},
+    {0x8029, 169},
+    {0xc038, 169},
+  },
+  /* 132 */
+  {
+    {0x8003, 170},
+    {0x8006, 170},
+    {0x800a, 170},
+    {0x800f, 170},
+    {0x8018, 170},
+    {0x801f, 170},
+    {0x8029, 170},
+    {0xc038, 170},
+    {0x8003, 173},
+    {0x8006, 173},
+    {0x800a, 173},
+    {0x800f, 173},
+    {0x8018, 173},
+    {0x801f, 173},
+    {0x8029, 173},
+    {0xc038, 173},
+  },
+  /* 133 */
+  {
+    {0x89, 0},
+    {0x8a, 0},
+    {0x8c, 0},
+    {0x8d, 0},
+    {0x90, 0},
+    {0x91, 0},
+    {0x93, 0},
+    {0x96, 0},
+    {0x9c, 0},
+    {0x9f, 0},
+    {0xa3, 0},
+    {0xa6, 0},
+    {0xab, 0},
+    {0xae, 0},
+    {0xb5, 0},
+    {0xbe, 0},
+  },
+  /* 134 */
+  {
+    {0xc000, 178},
+    {0xc000, 181},
+    {0xc000, 185},
+    {0xc000, 186},
+    {0xc000, 187},
+    {0xc000, 189},
+    {0xc000, 190},
+    {0xc000, 196},
+    {0xc000, 198},
+    {0xc000, 228},
+    {0xc000, 232},
+    {0xc000, 233},
+    {0x94, 0},
+    {0x95, 0},
+    {0x97, 0},
+    {0x98, 0},
+  },
+  /* 135 */
+  {
+    {0x8001, 178},
+    {0xc016, 178},
+    {0x8001, 181},
+    {0xc016, 181},
+    {0x8001, 185},
+    {0xc016, 185},
+    {0x8001, 186},
+    {0xc016, 186},
+    {0x8001, 187},
+    {0xc016, 187},
+    {0x8001, 189},
+    {0xc016, 189},
+    {0x8001, 190},
+    {0xc016, 190},
+    {0x8001, 196},
+    {0xc016, 196},
+  },
+  /* 136 */
+  {
+    {0x8002, 178},
+    {0x8009, 178},
+    {0x8017, 178},
+    {0xc028, 178},
+    {0x8002, 181},
+    {0x8009, 181},
+    {0x8017, 181},
+    {0xc028, 181},
+    {0x8002, 185},
+    {0x8009, 185},
+    {0x8017, 185},
+    {0xc028, 185},
+    {0x8002, 186},
+    {0x8009, 186},
+    {0x8017, 186},
+    {0xc028, 186},
+  },
+  /* 137 */
+  {
+    {0x8003, 178},
+    {0x8006, 178},
+    {0x800a, 178},
+    {0x800f, 178},
+    {0x8018, 178},
+    {0x801f, 178},
+    {0x8029, 178},
+    {0xc038, 178},
+    {0x8003, 181},
+    {0x8006, 181},
+    {0x800a, 181},
+    {0x800f, 181},
+    {0x8018, 181},
+    {0x801f, 181},
+    {0x8029, 181},
+    {0xc038, 181},
+  },
+  /* 138 */
+  {
+    {0x8003, 185},
+    {0x8006, 185},
+    {0x800a, 185},
+    {0x800f, 185},
+    {0x8018, 185},
+    {0x801f, 185},
+    {0x8029, 185},
+    {0xc038, 185},
+    {0x8003, 186},
+    {0x8006, 186},
+    {0x800a, 186},
+    {0x800f, 186},
+    {0x8018, 186},
+    {0x801f, 186},
+    {0x8029, 186},
+    {0xc038, 186},
+  },
+  /* 139 */
+  {
+    {0x8002, 187},
+    {0x8009, 187},
+    {0x8017, 187},
+    {0xc028, 187},
+    {0x8002, 189},
+    {0x8009, 189},
+    {0x8017, 189},
+    {0xc028, 189},
+    {0x8002, 190},
+    {0x8009, 190},
+    {0x8017, 190},
+    {0xc028, 190},
+    {0x8002, 196},
+    {0x8009, 196},
+    {0x8017, 196},
+    {0xc028, 196},
+  },
+  /* 140 */
+  {
+    {0x8003, 187},
+    {0x8006, 187},
+    {0x800a, 187},
+    {0x800f, 187},
+    {0x8018, 187},
+    {0x801f, 187},
+    {0x8029, 187},
+    {0xc038, 187},
+    {0x8003, 189},
+    {0x8006, 189},
+    {0x800a, 189},
+    {0x800f, 189},
+    {0x8018, 189},
+    {0x801f, 189},
+    {0x8029, 189},
+    {0xc038, 189},
+  },
+  /* 141 */
+  {
+    {0x8003, 190},
+    {0x8006, 190},
+    {0x800a, 190},
+    {0x800f, 190},
+    {0x8018, 190},
+    {0x801f, 190},
+    {0x8029, 190},
+    {0xc038, 190},
+    {0x8003, 196},
+    {0x8006, 196},
+    {0x800a, 196},
+    {0x800f, 196},
+    {0x8018, 196},
+    {0x801f, 196},
+    {0x8029, 196},
+    {0xc038, 196},
+  },
+  /* 142 */
+  {
+    {0x8001, 198},
+    {0xc016, 198},
+    {0x8001, 228},
+    {0xc016, 228},
+    {0x8001, 232},
+    {0xc016, 232},
+    {0x8001, 233},
+    {0xc016, 233},
+    {0xc000, 1},
+    {0xc000, 135},
+    {0xc000, 137},
+    {0xc000, 138},
+    {0xc000, 139},
+    {0xc000, 140},
+    {0xc000, 141},
+    {0xc000, 143},
+  },
+  /* 143 */
+  {
+    {0x8002, 198},
+    {0x8009, 198},
+    {0x8017, 198},
+    {0xc028, 198},
+    {0x8002, 228},
+    {0x8009, 228},
+    {0x8017, 228},
+    {0xc028, 228},
+    {0x8002, 232},
+    {0x8009, 232},
+    {0x8017, 232},
+    {0xc028, 232},
+    {0x8002, 233},
+    {0x8009, 233},
+    {0x8017, 233},
+    {0xc028, 233},
+  },
+  /* 144 */
+  {
+    {0x8003, 198},
+    {0x8006, 198},
+    {0x800a, 198},
+    {0x800f, 198},
+    {0x8018, 198},
+    {0x801f, 198},
+    {0x8029, 198},
+    {0xc038, 198},
+    {0x8003, 228},
+    {0x8006, 228},
+    {0x800a, 228},
+    {0x800f, 228},
+    {0x8018, 228},
+    {0x801f, 228},
+    {0x8029, 228},
+    {0xc038, 228},
+  },
+  /* 145 */
+  {
+    {0x8003, 232},
+    {0x8006, 232},
+    {0x800a, 232},
+    {0x800f, 232},
+    {0x8018, 232},
+    {0x801f, 232},
+    {0x8029, 232},
+    {0xc038, 232},
+    {0x8003, 233},
+    {0x8006, 233},
+    {0x800a, 233},
+    {0x800f, 233},
+    {0x8018, 233},
+    {0x801f, 233},
+    {0x8029, 233},
+    {0xc038, 233},
+  },
+  /* 146 */
+  {
+    {0x8001, 1},
+    {0xc016, 1},
+    {0x8001, 135},
+    {0xc016, 135},
+    {0x8001, 137},
+    {0xc016, 137},
+    {0x8001, 138},
+    {0xc016, 138},
+    {0x8001, 139},
+    {0xc016, 139},
+    {0x8001, 140},
+    {0xc016, 140},
+    {0x8001, 141},
+    {0xc016, 141},
+    {0x8001, 143},
+    {0xc016, 143},
+  },
+  /* 147 */
+  {
+    {0x8002, 1},
+    {0x8009, 1},
+    {0x8017, 1},
+    {0xc028, 1},
+    {0x8002, 135},
+    {0x8009, 135},
+    {0x8017, 135},
+    {0xc028, 135},
+    {0x8002, 137},
+    {0x8009, 137},
+    {0x8017, 137},
+    {0xc028, 137},
+    {0x8002, 138},
+    {0x8009, 138},
+    {0x8017, 138},
+    {0xc028, 138},
+  },
+  /* 148 */
+  {
+    {0x8003, 1},
+    {0x8006, 1},
+    {0x800a, 1},
+    {0x800f, 1},
+    {0x8018, 1},
+    {0x801f, 1},
+    {0x8029, 1},
+    {0xc038, 1},
+    {0x8003, 135},
+    {0x8006, 135},
+    {0x800a, 135},
+    {0x800f, 135},
+    {0x8018, 135},
+    {0x801f, 135},
+    {0x8029, 135},
+    {0xc038, 135},
+  },
+  /* 149 */
+  {
+    {0x8003, 137},
+    {0x8006, 137},
+    {0x800a, 137},
+    {0x800f, 137},
+    {0x8018, 137},
+    {0x801f, 137},
+    {0x8029, 137},
+    {0xc038, 137},
+    {0x8003, 138},
+    {0x8006, 138},
+    {0x800a, 138},
+    {0x800f, 138},
+    {0x8018, 138},
+    {0x801f, 138},
+    {0x8029, 138},
+    {0xc038, 138},
+  },
+  /* 150 */
+  {
+    {0x8002, 139},
+    {0x8009, 139},
+    {0x8017, 139},
+    {0xc028, 139},
+    {0x8002, 140},
+    {0x8009, 140},
+    {0x8017, 140},
+    {0xc028, 140},
+    {0x8002, 141},
+    {0x8009, 141},
+    {0x8017, 141},
+    {0xc028, 141},
+    {0x8002, 143},
+    {0x8009, 143},
+    {0x8017, 143},
+    {0xc028, 143},
+  },
+  /* 151 */
+  {
+    {0x8003, 139},
+    {0x8006, 139},
+    {0x800a, 139},
+    {0x800f, 139},
+    {0x8018, 139},
+    {0x801f, 139},
+    {0x8029, 139},
+    {0xc038, 139},
+    {0x8003, 140},
+    {0x8006, 140},
+    {0x800a, 140},
+    {0x800f, 140},
+    {0x8018, 140},
+    {0x801f, 140},
+    {0x8029, 140},
+    {0xc038, 140},
+  },
+  /* 152 */
+  {
+    {0x8003, 141},
+    {0x8006, 141},
+    {0x800a, 141},
+    {0x800f, 141},
+    {0x8018, 141},
+    {0x801f, 141},
+    {0x8029, 141},
+    {0xc038, 141},
+    {0x8003, 143},
+    {0x8006, 143},
+    {0x800a, 143},
+    {0x800f, 143},
+    {0x8018, 143},
+    {0x801f, 143},
+    {0x8029, 143},
+    {0xc038, 143},
+  },
+  /* 153 */
+  {
+    {0x9d, 0},
+    {0x9e, 0},
+    {0xa0, 0},
+    {0xa1, 0},
+    {0xa4, 0},
+    {0xa5, 0},
+    {0xa7, 0},
+    {0xa8, 0},
+    {0xac, 0},
+    {0xad, 0},
+    {0xaf, 0},
+    {0xb1, 0},
+    {0xb6, 0},
+    {0xb9, 0},
+    {0xbf, 0},
+    {0xcf, 0},
+  },
+  /* 154 */
+  {
+    {0xc000, 147},
+    {0xc000, 149},
+    {0xc000, 150},
+    {0xc000, 151},
+    {0xc000, 152},
+    {0xc000, 155},
+    {0xc000, 157},
+    {0xc000, 158},
+    {0xc000, 165},
+    {0xc000, 166},
+    {0xc000, 168},
+    {0xc000, 174},
+    {0xc000, 175},
+    {0xc000, 180},
+    {0xc000, 182},
+    {0xc000, 183},
+  },
+  /* 155 */
+  {
+    {0x8001, 147},
+    {0xc016, 147},
+    {0x8001, 149},
+    {0xc016, 149},
+    {0x8001, 150},
+    {0xc016, 150},
+    {0x8001, 151},
+    {0xc016, 151},
+    {0x8001, 152},
+    {0xc016, 152},
+    {0x8001, 155},
+    {0xc016, 155},
+    {0x8001, 157},
+    {0xc016, 157},
+    {0x8001, 158},
+    {0xc016, 158},
+  },
+  /* 156 */
+  {
+    {0x8002, 147},
+    {0x8009, 147},
+    {0x8017, 147},
+    {0xc028, 147},
+    {0x8002, 149},
+    {0x8009, 149},
+    {0x8017, 149},
+    {0xc028, 149},
+    {0x8002, 150},
+    {0x8009, 150},
+    {0x8017, 150},
+    {0xc028, 150},
+    {0x8002, 151},
+    {0x8009, 151},
+    {0x8017, 151},
+    {0xc028, 151},
+  },
+  /* 157 */
+  {
+    {0x8003, 147},
+    {0x8006, 147},
+    {0x800a, 147},
+    {0x800f, 147},
+    {0x8018, 147},
+    {0x801f, 147},
+    {0x8029, 147},
+    {0xc038, 147},
+    {0x8003, 149},
+    {0x8006, 149},
+    {0x800a, 149},
+    {0x800f, 149},
+    {0x8018, 149},
+    {0x801f, 149},
+    {0x8029, 149},
+    {0xc038, 149},
+  },
+  /* 158 */
+  {
+    {0x8003, 150},
+    {0x8006, 150},
+    {0x800a, 150},
+    {0x800f, 150},
+    {0x8018, 150},
+    {0x801f, 150},
+    {0x8029, 150},
+    {0xc038, 150},
+    {0x8003, 151},
+    {0x8006, 151},
+    {0x800a, 151},
+    {0x800f, 151},
+    {0x8018, 151},
+    {0x801f, 151},
+    {0x8029, 151},
+    {0xc038, 151},
+  },
+  /* 159 */
+  {
+    {0x8002, 152},
+    {0x8009, 152},
+    {0x8017, 152},
+    {0xc028, 152},
+    {0x8002, 155},
+    {0x8009, 155},
+    {0x8017, 155},
+    {0xc028, 155},
+    {0x8002, 157},
+    {0x8009, 157},
+    {0x8017, 157},
+    {0xc028, 157},
+    {0x8002, 158},
+    {0x8009, 158},
+    {0x8017, 158},
+    {0xc028, 158},
+  },
+  /* 160 */
+  {
+    {0x8003, 152},
+    {0x8006, 152},
+    {0x800a, 152},
+    {0x800f, 152},
+    {0x8018, 152},
+    {0x801f, 152},
+    {0x8029, 152},
+    {0xc038, 152},
+    {0x8003, 155},
+    {0x8006, 155},
+    {0x800a, 155},
+    {0x800f, 155},
+    {0x8018, 155},
+    {0x801f, 155},
+    {0x8029, 155},
+    {0xc038, 155},
+  },
+  /* 161 */
+  {
+    {0x8003, 157},
+    {0x8006, 157},
+    {0x800a, 157},
+    {0x800f, 157},
+    {0x8018, 157},
+    {0x801f, 157},
+    {0x8029, 157},
+    {0xc038, 157},
+    {0x8003, 158},
+    {0x8006, 158},
+    {0x800a, 158},
+    {0x800f, 158},
+    {0x8018, 158},
+    {0x801f, 158},
+    {0x8029, 158},
+    {0xc038, 158},
+  },
+  /* 162 */
+  {
+    {0x8001, 165},
+    {0xc016, 165},
+    {0x8001, 166},
+    {0xc016, 166},
+    {0x8001, 168},
+    {0xc016, 168},
+    {0x8001, 174},
+    {0xc016, 174},
+    {0x8001, 175},
+    {0xc016, 175},
+    {0x8001, 180},
+    {0xc016, 180},
+    {0x8001, 182},
+    {0xc016, 182},
+    {0x8001, 183},
+    {0xc016, 183},
+  },
+  /* 163 */
+  {
+    {0x8002, 165},
+    {0x8009, 165},
+    {0x8017, 165},
+    {0xc028, 165},
+    {0x8002, 166},
+    {0x8009, 166},
+    {0x8017, 166},
+    {0xc028, 166},
+    {0x8002, 168},
+    {0x8009, 168},
+    {0x8017, 168},
+    {0xc028, 168},
+    {0x8002, 174},
+    {0x8009, 174},
+    {0x8017, 174},
+    {0xc028, 174},
+  },
+  /* 164 */
+  {
+    {0x8003, 165},
+    {0x8006, 165},
+    {0x800a, 165},
+    {0x800f, 165},
+    {0x8018, 165},
+    {0x801f, 165},
+    {0x8029, 165},
+    {0xc038, 165},
+    {0x8003, 166},
+    {0x8006, 166},
+    {0x800a, 166},
+    {0x800f, 166},
+    {0x8018, 166},
+    {0x801f, 166},
+    {0x8029, 166},
+    {0xc038, 166},
+  },
+  /* 165 */
+  {
+    {0x8003, 168},
+    {0x8006, 168},
+    {0x800a, 168},
+    {0x800f, 168},
+    {0x8018, 168},
+    {0x801f, 168},
+    {0x8029, 168},
+    {0xc038, 168},
+    {0x8003, 174},
+    {0x8006, 174},
+    {0x800a, 174},
+    {0x800f, 174},
+    {0x8018, 174},
+    {0x801f, 174},
+    {0x8029, 174},
+    {0xc038, 174},
+  },
+  /* 166 */
+  {
+    {0x8002, 175},
+    {0x8009, 175},
+    {0x8017, 175},
+    {0xc028, 175},
+    {0x8002, 180},
+    {0x8009, 180},
+    {0x8017, 180},
+    {0xc028, 180},
+    {0x8002, 182},
+    {0x8009, 182},
+    {0x8017, 182},
+    {0xc028, 182},
+    {0x8002, 183},
+    {0x8009, 183},
+    {0x8017, 183},
+    {0xc028, 183},
+  },
+  /* 167 */
+  {
+    {0x8003, 175},
+    {0x8006, 175},
+    {0x800a, 175},
+    {0x800f, 175},
+    {0x8018, 175},
+    {0x801f, 175},
+    {0x8029, 175},
+    {0xc038, 175},
+    {0x8003, 180},
+    {0x8006, 180},
+    {0x800a, 180},
+    {0x800f, 180},
+    {0x8018, 180},
+    {0x801f, 180},
+    {0x8029, 180},
+    {0xc038, 180},
+  },
+  /* 168 */
+  {
+    {0x8003, 182},
+    {0x8006, 182},
+    {0x800a, 182},
+    {0x800f, 182},
+    {0x8018, 182},
+    {0x801f, 182},
+    {0x8029, 182},
+    {0xc038, 182},
+    {0x8003, 183},
+    {0x8006, 183},
+    {0x800a, 183},
+    {0x800f, 183},
+    {0x8018, 183},
+    {0x801f, 183},
+    {0x8029, 183},
+    {0xc038, 183},
+  },
+  /* 169 */
+  {
+    {0xc000, 188},
+    {0xc000, 191},
+    {0xc000, 197},
+    {0xc000, 231},
+    {0xc000, 239},
+    {0xb0, 0},
+    {0xb2, 0},
+    {0xb3, 0},
+    {0xb7, 0},
+    {0xb8, 0},
+    {0xba, 0},
+    {0xbb, 0},
+    {0xc0, 0},
+    {0xc7, 0},
+    {0xd0, 0},
+    {0xdf, 0},
+  },
+  /* 170 */
+  {
+    {0x8001, 188},
+    {0xc016, 188},
+    {0x8001, 191},
+    {0xc016, 191},
+    {0x8001, 197},
+    {0xc016, 197},
+    {0x8001, 231},
+    {0xc016, 231},
+    {0x8001, 239},
+    {0xc016, 239},
+    {0xc000, 9},
+    {0xc000, 142},
+    {0xc000, 144},
+    {0xc000, 145},
+    {0xc000, 148},
+    {0xc000, 159},
+  },
+  /* 171 */
+  {
+    {0x8002, 188},
+    {0x8009, 188},
+    {0x8017, 188},
+    {0xc028, 188},
+    {0x8002, 191},
+    {0x8009, 191},
+    {0x8017, 191},
+    {0xc028, 191},
+    {0x8002, 197},
+    {0x8009, 197},
+    {0x8017, 197},
+    {0xc028, 197},
+    {0x8002, 231},
+    {0x8009, 231},
+    {0x8017, 231},
+    {0xc028, 231},
+  },
+  /* 172 */
+  {
+    {0x8003, 188},
+    {0x8006, 188},
+    {0x800a, 188},
+    {0x800f, 188},
+    {0x8018, 188},
+    {0x801f, 188},
+    {0x8029, 188},
+    {0xc038, 188},
+    {0x8003, 191},
+    {0x8006, 191},
+    {0x800a, 191},
+    {0x800f, 191},
+    {0x8018, 191},
+    {0x801f, 191},
+    {0x8029, 191},
+    {0xc038, 191},
+  },
+  /* 173 */
+  {
+    {0x8003, 197},
+    {0x8006, 197},
+    {0x800a, 197},
+    {0x800f, 197},
+    {0x8018, 197},
+    {0x801f, 197},
+    {0x8029, 197},
+    {0xc038, 197},
+    {0x8003, 231},
+    {0x8006, 231},
+    {0x800a, 231},
+    {0x800f, 231},
+    {0x8018, 231},
+    {0x801f, 231},
+    {0x8029, 231},
+    {0xc038, 231},
+  },
+  /* 174 */
+  {
+    {0x8002, 239},
+    {0x8009, 239},
+    {0x8017, 239},
+    {0xc028, 239},
+    {0x8001, 9},
+    {0xc016, 9},
+    {0x8001, 142},
+    {0xc016, 142},
+    {0x8001, 144},
+    {0xc016, 144},
+    {0x8001, 145},
+    {0xc016, 145},
+    {0x8001, 148},
+    {0xc016, 148},
+    {0x8001, 159},
+    {0xc016, 159},
+  },
+  /* 175 */
+  {
+    {0x8003, 239},
+    {0x8006, 239},
+    {0x800a, 239},
+    {0x800f, 239},
+    {0x8018, 239},
+    {0x801f, 239},
+    {0x8029, 239},
+    {0xc038, 239},
+    {0x8002, 9},
+    {0x8009, 9},
+    {0x8017, 9},
+    {0xc028, 9},
+    {0x8002, 142},
+    {0x8009, 142},
+    {0x8017, 142},
+    {0xc028, 142},
+  },
+  /* 176 */
+  {
+    {0x8003, 9},
+    {0x8006, 9},
+    {0x800a, 9},
+    {0x800f, 9},
+    {0x8018, 9},
+    {0x801f, 9},
+    {0x8029, 9},
+    {0xc038, 9},
+    {0x8003, 142},
+    {0x8006, 142},
+    {0x800a, 142},
+    {0x800f, 142},
+    {0x8018, 142},
+    {0x801f, 142},
+    {0x8029, 142},
+    {0xc038, 142},
+  },
+  /* 177 */
+  {
+    {0x8002, 144},
+    {0x8009, 144},
+    {0x8017, 144},
+    {0xc028, 144},
+    {0x8002, 145},
+    {0x8009, 145},
+    {0x8017, 145},
+    {0xc028, 145},
+    {0x8002, 148},
+    {0x8009, 148},
+    {0x8017, 148},
+    {0xc028, 148},
+    {0x8002, 159},
+    {0x8009, 159},
+    {0x8017, 159},
+    {0xc028, 159},
+  },
+  /* 178 */
+  {
+    {0x8003, 144},
+    {0x8006, 144},
+    {0x800a, 144},
+    {0x800f, 144},
+    {0x8018, 144},
+    {0x801f, 144},
+    {0x8029, 144},
+    {0xc038, 144},
+    {0x8003, 145},
+    {0x8006, 145},
+    {0x800a, 145},
+    {0x800f, 145},
+    {0x8018, 145},
+    {0x801f, 145},
+    {0x8029, 145},
+    {0xc038, 145},
+  },
+  /* 179 */
+  {
+    {0x8003, 148},
+    {0x8006, 148},
+    {0x800a, 148},
+    {0x800f, 148},
+    {0x8018, 148},
+    {0x801f, 148},
+    {0x8029, 148},
+    {0xc038, 148},
+    {0x8003, 159},
+    {0x8006, 159},
+    {0x800a, 159},
+    {0x800f, 159},
+    {0x8018, 159},
+    {0x801f, 159},
+    {0x8029, 159},
+    {0xc038, 159},
+  },
+  /* 180 */
+  {
+    {0xc000, 171},
+    {0xc000, 206},
+    {0xc000, 215},
+    {0xc000, 225},
+    {0xc000, 236},
+    {0xc000, 237},
+    {0xbc, 0},
+    {0xbd, 0},
+    {0xc1, 0},
+    {0xc4, 0},
+    {0xc8, 0},
+    {0xcb, 0},
+    {0xd1, 0},
+    {0xd8, 0},
+    {0xe0, 0},
+    {0xee, 0},
+  },
+  /* 181 */
+  {
+    {0x8001, 171},
+    {0xc016, 171},
+    {0x8001, 206},
+    {0xc016, 206},
+    {0x8001, 215},
+    {0xc016, 215},
+    {0x8001, 225},
+    {0xc016, 225},
+    {0x8001, 236},
+    {0xc016, 236},
+    {0x8001, 237},
+    {0xc016, 237},
+    {0xc000, 199},
+    {0xc000, 207},
+    {0xc000, 234},
+    {0xc000, 235},
+  },
+  /* 182 */
+  {
+    {0x8002, 171},
+    {0x8009, 171},
+    {0x8017, 171},
+    {0xc028, 171},
+    {0x8002, 206},
+    {0x8009, 206},
+    {0x8017, 206},
+    {0xc028, 206},
+    {0x8002, 215},
+    {0x8009, 215},
+    {0x8017, 215},
+    {0xc028, 215},
+    {0x8002, 225},
+    {0x8009, 225},
+    {0x8017, 225},
+    {0xc028, 225},
+  },
+  /* 183 */
+  {
+    {0x8003, 171},
+    {0x8006, 171},
+    {0x800a, 171},
+    {0x800f, 171},
+    {0x8018, 171},
+    {0x801f, 171},
+    {0x8029, 171},
+    {0xc038, 171},
+    {0x8003, 206},
+    {0x8006, 206},
+    {0x800a, 206},
+    {0x800f, 206},
+    {0x8018, 206},
+    {0x801f, 206},
+    {0x8029, 206},
+    {0xc038, 206},
+  },
+  /* 184 */
+  {
+    {0x8003, 215},
+    {0x8006, 215},
+    {0x800a, 215},
+    {0x800f, 215},
+    {0x8018, 215},
+    {0x801f, 215},
+    {0x8029, 215},
+    {0xc038, 215},
+    {0x8003, 225},
+    {0x8006, 225},
+    {0x800a, 225},
+    {0x800f, 225},
+    {0x8018, 225},
+    {0x801f, 225},
+    {0x8029, 225},
+    {0xc038, 225},
+  },
+  /* 185 */
+  {
+    {0x8002, 236},
+    {0x8009, 236},
+    {0x8017, 236},
+    {0xc028, 236},
+    {0x8002, 237},
+    {0x8009, 237},
+    {0x8017, 237},
+    {0xc028, 237},
+    {0x8001, 199},
+    {0xc016, 199},
+    {0x8001, 207},
+    {0xc016, 207},
+    {0x8001, 234},
+    {0xc016, 234},
+    {0x8001, 235},
+    {0xc016, 235},
+  },
+  /* 186 */
+  {
+    {0x8003, 236},
+    {0x8006, 236},
+    {0x800a, 236},
+    {0x800f, 236},
+    {0x8018, 236},
+    {0x801f, 236},
+    {0x8029, 236},
+    {0xc038, 236},
+    {0x8003, 237},
+    {0x8006, 237},
+    {0x800a, 237},
+    {0x800f, 237},
+    {0x8018, 237},
+    {0x801f, 237},
+    {0x8029, 237},
+    {0xc038, 237},
+  },
+  /* 187 */
+  {
+    {0x8002, 199},
+    {0x8009, 199},
+    {0x8017, 199},
+    {0xc028, 199},
+    {0x8002, 207},
+    {0x8009, 207},
+    {0x8017, 207},
+    {0xc028, 207},
+    {0x8002, 234},
+    {0x8009, 234},
+    {0x8017, 234},
+    {0xc028, 234},
+    {0x8002, 235},
+    {0x8009, 235},
+    {0x8017, 235},
+    {0xc028, 235},
+  },
+  /* 188 */
+  {
+    {0x8003, 199},
+    {0x8006, 199},
+    {0x800a, 199},
+    {0x800f, 199},
+    {0x8018, 199},
+    {0x801f, 199},
+    {0x8029, 199},
+    {0xc038, 199},
+    {0x8003, 207},
+    {0x8006, 207},
+    {0x800a, 207},
+    {0x800f, 207},
+    {0x8018, 207},
+    {0x801f, 207},
+    {0x8029, 207},
+    {0xc038, 207},
+  },
+  /* 189 */
+  {
+    {0x8003, 234},
+    {0x8006, 234},
+    {0x800a, 234},
+    {0x800f, 234},
+    {0x8018, 234},
+    {0x801f, 234},
+    {0x8029, 234},
+    {0xc038, 234},
+    {0x8003, 235},
+    {0x8006, 235},
+    {0x800a, 235},
+    {0x800f, 235},
+    {0x8018, 235},
+    {0x801f, 235},
+    {0x8029, 235},
+    {0xc038, 235},
+  },
+  /* 190 */
+  {
+    {0xc2, 0},
+    {0xc3, 0},
+    {0xc5, 0},
+    {0xc6, 0},
+    {0xc9, 0},
+    {0xca, 0},
+    {0xcc, 0},
+    {0xcd, 0},
+    {0xd2, 0},
+    {0xd5, 0},
+    {0xd9, 0},
+    {0xdc, 0},
+    {0xe1, 0},
+    {0xe7, 0},
+    {0xef, 0},
+    {0xf6, 0},
+  },
+  /* 191 */
+  {
+    {0xc000, 192},
+    {0xc000, 193},
+    {0xc000, 200},
+    {0xc000, 201},
+    {0xc000, 202},
+    {0xc000, 205},
+    {0xc000, 210},
+    {0xc000, 213},
+    {0xc000, 218},
+    {0xc000, 219},
+    {0xc000, 238},
+    {0xc000, 240},
+    {0xc000, 242},
+    {0xc000, 243},
+    {0xc000, 255},
+    {0xce, 0},
+  },
+  /* 192 */
+  {
+    {0x8001, 192},
+    {0xc016, 192},
+    {0x8001, 193},
+    {0xc016, 193},
+    {0x8001, 200},
+    {0xc016, 200},
+    {0x8001, 201},
+    {0xc016, 201},
+    {0x8001, 202},
+    {0xc016, 202},
+    {0x8001, 205},
+    {0xc016, 205},
+    {0x8001, 210},
+    {0xc016, 210},
+    {0x8001, 213},
+    {0xc016, 213},
+  },
+  /* 193 */
+  {
+    {0x8002, 192},
+    {0x8009, 192},
+    {0x8017, 192},
+    {0xc028, 192},
+    {0x8002, 193},
+    {0x8009, 193},
+    {0x8017, 193},
+    {0xc028, 193},
+    {0x8002, 200},
+    {0x8009, 200},
+    {0x8017, 200},
+    {0xc028, 200},
+    {0x8002, 201},
+    {0x8009, 201},
+    {0x8017, 201},
+    {0xc028, 201},
+  },
+  /* 194 */
+  {
+    {0x8003, 192},
+    {0x8006, 192},
+    {0x800a, 192},
+    {0x800f, 192},
+    {0x8018, 192},
+    {0x801f, 192},
+    {0x8029, 192},
+    {0xc038, 192},
+    {0x8003, 193},
+    {0x8006, 193},
+    {0x800a, 193},
+    {0x800f, 193},
+    {0x8018, 193},
+    {0x801f, 193},
+    {0x8029, 193},
+    {0xc038, 193},
+  },
+  /* 195 */
+  {
+    {0x8003, 200},
+    {0x8006, 200},
+    {0x800a, 200},
+    {0x800f, 200},
+    {0x8018, 200},
+    {0x801f, 200},
+    {0x8029, 200},
+    {0xc038, 200},
+    {0x8003, 201},
+    {0x8006, 201},
+    {0x800a, 201},
+    {0x800f, 201},
+    {0x8018, 201},
+    {0x801f, 201},
+    {0x8029, 201},
+    {0xc038, 201},
+  },
+  /* 196 */
+  {
+    {0x8002, 202},
+    {0x8009, 202},
+    {0x8017, 202},
+    {0xc028, 202},
+    {0x8002, 205},
+    {0x8009, 205},
+    {0x8017, 205},
+    {0xc028, 205},
+    {0x8002, 210},
+    {0x8009, 210},
+    {0x8017, 210},
+    {0xc028, 210},
+    {0x8002, 213},
+    {0x8009, 213},
+    {0x8017, 213},
+    {0xc028, 213},
+  },
+  /* 197 */
+  {
+    {0x8003, 202},
+    {0x8006, 202},
+    {0x800a, 202},
+    {0x800f, 202},
+    {0x8018, 202},
+    {0x801f, 202},
+    {0x8029, 202},
+    {0xc038, 202},
+    {0x8003, 205},
+    {0x8006, 205},
+    {0x800a, 205},
+    {0x800f, 205},
+    {0x8018, 205},
+    {0x801f, 205},
+    {0x8029, 205},
+    {0xc038, 205},
+  },
+  /* 198 */
+  {
+    {0x8003, 210},
+    {0x8006, 210},
+    {0x800a, 210},
+    {0x800f, 210},
+    {0x8018, 210},
+    {0x801f, 210},
+    {0x8029, 210},
+    {0xc038, 210},
+    {0x8003, 213},
+    {0x8006, 213},
+    {0x800a, 213},
+    {0x800f, 213},
+    {0x8018, 213},
+    {0x801f, 213},
+    {0x8029, 213},
+    {0xc038, 213},
+  },
+  /* 199 */
+  {
+    {0x8001, 218},
+    {0xc016, 218},
+    {0x8001, 219},
+    {0xc016, 219},
+    {0x8001, 238},
+    {0xc016, 238},
+    {0x8001, 240},
+    {0xc016, 240},
+    {0x8001, 242},
+    {0xc016, 242},
+    {0x8001, 243},
+    {0xc016, 243},
+    {0x8001, 255},
+    {0xc016, 255},
+    {0xc000, 203},
+    {0xc000, 204},
+  },
+  /* 200 */
+  {
+    {0x8002, 218},
+    {0x8009, 218},
+    {0x8017, 218},
+    {0xc028, 218},
+    {0x8002, 219},
+    {0x8009, 219},
+    {0x8017, 219},
+    {0xc028, 219},
+    {0x8002, 238},
+    {0x8009, 238},
+    {0x8017, 238},
+    {0xc028, 238},
+    {0x8002, 240},
+    {0x8009, 240},
+    {0x8017, 240},
+    {0xc028, 240},
+  },
+  /* 201 */
+  {
+    {0x8003, 218},
+    {0x8006, 218},
+    {0x800a, 218},
+    {0x800f, 218},
+    {0x8018, 218},
+    {0x801f, 218},
+    {0x8029, 218},
+    {0xc038, 218},
+    {0x8003, 219},
+    {0x8006, 219},
+    {0x800a, 219},
+    {0x800f, 219},
+    {0x8018, 219},
+    {0x801f, 219},
+    {0x8029, 219},
+    {0xc038, 219},
+  },
+  /* 202 */
+  {
+    {0x8003, 238},
+    {0x8006, 238},
+    {0x800a, 238},
+    {0x800f, 238},
+    {0x8018, 238},
+    {0x801f, 238},
+    {0x8029, 238},
+    {0xc038, 238},
+    {0x8003, 240},
+    {0x8006, 240},
+    {0x800a, 240},
+    {0x800f, 240},
+    {0x8018, 240},
+    {0x801f, 240},
+    {0x8029, 240},
+    {0xc038, 240},
+  },
+  /* 203 */
+  {
+    {0x8002, 242},
+    {0x8009, 242},
+    {0x8017, 242},
+    {0xc028, 242},
+    {0x8002, 243},
+    {0x8009, 243},
+    {0x8017, 243},
+    {0xc028, 243},
+    {0x8002, 255},
+    {0x8009, 255},
+    {0x8017, 255},
+    {0xc028, 255},
+    {0x8001, 203},
+    {0xc016, 203},
+    {0x8001, 204},
+    {0xc016, 204},
+  },
+  /* 204 */
+  {
+    {0x8003, 242},
+    {0x8006, 242},
+    {0x800a, 242},
+    {0x800f, 242},
+    {0x8018, 242},
+    {0x801f, 242},
+    {0x8029, 242},
+    {0xc038, 242},
+    {0x8003, 243},
+    {0x8006, 243},
+    {0x800a, 243},
+    {0x800f, 243},
+    {0x8018, 243},
+    {0x801f, 243},
+    {0x8029, 243},
+    {0xc038, 243},
+  },
+  /* 205 */
+  {
+    {0x8003, 255},
+    {0x8006, 255},
+    {0x800a, 255},
+    {0x800f, 255},
+    {0x8018, 255},
+    {0x801f, 255},
+    {0x8029, 255},
+    {0xc038, 255},
+    {0x8002, 203},
+    {0x8009, 203},
+    {0x8017, 203},
+    {0xc028, 203},
+    {0x8002, 204},
+    {0x8009, 204},
+    {0x8017, 204},
+    {0xc028, 204},
+  },
+  /* 206 */
+  {
+    {0x8003, 203},
+    {0x8006, 203},
+    {0x800a, 203},
+    {0x800f, 203},
+    {0x8018, 203},
+    {0x801f, 203},
+    {0x8029, 203},
+    {0xc038, 203},
+    {0x8003, 204},
+    {0x8006, 204},
+    {0x800a, 204},
+    {0x800f, 204},
+    {0x8018, 204},
+    {0x801f, 204},
+    {0x8029, 204},
+    {0xc038, 204},
+  },
+  /* 207 */
+  {
+    {0xd3, 0},
+    {0xd4, 0},
+    {0xd6, 0},
+    {0xd7, 0},
+    {0xda, 0},
+    {0xdb, 0},
+    {0xdd, 0},
+    {0xde, 0},
+    {0xe2, 0},
+    {0xe4, 0},
+    {0xe8, 0},
+    {0xeb, 0},
+    {0xf0, 0},
+    {0xf3, 0},
+    {0xf7, 0},
+    {0xfa, 0},
+  },
+  /* 208 */
+  {
+    {0xc000, 211},
+    {0xc000, 212},
+    {0xc000, 214},
+    {0xc000, 221},
+    {0xc000, 222},
+    {0xc000, 223},
+    {0xc000, 241},
+    {0xc000, 244},
+    {0xc000, 245},
+    {0xc000, 246},
+    {0xc000, 247},
+    {0xc000, 248},
+    {0xc000, 250},
+    {0xc000, 251},
+    {0xc000, 252},
+    {0xc000, 253},
+  },
+  /* 209 */
+  {
+    {0x8001, 211},
+    {0xc016, 211},
+    {0x8001, 212},
+    {0xc016, 212},
+    {0x8001, 214},
+    {0xc016, 214},
+    {0x8001, 221},
+    {0xc016, 221},
+    {0x8001, 222},
+    {0xc016, 222},
+    {0x8001, 223},
+    {0xc016, 223},
+    {0x8001, 241},
+    {0xc016, 241},
+    {0x8001, 244},
+    {0xc016, 244},
+  },
+  /* 210 */
+  {
+    {0x8002, 211},
+    {0x8009, 211},
+    {0x8017, 211},
+    {0xc028, 211},
+    {0x8002, 212},
+    {0x8009, 212},
+    {0x8017, 212},
+    {0xc028, 212},
+    {0x8002, 214},
+    {0x8009, 214},
+    {0x8017, 214},
+    {0xc028, 214},
+    {0x8002, 221},
+    {0x8009, 221},
+    {0x8017, 221},
+    {0xc028, 221},
+  },
+  /* 211 */
+  {
+    {0x8003, 211},
+    {0x8006, 211},
+    {0x800a, 211},
+    {0x800f, 211},
+    {0x8018, 211},
+    {0x801f, 211},
+    {0x8029, 211},
+    {0xc038, 211},
+    {0x8003, 212},
+    {0x8006, 212},
+    {0x800a, 212},
+    {0x800f, 212},
+    {0x8018, 212},
+    {0x801f, 212},
+    {0x8029, 212},
+    {0xc038, 212},
+  },
+  /* 212 */
+  {
+    {0x8003, 214},
+    {0x8006, 214},
+    {0x800a, 214},
+    {0x800f, 214},
+    {0x8018, 214},
+    {0x801f, 214},
+    {0x8029, 214},
+    {0xc038, 214},
+    {0x8003, 221},
+    {0x8006, 221},
+    {0x800a, 221},
+    {0x800f, 221},
+    {0x8018, 221},
+    {0x801f, 221},
+    {0x8029, 221},
+    {0xc038, 221},
+  },
+  /* 213 */
+  {
+    {0x8002, 222},
+    {0x8009, 222},
+    {0x8017, 222},
+    {0xc028, 222},
+    {0x8002, 223},
+    {0x8009, 223},
+    {0x8017, 223},
+    {0xc028, 223},
+    {0x8002, 241},
+    {0x8009, 241},
+    {0x8017, 241},
+    {0xc028, 241},
+    {0x8002, 244},
+    {0x8009, 244},
+    {0x8017, 244},
+    {0xc028, 244},
+  },
+  /* 214 */
+  {
+    {0x8003, 222},
+    {0x8006, 222},
+    {0x800a, 222},
+    {0x800f, 222},
+    {0x8018, 222},
+    {0x801f, 222},
+    {0x8029, 222},
+    {0xc038, 222},
+    {0x8003, 223},
+    {0x8006, 223},
+    {0x800a, 223},
+    {0x800f, 223},
+    {0x8018, 223},
+    {0x801f, 223},
+    {0x8029, 223},
+    {0xc038, 223},
+  },
+  /* 215 */
+  {
+    {0x8003, 241},
+    {0x8006, 241},
+    {0x800a, 241},
+    {0x800f, 241},
+    {0x8018, 241},
+    {0x801f, 241},
+    {0x8029, 241},
+    {0xc038, 241},
+    {0x8003, 244},
+    {0x8006, 244},
+    {0x800a, 244},
+    {0x800f, 244},
+    {0x8018, 244},
+    {0x801f, 244},
+    {0x8029, 244},
+    {0xc038, 244},
+  },
+  /* 216 */
+  {
+    {0x8001, 245},
+    {0xc016, 245},
+    {0x8001, 246},
+    {0xc016, 246},
+    {0x8001, 247},
+    {0xc016, 247},
+    {0x8001, 248},
+    {0xc016, 248},
+    {0x8001, 250},
+    {0xc016, 250},
+    {0x8001, 251},
+    {0xc016, 251},
+    {0x8001, 252},
+    {0xc016, 252},
+    {0x8001, 253},
+    {0xc016, 253},
+  },
+  /* 217 */
+  {
+    {0x8002, 245},
+    {0x8009, 245},
+    {0x8017, 245},
+    {0xc028, 245},
+    {0x8002, 246},
+    {0x8009, 246},
+    {0x8017, 246},
+    {0xc028, 246},
+    {0x8002, 247},
+    {0x8009, 247},
+    {0x8017, 247},
+    {0xc028, 247},
+    {0x8002, 248},
+    {0x8009, 248},
+    {0x8017, 248},
+    {0xc028, 248},
+  },
+  /* 218 */
+  {
+    {0x8003, 245},
+    {0x8006, 245},
+    {0x800a, 245},
+    {0x800f, 245},
+    {0x8018, 245},
+    {0x801f, 245},
+    {0x8029, 245},
+    {0xc038, 245},
+    {0x8003, 246},
+    {0x8006, 246},
+    {0x800a, 246},
+    {0x800f, 246},
+    {0x8018, 246},
+    {0x801f, 246},
+    {0x8029, 246},
+    {0xc038, 246},
+  },
+  /* 219 */
+  {
+    {0x8003, 247},
+    {0x8006, 247},
+    {0x800a, 247},
+    {0x800f, 247},
+    {0x8018, 247},
+    {0x801f, 247},
+    {0x8029, 247},
+    {0xc038, 247},
+    {0x8003, 248},
+    {0x8006, 248},
+    {0x800a, 248},
+    {0x800f, 248},
+    {0x8018, 248},
+    {0x801f, 248},
+    {0x8029, 248},
+    {0xc038, 248},
+  },
+  /* 220 */
+  {
+    {0x8002, 250},
+    {0x8009, 250},
+    {0x8017, 250},
+    {0xc028, 250},
+    {0x8002, 251},
+    {0x8009, 251},
+    {0x8017, 251},
+    {0xc028, 251},
+    {0x8002, 252},
+    {0x8009, 252},
+    {0x8017, 252},
+    {0xc028, 252},
+    {0x8002, 253},
+    {0x8009, 253},
+    {0x8017, 253},
+    {0xc028, 253},
+  },
+  /* 221 */
+  {
+    {0x8003, 250},
+    {0x8006, 250},
+    {0x800a, 250},
+    {0x800f, 250},
+    {0x8018, 250},
+    {0x801f, 250},
+    {0x8029, 250},
+    {0xc038, 250},
+    {0x8003, 251},
+    {0x8006, 251},
+    {0x800a, 251},
+    {0x800f, 251},
+    {0x8018, 251},
+    {0x801f, 251},
+    {0x8029, 251},
+    {0xc038, 251},
+  },
+  /* 222 */
+  {
+    {0x8003, 252},
+    {0x8006, 252},
+    {0x800a, 252},
+    {0x800f, 252},
+    {0x8018, 252},
+    {0x801f, 252},
+    {0x8029, 252},
+    {0xc038, 252},
+    {0x8003, 253},
+    {0x8006, 253},
+    {0x800a, 253},
+    {0x800f, 253},
+    {0x8018, 253},
+    {0x801f, 253},
+    {0x8029, 253},
+    {0xc038, 253},
+  },
+  /* 223 */
+  {
+    {0xc000, 254},
+    {0xe3, 0},
+    {0xe5, 0},
+    {0xe6, 0},
+    {0xe9, 0},
+    {0xea, 0},
+    {0xec, 0},
+    {0xed, 0},
+    {0xf1, 0},
+    {0xf2, 0},
+    {0xf4, 0},
+    {0xf5, 0},
+    {0xf8, 0},
+    {0xf9, 0},
+    {0xfb, 0},
+    {0xfc, 0},
+  },
+  /* 224 */
+  {
+    {0x8001, 254},
+    {0xc016, 254},
+    {0xc000, 2},
+    {0xc000, 3},
+    {0xc000, 4},
+    {0xc000, 5},
+    {0xc000, 6},
+    {0xc000, 7},
+    {0xc000, 8},
+    {0xc000, 11},
+    {0xc000, 12},
+    {0xc000, 14},
+    {0xc000, 15},
+    {0xc000, 16},
+    {0xc000, 17},
+    {0xc000, 18},
+  },
+  /* 225 */
+  {
+    {0x8002, 254},
+    {0x8009, 254},
+    {0x8017, 254},
+    {0xc028, 254},
+    {0x8001, 2},
+    {0xc016, 2},
+    {0x8001, 3},
+    {0xc016, 3},
+    {0x8001, 4},
+    {0xc016, 4},
+    {0x8001, 5},
+    {0xc016, 5},
+    {0x8001, 6},
+    {0xc016, 6},
+    {0x8001, 7},
+    {0xc016, 7},
+  },
+  /* 226 */
+  {
+    {0x8003, 254},
+    {0x8006, 254},
+    {0x800a, 254},
+    {0x800f, 254},
+    {0x8018, 254},
+    {0x801f, 254},
+    {0x8029, 254},
+    {0xc038, 254},
+    {0x8002, 2},
+    {0x8009, 2},
+    {0x8017, 2},
+    {0xc028, 2},
+    {0x8002, 3},
+    {0x8009, 3},
+    {0x8017, 3},
+    {0xc028, 3},
+  },
+  /* 227 */
+  {
+    {0x8003, 2},
+    {0x8006, 2},
+    {0x800a, 2},
+    {0x800f, 2},
+    {0x8018, 2},
+    {0x801f, 2},
+    {0x8029, 2},
+    {0xc038, 2},
+    {0x8003, 3},
+    {0x8006, 3},
+    {0x800a, 3},
+    {0x800f, 3},
+    {0x8018, 3},
+    {0x801f, 3},
+    {0x8029, 3},
+    {0xc038, 3},
+  },
+  /* 228 */
+  {
+    {0x8002, 4},
+    {0x8009, 4},
+    {0x8017, 4},
+    {0xc028, 4},
+    {0x8002, 5},
+    {0x8009, 5},
+    {0x8017, 5},
+    {0xc028, 5},
+    {0x8002, 6},
+    {0x8009, 6},
+    {0x8017, 6},
+    {0xc028, 6},
+    {0x8002, 7},
+    {0x8009, 7},
+    {0x8017, 7},
+    {0xc028, 7},
+  },
+  /* 229 */
+  {
+    {0x8003, 4},
+    {0x8006, 4},
+    {0x800a, 4},
+    {0x800f, 4},
+    {0x8018, 4},
+    {0x801f, 4},
+    {0x8029, 4},
+    {0xc038, 4},
+    {0x8003, 5},
+    {0x8006, 5},
+    {0x800a, 5},
+    {0x800f, 5},
+    {0x8018, 5},
+    {0x801f, 5},
+    {0x8029, 5},
+    {0xc038, 5},
+  },
+  /* 230 */
+  {
+    {0x8003, 6},
+    {0x8006, 6},
+    {0x800a, 6},
+    {0x800f, 6},
+    {0x8018, 6},
+    {0x801f, 6},
+    {0x8029, 6},
+    {0xc038, 6},
+    {0x8003, 7},
+    {0x8006, 7},
+    {0x800a, 7},
+    {0x800f, 7},
+    {0x8018, 7},
+    {0x801f, 7},
+    {0x8029, 7},
+    {0xc038, 7},
+  },
+  /* 231 */
+  {
+    {0x8001, 8},
+    {0xc016, 8},
+    {0x8001, 11},
+    {0xc016, 11},
+    {0x8001, 12},
+    {0xc016, 12},
+    {0x8001, 14},
+    {0xc016, 14},
+    {0x8001, 15},
+    {0xc016, 15},
+    {0x8001, 16},
+    {0xc016, 16},
+    {0x8001, 17},
+    {0xc016, 17},
+    {0x8001, 18},
+    {0xc016, 18},
+  },
+  /* 232 */
+  {
+    {0x8002, 8},
+    {0x8009, 8},
+    {0x8017, 8},
+    {0xc028, 8},
+    {0x8002, 11},
+    {0x8009, 11},
+    {0x8017, 11},
+    {0xc028, 11},
+    {0x8002, 12},
+    {0x8009, 12},
+    {0x8017, 12},
+    {0xc028, 12},
+    {0x8002, 14},
+    {0x8009, 14},
+    {0x8017, 14},
+    {0xc028, 14},
+  },
+  /* 233 */
+  {
+    {0x8003, 8},
+    {0x8006, 8},
+    {0x800a, 8},
+    {0x800f, 8},
+    {0x8018, 8},
+    {0x801f, 8},
+    {0x8029, 8},
+    {0xc038, 8},
+    {0x8003, 11},
+    {0x8006, 11},
+    {0x800a, 11},
+    {0x800f, 11},
+    {0x8018, 11},
+    {0x801f, 11},
+    {0x8029, 11},
+    {0xc038, 11},
+  },
+  /* 234 */
+  {
+    {0x8003, 12},
+    {0x8006, 12},
+    {0x800a, 12},
+    {0x800f, 12},
+    {0x8018, 12},
+    {0x801f, 12},
+    {0x8029, 12},
+    {0xc038, 12},
+    {0x8003, 14},
+    {0x8006, 14},
+    {0x800a, 14},
+    {0x800f, 14},
+    {0x8018, 14},
+    {0x801f, 14},
+    {0x8029, 14},
+    {0xc038, 14},
+  },
+  /* 235 */
+  {
+    {0x8002, 15},
+    {0x8009, 15},
+    {0x8017, 15},
+    {0xc028, 15},
+    {0x8002, 16},
+    {0x8009, 16},
+    {0x8017, 16},
+    {0xc028, 16},
+    {0x8002, 17},
+    {0x8009, 17},
+    {0x8017, 17},
+    {0xc028, 17},
+    {0x8002, 18},
+    {0x8009, 18},
+    {0x8017, 18},
+    {0xc028, 18},
+  },
+  /* 236 */
+  {
+    {0x8003, 15},
+    {0x8006, 15},
+    {0x800a, 15},
+    {0x800f, 15},
+    {0x8018, 15},
+    {0x801f, 15},
+    {0x8029, 15},
+    {0xc038, 15},
+    {0x8003, 16},
+    {0x8006, 16},
+    {0x800a, 16},
+    {0x800f, 16},
+    {0x8018, 16},
+    {0x801f, 16},
+    {0x8029, 16},
+    {0xc038, 16},
+  },
+  /* 237 */
+  {
+    {0x8003, 17},
+    {0x8006, 17},
+    {0x800a, 17},
+    {0x800f, 17},
+    {0x8018, 17},
+    {0x801f, 17},
+    {0x8029, 17},
+    {0xc038, 17},
+    {0x8003, 18},
+    {0x8006, 18},
+    {0x800a, 18},
+    {0x800f, 18},
+    {0x8018, 18},
+    {0x801f, 18},
+    {0x8029, 18},
+    {0xc038, 18},
+  },
+  /* 238 */
+  {
+    {0xc000, 19},
+    {0xc000, 20},
+    {0xc000, 21},
+    {0xc000, 23},
+    {0xc000, 24},
+    {0xc000, 25},
+    {0xc000, 26},
+    {0xc000, 27},
+    {0xc000, 28},
+    {0xc000, 29},
+    {0xc000, 30},
+    {0xc000, 31},
+    {0xc000, 127},
+    {0xc000, 220},
+    {0xc000, 249},
+    {0xfd, 0},
+  },
+  /* 239 */
+  {
+    {0x8001, 19},
+    {0xc016, 19},
+    {0x8001, 20},
+    {0xc016, 20},
+    {0x8001, 21},
+    {0xc016, 21},
+    {0x8001, 23},
+    {0xc016, 23},
+    {0x8001, 24},
+    {0xc016, 24},
+    {0x8001, 25},
+    {0xc016, 25},
+    {0x8001, 26},
+    {0xc016, 26},
+    {0x8001, 27},
+    {0xc016, 27},
+  },
+  /* 240 */
+  {
+    {0x8002, 19},
+    {0x8009, 19},
+    {0x8017, 19},
+    {0xc028, 19},
+    {0x8002, 20},
+    {0x8009, 20},
+    {0x8017, 20},
+    {0xc028, 20},
+    {0x8002, 21},
+    {0x8009, 21},
+    {0x8017, 21},
+    {0xc028, 21},
+    {0x8002, 23},
+    {0x8009, 23},
+    {0x8017, 23},
+    {0xc028, 23},
+  },
+  /* 241 */
+  {
+    {0x8003, 19},
+    {0x8006, 19},
+    {0x800a, 19},
+    {0x800f, 19},
+    {0x8018, 19},
+    {0x801f, 19},
+    {0x8029, 19},
+    {0xc038, 19},
+    {0x8003, 20},
+    {0x8006, 20},
+    {0x800a, 20},
+    {0x800f, 20},
+    {0x8018, 20},
+    {0x801f, 20},
+    {0x8029, 20},
+    {0xc038, 20},
+  },
+  /* 242 */
+  {
+    {0x8003, 21},
+    {0x8006, 21},
+    {0x800a, 21},
+    {0x800f, 21},
+    {0x8018, 21},
+    {0x801f, 21},
+    {0x8029, 21},
+    {0xc038, 21},
+    {0x8003, 23},
+    {0x8006, 23},
+    {0x800a, 23},
+    {0x800f, 23},
+    {0x8018, 23},
+    {0x801f, 23},
+    {0x8029, 23},
+    {0xc038, 23},
+  },
+  /* 243 */
+  {
+    {0x8002, 24},
+    {0x8009, 24},
+    {0x8017, 24},
+    {0xc028, 24},
+    {0x8002, 25},
+    {0x8009, 25},
+    {0x8017, 25},
+    {0xc028, 25},
+    {0x8002, 26},
+    {0x8009, 26},
+    {0x8017, 26},
+    {0xc028, 26},
+    {0x8002, 27},
+    {0x8009, 27},
+    {0x8017, 27},
+    {0xc028, 27},
+  },
+  /* 244 */
+  {
+    {0x8003, 24},
+    {0x8006, 24},
+    {0x800a, 24},
+    {0x800f, 24},
+    {0x8018, 24},
+    {0x801f, 24},
+    {0x8029, 24},
+    {0xc038, 24},
+    {0x8003, 25},
+    {0x8006, 25},
+    {0x800a, 25},
+    {0x800f, 25},
+    {0x8018, 25},
+    {0x801f, 25},
+    {0x8029, 25},
+    {0xc038, 25},
+  },
+  /* 245 */
+  {
+    {0x8003, 26},
+    {0x8006, 26},
+    {0x800a, 26},
+    {0x800f, 26},
+    {0x8018, 26},
+    {0x801f, 26},
+    {0x8029, 26},
+    {0xc038, 26},
+    {0x8003, 27},
+    {0x8006, 27},
+    {0x800a, 27},
+    {0x800f, 27},
+    {0x8018, 27},
+    {0x801f, 27},
+    {0x8029, 27},
+    {0xc038, 27},
+  },
+  /* 246 */
+  {
+    {0x8001, 28},
+    {0xc016, 28},
+    {0x8001, 29},
+    {0xc016, 29},
+    {0x8001, 30},
+    {0xc016, 30},
+    {0x8001, 31},
+    {0xc016, 31},
+    {0x8001, 127},
+    {0xc016, 127},
+    {0x8001, 220},
+    {0xc016, 220},
+    {0x8001, 249},
+    {0xc016, 249},
+    {0xfe, 0},
+    {0xff, 0},
+  },
+  /* 247 */
+  {
+    {0x8002, 28},
+    {0x8009, 28},
+    {0x8017, 28},
+    {0xc028, 28},
+    {0x8002, 29},
+    {0x8009, 29},
+    {0x8017, 29},
+    {0xc028, 29},
+    {0x8002, 30},
+    {0x8009, 30},
+    {0x8017, 30},
+    {0xc028, 30},
+    {0x8002, 31},
+    {0x8009, 31},
+    {0x8017, 31},
+    {0xc028, 31},
+  },
+  /* 248 */
+  {
+    {0x8003, 28},
+    {0x8006, 28},
+    {0x800a, 28},
+    {0x800f, 28},
+    {0x8018, 28},
+    {0x801f, 28},
+    {0x8029, 28},
+    {0xc038, 28},
+    {0x8003, 29},
+    {0x8006, 29},
+    {0x800a, 29},
+    {0x800f, 29},
+    {0x8018, 29},
+    {0x801f, 29},
+    {0x8029, 29},
+    {0xc038, 29},
+  },
+  /* 249 */
+  {
+    {0x8003, 30},
+    {0x8006, 30},
+    {0x800a, 30},
+    {0x800f, 30},
+    {0x8018, 30},
+    {0x801f, 30},
+    {0x8029, 30},
+    {0xc038, 30},
+    {0x8003, 31},
+    {0x8006, 31},
+    {0x800a, 31},
+    {0x800f, 31},
+    {0x8018, 31},
+    {0x801f, 31},
+    {0x8029, 31},
+    {0xc038, 31},
+  },
+  /* 250 */
+  {
+    {0x8002, 127},
+    {0x8009, 127},
+    {0x8017, 127},
+    {0xc028, 127},
+    {0x8002, 220},
+    {0x8009, 220},
+    {0x8017, 220},
+    {0xc028, 220},
+    {0x8002, 249},
+    {0x8009, 249},
+    {0x8017, 249},
+    {0xc028, 249},
+    {0xc000, 10},
+    {0xc000, 13},
+    {0xc000, 22},
+    {0x100, 0},
+  },
+  /* 251 */
+  {
+    {0x8003, 127},
+    {0x8006, 127},
+    {0x800a, 127},
+    {0x800f, 127},
+    {0x8018, 127},
+    {0x801f, 127},
+    {0x8029, 127},
+    {0xc038, 127},
+    {0x8003, 220},
+    {0x8006, 220},
+    {0x800a, 220},
+    {0x800f, 220},
+    {0x8018, 220},
+    {0x801f, 220},
+    {0x8029, 220},
+    {0xc038, 220},
+  },
+  /* 252 */
+  {
+    {0x8003, 249},
+    {0x8006, 249},
+    {0x800a, 249},
+    {0x800f, 249},
+    {0x8018, 249},
+    {0x801f, 249},
+    {0x8029, 249},
+    {0xc038, 249},
+    {0x8001, 10},
+    {0xc016, 10},
+    {0x8001, 13},
+    {0xc016, 13},
+    {0x8001, 22},
+    {0xc016, 22},
+    {0x100, 0},
+    {0x100, 0},
+  },
+  /* 253 */
+  {
+    {0x8002, 10},
+    {0x8009, 10},
+    {0x8017, 10},
+    {0xc028, 10},
+    {0x8002, 13},
+    {0x8009, 13},
+    {0x8017, 13},
+    {0xc028, 13},
+    {0x8002, 22},
+    {0x8009, 22},
+    {0x8017, 22},
+    {0xc028, 22},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+  },
+  /* 254 */
+  {
+    {0x8003, 10},
+    {0x8006, 10},
+    {0x800a, 10},
+    {0x800f, 10},
+    {0x8018, 10},
+    {0x801f, 10},
+    {0x8029, 10},
+    {0xc038, 10},
+    {0x8003, 13},
+    {0x8006, 13},
+    {0x800a, 13},
+    {0x800f, 13},
+    {0x8018, 13},
+    {0x801f, 13},
+    {0x8029, 13},
+    {0xc038, 13},
+  },
+  /* 255 */
+  {
+    {0x8003, 22},
+    {0x8006, 22},
+    {0x800a, 22},
+    {0x800f, 22},
+    {0x8018, 22},
+    {0x801f, 22},
+    {0x8029, 22},
+    {0xc038, 22},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+  },
+  /* 256 */
+  {
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+    {0x100, 0},
+  },
 };
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_range.c b/deps/ngtcp2/nghttp3/lib/nghttp3_range.c
index 0ce71480d72fec..af810a2c5929db 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_range.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_range.c
@@ -34,11 +34,13 @@ void nghttp3_range_init(nghttp3_range *r, uint64_t begin, uint64_t end) {
 nghttp3_range nghttp3_range_intersect(const nghttp3_range *a,
                                       const nghttp3_range *b) {
   nghttp3_range r = {0, 0};
-  uint64_t begin = nghttp3_max(a->begin, b->begin);
-  uint64_t end = nghttp3_min(a->end, b->end);
+  uint64_t begin = nghttp3_max_uint64(a->begin, b->begin);
+  uint64_t end = nghttp3_min_uint64(a->end, b->end);
+
   if (begin < end) {
     nghttp3_range_init(&r, begin, end);
   }
+
   return r;
 }
 
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_range.h b/deps/ngtcp2/nghttp3/lib/nghttp3_range.h
index 20dab69aa62db5..e52e1966b870ec 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_range.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_range.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -59,7 +59,7 @@ uint64_t nghttp3_range_len(const nghttp3_range *r);
 
 /*
  * nghttp3_range_eq returns nonzero if |a| equals |b|, such that
- * a->begin == b->begin, and a->end == b->end hold.
+ * a->begin == b->begin and a->end == b->end hold.
  */
 int nghttp3_range_eq(const nghttp3_range *a, const nghttp3_range *b);
 
@@ -78,4 +78,4 @@ void nghttp3_range_cut(nghttp3_range *left, nghttp3_range *right,
  */
 int nghttp3_range_not_after(const nghttp3_range *a, const nghttp3_range *b);
 
-#endif /* NGHTTP3_RANGE_H */
+#endif /* !defined(NGHTTP3_RANGE_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_rcbuf.h b/deps/ngtcp2/nghttp3/lib/nghttp3_rcbuf.h
index f589c377bf6ea7..97f83234ab5a8e 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_rcbuf.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_rcbuf.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -78,4 +78,4 @@ int nghttp3_rcbuf_new2(nghttp3_rcbuf **rcbuf_ptr, const uint8_t *src,
  */
 void nghttp3_rcbuf_del(nghttp3_rcbuf *rcbuf);
 
-#endif /* NGHTTP3_RCBUF_H */
+#endif /* !defined(NGHTTP3_RCBUF_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.c b/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.c
index 38b5460837190b..7d3ab39bf82a7f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.c
@@ -29,29 +29,27 @@
 #include <string.h>
 #ifdef WIN32
 #  include <intrin.h>
-#endif
+#endif /* defined(WIN32) */
 
 #include "nghttp3_macro.h"
 
-#if defined(_MSC_VER) && _MSC_VER < 1941 && !defined(__clang__) &&                                \
-    (defined(_M_ARM) || defined(_M_ARM64))
-unsigned int __popcnt(unsigned int x) {
-  unsigned int c = 0;
-  for (; x; ++c) {
-    x &= x - 1;
-  }
-  return c;
+static int ispow2(size_t n) {
+#if defined(_MSC_VER) && !defined(__clang__) &&                                \
+  (defined(_M_ARM) || (defined(_M_ARM64) && _MSC_VER < 1941))
+  return n && !(n & (n - 1));
+#elif defined(WIN32)
+  return 1 == __popcnt((unsigned int)n);
+#else  /* !((defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) ||   \
+          (defined(_M_ARM64) && _MSC_VER < 1941))) || defined(WIN32)) */
+  return 1 == __builtin_popcount((unsigned int)n);
+#endif /* !((defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) ||   \
+          (defined(_M_ARM64) && _MSC_VER < 1941))) || defined(WIN32)) */
 }
-#endif
 
 int nghttp3_ringbuf_init(nghttp3_ringbuf *rb, size_t nmemb, size_t size,
                          const nghttp3_mem *mem) {
   if (nmemb) {
-#ifdef WIN32
-    assert(1 == __popcnt((unsigned int)nmemb));
-#else
-    assert(1 == __builtin_popcount((unsigned int)nmemb));
-#endif
+    assert(ispow2(nmemb));
 
     rb->buf = nghttp3_mem_malloc(mem, nmemb * size);
     if (rb->buf == NULL) {
@@ -80,7 +78,7 @@ void nghttp3_ringbuf_free(nghttp3_ringbuf *rb) {
 
 void *nghttp3_ringbuf_push_front(nghttp3_ringbuf *rb) {
   rb->first = (rb->first - 1) & (rb->nmemb - 1);
-  rb->len = nghttp3_min(rb->nmemb, rb->len + 1);
+  rb->len = nghttp3_min_size(rb->nmemb, rb->len + 1);
 
   return (void *)&rb->buf[rb->first * rb->size];
 }
@@ -127,11 +125,7 @@ int nghttp3_ringbuf_reserve(nghttp3_ringbuf *rb, size_t nmemb) {
     return 0;
   }
 
-#ifdef WIN32
-  assert(1 == __popcnt((unsigned int)nmemb));
-#else
-  assert(1 == __builtin_popcount((unsigned int)nmemb));
-#endif
+  assert(ispow2(nmemb));
 
   buf = nghttp3_mem_malloc(rb->mem, nmemb * rb->size);
   if (buf == NULL) {
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.h b/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.h
index 8e05ec55b24724..b154290a51d5a5 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_ringbuf.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -110,4 +110,4 @@ int nghttp3_ringbuf_full(nghttp3_ringbuf *rb);
 
 int nghttp3_ringbuf_reserve(nghttp3_ringbuf *rb, size_t nmemb);
 
-#endif /* NGHTTP3_RINGBUF_H */
+#endif /* !defined(NGHTTP3_RINGBUF_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_str.c b/deps/ngtcp2/nghttp3/lib/nghttp3_str.c
index 3782aa72cd6e81..fc131404d13754 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_str.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_str.c
@@ -36,70 +36,70 @@ uint8_t *nghttp3_cpymem(uint8_t *dest, const uint8_t *src, size_t n) {
 
 /* Generated by gendowncasetbl.py */
 static const uint8_t DOWNCASE_TBL[] = {
-    0 /* NUL  */,   1 /* SOH  */,   2 /* STX  */,   3 /* ETX  */,
-    4 /* EOT  */,   5 /* ENQ  */,   6 /* ACK  */,   7 /* BEL  */,
-    8 /* BS   */,   9 /* HT   */,   10 /* LF   */,  11 /* VT   */,
-    12 /* FF   */,  13 /* CR   */,  14 /* SO   */,  15 /* SI   */,
-    16 /* DLE  */,  17 /* DC1  */,  18 /* DC2  */,  19 /* DC3  */,
-    20 /* DC4  */,  21 /* NAK  */,  22 /* SYN  */,  23 /* ETB  */,
-    24 /* CAN  */,  25 /* EM   */,  26 /* SUB  */,  27 /* ESC  */,
-    28 /* FS   */,  29 /* GS   */,  30 /* RS   */,  31 /* US   */,
-    32 /* SPC  */,  33 /* !    */,  34 /* "    */,  35 /* #    */,
-    36 /* $    */,  37 /* %    */,  38 /* &    */,  39 /* '    */,
-    40 /* (    */,  41 /* )    */,  42 /* *    */,  43 /* +    */,
-    44 /* ,    */,  45 /* -    */,  46 /* .    */,  47 /* /    */,
-    48 /* 0    */,  49 /* 1    */,  50 /* 2    */,  51 /* 3    */,
-    52 /* 4    */,  53 /* 5    */,  54 /* 6    */,  55 /* 7    */,
-    56 /* 8    */,  57 /* 9    */,  58 /* :    */,  59 /* ;    */,
-    60 /* <    */,  61 /* =    */,  62 /* >    */,  63 /* ?    */,
-    64 /* @    */,  97 /* A    */,  98 /* B    */,  99 /* C    */,
-    100 /* D    */, 101 /* E    */, 102 /* F    */, 103 /* G    */,
-    104 /* H    */, 105 /* I    */, 106 /* J    */, 107 /* K    */,
-    108 /* L    */, 109 /* M    */, 110 /* N    */, 111 /* O    */,
-    112 /* P    */, 113 /* Q    */, 114 /* R    */, 115 /* S    */,
-    116 /* T    */, 117 /* U    */, 118 /* V    */, 119 /* W    */,
-    120 /* X    */, 121 /* Y    */, 122 /* Z    */, 91 /* [    */,
-    92 /* \    */,  93 /* ]    */,  94 /* ^    */,  95 /* _    */,
-    96 /* `    */,  97 /* a    */,  98 /* b    */,  99 /* c    */,
-    100 /* d    */, 101 /* e    */, 102 /* f    */, 103 /* g    */,
-    104 /* h    */, 105 /* i    */, 106 /* j    */, 107 /* k    */,
-    108 /* l    */, 109 /* m    */, 110 /* n    */, 111 /* o    */,
-    112 /* p    */, 113 /* q    */, 114 /* r    */, 115 /* s    */,
-    116 /* t    */, 117 /* u    */, 118 /* v    */, 119 /* w    */,
-    120 /* x    */, 121 /* y    */, 122 /* z    */, 123 /* {    */,
-    124 /* |    */, 125 /* }    */, 126 /* ~    */, 127 /* DEL  */,
-    128 /* 0x80 */, 129 /* 0x81 */, 130 /* 0x82 */, 131 /* 0x83 */,
-    132 /* 0x84 */, 133 /* 0x85 */, 134 /* 0x86 */, 135 /* 0x87 */,
-    136 /* 0x88 */, 137 /* 0x89 */, 138 /* 0x8a */, 139 /* 0x8b */,
-    140 /* 0x8c */, 141 /* 0x8d */, 142 /* 0x8e */, 143 /* 0x8f */,
-    144 /* 0x90 */, 145 /* 0x91 */, 146 /* 0x92 */, 147 /* 0x93 */,
-    148 /* 0x94 */, 149 /* 0x95 */, 150 /* 0x96 */, 151 /* 0x97 */,
-    152 /* 0x98 */, 153 /* 0x99 */, 154 /* 0x9a */, 155 /* 0x9b */,
-    156 /* 0x9c */, 157 /* 0x9d */, 158 /* 0x9e */, 159 /* 0x9f */,
-    160 /* 0xa0 */, 161 /* 0xa1 */, 162 /* 0xa2 */, 163 /* 0xa3 */,
-    164 /* 0xa4 */, 165 /* 0xa5 */, 166 /* 0xa6 */, 167 /* 0xa7 */,
-    168 /* 0xa8 */, 169 /* 0xa9 */, 170 /* 0xaa */, 171 /* 0xab */,
-    172 /* 0xac */, 173 /* 0xad */, 174 /* 0xae */, 175 /* 0xaf */,
-    176 /* 0xb0 */, 177 /* 0xb1 */, 178 /* 0xb2 */, 179 /* 0xb3 */,
-    180 /* 0xb4 */, 181 /* 0xb5 */, 182 /* 0xb6 */, 183 /* 0xb7 */,
-    184 /* 0xb8 */, 185 /* 0xb9 */, 186 /* 0xba */, 187 /* 0xbb */,
-    188 /* 0xbc */, 189 /* 0xbd */, 190 /* 0xbe */, 191 /* 0xbf */,
-    192 /* 0xc0 */, 193 /* 0xc1 */, 194 /* 0xc2 */, 195 /* 0xc3 */,
-    196 /* 0xc4 */, 197 /* 0xc5 */, 198 /* 0xc6 */, 199 /* 0xc7 */,
-    200 /* 0xc8 */, 201 /* 0xc9 */, 202 /* 0xca */, 203 /* 0xcb */,
-    204 /* 0xcc */, 205 /* 0xcd */, 206 /* 0xce */, 207 /* 0xcf */,
-    208 /* 0xd0 */, 209 /* 0xd1 */, 210 /* 0xd2 */, 211 /* 0xd3 */,
-    212 /* 0xd4 */, 213 /* 0xd5 */, 214 /* 0xd6 */, 215 /* 0xd7 */,
-    216 /* 0xd8 */, 217 /* 0xd9 */, 218 /* 0xda */, 219 /* 0xdb */,
-    220 /* 0xdc */, 221 /* 0xdd */, 222 /* 0xde */, 223 /* 0xdf */,
-    224 /* 0xe0 */, 225 /* 0xe1 */, 226 /* 0xe2 */, 227 /* 0xe3 */,
-    228 /* 0xe4 */, 229 /* 0xe5 */, 230 /* 0xe6 */, 231 /* 0xe7 */,
-    232 /* 0xe8 */, 233 /* 0xe9 */, 234 /* 0xea */, 235 /* 0xeb */,
-    236 /* 0xec */, 237 /* 0xed */, 238 /* 0xee */, 239 /* 0xef */,
-    240 /* 0xf0 */, 241 /* 0xf1 */, 242 /* 0xf2 */, 243 /* 0xf3 */,
-    244 /* 0xf4 */, 245 /* 0xf5 */, 246 /* 0xf6 */, 247 /* 0xf7 */,
-    248 /* 0xf8 */, 249 /* 0xf9 */, 250 /* 0xfa */, 251 /* 0xfb */,
-    252 /* 0xfc */, 253 /* 0xfd */, 254 /* 0xfe */, 255 /* 0xff */,
+  0 /* NUL  */,   1 /* SOH  */,   2 /* STX  */,   3 /* ETX  */,
+  4 /* EOT  */,   5 /* ENQ  */,   6 /* ACK  */,   7 /* BEL  */,
+  8 /* BS   */,   9 /* HT   */,   10 /* LF   */,  11 /* VT   */,
+  12 /* FF   */,  13 /* CR   */,  14 /* SO   */,  15 /* SI   */,
+  16 /* DLE  */,  17 /* DC1  */,  18 /* DC2  */,  19 /* DC3  */,
+  20 /* DC4  */,  21 /* NAK  */,  22 /* SYN  */,  23 /* ETB  */,
+  24 /* CAN  */,  25 /* EM   */,  26 /* SUB  */,  27 /* ESC  */,
+  28 /* FS   */,  29 /* GS   */,  30 /* RS   */,  31 /* US   */,
+  32 /* SPC  */,  33 /* !    */,  34 /* "    */,  35 /* #    */,
+  36 /* $    */,  37 /* %    */,  38 /* &    */,  39 /* '    */,
+  40 /* (    */,  41 /* )    */,  42 /* *    */,  43 /* +    */,
+  44 /* ,    */,  45 /* -    */,  46 /* .    */,  47 /* /    */,
+  48 /* 0    */,  49 /* 1    */,  50 /* 2    */,  51 /* 3    */,
+  52 /* 4    */,  53 /* 5    */,  54 /* 6    */,  55 /* 7    */,
+  56 /* 8    */,  57 /* 9    */,  58 /* :    */,  59 /* ;    */,
+  60 /* <    */,  61 /* =    */,  62 /* >    */,  63 /* ?    */,
+  64 /* @    */,  97 /* A    */,  98 /* B    */,  99 /* C    */,
+  100 /* D    */, 101 /* E    */, 102 /* F    */, 103 /* G    */,
+  104 /* H    */, 105 /* I    */, 106 /* J    */, 107 /* K    */,
+  108 /* L    */, 109 /* M    */, 110 /* N    */, 111 /* O    */,
+  112 /* P    */, 113 /* Q    */, 114 /* R    */, 115 /* S    */,
+  116 /* T    */, 117 /* U    */, 118 /* V    */, 119 /* W    */,
+  120 /* X    */, 121 /* Y    */, 122 /* Z    */, 91 /* [    */,
+  92 /* \    */,  93 /* ]    */,  94 /* ^    */,  95 /* _    */,
+  96 /* `    */,  97 /* a    */,  98 /* b    */,  99 /* c    */,
+  100 /* d    */, 101 /* e    */, 102 /* f    */, 103 /* g    */,
+  104 /* h    */, 105 /* i    */, 106 /* j    */, 107 /* k    */,
+  108 /* l    */, 109 /* m    */, 110 /* n    */, 111 /* o    */,
+  112 /* p    */, 113 /* q    */, 114 /* r    */, 115 /* s    */,
+  116 /* t    */, 117 /* u    */, 118 /* v    */, 119 /* w    */,
+  120 /* x    */, 121 /* y    */, 122 /* z    */, 123 /* {    */,
+  124 /* |    */, 125 /* }    */, 126 /* ~    */, 127 /* DEL  */,
+  128 /* 0x80 */, 129 /* 0x81 */, 130 /* 0x82 */, 131 /* 0x83 */,
+  132 /* 0x84 */, 133 /* 0x85 */, 134 /* 0x86 */, 135 /* 0x87 */,
+  136 /* 0x88 */, 137 /* 0x89 */, 138 /* 0x8a */, 139 /* 0x8b */,
+  140 /* 0x8c */, 141 /* 0x8d */, 142 /* 0x8e */, 143 /* 0x8f */,
+  144 /* 0x90 */, 145 /* 0x91 */, 146 /* 0x92 */, 147 /* 0x93 */,
+  148 /* 0x94 */, 149 /* 0x95 */, 150 /* 0x96 */, 151 /* 0x97 */,
+  152 /* 0x98 */, 153 /* 0x99 */, 154 /* 0x9a */, 155 /* 0x9b */,
+  156 /* 0x9c */, 157 /* 0x9d */, 158 /* 0x9e */, 159 /* 0x9f */,
+  160 /* 0xa0 */, 161 /* 0xa1 */, 162 /* 0xa2 */, 163 /* 0xa3 */,
+  164 /* 0xa4 */, 165 /* 0xa5 */, 166 /* 0xa6 */, 167 /* 0xa7 */,
+  168 /* 0xa8 */, 169 /* 0xa9 */, 170 /* 0xaa */, 171 /* 0xab */,
+  172 /* 0xac */, 173 /* 0xad */, 174 /* 0xae */, 175 /* 0xaf */,
+  176 /* 0xb0 */, 177 /* 0xb1 */, 178 /* 0xb2 */, 179 /* 0xb3 */,
+  180 /* 0xb4 */, 181 /* 0xb5 */, 182 /* 0xb6 */, 183 /* 0xb7 */,
+  184 /* 0xb8 */, 185 /* 0xb9 */, 186 /* 0xba */, 187 /* 0xbb */,
+  188 /* 0xbc */, 189 /* 0xbd */, 190 /* 0xbe */, 191 /* 0xbf */,
+  192 /* 0xc0 */, 193 /* 0xc1 */, 194 /* 0xc2 */, 195 /* 0xc3 */,
+  196 /* 0xc4 */, 197 /* 0xc5 */, 198 /* 0xc6 */, 199 /* 0xc7 */,
+  200 /* 0xc8 */, 201 /* 0xc9 */, 202 /* 0xca */, 203 /* 0xcb */,
+  204 /* 0xcc */, 205 /* 0xcd */, 206 /* 0xce */, 207 /* 0xcf */,
+  208 /* 0xd0 */, 209 /* 0xd1 */, 210 /* 0xd2 */, 211 /* 0xd3 */,
+  212 /* 0xd4 */, 213 /* 0xd5 */, 214 /* 0xd6 */, 215 /* 0xd7 */,
+  216 /* 0xd8 */, 217 /* 0xd9 */, 218 /* 0xda */, 219 /* 0xdb */,
+  220 /* 0xdc */, 221 /* 0xdd */, 222 /* 0xde */, 223 /* 0xdf */,
+  224 /* 0xe0 */, 225 /* 0xe1 */, 226 /* 0xe2 */, 227 /* 0xe3 */,
+  228 /* 0xe4 */, 229 /* 0xe5 */, 230 /* 0xe6 */, 231 /* 0xe7 */,
+  232 /* 0xe8 */, 233 /* 0xe9 */, 234 /* 0xea */, 235 /* 0xeb */,
+  236 /* 0xec */, 237 /* 0xed */, 238 /* 0xee */, 239 /* 0xef */,
+  240 /* 0xf0 */, 241 /* 0xf1 */, 242 /* 0xf2 */, 243 /* 0xf3 */,
+  244 /* 0xf4 */, 245 /* 0xf5 */, 246 /* 0xf6 */, 247 /* 0xf7 */,
+  248 /* 0xf8 */, 249 /* 0xf9 */, 250 /* 0xfa */, 251 /* 0xfb */,
+  252 /* 0xfc */, 253 /* 0xfd */, 254 /* 0xfe */, 255 /* 0xff */,
 };
 
 void nghttp3_downcase(uint8_t *s, size_t len) {
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_str.h b/deps/ngtcp2/nghttp3/lib/nghttp3_str.h
index 19c1d2c71b559b..280749a3a9a3d9 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_str.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_str.h
@@ -29,7 +29,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -37,4 +37,4 @@ uint8_t *nghttp3_cpymem(uint8_t *dest, const uint8_t *src, size_t n);
 
 void nghttp3_downcase(uint8_t *s, size_t len);
 
-#endif /* NGHTTP3_STR_H */
+#endif /* !defined(NGHTTP3_STR_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_stream.c b/deps/ngtcp2/nghttp3/lib/nghttp3_stream.c
index 6188a141dd123b..328cddd488fd6f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_stream.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_stream.c
@@ -73,7 +73,6 @@ int nghttp3_stream_new(nghttp3_stream **pstream, int64_t stream_id,
 
   stream->qpack_blocked_pe.index = NGHTTP3_PQ_BAD_INDEX;
   stream->mem = mem;
-  stream->tx.offset = 0;
   stream->rx.http.status_code = -1;
   stream->rx.http.content_length = -1;
   stream->rx.http.pri.urgency = NGHTTP3_DEFAULT_URGENCY;
@@ -180,48 +179,44 @@ void nghttp3_stream_read_state_reset(nghttp3_stream_read_state *rstate) {
 }
 
 nghttp3_ssize nghttp3_read_varint(nghttp3_varint_read_state *rvint,
-                                  const uint8_t *src, size_t srclen, int fin) {
-  size_t nread = 0;
-  size_t n;
-  size_t i;
+                                  const uint8_t *begin, const uint8_t *end,
+                                  int fin) {
+  const uint8_t *orig_begin = begin;
+  size_t len;
 
-  assert(srclen > 0);
+  assert(begin != end);
 
   if (rvint->left == 0) {
     assert(rvint->acc == 0);
 
-    rvint->left = nghttp3_get_varintlen(src);
-    if (rvint->left <= srclen) {
-      rvint->acc = nghttp3_get_varint(&nread, src);
-      rvint->left = 0;
-      return (nghttp3_ssize)nread;
+    len = nghttp3_get_varintlen(begin);
+    if (len <= (size_t)(end - begin)) {
+      nghttp3_get_varint(&rvint->acc, begin);
+      return (nghttp3_ssize)len;
     }
 
     if (fin) {
       return NGHTTP3_ERR_INVALID_ARGUMENT;
     }
 
-    rvint->acc = nghttp3_get_varint_fb(src);
-    nread = 1;
-    ++src;
-    --srclen;
-    --rvint->left;
+    rvint->acc = nghttp3_get_varint_fb(begin++);
+    rvint->left = len - 1;
   }
 
-  n = nghttp3_min(rvint->left, srclen);
+  len = nghttp3_min_size(rvint->left, (size_t)(end - begin));
+  end = begin + len;
 
-  for (i = 0; i < n; ++i) {
-    rvint->acc = (rvint->acc << 8) + src[i];
+  for (; begin != end;) {
+    rvint->acc = (rvint->acc << 8) + *begin++;
   }
 
-  rvint->left -= n;
-  nread += n;
+  rvint->left -= len;
 
   if (fin && rvint->left) {
     return NGHTTP3_ERR_INVALID_ARGUMENT;
   }
 
-  return (nghttp3_ssize)nread;
+  return (nghttp3_ssize)(begin - orig_begin);
 }
 
 int nghttp3_stream_frq_add(nghttp3_stream *stream,
@@ -231,7 +226,8 @@ int nghttp3_stream_frq_add(nghttp3_stream *stream,
   int rv;
 
   if (nghttp3_ringbuf_full(frq)) {
-    size_t nlen = nghttp3_max(NGHTTP3_MIN_RBLEN, nghttp3_ringbuf_len(frq) * 2);
+    size_t nlen =
+      nghttp3_max_size(NGHTTP3_MIN_RBLEN, nghttp3_ringbuf_len(frq) * 2);
     rv = nghttp3_ringbuf_reserve(frq, nlen);
     if (rv != 0) {
       return rv;
@@ -444,8 +440,8 @@ int nghttp3_stream_write_headers(nghttp3_stream *stream,
   assert(conn);
 
   return nghttp3_stream_write_header_block(
-      stream, &conn->qenc, conn->tx.qenc, &conn->tx.qpack.rbuf,
-      &conn->tx.qpack.ebuf, NGHTTP3_FRAME_HEADERS, fr->nva, fr->nvlen);
+    stream, &conn->qenc, conn->tx.qenc, &conn->tx.qpack.rbuf,
+    &conn->tx.qpack.ebuf, NGHTTP3_FRAME_HEADERS, fr->nva, fr->nvlen);
 }
 
 int nghttp3_stream_write_header_block(nghttp3_stream *stream,
@@ -738,7 +734,7 @@ int nghttp3_stream_outq_add(nghttp3_stream *stream,
   }
 
   if (nghttp3_ringbuf_full(outq)) {
-    size_t nlen = nghttp3_max(NGHTTP3_MIN_RBLEN, len * 2);
+    size_t nlen = nghttp3_max_size(NGHTTP3_MIN_RBLEN, len * 2);
     rv = nghttp3_ringbuf_reserve(outq, nlen);
     if (rv != 0) {
       return rv;
@@ -770,8 +766,8 @@ int nghttp3_stream_ensure_chunk(nghttp3_stream *stream, size_t need) {
     ;
 
   if (n == NGHTTP3_STREAM_MIN_CHUNK_SIZE) {
-    p = (uint8_t *)nghttp3_objalloc_chunk_len_get(stream->out_chunk_objalloc,
-                                                  n);
+    p =
+      (uint8_t *)nghttp3_objalloc_chunk_len_get(stream->out_chunk_objalloc, n);
   } else {
     p = nghttp3_mem_malloc(stream->mem, n);
   }
@@ -780,7 +776,7 @@ int nghttp3_stream_ensure_chunk(nghttp3_stream *stream, size_t need) {
   }
 
   if (nghttp3_ringbuf_full(chunks)) {
-    size_t nlen = nghttp3_max(NGHTTP3_MIN_RBLEN, len * 2);
+    size_t nlen = nghttp3_max_size(NGHTTP3_MIN_RBLEN, len * 2);
     rv = nghttp3_ringbuf_reserve(chunks, nlen);
     if (rv != 0) {
       return rv;
@@ -928,9 +924,8 @@ static void stream_pop_outq_entry(nghttp3_stream *stream,
   nghttp3_ringbuf_pop_front(&stream->outq);
 }
 
-int nghttp3_stream_add_ack_offset(nghttp3_stream *stream, uint64_t n) {
+int nghttp3_stream_update_ack_offset(nghttp3_stream *stream, uint64_t offset) {
   nghttp3_ringbuf *outq = &stream->outq;
-  uint64_t offset = stream->ack_offset + n;
   size_t buflen;
   size_t npopped = 0;
   uint64_t nack;
@@ -941,24 +936,25 @@ int nghttp3_stream_add_ack_offset(nghttp3_stream *stream, uint64_t n) {
     tbuf = nghttp3_ringbuf_get(outq, 0);
     buflen = nghttp3_buf_len(&tbuf->buf);
 
-    if (tbuf->type == NGHTTP3_BUF_TYPE_ALIEN) {
-      nack = nghttp3_min(offset, (uint64_t)buflen) - stream->ack_done;
-      if (stream->callbacks.acked_data) {
-        rv = stream->callbacks.acked_data(stream, stream->node.id, nack,
-                                          stream->user_data);
-        if (rv != 0) {
-          return NGHTTP3_ERR_CALLBACK_FAILURE;
-        }
+    /* For NGHTTP3_BUF_TYPE_ALIEN, we never add 0 length buffer. */
+    if (tbuf->type == NGHTTP3_BUF_TYPE_ALIEN && stream->ack_offset < offset &&
+        stream->callbacks.acked_data) {
+      nack = nghttp3_min_uint64(offset, stream->ack_base + buflen) -
+             stream->ack_offset;
+
+      rv = stream->callbacks.acked_data(stream, stream->node.id, nack,
+                                        stream->user_data);
+      if (rv != 0) {
+        return NGHTTP3_ERR_CALLBACK_FAILURE;
       }
-      stream->ack_done += nack;
     }
 
-    if (offset >= buflen) {
+    if (offset >= stream->ack_base + buflen) {
       stream_pop_outq_entry(stream, tbuf);
 
-      offset -= buflen;
+      stream->ack_base += buflen;
+      stream->ack_offset = stream->ack_base;
       ++npopped;
-      stream->ack_done = 0;
 
       if (stream->outq_idx + 1 == npopped) {
         stream->outq_offset = 0;
@@ -996,7 +992,7 @@ int nghttp3_stream_buffer_data(nghttp3_stream *stream, const uint8_t *data,
   if (len) {
     buf = nghttp3_ringbuf_get(inq, len - 1);
     bufleft = nghttp3_buf_left(buf);
-    nwrite = nghttp3_min(datalen, bufleft);
+    nwrite = nghttp3_min_size(datalen, bufleft);
     buf->last = nghttp3_cpymem(buf->last, data, nwrite);
     data += nwrite;
     datalen -= nwrite;
@@ -1005,7 +1001,7 @@ int nghttp3_stream_buffer_data(nghttp3_stream *stream, const uint8_t *data,
   for (; datalen;) {
     if (nghttp3_ringbuf_full(inq)) {
       size_t nlen =
-          nghttp3_max(NGHTTP3_MIN_RBLEN, nghttp3_ringbuf_len(inq) * 2);
+        nghttp3_max_size(NGHTTP3_MIN_RBLEN, nghttp3_ringbuf_len(inq) * 2);
       rv = nghttp3_ringbuf_reserve(inq, nlen);
       if (rv != 0) {
         return rv;
@@ -1020,7 +1016,7 @@ int nghttp3_stream_buffer_data(nghttp3_stream *stream, const uint8_t *data,
     buf = nghttp3_ringbuf_push_back(inq);
     nghttp3_buf_wrap_init(buf, rawbuf, 16384);
     bufleft = nghttp3_buf_left(buf);
-    nwrite = nghttp3_min(datalen, bufleft);
+    nwrite = nghttp3_min_size(datalen, bufleft);
     buf->last = nghttp3_cpymem(buf->last, data, nwrite);
     data += nwrite;
     datalen -= nwrite;
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_stream.h b/deps/ngtcp2/nghttp3/lib/nghttp3_stream.h
index 03a57697b232b3..7d296febf9135f 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_stream.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_stream.h
@@ -27,7 +27,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -88,8 +88,8 @@ typedef struct nghttp3_varint_read_state {
 typedef struct nghttp3_stream_read_state {
   nghttp3_varint_read_state rvint;
   nghttp3_frame fr;
-  int state;
   int64_t left;
+  int state;
 } nghttp3_stream_read_state;
 
 /* NGHTTP3_STREAM_FLAG_NONE indicates that no flag is set. */
@@ -186,9 +186,6 @@ typedef struct nghttp3_stream_callbacks {
 } nghttp3_stream_callbacks;
 
 typedef struct nghttp3_http_state {
-  /* status_code is HTTP status code received.  This field is used
-     if connection is initialized as client. */
-  int32_t status_code;
   /* content_length is the value of received content-length header
      field. */
   int64_t content_length;
@@ -196,6 +193,9 @@ typedef struct nghttp3_http_state {
      far. */
   int64_t recv_content_length;
   nghttp3_pri pri;
+  /* status_code is HTTP status code received.  This field is used
+     if connection is initialized as client. */
+  int32_t status_code;
   uint32_t flags;
 } nghttp3_http_state;
 
@@ -226,13 +226,12 @@ struct nghttp3_stream {
       /* outq_offset is write offset relative to the element at outq_idx
          in outq. */
       uint64_t outq_offset;
-      /* ack_offset is offset acknowledged by peer relative to the first
-         element in outq. */
+      /* ack_base is the number of bytes acknowledged by a remote
+         endpoint where the first element in outq is positioned at. */
+      uint64_t ack_base;
+      /* ack_offset is the number of bytes acknowledged by a remote
+         endpoint so far. */
       uint64_t ack_offset;
-      /* ack_done is the number of bytes notified to an application that
-         they are acknowledged inside the first outq element if it is of
-         type NGHTTP3_BUF_TYPE_ALIEN. */
-      uint64_t ack_done;
       uint64_t unscheduled_nwrite;
       nghttp3_stream_type type;
       nghttp3_stream_read_state rstate;
@@ -283,7 +282,8 @@ void nghttp3_varint_read_state_reset(nghttp3_varint_read_state *rvint);
 void nghttp3_stream_read_state_reset(nghttp3_stream_read_state *rstate);
 
 nghttp3_ssize nghttp3_read_varint(nghttp3_varint_read_state *rvint,
-                                  const uint8_t *src, size_t srclen, int fin);
+                                  const uint8_t *begin, const uint8_t *end,
+                                  int fin);
 
 int nghttp3_stream_frq_add(nghttp3_stream *stream,
                            const nghttp3_frame_entry *frent);
@@ -336,7 +336,11 @@ void nghttp3_stream_add_outq_offset(nghttp3_stream *stream, size_t n);
  */
 int nghttp3_stream_outq_write_done(nghttp3_stream *stream);
 
-int nghttp3_stream_add_ack_offset(nghttp3_stream *stream, uint64_t n);
+/*
+ * nghttp2_stream_update_ack_offset updates the last acknowledged
+ * offset to |offset|.
+ */
+int nghttp3_stream_update_ack_offset(nghttp3_stream *stream, uint64_t offset);
 
 /*
  * nghttp3_stream_is_active returns nonzero if |stream| is active.  In
@@ -390,4 +394,4 @@ int nghttp3_client_stream_uni(int64_t stream_id);
  */
 int nghttp3_server_stream_uni(int64_t stream_id);
 
-#endif /* NGHTTP3_STREAM_H */
+#endif /* !defined(NGHTTP3_STREAM_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.c b/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.c
index d9c5e598699512..eae847e7a9236a 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.c
@@ -73,8 +73,8 @@ int nghttp3_tnode_schedule(nghttp3_tnode *tnode, nghttp3_pq *pq,
 
   if (tnode->pe.index == NGHTTP3_PQ_BAD_INDEX) {
     tnode->cycle =
-        pq_get_first_cycle(pq) +
-        ((nwrite == 0 || !tnode->pri.inc) ? 0 : nghttp3_max(1, penalty));
+      pq_get_first_cycle(pq) +
+      ((nwrite == 0 || !tnode->pri.inc) ? 0 : nghttp3_max_uint64(1, penalty));
   } else if (nwrite > 0) {
     if (!tnode->pri.inc || nghttp3_pq_size(pq) == 1) {
       return 0;
@@ -82,7 +82,7 @@ int nghttp3_tnode_schedule(nghttp3_tnode *tnode, nghttp3_pq *pq,
 
     nghttp3_pq_remove(pq, &tnode->pe);
     tnode->pe.index = NGHTTP3_PQ_BAD_INDEX;
-    tnode->cycle += nghttp3_max(1, penalty);
+    tnode->cycle += nghttp3_max_uint64(1, penalty);
   } else {
     return 0;
   }
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.h b/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.h
index 1abc1e62519381..c13af52fdc6bc7 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_tnode.h
@@ -27,7 +27,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -63,4 +63,4 @@ int nghttp3_tnode_schedule(nghttp3_tnode *tnode, nghttp3_pq *pq,
  */
 int nghttp3_tnode_is_scheduled(nghttp3_tnode *tnode);
 
-#endif /* NGHTTP3_TNODE_H */
+#endif /* !defined(NGHTTP3_TNODE_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.c b/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.c
index 6fea89b802b12d..8adeeb4931dc57 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.c
@@ -29,11 +29,11 @@
 #include <errno.h>
 #ifdef HAVE_UNISTD_H
 #  include <unistd.h>
-#endif /* HAVE_UNISTD_H */
+#endif /* defined(HAVE_UNISTD_H) */
 #include <stdlib.h>
 #ifdef WIN32
 #  include <io.h>
-#endif /* WIN32 */
+#endif /* defined(WIN32) */
 
 void nghttp3_unreachable_fail(const char *file, int line, const char *func) {
   char *buf;
@@ -62,9 +62,9 @@ void nghttp3_unreachable_fail(const char *file, int line, const char *func) {
 #ifndef WIN32
   while (write(STDERR_FILENO, buf, (size_t)rv) == -1 && errno == EINTR)
     ;
-#else  /* WIN32 */
+#else  /* defined(WIN32) */
   _write(_fileno(stderr), buf, (unsigned int)rv);
-#endif /* WIN32 */
+#endif /* defined(WIN32) */
 
   free(buf);
 
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.h b/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.h
index 6360f52d3aa857..c609d7ed72f3cb 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_unreachable.h
@@ -28,26 +28,26 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
 #ifdef __FILE_NAME__
 #  define NGHTTP3_FILE_NAME __FILE_NAME__
-#else /* !__FILE_NAME__ */
+#else /* !defined(__FILE_NAME__) */
 #  define NGHTTP3_FILE_NAME "(file)"
-#endif /* !__FILE_NAME__ */
+#endif /* !defined(__FILE_NAME__) */
 
 #define nghttp3_unreachable()                                                  \
   nghttp3_unreachable_fail(NGHTTP3_FILE_NAME, __LINE__, __func__)
 
 #ifdef _MSC_VER
 __declspec(noreturn)
-#endif /* _MSC_VER */
+#endif /* defined(_MSC_VER) */
     void nghttp3_unreachable_fail(const char *file, int line, const char *func)
 #ifndef _MSC_VER
         __attribute__((noreturn))
-#endif /* !_MSC_VER */
+#endif /* !defined(_MSC_VER) */
         ;
 
-#endif /* NGHTTP3_UNREACHABLE_H */
+#endif /* !defined(NGHTTP3_UNREACHABLE_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_vec.h b/deps/ngtcp2/nghttp3/lib/nghttp3_vec.h
index 473d1467310062..f36eabc1052621 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_vec.h
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_vec.h
@@ -28,7 +28,7 @@
 
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
@@ -38,4 +38,4 @@
  */
 int64_t nghttp3_vec_len_varint(const nghttp3_vec *vec, size_t n);
 
-#endif /* NGHTTP3_VEC_H */
+#endif /* !defined(NGHTTP3_VEC_H) */
diff --git a/deps/ngtcp2/nghttp3/lib/nghttp3_version.c b/deps/ngtcp2/nghttp3/lib/nghttp3_version.c
index c460cc72835b1d..939821d84eac3d 100644
--- a/deps/ngtcp2/nghttp3/lib/nghttp3_version.c
+++ b/deps/ngtcp2/nghttp3/lib/nghttp3_version.c
@@ -24,7 +24,7 @@
  */
 #ifdef HAVE_CONFIG_H
 #  include <config.h>
-#endif /* HAVE_CONFIG_H */
+#endif /* defined(HAVE_CONFIG_H) */
 
 #include <nghttp3/nghttp3.h>
 
diff --git a/deps/ngtcp2/nghttp3/lib/sfparse/COPYING b/deps/ngtcp2/nghttp3/lib/sfparse/COPYING
new file mode 100644
index 00000000000000..8212d82d83ab74
--- /dev/null
+++ b/deps/ngtcp2/nghttp3/lib/sfparse/COPYING
@@ -0,0 +1,22 @@
+The MIT License
+
+Copyright (c) 2023 sfparse contributors
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/deps/ngtcp2/nghttp3/lib/sfparse.c b/deps/ngtcp2/nghttp3/lib/sfparse/sfparse.c
similarity index 55%
rename from deps/ngtcp2/nghttp3/lib/sfparse.c
rename to deps/ngtcp2/nghttp3/lib/sfparse/sfparse.c
index efa2850c9d661d..d0328cf40c21ea 100644
--- a/deps/ngtcp2/nghttp3/lib/sfparse.c
+++ b/deps/ngtcp2/nghttp3/lib/sfparse/sfparse.c
@@ -135,6 +135,70 @@
   UCALPHA_CASES:                                                               \
   LCALPHA_CASES
 
+#define TOKEN_CASES                                                            \
+  case '!':                                                                    \
+  case '#':                                                                    \
+  case '$':                                                                    \
+  case '%':                                                                    \
+  case '&':                                                                    \
+  case '\'':                                                                   \
+  case '*':                                                                    \
+  case '+':                                                                    \
+  case '-':                                                                    \
+  case '.':                                                                    \
+  case '/':                                                                    \
+  DIGIT_CASES:                                                                 \
+  case ':':                                                                    \
+  UCALPHA_CASES:                                                               \
+  case '^':                                                                    \
+  case '_':                                                                    \
+  case '`':                                                                    \
+  LCALPHA_CASES:                                                               \
+  case '|':                                                                    \
+  case '~'
+
+#define LCHEXALPHA_CASES                                                       \
+  case 'a':                                                                    \
+  case 'b':                                                                    \
+  case 'c':                                                                    \
+  case 'd':                                                                    \
+  case 'e':                                                                    \
+  case 'f'
+
+#define X00_1F_CASES                                                           \
+  case 0x00:                                                                   \
+  case 0x01:                                                                   \
+  case 0x02:                                                                   \
+  case 0x03:                                                                   \
+  case 0x04:                                                                   \
+  case 0x05:                                                                   \
+  case 0x06:                                                                   \
+  case 0x07:                                                                   \
+  case 0x08:                                                                   \
+  case 0x09:                                                                   \
+  case 0x0a:                                                                   \
+  case 0x0b:                                                                   \
+  case 0x0c:                                                                   \
+  case 0x0d:                                                                   \
+  case 0x0e:                                                                   \
+  case 0x0f:                                                                   \
+  case 0x10:                                                                   \
+  case 0x11:                                                                   \
+  case 0x12:                                                                   \
+  case 0x13:                                                                   \
+  case 0x14:                                                                   \
+  case 0x15:                                                                   \
+  case 0x16:                                                                   \
+  case 0x17:                                                                   \
+  case 0x18:                                                                   \
+  case 0x19:                                                                   \
+  case 0x1a:                                                                   \
+  case 0x1b:                                                                   \
+  case 0x1c:                                                                   \
+  case 0x1d:                                                                   \
+  case 0x1e:                                                                   \
+  case 0x1f
+
 #define X20_21_CASES                                                           \
   case ' ':                                                                    \
   case '!'
@@ -175,6 +239,137 @@
   case '}':                                                                    \
   case '~'
 
+#define X7F_FF_CASES                                                           \
+  case 0x7f:                                                                   \
+  case 0x80:                                                                   \
+  case 0x81:                                                                   \
+  case 0x82:                                                                   \
+  case 0x83:                                                                   \
+  case 0x84:                                                                   \
+  case 0x85:                                                                   \
+  case 0x86:                                                                   \
+  case 0x87:                                                                   \
+  case 0x88:                                                                   \
+  case 0x89:                                                                   \
+  case 0x8a:                                                                   \
+  case 0x8b:                                                                   \
+  case 0x8c:                                                                   \
+  case 0x8d:                                                                   \
+  case 0x8e:                                                                   \
+  case 0x8f:                                                                   \
+  case 0x90:                                                                   \
+  case 0x91:                                                                   \
+  case 0x92:                                                                   \
+  case 0x93:                                                                   \
+  case 0x94:                                                                   \
+  case 0x95:                                                                   \
+  case 0x96:                                                                   \
+  case 0x97:                                                                   \
+  case 0x98:                                                                   \
+  case 0x99:                                                                   \
+  case 0x9a:                                                                   \
+  case 0x9b:                                                                   \
+  case 0x9c:                                                                   \
+  case 0x9d:                                                                   \
+  case 0x9e:                                                                   \
+  case 0x9f:                                                                   \
+  case 0xa0:                                                                   \
+  case 0xa1:                                                                   \
+  case 0xa2:                                                                   \
+  case 0xa3:                                                                   \
+  case 0xa4:                                                                   \
+  case 0xa5:                                                                   \
+  case 0xa6:                                                                   \
+  case 0xa7:                                                                   \
+  case 0xa8:                                                                   \
+  case 0xa9:                                                                   \
+  case 0xaa:                                                                   \
+  case 0xab:                                                                   \
+  case 0xac:                                                                   \
+  case 0xad:                                                                   \
+  case 0xae:                                                                   \
+  case 0xaf:                                                                   \
+  case 0xb0:                                                                   \
+  case 0xb1:                                                                   \
+  case 0xb2:                                                                   \
+  case 0xb3:                                                                   \
+  case 0xb4:                                                                   \
+  case 0xb5:                                                                   \
+  case 0xb6:                                                                   \
+  case 0xb7:                                                                   \
+  case 0xb8:                                                                   \
+  case 0xb9:                                                                   \
+  case 0xba:                                                                   \
+  case 0xbb:                                                                   \
+  case 0xbc:                                                                   \
+  case 0xbd:                                                                   \
+  case 0xbe:                                                                   \
+  case 0xbf:                                                                   \
+  case 0xc0:                                                                   \
+  case 0xc1:                                                                   \
+  case 0xc2:                                                                   \
+  case 0xc3:                                                                   \
+  case 0xc4:                                                                   \
+  case 0xc5:                                                                   \
+  case 0xc6:                                                                   \
+  case 0xc7:                                                                   \
+  case 0xc8:                                                                   \
+  case 0xc9:                                                                   \
+  case 0xca:                                                                   \
+  case 0xcb:                                                                   \
+  case 0xcc:                                                                   \
+  case 0xcd:                                                                   \
+  case 0xce:                                                                   \
+  case 0xcf:                                                                   \
+  case 0xd0:                                                                   \
+  case 0xd1:                                                                   \
+  case 0xd2:                                                                   \
+  case 0xd3:                                                                   \
+  case 0xd4:                                                                   \
+  case 0xd5:                                                                   \
+  case 0xd6:                                                                   \
+  case 0xd7:                                                                   \
+  case 0xd8:                                                                   \
+  case 0xd9:                                                                   \
+  case 0xda:                                                                   \
+  case 0xdb:                                                                   \
+  case 0xdc:                                                                   \
+  case 0xdd:                                                                   \
+  case 0xde:                                                                   \
+  case 0xdf:                                                                   \
+  case 0xe0:                                                                   \
+  case 0xe1:                                                                   \
+  case 0xe2:                                                                   \
+  case 0xe3:                                                                   \
+  case 0xe4:                                                                   \
+  case 0xe5:                                                                   \
+  case 0xe6:                                                                   \
+  case 0xe7:                                                                   \
+  case 0xe8:                                                                   \
+  case 0xe9:                                                                   \
+  case 0xea:                                                                   \
+  case 0xeb:                                                                   \
+  case 0xec:                                                                   \
+  case 0xed:                                                                   \
+  case 0xee:                                                                   \
+  case 0xef:                                                                   \
+  case 0xf0:                                                                   \
+  case 0xf1:                                                                   \
+  case 0xf2:                                                                   \
+  case 0xf3:                                                                   \
+  case 0xf4:                                                                   \
+  case 0xf5:                                                                   \
+  case 0xf6:                                                                   \
+  case 0xf7:                                                                   \
+  case 0xf8:                                                                   \
+  case 0xf9:                                                                   \
+  case 0xfa:                                                                   \
+  case 0xfb:                                                                   \
+  case 0xfc:                                                                   \
+  case 0xfd:                                                                   \
+  case 0xfe:                                                                   \
+  case 0xff
+
 static int is_ws(uint8_t c) {
   switch (c) {
   case ' ':
@@ -431,25 +626,7 @@ static int parser_token(sf_parser *sfp, sf_value *dest) {
 
   for (; !parser_eof(sfp); ++sfp->pos) {
     switch (*sfp->pos) {
-    case '!':
-    case '#':
-    case '$':
-    case '%':
-    case '&':
-    case '\'':
-    case '*':
-    case '+':
-    case '-':
-    case '.':
-    case '^':
-    case '_':
-    case '`':
-    case '|':
-    case '~':
-    case ':':
-    case '/':
-    DIGIT_CASES:
-    ALPHA_CASES:
+    TOKEN_CASES:
       continue;
     }
 
@@ -487,58 +664,30 @@ static int parser_byteseq(sf_parser *sfp, sf_value *dest) {
       case 1:
         return SF_ERR_PARSE_ERROR;
       case 2:
-        switch (*(sfp->pos - 1)) {
-        case 'A':
-        case 'Q':
-        case 'g':
-        case 'w':
-          break;
-        default:
-          return SF_ERR_PARSE_ERROR;
-        }
-
         ++sfp->pos;
 
-        if (parser_eof(sfp) || *sfp->pos != '=') {
+        if (parser_eof(sfp)) {
           return SF_ERR_PARSE_ERROR;
         }
 
+        if (*sfp->pos == '=') {
+          ++sfp->pos;
+        }
+
         break;
       case 3:
-        switch (*(sfp->pos - 1)) {
-        case 'A':
-        case 'E':
-        case 'I':
-        case 'M':
-        case 'Q':
-        case 'U':
-        case 'Y':
-        case 'c':
-        case 'g':
-        case 'k':
-        case 'o':
-        case 's':
-        case 'w':
-        case '0':
-        case '4':
-        case '8':
-          break;
-        default:
-          return SF_ERR_PARSE_ERROR;
-        }
+        ++sfp->pos;
 
         break;
       }
 
-      ++sfp->pos;
-
       if (parser_eof(sfp) || *sfp->pos != ':') {
         return SF_ERR_PARSE_ERROR;
       }
 
       goto fin;
     case ':':
-      if ((sfp->pos - base) & 0x3) {
+      if (((sfp->pos - base) & 0x3) == 1) {
         return SF_ERR_PARSE_ERROR;
       }
 
@@ -599,6 +748,169 @@ static int parser_boolean(sf_parser *sfp, sf_value *dest) {
   return 0;
 }
 
+static int pctdecode(uint8_t *pc, const uint8_t **ppos) {
+  uint8_t c, b = **ppos;
+
+  switch (b) {
+  DIGIT_CASES:
+    c = (uint8_t)((b - '0') << 4);
+
+    break;
+  LCHEXALPHA_CASES:
+    c = (uint8_t)((b - 'a' + 10) << 4);
+
+    break;
+  default:
+    return -1;
+  }
+
+  b = *++*ppos;
+
+  switch (b) {
+  DIGIT_CASES:
+    c |= (uint8_t)(b - '0');
+
+    break;
+  LCHEXALPHA_CASES:
+    c |= (uint8_t)(b - 'a' + 10);
+
+    break;
+  default:
+    return -1;
+  }
+
+  *pc = c;
+  ++*ppos;
+
+  return 0;
+}
+
+/* Start of utf8 dfa */
+/* Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ * See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+ *
+ * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+/* clang-format off */
+static const uint8_t utf8d[] = {
+  /*
+   * The first part of the table maps bytes to character classes that
+   * to reduce the size of the transition table and create bitmasks.
+   */
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+   /*
+    * The second part is a transition table that maps a combination
+    * of a state of the automaton and a character class to a state.
+    */
+   0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
+  12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
+  12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
+  12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
+  12,36,12,12,12,12,12,12,12,12,12,12,
+};
+/* clang-format on */
+
+static void utf8_decode(uint32_t *state, uint8_t byte) {
+  *state = utf8d[256 + *state + utf8d[byte]];
+}
+
+/* End of utf8 dfa */
+
+static int parser_dispstring(sf_parser *sfp, sf_value *dest) {
+  const uint8_t *base;
+  uint8_t c;
+  uint32_t utf8state = UTF8_ACCEPT;
+
+  assert('%' == *sfp->pos);
+
+  ++sfp->pos;
+
+  if (parser_eof(sfp) || *sfp->pos != '"') {
+    return SF_ERR_PARSE_ERROR;
+  }
+
+  base = ++sfp->pos;
+
+  for (; !parser_eof(sfp);) {
+    switch (*sfp->pos) {
+    X00_1F_CASES:
+    X7F_FF_CASES:
+      return SF_ERR_PARSE_ERROR;
+    case '%':
+      ++sfp->pos;
+
+      if (sfp->pos + 2 > sfp->end) {
+        return SF_ERR_PARSE_ERROR;
+      }
+
+      if (pctdecode(&c, &sfp->pos) != 0) {
+        return SF_ERR_PARSE_ERROR;
+      }
+
+      utf8_decode(&utf8state, c);
+      if (utf8state == UTF8_REJECT) {
+        return SF_ERR_PARSE_ERROR;
+      }
+
+      break;
+    case '"':
+      if (utf8state != UTF8_ACCEPT) {
+        return SF_ERR_PARSE_ERROR;
+      }
+
+      if (dest) {
+        dest->type = SF_TYPE_DISPSTRING;
+        dest->flags = SF_VALUE_FLAG_NONE;
+        dest->vec.len = (size_t)(sfp->pos - base);
+        dest->vec.base = dest->vec.len == 0 ? NULL : (uint8_t *)base;
+      }
+
+      ++sfp->pos;
+
+      return 0;
+    default:
+      if (utf8state != UTF8_ACCEPT) {
+        return SF_ERR_PARSE_ERROR;
+      }
+
+      ++sfp->pos;
+    }
+  }
+
+  return SF_ERR_PARSE_ERROR;
+}
+
 static int parser_bare_item(sf_parser *sfp, sf_value *dest) {
   switch (*sfp->pos) {
   case '"':
@@ -615,6 +927,8 @@ static int parser_bare_item(sf_parser *sfp, sf_value *dest) {
   case '*':
   ALPHA_CASES:
     return parser_token(sfp, dest);
+  case '%':
+    return parser_dispstring(sfp, dest);
   default:
     return SF_ERR_PARSE_ERROR;
   }
@@ -1037,7 +1351,7 @@ void sf_unescape(sf_vec *dest, const sf_vec *src) {
   size_t len, slen;
 
   if (src->len == 0) {
-    *dest = *src;
+    dest->len = 0;
 
     return;
   }
@@ -1049,16 +1363,12 @@ void sf_unescape(sf_vec *dest, const sf_vec *src) {
   for (;;) {
     q = memchr(p, '\\', len);
     if (q == NULL) {
-      if (len == src->len) {
-        *dest = *src;
-
-        return;
-      }
-
       memcpy(o, p, len);
       o += len;
 
-      break;
+      dest->len = (size_t)(o - dest->base);
+
+      return;
     }
 
     slen = (size_t)(q - p);
@@ -1069,8 +1379,6 @@ void sf_unescape(sf_vec *dest, const sf_vec *src) {
     *o++ = *p++;
     len -= slen + 2;
   }
-
-  dest->len = (size_t)(o - dest->base);
 }
 
 void sf_base64decode(sf_vec *dest, const sf_vec *src) {
@@ -1093,20 +1401,22 @@ void sf_base64decode(sf_vec *dest, const sf_vec *src) {
   uint8_t *o;
   const uint8_t *p, *end;
   uint32_t n;
-  size_t i;
+  size_t i, left;
   int idx;
 
-  assert((src->len & 0x3) == 0);
-
   if (src->len == 0) {
-    *dest = *src;
+    dest->len = 0;
 
     return;
   }
 
   o = dest->base;
   p = src->base;
-  end = src->base + src->len;
+  left = src->len & 0x3;
+  if (left == 0 && src->base[src->len - 1] == '=') {
+    left = 4;
+  }
+  end = src->base + src->len - left;
 
   for (; p != end;) {
     n = 0;
@@ -1114,33 +1424,94 @@ void sf_base64decode(sf_vec *dest, const sf_vec *src) {
     for (i = 1; i <= 4; ++i, ++p) {
       idx = index_tbl[*p];
 
-      if (idx == -1) {
-        assert(i > 2);
+      assert(idx != -1);
 
-        if (i == 3) {
-          assert(*p == '=' && *(p + 1) == '=' && p + 2 == end);
+      n += (uint32_t)(idx << (24 - i * 6));
+    }
 
-          *o++ = (uint8_t)(n >> 16);
+    *o++ = (uint8_t)(n >> 16);
+    *o++ = (n >> 8) & 0xffu;
+    *o++ = n & 0xffu;
+  }
 
-          goto fin;
-        }
+  switch (left) {
+  case 0:
+    goto fin;
+  case 1:
+    assert(0);
+    abort();
+  case 3:
+    if (src->base[src->len - 1] == '=') {
+      left = 2;
+    }
 
-        assert(*p == '=' && p + 1 == end);
+    break;
+  case 4:
+    assert('=' == src->base[src->len - 1]);
 
-        *o++ = (uint8_t)(n >> 16);
-        *o++ = (n >> 8) & 0xffu;
+    if (src->base[src->len - 2] == '=') {
+      left = 2;
+    } else {
+      left = 3;
+    }
 
-        goto fin;
-      }
+    break;
+  }
 
-      n += (uint32_t)(idx << (24 - i * 6));
-    }
+  switch (left) {
+  case 2:
+    *o = (uint8_t)(index_tbl[*p++] << 2);
+    *o++ |= (uint8_t)(index_tbl[*p++] >> 4);
 
-    *o++ = (uint8_t)(n >> 16);
+    break;
+  case 3:
+    n = (uint32_t)(index_tbl[*p++] << 10);
+    n += (uint32_t)(index_tbl[*p++] << 4);
+    n += (uint32_t)(index_tbl[*p++] >> 2);
     *o++ = (n >> 8) & 0xffu;
     *o++ = n & 0xffu;
+
+    break;
   }
 
 fin:
   dest->len = (size_t)(o - dest->base);
 }
+
+void sf_pctdecode(sf_vec *dest, const sf_vec *src) {
+  const uint8_t *p, *q;
+  uint8_t *o;
+  size_t len, slen;
+
+  if (src->len == 0) {
+    dest->len = 0;
+
+    return;
+  }
+
+  o = dest->base;
+  p = src->base;
+  len = src->len;
+
+  for (;;) {
+    q = memchr(p, '%', len);
+    if (q == NULL) {
+      memcpy(o, p, len);
+      o += len;
+
+      dest->len = (size_t)(o - dest->base);
+
+      return;
+    }
+
+    slen = (size_t)(q - p);
+    memcpy(o, p, slen);
+    o += slen;
+
+    p = q + 1;
+
+    pctdecode(o++, &p);
+
+    len -= slen + 3;
+  }
+}
diff --git a/deps/ngtcp2/nghttp3/lib/sfparse.h b/deps/ngtcp2/nghttp3/lib/sfparse/sfparse.h
similarity index 91%
rename from deps/ngtcp2/nghttp3/lib/sfparse.h
rename to deps/ngtcp2/nghttp3/lib/sfparse/sfparse.h
index 1474db1429acea..01cc947d4d61bc 100644
--- a/deps/ngtcp2/nghttp3/lib/sfparse.h
+++ b/deps/ngtcp2/nghttp3/lib/sfparse/sfparse.h
@@ -85,7 +85,11 @@ typedef enum sf_type {
   /**
    * :enum:`SF_TYPE_DATE` indicates date type.
    */
-  SF_TYPE_DATE
+  SF_TYPE_DATE,
+  /**
+   * :enum:`SF_TYPE_DISPSTRING` indicates display string type.
+   */
+  SF_TYPE_DISPSTRING
 } sf_type;
 
 /**
@@ -197,8 +201,8 @@ typedef struct sf_value {
     /**
      * :member:`vec` contains sequence of bytes if :member:`type` is
      * either :enum:`sf_type.SF_TYPE_STRING`,
-     * :enum:`sf_type.SF_TYPE_TOKEN`, or
-     * :enum:`sf_type.SF_TYPE_BYTESEQ`.
+     * :enum:`sf_type.SF_TYPE_TOKEN`, :enum:`sf_type.SF_TYPE_BYTESEQ`,
+     * or :enum:`sf_type.SF_TYPE_DISPSTRING`.
      *
      * For :enum:`sf_type.SF_TYPE_STRING`, this field contains one or
      * more escaped characters if :member:`flags` has
@@ -209,6 +213,10 @@ typedef struct sf_value {
      * encoded string.  To decode this byte string, use
      * `sf_base64decode`.
      *
+     * For :enum:`sf_type.SF_TYPE_DISPSTRING`, this field may contain
+     * percent-encoded UTF-8 byte sequences.  To decode it, use
+     * `sf_pctdecode`.
+     *
      * If :member:`vec.len <sf_vec.len>` == 0, :member:`vec.base
      * <sf_vec.base>` is guaranteed to be NULL.
      */
@@ -372,10 +380,6 @@ int sf_parser_inner_list(sf_parser *sfp, sf_value *dest);
  * :member:`dest->base <sf_vec.base>` must point to the buffer that
  * has sufficient space to store the unescaped string.
  *
- * If there is no escape character in |src|, |*src| is assigned to
- * |*dest|.  This includes the case that :member:`src->len
- * <sf_vec.len>` == 0.
- *
  * This function sets the length of unescaped string to
  * :member:`dest->len <sf_vec.len>`.
  */
@@ -394,14 +398,29 @@ void sf_unescape(sf_vec *dest, const sf_vec *src);
  * :member:`dest->base <sf_vec.base>` must point to the buffer that
  * has sufficient space to store the decoded byte string.
  *
- * If :member:`src->len <sf_vec.len>` == 0, |*src| is assigned to
- * |*dest|.
- *
  * This function sets the length of decoded byte string to
  * :member:`dest->len <sf_vec.len>`.
  */
 void sf_base64decode(sf_vec *dest, const sf_vec *src);
 
+/**
+ * @function
+ *
+ * `sf_pctdecode` decodes percent-encoded string |src| and writes the
+ * result into |dest|.  |src| should be the pointer to
+ * :member:`sf_value.vec` of type :enum:`sf_type.SF_TYPE_DISPSTRING`
+ * produced by either `sf_parser_dict`, `sf_parser_list`,
+ * `sf_parser_inner_list`, `sf_parser_item`, or `sf_parser_param`,
+ * otherwise the behavior is undefined.
+ *
+ * :member:`dest->base <sf_vec.base>` must point to the buffer that
+ * has sufficient space to store the decoded byte string.
+ *
+ * This function sets the length of decoded byte string to
+ * :member:`dest->len <sf_vec.len>`.
+ */
+void sf_pctdecode(sf_vec *dest, const sf_vec *src);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/deps/openssl/openssl.gyp b/deps/openssl/openssl.gyp
index f6b157f8d60813..ea3a2dc09ef29b 100644
--- a/deps/openssl/openssl.gyp
+++ b/deps/openssl/openssl.gyp
@@ -5,19 +5,13 @@
     'nasm_version%': '0.0',
     'openssl-cli': '<(PRODUCT_DIR)/<(EXECUTABLE_PREFIX)openssl-cli<(EXECUTABLE_SUFFIX)',
     'conditions': [
-      ['OS == "win"', {
-        'obj_dir_abs': '<(PRODUCT_DIR_ABS)/obj',
-      }],
       ['GENERATOR == "ninja"', {
-        'obj_dir_abs': '<(PRODUCT_DIR_ABS)/obj',
-        'modules_dir': '<(PRODUCT_DIR_ABS)/obj/lib/openssl-modules',
+        'modules_dir': '<(PRODUCT_DIR_ABS_CSTR)/obj/lib/openssl-modules',
       }, {
-        'obj_dir_abs%': '<(PRODUCT_DIR_ABS)/obj.target',
-        'modules_dir': '<(PRODUCT_DIR_ABS)/obj.target/deps/openssl/lib/openssl-modules',
+        'modules_dir': '<(PRODUCT_DIR_ABS_CSTR)/obj.target/deps/openssl/lib/openssl-modules',
       }],
       ['OS=="mac"', {
-        'obj_dir_abs%': '<(PRODUCT_DIR_ABS)/obj.target',
-        'modules_dir': '<(PRODUCT_DIR_ABS)/obj.target/deps/openssl/lib/openssl-modules',
+        'modules_dir': '<(PRODUCT_DIR_ABS_CSTR)/obj.target/deps/openssl/lib/openssl-modules',
       }],
     ],
   },
diff --git a/deps/simdutf/simdutf.cpp b/deps/simdutf/simdutf.cpp
index 007fa02b165204..eb3e4598407374 100644
--- a/deps/simdutf/simdutf.cpp
+++ b/deps/simdutf/simdutf.cpp
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-11-21 10:33:28 -0500. Do not edit! */
+/* auto-generated on 2024-12-10 14:54:53 -0500. Do not edit! */
 /* begin file src/simdutf.cpp */
 #include "simdutf.h"
 // We include base64_tables once.
@@ -6410,43 +6410,42 @@ SIMDUTF_UNTARGET_REGION
 
 #endif // SIMDUTF_RVV_H
 /* end file src/simdutf/rvv.h */
-/* begin file src/simdutf/fallback.h */
-#ifndef SIMDUTF_FALLBACK_H
-#define SIMDUTF_FALLBACK_H
+/* begin file src/simdutf/lsx.h */
+#ifndef SIMDUTF_LSX_H
+#define SIMDUTF_LSX_H
 
+#ifdef SIMDUTF_FALLBACK_H
+  #error "lsx.h must be included before fallback.h"
+#endif
 
-// Note that fallback.h is always imported last.
 
-// Default Fallback to on unless a builtin implementation has already been
-// selected.
-#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
-  #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ||        \
-      SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE ||     \
-      SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV
-    #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
-  #else
-    #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
-  #endif
+#ifndef SIMDUTF_IMPLEMENTATION_LSX
+  #define SIMDUTF_IMPLEMENTATION_LSX (SIMDUTF_IS_LSX)
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX && SIMDUTF_IS_LSX
+  #define SIMDUTF_CAN_ALWAYS_RUN_LSX 1
+#else
+  #define SIMDUTF_CAN_ALWAYS_RUN_LSX 0
 #endif
 
 #define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
 
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
+#if SIMDUTF_IMPLEMENTATION_LSX
 
 namespace simdutf {
 /**
- * Fallback implementation (runs on any machine).
+ * Implementation for LoongArch SX.
  */
-namespace fallback {} // namespace fallback
+namespace lsx {} // namespace lsx
 } // namespace simdutf
 
-/* begin file src/simdutf/fallback/implementation.h */
-#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
-#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
+/* begin file src/simdutf/lsx/implementation.h */
+#ifndef SIMDUTF_LSX_IMPLEMENTATION_H
+#define SIMDUTF_LSX_IMPLEMENTATION_H
 
 
 namespace simdutf {
-namespace fallback {
+namespace lsx {
 
 namespace {
 using namespace simdutf;
@@ -6455,8 +6454,8 @@ using namespace simdutf;
 class implementation final : public simdutf::implementation {
 public:
   simdutf_really_inline implementation()
-      : simdutf::implementation("fallback", "Generic fallback implementation",
-                                0) {}
+      : simdutf::implementation("lsx", "LOONGARCH SX",
+                                internal::instruction_set::LSX) {}
   simdutf_warn_unused int detect_encodings(const char *input,
                                            size_t length) const noexcept final;
   simdutf_warn_unused bool validate_utf8(const char *buf,
@@ -6541,12 +6540,6 @@ class implementation final : public simdutf::implementation {
       const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
   simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
       const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_utf32_to_utf8(
-      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
-      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
   simdutf_warn_unused size_t
   convert_utf32_to_latin1(const char32_t *buf, size_t len,
                           char *latin1_output) const noexcept final;
@@ -6556,6 +6549,12 @@ class implementation final : public simdutf::implementation {
   simdutf_warn_unused size_t
   convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
                                 char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
   simdutf_warn_unused size_t
   convert_utf32_to_utf16le(const char32_t *buf, size_t len,
                            char16_t *utf16_buffer) const noexcept final;
@@ -6630,3980 +6629,6754 @@ class implementation final : public simdutf::implementation {
   utf8_length_from_latin1(const char *input, size_t length) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(
       const char *input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_options) const noexcept;
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_options =
-          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options) const noexcept;
   simdutf_warn_unused size_t maximal_binary_length_from_base64(
       const char16_t *input, size_t length) const noexcept;
-  simdutf_warn_unused result base64_to_binary(
-      const char16_t *input, size_t length, char *output,
-      base64_options options,
-      last_chunk_handling_options last_chunk_options) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options) const noexcept;
   simdutf_warn_unused size_t base64_length_from_binary(
       size_t length, base64_options options) const noexcept;
-  simdutf_warn_unused full_result base64_to_binary_details(
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept;
+
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
       const char16_t *input, size_t length, char *output,
       base64_options options,
       last_chunk_handling_options last_chunk_options =
           last_chunk_handling_options::loose) const noexcept;
-  size_t binary_to_base64(const char *input, size_t length, char *output,
-                          base64_options options) const noexcept;
 };
-} // namespace fallback
+
+} // namespace lsx
 } // namespace simdutf
 
-#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
-/* end file src/simdutf/fallback/implementation.h */
+#endif // SIMDUTF_LSX_IMPLEMENTATION_H
+/* end file src/simdutf/lsx/implementation.h */
 
-/* begin file src/simdutf/fallback/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "fallback"
-// #define SIMDUTF_IMPLEMENTATION fallback
-/* end file src/simdutf/fallback/begin.h */
+/* begin file src/simdutf/lsx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lsx"
+// #define SIMDUTF_IMPLEMENTATION lsx
+/* end file src/simdutf/lsx/begin.h */
 
   // Declarations
-/* begin file src/simdutf/fallback/bitmanipulation.h */
-#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
-#define SIMDUTF_FALLBACK_BITMANIPULATION_H
+/* begin file src/simdutf/lsx/intrinsics.h */
+#ifndef SIMDUTF_LSX_INTRINSICS_H
+#define SIMDUTF_LSX_INTRINSICS_H
+
+
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <lsxintrin.h>
+
+#endif //  SIMDUTF_LSX_INTRINSICS_H
+/* end file src/simdutf/lsx/intrinsics.h */
+/* begin file src/simdutf/lsx/bitmanipulation.h */
+#ifndef SIMDUTF_LSX_BITMANIPULATION_H
+#define SIMDUTF_LSX_BITMANIPULATION_H
 
 #include <limits>
 
 namespace simdutf {
-namespace fallback {
-namespace {} // unnamed namespace
-} // namespace fallback
-} // namespace simdutf
+namespace lsx {
+namespace {
 
-#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
-/* end file src/simdutf/fallback/bitmanipulation.h */
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
+}
 
-/* begin file src/simdutf/fallback/end.h */
-/* end file src/simdutf/fallback/end.h */
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+  return __builtin_ctzll(input_num);
+}
+#endif
 
-#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
-#endif // SIMDUTF_FALLBACK_H
-/* end file src/simdutf/fallback.h */
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
 
-/* begin file src/scalar/utf8.h */
-#ifndef SIMDUTF_UTF8_H
-#define SIMDUTF_UTF8_H
+#endif // SIMDUTF_LSX_BITMANIPULATION_H
+/* end file src/simdutf/lsx/bitmanipulation.h */
+/* begin file src/simdutf/lsx/simd.h */
+#ifndef SIMDUTF_LSX_SIMD_H
+#define SIMDUTF_LSX_SIMD_H
+
+#include <type_traits>
 
 namespace simdutf {
-namespace scalar {
+namespace lsx {
 namespace {
-namespace utf8 {
-#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
-// only used by the fallback kernel.
-// credit: based on code from Google Fuchsia (Apache Licensed)
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  uint64_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 16 bytes are ascii.
-    uint64_t next_pos = pos + 16;
-    if (next_pos <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
+namespace simd {
 
-    while (byte < 0b10000000) {
-      if (++pos == len) {
-        return true;
-      }
-      byte = data[pos];
-    }
+template <typename T> struct simd8;
 
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) {
-        return false;
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) {
-        return false;
-      }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) {
-        return false;
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point) ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return false;
-      }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) {
-        return false;
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return false;
-      }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) {
-        return false;
-      }
-    } else {
-      // we may have a continuation
-      return false;
-    }
-    pos = next_pos;
-  }
-  return true;
-}
-#endif
+//
+// Base class of simd8<uint8_t> and simd8<bool>, both of which use __m128i
+// internally.
+//
+template <typename T, typename Mask = simd8<bool>> struct base_u8 {
+  __m128i value;
+  static const int SIZE = sizeof(value);
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf,
-                                                       size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  uint32_t code_point = 0;
-  while (pos < len) {
-    // check of the next 16 bytes are ascii.
-    size_t next_pos = pos + 16;
-    if (next_pos <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      std::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        pos = next_pos;
-        continue;
-      }
-    }
-    unsigned char byte = data[pos];
+  // Conversion from/to SIMD register
+  simdutf_really_inline base_u8(const __m128i _value) : value(_value) {}
+  simdutf_really_inline operator const __m128i &() const { return this->value; }
+  simdutf_really_inline operator __m128i &() { return this->value; }
+  simdutf_really_inline T first() const {
+    return __lsx_vpickve2gr_bu(this->value, 0);
+  }
+  simdutf_really_inline T last() const {
+    return __lsx_vpickve2gr_bu(this->value, 15);
+  }
 
-    while (byte < 0b10000000) {
-      if (++pos == len) {
-        return result(error_code::SUCCESS, len);
-      }
-      byte = data[pos];
-    }
+  // Bit operations
+  simdutf_really_inline simd8<T> operator|(const simd8<T> other) const {
+    return __lsx_vor_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> operator&(const simd8<T> other) const {
+    return __lsx_vand_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> operator^(const simd8<T> other) const {
+    return __lsx_vxor_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> bit_andnot(const simd8<T> other) const {
+    return __lsx_vandn_v(this->value, other);
+  }
+  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
+  simdutf_really_inline simd8<T> &operator|=(const simd8<T> other) {
+    auto this_cast = static_cast<simd8<T> *>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd8<T> &operator&=(const simd8<T> other) {
+    auto this_cast = static_cast<simd8<T> *>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd8<T> &operator^=(const simd8<T> other) {
+    auto this_cast = static_cast<simd8<T> *>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
+  }
 
-    if ((byte & 0b11100000) == 0b11000000) {
-      next_pos = pos + 2;
-      if (next_pos > len) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if ((code_point < 0x80) || (0x7ff < code_point)) {
-        return result(error_code::OVERLONG, pos);
-      }
-    } else if ((byte & 0b11110000) == 0b11100000) {
-      next_pos = pos + 3;
-      if (next_pos > len) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      code_point = (byte & 0b00001111) << 12 |
-                   (data[pos + 1] & 0b00111111) << 6 |
-                   (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xd7ff < code_point && code_point < 0xe000) {
-        return result(error_code::SURROGATE, pos);
-      }
-    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
-      next_pos = pos + 4;
-      if (next_pos > len) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      code_point =
-          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
-          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0x10ffff < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      } else {
-        return result(error_code::HEADER_BITS, pos);
-      }
-    }
-    pos = next_pos;
+  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+                                               const simd8<T> rhs) {
+    return __lsx_vseq_b(lhs, rhs);
   }
-  return result(error_code::SUCCESS, len);
-}
 
-// Finds the previous leading byte starting backward from buf and validates with
-// errors from there Used to pinpoint the location of an error when an invalid
-// chunk is detected We assume that the stream starts with a leading byte, and
-// to check that it is the case, we ask that you pass a pointer to the start of
-// the stream (start).
-inline simdutf_warn_unused result rewind_and_validate_with_errors(
-    const char *start, const char *buf, size_t len) noexcept {
-  // First check that we start with a leading byte
-  if ((*start & 0b11000000) == 0b10000000) {
-    return result(error_code::TOO_LONG, 0);
+  template <int N = 1>
+  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
+                       __lsx_vbsrl_v(prev_chunk.value, 16 - N));
   }
-  size_t extra_len{0};
-  // A leading byte cannot be further than 4 bytes away
-  for (int i = 0; i < 5; i++) {
-    unsigned char byte = *buf;
-    if ((byte & 0b11000000) != 0b10000000) {
-      break;
-    } else {
-      buf--;
-      extra_len++;
-    }
+};
+
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base_u8<bool> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  static simdutf_really_inline simd8<bool> splat(bool _value) {
+    return __lsx_vreplgr2vr_b(uint8_t(-(!!_value)));
   }
 
-  result res = validate_with_errors(buf, len + extra_len);
-  res.count -= extra_len;
-  return res;
-}
+  simdutf_really_inline simd8(const __m128i _value) : base_u8<bool>(_value) {}
+  // False constructor
+  simdutf_really_inline simd8() : simd8(__lsx_vldi(0)) {}
+  // Splat constructor
+  simdutf_really_inline simd8(bool _value) : simd8(splat(_value)) {}
+  simdutf_really_inline void store(uint8_t dst[16]) const {
+    return __lsx_vst(this->value, dst, 0);
+  }
 
-inline size_t count_code_points(const char *buf, size_t len) {
-  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    // -65 is 0b10111111, anything larger in two-complement's should start a new
-    // code point.
-    if (p[i] > -65) {
-      counter++;
-    }
+  simdutf_really_inline uint32_t to_bitmask() const {
+    return __lsx_vpickve2gr_wu(__lsx_vmsknz_b(*this), 0);
   }
-  return counter;
-}
 
-inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
-  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    if (p[i] > -65) {
-      counter++;
-    }
-    if (uint8_t(p[i]) >= 240) {
-      counter++;
-    }
+  simdutf_really_inline bool any() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) != 0;
   }
-  return counter;
-}
+  simdutf_really_inline bool none() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) == 0;
+  }
+  simdutf_really_inline bool all() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(*this), 0) == 0xFFFF;
+  }
+};
 
-simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
-                                                    size_t length) {
-  if (length < 3) {
-    switch (length) {
-    case 2:
-      if (uint8_t(input[length - 1]) >= 0xc0) {
-        return length - 1;
-      } // 2-, 3- and 4-byte characters with only 1 byte left
-      if (uint8_t(input[length - 2]) >= 0xe0) {
-        return length - 2;
-      } // 3- and 4-byte characters with only 2 bytes left
-      return length;
-    case 1:
-      if (uint8_t(input[length - 1]) >= 0xc0) {
-        return length - 1;
-      } // 2-, 3- and 4-byte characters with only 1 byte left
-      return length;
-    case 0:
-      return length;
-    }
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base_u8<uint8_t> {
+  static simdutf_really_inline simd8<uint8_t> splat(uint8_t _value) {
+    return __lsx_vreplgr2vr_b(_value);
   }
-  if (uint8_t(input[length - 1]) >= 0xc0) {
-    return length - 1;
-  } // 2-, 3- and 4-byte characters with only 1 byte left
-  if (uint8_t(input[length - 2]) >= 0xe0) {
-    return length - 2;
-  } // 3- and 4-byte characters with only 1 byte left
-  if (uint8_t(input[length - 3]) >= 0xf0) {
-    return length - 3;
-  } // 4-byte characters with only 3 bytes left
-  return length;
-}
+  static simdutf_really_inline simd8<uint8_t> zero() { return __lsx_vldi(0); }
+  static simdutf_really_inline simd8<uint8_t> load(const uint8_t *values) {
+    return __lsx_vld(values, 0);
+  }
+  simdutf_really_inline simd8(const __m128i _value)
+      : base_u8<uint8_t>(_value) {}
+  // Zero constructor
+  simdutf_really_inline simd8() : simd8(zero()) {}
+  // Array constructor
+  simdutf_really_inline simd8(const uint8_t values[16]) : simd8(load(values)) {}
+  // Splat constructor
+  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Member-by-member initialization
 
-} // namespace utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  simdutf_really_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15)
+      : simd8((__m128i)v16u8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+                             v12, v13, v14, v15}) {}
 
-#endif
-/* end file src/scalar/utf8.h */
-/* begin file src/scalar/utf16.h */
-#ifndef SIMDUTF_UTF16_H
-#define SIMDUTF_UTF16_H
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15);
+  }
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16 {
+  // Store to array
+  simdutf_really_inline void store(uint8_t dst[16]) const {
+    return __lsx_vst(this->value, dst, 0);
+  }
 
-inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
-  return uint16_t((word >> 8) | (word << 8));
-}
+  // Saturated math
+  simdutf_really_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return __lsx_vsadd_bu(this->value, other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return __lsx_vssub_bu(this->value, other);
+  }
 
-template <endianness big_endian>
-inline simdutf_warn_unused bool validate(const char16_t *buf,
-                                         size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  uint64_t pos = 0;
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) == 0xD800) {
-      if (pos + 1 >= len) {
-        return false;
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return false;
-      }
-      uint16_t next_word =
-          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return false;
-      }
-      pos += 2;
-    } else {
-      pos++;
-    }
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd8<uint8_t>
+  operator+(const simd8<uint8_t> other) const {
+    return __lsx_vadd_b(this->value, other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  operator-(const simd8<uint8_t> other) const {
+    return __lsx_vsub_b(this->value, other);
+  }
+  simdutf_really_inline simd8<uint8_t> &operator+=(const simd8<uint8_t> other) {
+    *this = *this + other;
+    return *this;
+  }
+  simdutf_really_inline simd8<uint8_t> &operator-=(const simd8<uint8_t> other) {
+    *this = *this - other;
+    return *this;
   }
-  return true;
-}
 
-template <endianness big_endian>
-inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
-                                                       size_t len) noexcept {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) == 0xD800) {
-      if (pos + 1 >= len) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t next_word =
-          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      pos += 2;
+  // Order-specific operations
+  simdutf_really_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return __lsx_vmax_bu(*this, other);
+  }
+  simdutf_really_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return __lsx_vmin_bu(*this, other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return __lsx_vsle_bu(*this, other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return __lsx_vsle_bu(other, *this);
+  }
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return __lsx_vslt_bu(*this, other);
+  }
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return __lsx_vslt_bu(other, *this);
+  }
+  // Same as >, but instead of guaranteeing all 1's == true, false = 0 and true
+  // = nonzero. For ARM, returns all 1's.
+  simdutf_really_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return simd8<uint8_t>(*this > other);
+  }
+  // Same as <, but instead of guaranteeing all 1's == true, false = 0 and true
+  // = nonzero. For ARM, returns all 1's.
+  simdutf_really_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return simd8<uint8_t>(*this < other);
+  }
+
+  // Bit-specific operations
+  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return __lsx_vslt_bu(__lsx_vldi(0), __lsx_vand_v(this->value, bits));
+  }
+  simdutf_really_inline bool is_ascii() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmskgez_b(this->value), 0) == 0xFFFF;
+  }
+
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    return __lsx_vpickve2gr_hu(__lsx_vmsknz_b(this->value), 0) > 0;
+  }
+  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return (*this & bits).any_bits_set_anywhere();
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+    return __lsx_vsrli_b(this->value, N);
+  }
+  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+    return __lsx_vslli_b(this->value, N);
+  }
+
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return lookup_table.apply_lookup_16_to(*this);
+  }
+
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
+  }
+
+  template <typename T>
+  simdutf_really_inline simd8<uint8_t>
+  apply_lookup_16_to(const simd8<T> original) const {
+    __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
+    return __lsx_vshuf_b(__lsx_vldi(0), *this, simd8<uint8_t>(original_tmp));
+  }
+};
+
+// Signed bytes
+template <> struct simd8<int8_t> {
+  __m128i value;
+
+  static simdutf_really_inline simd8<int8_t> splat(int8_t _value) {
+    return __lsx_vreplgr2vr_b(_value);
+  }
+  static simdutf_really_inline simd8<int8_t> zero() { return __lsx_vldi(0); }
+  static simdutf_really_inline simd8<int8_t> load(const int8_t values[16]) {
+    return __lsx_vld(values, 0);
+  }
+
+  template <endianness big_endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *p) const {
+    __m128i zero = __lsx_vldi(0);
+    if (match_system(big_endian)) {
+      __lsx_vst(__lsx_vilvl_b(zero, (__m128i)this->value),
+                reinterpret_cast<uint16_t *>(p), 0);
+      __lsx_vst(__lsx_vilvh_b(zero, (__m128i)this->value),
+                reinterpret_cast<uint16_t *>(p + 8), 0);
     } else {
-      pos++;
+      __lsx_vst(__lsx_vilvl_b((__m128i)this->value, zero),
+                reinterpret_cast<uint16_t *>(p), 0);
+      __lsx_vst(__lsx_vilvh_b((__m128i)this->value, zero),
+                reinterpret_cast<uint16_t *>(p + 8), 0);
     }
   }
-  return result(error_code::SUCCESS, pos);
-}
 
-template <endianness big_endian>
-inline size_t count_code_points(const char16_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *p) const {
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, (__m128i)this->value);
+    __m128i in16high = __lsx_vilvh_b(zero, (__m128i)this->value);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(p), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(p + 4), 0);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(p + 8), 0);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(p + 12), 0);
   }
-  return counter;
-}
 
-template <endianness big_endian>
-inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter++; // ASCII
-    counter += static_cast<size_t>(
-        word >
-        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
-    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
-                                   (word >= 0xE000)); // three-byte
-  }
-  return counter;
-}
+  // In places where the table can be reused, which is most uses in simdutf, it
+  // is worth it to do 4 table lookups, as there is no direct zero extension
+  // from u8 to u32.
+  simdutf_really_inline void store_ascii_as_utf32_tbl(char32_t *p) const {
+    const simd8<uint8_t> tb1{0, 255, 255, 255, 1, 255, 255, 255,
+                             2, 255, 255, 255, 3, 255, 255, 255};
+    const simd8<uint8_t> tb2{4, 255, 255, 255, 5, 255, 255, 255,
+                             6, 255, 255, 255, 7, 255, 255, 255};
+    const simd8<uint8_t> tb3{8,  255, 255, 255, 9,  255, 255, 255,
+                             10, 255, 255, 255, 11, 255, 255, 255};
+    const simd8<uint8_t> tb4{12, 255, 255, 255, 13, 255, 255, 255,
+                             14, 255, 255, 255, 15, 255, 255, 255};
 
-template <endianness big_endian>
-inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
-    counter += ((word & 0xFC00) != 0xDC00);
+    // encourage store pairing and interleaving
+    const auto shuf1 = this->apply_lookup_16_to(tb1);
+    const auto shuf2 = this->apply_lookup_16_to(tb2);
+    shuf1.store(reinterpret_cast<int8_t *>(p));
+    shuf2.store(reinterpret_cast<int8_t *>(p + 4));
+
+    const auto shuf3 = this->apply_lookup_16_to(tb3);
+    const auto shuf4 = this->apply_lookup_16_to(tb4);
+    shuf3.store(reinterpret_cast<int8_t *>(p + 8));
+    shuf4.store(reinterpret_cast<int8_t *>(p + 12));
   }
-  return counter;
-}
+  // Conversion from/to SIMD register
+  simdutf_really_inline simd8(const __m128i _value) : value(_value) {}
+  simdutf_really_inline operator const __m128i &() const { return this->value; }
 
-inline size_t latin1_length_from_utf16(size_t len) { return len; }
+  simdutf_really_inline operator const __m128i() const { return this->value; }
 
-simdutf_really_inline void change_endianness_utf16(const char16_t *in,
-                                                   size_t size, char16_t *out) {
-  const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
-  uint16_t *output = reinterpret_cast<uint16_t *>(out);
-  for (size_t i = 0; i < size; i++) {
-    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+  simdutf_really_inline operator __m128i &() { return this->value; }
+
+  // Zero constructor
+  simdutf_really_inline simd8() : simd8(zero()) {}
+  // Splat constructor
+  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const int8_t *values) : simd8(load(values)) {}
+  // Member-by-member initialization
+
+  simdutf_really_inline simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3,
+                              int8_t v4, int8_t v5, int8_t v6, int8_t v7,
+                              int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+                              int8_t v12, int8_t v13, int8_t v14, int8_t v15)
+      : simd8((__m128i)v16i8{v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+                             v12, v13, v14, v15}) {}
+
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15);
   }
-}
 
-template <endianness big_endian>
-simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
-                                                     size_t length) {
-  if (length <= 1) {
-    return length;
+  // Store to array
+  simdutf_really_inline void store(int8_t dst[16]) const {
+    return __lsx_vst(value, dst, 0);
   }
-  uint16_t last_word = uint16_t(input[length - 1]);
-  last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
-  length -= ((last_word & 0xFC00) == 0xD800);
-  return length;
-}
 
-} // namespace utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  simdutf_really_inline operator simd8<uint8_t>() const {
+    return ((__m128i)this->value);
+  }
 
-#endif
-/* end file src/scalar/utf16.h */
-/* begin file src/scalar/utf32.h */
-#ifndef SIMDUTF_UTF32_H
-#define SIMDUTF_UTF32_H
+  simdutf_really_inline simd8<int8_t>
+  operator|(const simd8<int8_t> other) const {
+    return __lsx_vor_v((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  operator&(const simd8<int8_t> other) const {
+    return __lsx_vand_v((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  operator^(const simd8<int8_t> other) const {
+    return __lsx_vxor_v((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  bit_andnot(const simd8<int8_t> other) const {
+    return __lsx_vandn_v((__m128i)other.value, (__m128i)value);
+  }
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32 {
+  // Math
+  simdutf_really_inline simd8<int8_t>
+  operator+(const simd8<int8_t> other) const {
+    return __lsx_vadd_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t>
+  operator-(const simd8<int8_t> other) const {
+    return __lsx_vsub_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t> &operator+=(const simd8<int8_t> other) {
+    *this = *this + other;
+    return *this;
+  }
+  simdutf_really_inline simd8<int8_t> &operator-=(const simd8<int8_t> other) {
+    *this = *this - other;
+    return *this;
+  }
 
-inline simdutf_warn_unused bool validate(const char32_t *buf,
-                                         size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  uint64_t pos = 0;
-  for (; pos < len; pos++) {
-    uint32_t word = data[pos];
-    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
-      return false;
-    }
+  simdutf_really_inline bool is_ascii() const {
+    return (__lsx_vpickve2gr_hu(__lsx_vmskgez_b((__m128i)this->value), 0) ==
+            0xffff);
   }
-  return true;
-}
 
-inline simdutf_warn_unused result validate_with_errors(const char32_t *buf,
-                                                       size_t len) noexcept {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  for (; pos < len; pos++) {
-    uint32_t word = data[pos];
-    if (word > 0x10FFFF) {
-      return result(error_code::TOO_LARGE, pos);
-    }
-    if (word >= 0xD800 && word <= 0xDFFF) {
-      return result(error_code::SURROGATE, pos);
-    }
+  // Order-sensitive comparisons
+  simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+    return __lsx_vmax_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+    return __lsx_vmin_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+    return __lsx_vslt_b((__m128i)other.value, (__m128i)value);
+  }
+  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+    return __lsx_vslt_b((__m128i)value, (__m128i)other.value);
+  }
+  simdutf_really_inline simd8<bool>
+  operator==(const simd8<int8_t> other) const {
+    return __lsx_vseq_b((__m128i)value, (__m128i)other.value);
   }
-  return result(error_code::SUCCESS, pos);
-}
 
-inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    // credit: @ttsugriy  for the vectorizable approach
-    counter++;                                     // ASCII
-    counter += static_cast<size_t>(p[i] > 0x7F);   // two-byte
-    counter += static_cast<size_t>(p[i] > 0x7FF);  // three-byte
-    counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
+  template <int N = 1>
+  simdutf_really_inline simd8<int8_t>
+  prev(const simd8<int8_t> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(this->value, N),
+                       __lsx_vbsrl_v(prev_chunk.value, 16 - N));
   }
-  return counter;
-}
 
-inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) {
-  // We are not BOM aware.
-  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
-  size_t counter{0};
-  for (size_t i = 0; i < len; i++) {
-    counter++;                                     // non-surrogate word
-    counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
+  // Perform a lookup assuming no value is larger than 16
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    return lookup_table.apply_lookup_16_to(*this);
+  }
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
   }
-  return counter;
-}
 
-inline size_t latin1_length_from_utf32(size_t len) {
-  // We are not BOM aware.
-  return len; // a utf32 codepoint will always represent 1 latin1 character
-}
+  template <typename T>
+  simdutf_really_inline simd8<int8_t>
+  apply_lookup_16_to(const simd8<T> original) const {
+    __m128i original_tmp = __lsx_vand_v(original, __lsx_vldi(0x1f));
+    return __lsx_vshuf_b(__lsx_vldi(0), (__m128i)this->value,
+                         simd8<uint8_t>(original_tmp));
+  }
+};
 
-inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) {
-  return ((word >> 24) & 0xff) |      // move byte 3 to byte 0
-         ((word << 8) & 0xff0000) |   // move byte 1 to byte 2
-         ((word >> 8) & 0xff00) |     // move byte 2 to byte 1
-         ((word << 24) & 0xff000000); // byte 0 to byte 3
-}
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(
+      NUM_CHUNKS == 4,
+      "LoongArch kernel should use four registers per 64-byte block.");
+  simd8<T> chunks[NUM_CHUNKS];
 
-} // namespace utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T> other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
 
-#endif
-/* end file src/scalar/utf32.h */
-/* begin file src/scalar/base64.h */
-#ifndef SIMDUTF_BASE64_H
-#define SIMDUTF_BASE64_H
+  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1,
+                                 const simd8<T> chunk2, const simd8<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+  simdutf_really_inline simd8x64(const T *ptr)
+      : chunks{simd8<T>::load(ptr),
+               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T)),
+               simd8<T>::load(ptr + 2 * sizeof(simd8<T>) / sizeof(T)),
+               simd8<T>::load(ptr + 3 * sizeof(simd8<T>) / sizeof(T))} {}
 
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <iostream>
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
+    this->chunks[2].store(ptr + sizeof(simd8<T>) * 2 / sizeof(T));
+    this->chunks[3].store(ptr + sizeof(simd8<T>) * 3 / sizeof(T));
+  }
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace base64 {
+  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+    this->chunks[0] |= other.chunks[0];
+    this->chunks[1] |= other.chunks[1];
+    this->chunks[2] |= other.chunks[2];
+    this->chunks[3] |= other.chunks[3];
+    return *this;
+  }
 
-// This function is not expected to be fast. Do not use in long loops.
-template <class char_type> bool is_ascii_white_space(char_type c) {
-  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
-}
+  simdutf_really_inline simd8<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
+  }
 
-template <class char_type> bool is_ascii_white_space_or_padding(char_type c) {
-  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
-         c == '=';
-}
+  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
 
-template <class char_type> bool is_eight_byte(char_type c) {
-  if (sizeof(char_type) == 1) {
-    return true;
+  template <endianness endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 0);
+    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 1);
+    this->chunks[2].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 2);
+    this->chunks[3].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 3);
   }
-  return uint8_t(c) == c;
-}
 
-// Returns true upon success. The destination buffer must be large enough.
-// This functions assumes that the padding (=) has been removed.
-template <class char_type>
-full_result
-base64_tail_decode(char *dst, const char_type *src, size_t length,
-                   size_t padded_characters, // number of padding characters
-                                             // '=', typically 0, 1, 2.
-                   base64_options options,
-                   last_chunk_handling_options last_chunk_options) {
-  // This looks like 5 branches, but we expect the compiler to resolve this to a
-  // single branch:
-  const uint8_t *to_base64 = (options & base64_url)
-                                 ? tables::base64::to_base64_url_value
-                                 : tables::base64::to_base64_value;
-  const uint32_t *d0 = (options & base64_url)
-                           ? tables::base64::base64_url::d0
-                           : tables::base64::base64_default::d0;
-  const uint32_t *d1 = (options & base64_url)
-                           ? tables::base64::base64_url::d1
-                           : tables::base64::base64_default::d1;
-  const uint32_t *d2 = (options & base64_url)
-                           ? tables::base64::base64_url::d2
-                           : tables::base64::base64_default::d2;
-  const uint32_t *d3 = (options & base64_url)
-                           ? tables::base64::base64_url::d3
-                           : tables::base64::base64_default::d3;
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 0);
+    this->chunks[1].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 1);
+    this->chunks[2].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 2);
+    this->chunks[3].store_ascii_as_utf32_tbl(ptr + sizeof(simd8<T>) * 3);
+  }
 
-  const char_type *srcend = src + length;
-  const char_type *srcinit = src;
-  const char *dstinit = dst;
+  simdutf_really_inline uint64_t to_bitmask() const {
+    __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[3]), 6);
+    mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[2]), 4));
+    mask = __lsx_vor_v(mask, __lsx_vbsll_v(__lsx_vmsknz_b(this->chunks[1]), 2));
+    mask = __lsx_vor_v(mask, __lsx_vmsknz_b(this->chunks[0]));
+    return __lsx_vpickve2gr_du(mask, 0);
+  }
 
-  uint32_t x;
-  size_t idx;
-  uint8_t buffer[4];
-  while (true) {
-    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
-           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
-           is_eight_byte(src[3]) &&
-           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
-                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
-      if (match_system(endianness::BIG)) {
-        x = scalar::utf32::swap_bytes(x);
-      }
-      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
-      dst += 3;
-      src += 4;
-    }
-    idx = 0;
-    // we need at least four characters.
-    while (idx < 4 && src < srcend) {
-      char_type c = *src;
-      uint8_t code = to_base64[uint8_t(c)];
-      buffer[idx] = uint8_t(code);
-      if (is_eight_byte(c) && code <= 63) {
-        idx++;
-      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
-        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      } else {
-        // We have a space or a newline. We ignore it.
-      }
-      src++;
-    }
-    if (idx != 4) {
-      if (last_chunk_options == last_chunk_handling_options::strict &&
-          (idx != 1) && ((idx + padded_characters) & 3) != 0) {
-        // The partial chunk was at src - idx
-        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      } else if (last_chunk_options ==
-                     last_chunk_handling_options::stop_before_partial &&
-                 (idx != 1) && ((idx + padded_characters) & 3) != 0) {
-        // Rewind src to before partial chunk
-        src -= idx;
-        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
-      } else {
-        if (idx == 2) {
-          uint32_t triple =
-              (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
-          if ((last_chunk_options == last_chunk_handling_options::strict) &&
-              (triple & 0xffff)) {
-            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
-                    size_t(dst - dstinit)};
-          }
-          if (match_system(endianness::BIG)) {
-            triple <<= 8;
-            std::memcpy(dst, &triple, 1);
-          } else {
-            triple = scalar::utf32::swap_bytes(triple);
-            triple >>= 8;
-            std::memcpy(dst, &triple, 1);
-          }
-          dst += 1;
-        } else if (idx == 3) {
-          uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
-                            (uint32_t(buffer[1]) << 2 * 6) +
-                            (uint32_t(buffer[2]) << 1 * 6);
-          if ((last_chunk_options == last_chunk_handling_options::strict) &&
-              (triple & 0xff)) {
-            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
-                    size_t(dst - dstinit)};
-          }
-          if (match_system(endianness::BIG)) {
-            triple <<= 8;
-            std::memcpy(dst, &triple, 2);
-          } else {
-            triple = scalar::utf32::swap_bytes(triple);
-            triple >>= 8;
-            std::memcpy(dst, &triple, 2);
-          }
-          dst += 2;
-        } else if (idx == 1) {
-          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
-                  size_t(dst - dstinit)};
-        }
-        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
-      }
-    }
-
-    uint32_t triple =
-        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
-        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
-    if (match_system(endianness::BIG)) {
-      triple <<= 8;
-      std::memcpy(dst, &triple, 3);
-    } else {
-      triple = scalar::utf32::swap_bytes(triple);
-      triple >>= 8;
-      std::memcpy(dst, &triple, 3);
-    }
-    dst += 3;
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                          this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
   }
-}
 
-// like base64_tail_decode, but it will not write past the end of the output
-// buffer. The outlen paramter is modified to reflect the number of bytes
-// written. This functions assumes that the padding (=) has been removed.
-template <class char_type>
-result base64_tail_decode_safe(
-    char *dst, size_t &outlen, const char_type *&srcr, size_t length,
-    size_t padded_characters, // number of padding characters '=', typically 0,
-                              // 1, 2.
-    base64_options options, last_chunk_handling_options last_chunk_options) {
-  const char_type *src = srcr;
-  if (length == 0) {
-    outlen = 0;
-    return {SUCCESS, 0};
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                          this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
   }
-  // This looks like 5 branches, but we expect the compiler to resolve this to a
-  // single branch:
-  const uint8_t *to_base64 = (options & base64_url)
-                                 ? tables::base64::to_base64_url_value
-                                 : tables::base64::to_base64_value;
-  const uint32_t *d0 = (options & base64_url)
-                           ? tables::base64::base64_url::d0
-                           : tables::base64::base64_default::d0;
-  const uint32_t *d1 = (options & base64_url)
-                           ? tables::base64::base64_url::d1
-                           : tables::base64::base64_default::d1;
-  const uint32_t *d2 = (options & base64_url)
-                           ? tables::base64::base64_url::d2
-                           : tables::base64::base64_default::d2;
-  const uint32_t *d3 = (options & base64_url)
-                           ? tables::base64::base64_url::d3
-                           : tables::base64::base64_default::d3;
 
-  const char_type *srcend = src + length;
-  const char_type *srcinit = src;
-  const char *dstinit = dst;
-  const char *dstend = dst + outlen;
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
 
-  uint32_t x;
-  size_t idx;
-  uint8_t buffer[4];
-  while (true) {
-    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
-           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
-           is_eight_byte(src[3]) &&
-           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
-                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
-      if (dstend - dst < 3) {
-        outlen = size_t(dst - dstinit);
-        srcr = src;
-        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
-      }
-      if (match_system(endianness::BIG)) {
-        x = scalar::utf32::swap_bytes(x);
-      }
-      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
-      dst += 3;
-      src += 4;
-    }
-    idx = 0;
-    const char_type *srccur = src;
-    // We need at least four characters.
-    while (idx < 4 && src < srcend) {
-      char_type c = *src;
-      uint8_t code = to_base64[uint8_t(c)];
+    return simd8x64<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+               (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+               (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
+    return simd8x64<bool>(
+               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+               (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+               (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+                          this->chunks[2] < mask, this->chunks[3] < mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask,
+                          this->chunks[2] > mask, this->chunks[3] > mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask,
+                          this->chunks[2] >= mask, this->chunks[3] >= mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+    return simd8x64<bool>(simd8<uint8_t>(this->chunks[0].value) >= mask,
+                          simd8<uint8_t>(this->chunks[1].value) >= mask,
+                          simd8<uint8_t>(this->chunks[2].value) >= mask,
+                          simd8<uint8_t>(this->chunks[3].value) >= mask)
+        .to_bitmask();
+  }
+}; // struct simd8x64<T>
+/* begin file src/simdutf/lsx/simd16-inl.h */
+template <typename T> struct simd16;
 
-      buffer[idx] = uint8_t(code);
-      if (is_eight_byte(c) && code <= 63) {
-        idx++;
-      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
-        outlen = size_t(dst - dstinit);
-        srcr = src;
-        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
-      } else {
-        // We have a space or a newline. We ignore it.
-      }
-      src++;
-    }
-    if (idx != 4) {
-      if (last_chunk_options == last_chunk_handling_options::strict &&
-          ((idx + padded_characters) & 3) != 0) {
-        outlen = size_t(dst - dstinit);
-        srcr = src;
-        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
-      } else if (last_chunk_options ==
-                     last_chunk_handling_options::stop_before_partial &&
-                 ((idx + padded_characters) & 3) != 0) {
-        // Rewind src to before partial chunk
-        srcr = srccur;
-        outlen = size_t(dst - dstinit);
-        return {SUCCESS, size_t(dst - dstinit)};
-      } else { // loose mode
-        if (idx == 0) {
-          // No data left; return success
-          outlen = size_t(dst - dstinit);
-          srcr = src;
-          return {SUCCESS, size_t(dst - dstinit)};
-        } else if (idx == 1) {
-          // Error: Incomplete chunk of length 1 is invalid in loose mode
-          outlen = size_t(dst - dstinit);
-          srcr = src;
-          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
-        } else if (idx == 2 || idx == 3) {
-          // Check if there's enough space in the destination buffer
-          size_t required_space = (idx == 2) ? 1 : 2;
-          if (size_t(dstend - dst) < required_space) {
-            outlen = size_t(dst - dstinit);
-            srcr = src;
-            return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
-          }
-          uint32_t triple = 0;
-          if (idx == 2) {
-            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12);
-            if ((last_chunk_options == last_chunk_handling_options::strict) &&
-                (triple & 0xffff)) {
-              srcr = src;
-              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
-            }
-            // Extract the first byte
-            triple >>= 16;
-            dst[0] = static_cast<char>(triple & 0xFF);
-            dst += 1;
-          } else if (idx == 3) {
-            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) +
-                     (uint32_t(buffer[2]) << 6);
-            if ((last_chunk_options == last_chunk_handling_options::strict) &&
-                (triple & 0xff)) {
-              srcr = src;
-              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
-            }
-            // Extract the first two bytes
-            triple >>= 8;
-            dst[0] = static_cast<char>((triple >> 8) & 0xFF);
-            dst[1] = static_cast<char>(triple & 0xFF);
-            dst += 2;
-          }
-          outlen = size_t(dst - dstinit);
-          srcr = src;
-          return {SUCCESS, size_t(dst - dstinit)};
-        }
-      }
-    }
+template <typename T, typename Mask = simd16<bool>> struct base_u16 {
+  __m128i value;
+  static const int SIZE = sizeof(value);
 
-    if (dstend - dst < 3) {
-      outlen = size_t(dst - dstinit);
-      srcr = src;
-      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
-    }
-    uint32_t triple = (uint32_t(buffer[0]) << 18) +
-                      (uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6) +
-                      (uint32_t(buffer[3]));
-    if (match_system(endianness::BIG)) {
-      triple <<= 8;
-      std::memcpy(dst, &triple, 3);
-    } else {
-      triple = scalar::utf32::swap_bytes(triple);
-      triple >>= 8;
-      std::memcpy(dst, &triple, 3);
-    }
-    dst += 3;
+  // Conversion from/to SIMD register
+  simdutf_really_inline base_u16() = default;
+  simdutf_really_inline base_u16(const __m128i _value) : value(_value) {}
+  // Bit operations
+  simdutf_really_inline simd16<T> operator|(const simd16<T> other) const {
+    return __lsx_vor_v(this->value, other.value);
   }
-}
-
-// Returns the number of bytes written. The destination buffer must be large
-// enough. It will add padding (=) if needed.
-size_t tail_encode_base64(char *dst, const char *src, size_t srclen,
-                          base64_options options) {
-  // By default, we use padding if we are not using the URL variant.
-  // This is check with ((options & base64_url) == 0) which returns true if we
-  // are not using the URL variant. However, we also allow 'inversion' of the
-  // convention with the base64_reverse_padding option. If the
-  // base64_reverse_padding option is set, we use padding if we are using the
-  // URL variant, and we omit it if we are not using the URL variant. This is
-  // checked with
-  // ((options & base64_reverse_padding) == base64_reverse_padding).
-  bool use_padding =
-      ((options & base64_url) == 0) ^
-      ((options & base64_reverse_padding) == base64_reverse_padding);
-  // This looks like 3 branches, but we expect the compiler to resolve this to
-  // a single branch:
-  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
-                                          : tables::base64::base64_default::e0;
-  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
-                                          : tables::base64::base64_default::e1;
-  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
-                                          : tables::base64::base64_default::e2;
-  char *out = dst;
-  size_t i = 0;
-  uint8_t t1, t2, t3;
-  for (; i + 2 < srclen; i += 3) {
-    t1 = uint8_t(src[i]);
-    t2 = uint8_t(src[i + 1]);
-    t3 = uint8_t(src[i + 2]);
-    *out++ = e0[t1];
-    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
-    *out++ = e2[t3];
+  simdutf_really_inline simd16<T> operator&(const simd16<T> other) const {
+    return __lsx_vand_v(this->value, other.value);
   }
-  switch (srclen - i) {
-  case 0:
-    break;
-  case 1:
-    t1 = uint8_t(src[i]);
-    *out++ = e0[t1];
-    *out++ = e1[(t1 & 0x03) << 4];
-    if (use_padding) {
-      *out++ = '=';
-      *out++ = '=';
-    }
-    break;
-  default: /* case 2 */
-    t1 = uint8_t(src[i]);
-    t2 = uint8_t(src[i + 1]);
-    *out++ = e0[t1];
-    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
-    *out++ = e2[(t2 & 0x0F) << 2];
-    if (use_padding) {
-      *out++ = '=';
-    }
+  simdutf_really_inline simd16<T> operator^(const simd16<T> other) const {
+    return __lsx_vxor_v(this->value, other.value);
+  }
+  simdutf_really_inline simd16<T> bit_andnot(const simd16<T> other) const {
+    return __lsx_vandn_v(this->value, other.value);
+  }
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
+  simdutf_really_inline simd16<T> &operator|=(const simd16<T> other) {
+    auto this_cast = static_cast<simd16<T> *>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd16<T> &operator&=(const simd16<T> other) {
+    auto this_cast = static_cast<simd16<T> *>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline simd16<T> &operator^=(const simd16<T> other) {
+    auto this_cast = static_cast<simd16<T> *>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
   }
-  return (size_t)(out - dst);
-}
 
-template <class char_type>
-simdutf_warn_unused size_t maximal_binary_length_from_base64(
-    const char_type *input, size_t length) noexcept {
-  // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
-  size_t padding = 0;
-  if (length > 0) {
-    if (input[length - 1] == '=') {
-      padding++;
-      if (length > 1 && input[length - 2] == '=') {
-        padding++;
-      }
-    }
+  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+                                               const simd16<T> rhs) {
+    return __lsx_vseq_h(lhs.value, rhs.value);
   }
-  size_t actual_length = length - padding;
-  if (actual_length % 4 <= 1) {
-    return actual_length / 4 * 3;
+
+  template <int N = 1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
+                       __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
   }
-  // if we have a valid input, then the remainder must be 2 or 3 adding one or
-  // two extra bytes.
-  return actual_length / 4 * 3 + (actual_length % 4) - 1;
-}
+};
 
-simdutf_warn_unused size_t
-base64_length_from_binary(size_t length, base64_options options) noexcept {
-  // By default, we use padding if we are not using the URL variant.
-  // This is check with ((options & base64_url) == 0) which returns true if we
-  // are not using the URL variant. However, we also allow 'inversion' of the
-  // convention with the base64_reverse_padding option. If the
-  // base64_reverse_padding option is set, we use padding if we are using the
-  // URL variant, and we omit it if we are not using the URL variant. This is
-  // checked with
-  // ((options & base64_reverse_padding) == base64_reverse_padding).
-  bool use_padding =
-      ((options & base64_url) == 0) ^
-      ((options & base64_reverse_padding) == base64_reverse_padding);
-  if (!use_padding) {
-    return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base_u16<T> {
+  typedef uint16_t bitmask_t;
+  typedef uint32_t bitmask2_t;
+
+  simdutf_really_inline base16() : base_u16<T>() {}
+  simdutf_really_inline base16(const __m128i _value) : base_u16<T>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer *ptr)
+      : base16(__lsx_vld(ptr, 0)) {}
+
+  static const int SIZE = sizeof(base_u16<T>::value);
+
+  template <int N = 1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    return __lsx_vor_v(__lsx_vbsll_v(*this, N * 2),
+                       __lsx_vbsrl_v(prev_chunk, 16 - N * 2));
   }
-  return (length + 2) / 3 *
-         4; // We use padding to make the length a multiple of 4.
-}
+};
 
-} // namespace base64
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) {
+    return __lsx_vreplgr2vr_h(uint16_t(-(!!_value)));
+  }
 
-#endif
-/* end file src/scalar/base64.h */
-/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF8_H
-#define SIMDUTF_LATIN1_TO_UTF8_H
+  simdutf_really_inline simd16() : base16() {}
+  simdutf_really_inline simd16(const __m128i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+};
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf8 {
+template <typename T> struct base16_numeric : base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) {
+    return __lsx_vreplgr2vr_h(_value);
+  }
+  static simdutf_really_inline simd16<T> zero() { return __lsx_vldi(0); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return __lsx_vld(reinterpret_cast<const uint16_t *>(values), 0);
+  }
 
-inline size_t convert(const char *buf, size_t len, char *utf8_output) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  size_t pos = 0;
-  size_t utf8_pos = 0;
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 |
-                 v2}; // We are only interested in these bits: 1000 1000 1000
-                      // 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          utf8_output[utf8_pos++] = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m128i _value)
+      : base16<T>(_value) {}
 
-    unsigned char byte = data[pos];
-    if ((byte & 0x80) == 0) { // if ASCII
-      // will generate one UTF-8 bytes
-      utf8_output[utf8_pos++] = char(byte);
-      pos++;
-    } else {
-      // will generate two UTF-8 bytes
-      utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
-      utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
-      pos++;
-    }
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const {
+    return __lsx_vst(this->value, dst, 0);
   }
-  return utf8_pos;
-}
 
-inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
-                           size_t utf8_len) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  size_t pos = 0;
-  size_t skip_pos = 0;
-  size_t utf8_pos = 0;
-  while (pos < len && utf8_pos < utf8_len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos >= skip_pos && pos + 16 <= len &&
-        utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
-                                     // check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 |
-                 v2}; // We are only interested in these bits: 1000 1000 1000
-                      // 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
-        utf8_pos += 16;
-        pos += 16;
-      } else {
-        // At least one of the next 16 bytes are not ASCII, we will process them
-        // one by one
-        skip_pos = pos + 16;
-      }
-    } else {
-      const auto byte = data[pos];
-      if ((byte & 0x80) == 0) { // if ASCII
-        // will generate one UTF-8 bytes
-        utf8_output[utf8_pos++] = char(byte);
-        pos++;
-      } else if (utf8_pos + 2 <= utf8_len) {
-        // will generate two UTF-8 bytes
-        utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
-        utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
-        pos++;
-      } else {
-        break;
-      }
-    }
-  }
-  return utf8_pos;
-}
-
-} // namespace latin1_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
-
-namespace simdutf {
-bool implementation::supported_by_runtime_system() const {
-  uint32_t required_instruction_sets = this->required_instruction_sets();
-  uint32_t supported_instruction_sets =
-      internal::detect_supported_architectures();
-  return ((supported_instruction_sets & required_instruction_sets) ==
-          required_instruction_sets);
-}
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFu; }
 
-simdutf_warn_unused encoding_type implementation::autodetect_encoding(
-    const char *input, size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+    return __lsx_vadd_b(*this, other);
   }
-  // UTF8 is common, it includes ASCII, and is commonly represented
-  // without a BOM, so if it fits, go with that. Note that it is still
-  // possible to get it wrong, we are only 'guessing'. If some has UTF-16
-  // data without a BOM, it could pass as UTF-8.
-  //
-  // An interesting twist might be to check for UTF-16 ASCII first (every
-  // other byte is zero).
-  if (validate_utf8(input, length)) {
-    return encoding_type::UTF8;
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+    return __lsx_vsub_b(*this, other);
   }
-  // The next most common encoding that might appear without BOM is probably
-  // UTF-16LE, so try that next.
-  if ((length % 2) == 0) {
-    // important: we need to divide by two
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      return encoding_type::UTF16_LE;
-    }
+  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+    *this = *this + other;
+    return *static_cast<simd16<T> *>(this);
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      return encoding_type::UTF32_LE;
-    }
+  simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+    *this = *this - other;
+    return *static_cast<simd16<T> *>(this);
   }
-  return encoding_type::unspecified;
-}
-
-namespace internal {
-// When there is a single implementation, we should not pay a price
-// for dispatching to the best implementation. We should just use the
-// one we have. This is a compile-time check.
-#define SIMDUTF_SINGLE_IMPLEMENTATION                                          \
-  (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL +           \
-       SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 +        \
-       SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_FALLBACK ==       \
-   1)
-
-// Static array of known implementations. We are hoping these get baked into the
-// executable without requiring a static initializer.
+};
 
-#if SIMDUTF_IMPLEMENTATION_ICELAKE
-static const icelake::implementation *get_icelake_singleton() {
-  static const icelake::implementation icelake_singleton{};
-  return &icelake_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_HASWELL
-static const haswell::implementation *get_haswell_singleton() {
-  static const haswell::implementation haswell_singleton{};
-  return &haswell_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_WESTMERE
-static const westmere::implementation *get_westmere_singleton() {
-  static const westmere::implementation westmere_singleton{};
-  return &westmere_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_ARM64
-static const arm64::implementation *get_arm64_singleton() {
-  static const arm64::implementation arm64_singleton{};
-  return &arm64_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_PPC64
-static const ppc64::implementation *get_ppc64_singleton() {
-  static const ppc64::implementation ppc64_singleton{};
-  return &ppc64_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_RVV
-static const rvv::implementation *get_rvv_singleton() {
-  static const rvv::implementation rvv_singleton{};
-  return &rvv_singleton;
-}
-#endif
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-static const fallback::implementation *get_fallback_singleton() {
-  static const fallback::implementation fallback_singleton{};
-  return &fallback_singleton;
-}
-#endif
+// Signed code unitstemplate<>
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value)
+      : base16_numeric<int16_t>(_value) {}
+  simdutf_really_inline simd16(simd16<bool> other)
+      : base16_numeric<int16_t>(other.value) {}
 
-#if SIMDUTF_SINGLE_IMPLEMENTATION
-static const implementation *get_single_implementation() {
-  return
-  #if SIMDUTF_IMPLEMENTATION_ICELAKE
-      get_icelake_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_HASWELL
-  get_haswell_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_WESTMERE
-  get_westmere_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_ARM64
-  get_arm64_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_PPC64
-  get_ppc64_singleton();
-  #endif
-  #if SIMDUTF_IMPLEMENTATION_FALLBACK
-  get_fallback_singleton();
-  #endif
-}
-#endif
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+  simdutf_really_inline operator simd16<uint16_t>() const;
 
-/**
- * @private Detects best supported implementation on first use, and sets it
- */
-class detect_best_supported_implementation_on_first_use final
-    : public implementation {
-public:
-  std::string name() const noexcept final { return set_best()->name(); }
-  std::string description() const noexcept final {
-    return set_best()->description();
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t>
+  max_val(const simd16<int16_t> other) const {
+    return __lsx_vmax_h(this->value, other.value);
   }
-  uint32_t required_instruction_sets() const noexcept final {
-    return set_best()->required_instruction_sets();
+  simdutf_really_inline simd16<int16_t>
+  min_val(const simd16<int16_t> other) const {
+    return __lsx_vmin_h(this->value, other.value);
   }
-
-  simdutf_warn_unused int
-  detect_encodings(const char *input, size_t length) const noexcept override {
-    return set_best()->detect_encodings(input, length);
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<int16_t> other) const {
+    return __lsx_vsle_h(other.value, this->value);
   }
-
-  simdutf_warn_unused bool
-  validate_utf8(const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8(buf, len);
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<int16_t> other) const {
+    return __lsx_vslt_h(this->value, other.value);
   }
+};
 
-  simdutf_warn_unused result validate_utf8_with_errors(
-      const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf8_with_errors(buf, len);
-  }
+// Unsigned code unitstemplate<>
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m128i _value)
+      : base16_numeric<uint16_t>((__m128i)_value) {}
+  simdutf_really_inline simd16(simd16<bool> other)
+      : base16_numeric<uint16_t>(other.value) {}
 
-  simdutf_warn_unused bool
-  validate_ascii(const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii(buf, len);
-  }
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
 
-  simdutf_warn_unused result validate_ascii_with_errors(
-      const char *buf, size_t len) const noexcept final override {
-    return set_best()->validate_ascii_with_errors(buf, len);
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t>
+  saturating_add(const simd16<uint16_t> other) const {
+    return __lsx_vsadd_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused bool
-  validate_utf16le(const char16_t *buf,
-                   size_t len) const noexcept final override {
-    return set_best()->validate_utf16le(buf, len);
+  simdutf_really_inline simd16<uint16_t>
+  saturating_sub(const simd16<uint16_t> other) const {
+    return __lsx_vssub_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused bool
-  validate_utf16be(const char16_t *buf,
-                   size_t len) const noexcept final override {
-    return set_best()->validate_utf16be(buf, len);
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t>
+  max_val(const simd16<uint16_t> other) const {
+    return __lsx_vmax_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused result validate_utf16le_with_errors(
-      const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16le_with_errors(buf, len);
+  simdutf_really_inline simd16<uint16_t>
+  min_val(const simd16<uint16_t> other) const {
+    return __lsx_vmin_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused result validate_utf16be_with_errors(
-      const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf16be_with_errors(buf, len);
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  gt_bits(const simd16<uint16_t> other) const {
+    return this->saturating_sub(other);
   }
-
-  simdutf_warn_unused bool
-  validate_utf32(const char32_t *buf,
-                 size_t len) const noexcept final override {
-    return set_best()->validate_utf32(buf, len);
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  lt_bits(const simd16<uint16_t> other) const {
+    return other.saturating_sub(*this);
   }
-
-  simdutf_warn_unused result validate_utf32_with_errors(
-      const char32_t *buf, size_t len) const noexcept final override {
-    return set_best()->validate_utf32_with_errors(buf, len);
+  simdutf_really_inline simd16<bool>
+  operator<=(const simd16<uint16_t> other) const {
+    return __lsx_vsle_hu(this->value, other.value);
   }
-
-  simdutf_warn_unused size_t
-  convert_latin1_to_utf8(const char *buf, size_t len,
-                         char *utf8_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+  simdutf_really_inline simd16<bool>
+  operator>=(const simd16<uint16_t> other) const {
+    return __lsx_vsle_hu(other.value, this->value);
   }
-
-  simdutf_warn_unused size_t convert_latin1_to_utf16le(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<uint16_t> other) const {
+    return __lsx_vslt_hu(other.value, this->value);
   }
-
-  simdutf_warn_unused size_t convert_latin1_to_utf16be(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<uint16_t> other) const {
+    return __lsx_vslt_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused size_t convert_latin1_to_utf32(
-      const char *buf, size_t len,
-      char32_t *latin1_output) const noexcept final override {
-    return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const {
+    return *this == uint16_t(0);
   }
-
-  simdutf_warn_unused size_t
-  convert_utf8_to_latin1(const char *buf, size_t len,
-                         char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+    return simd16<uint16_t>(__lsx_vsrli_h(this->value, N));
   }
-
-  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
-      const char *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
-                                                          latin1_output);
+  template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+    return simd16<uint16_t>(__lsx_vslli_h(this->value, N));
   }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
-      const char *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+  // logical operations
+  simdutf_really_inline simd16<uint16_t>
+  operator|(const simd16<uint16_t> other) const {
+    return __lsx_vor_v(this->value, other.value);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+  simdutf_really_inline simd16<uint16_t>
+  operator&(const simd16<uint16_t> other) const {
+    return __lsx_vand_v(this->value, other.value);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+  simdutf_really_inline simd16<uint16_t>
+  operator^(const simd16<uint16_t> other) const {
+    return __lsx_vxor_v(this->value, other.value);
   }
 
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len,
-                                                           utf16_output);
+  // Pack with the unsigned saturation of two uint16_t code units into single
+  // uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+                                                   const simd16<uint16_t> &v1) {
+    return __lsx_vssrlni_bu_h(v1.value, v0.value, 0);
   }
 
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len,
-                                                           utf16_output);
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    return __lsx_vshuf4i_b(this->value, 0b10110001);
   }
+};
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
-  }
+simdutf_really_inline simd16<int16_t>::operator simd16<uint16_t>() const {
+  return this->value;
+}
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
-      const char *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
-  }
+template <typename T> struct simd16x32 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+  static_assert(
+      NUM_CHUNKS == 4,
+      "LOONGARCH kernel should use four registers per 64-byte block.");
+  simd16<T> chunks[NUM_CHUNKS];
 
-  simdutf_warn_unused size_t
-  convert_utf8_to_utf32(const char *buf, size_t len,
-                        char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
-  }
+  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+  simd16x32<T> &
+  operator=(const simd16<T> other) = delete; // no assignment allowed
+  simd16x32() = delete;                      // no default constructor allowed
 
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
-      const char *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
-                                                         utf32_output);
-  }
+  simdutf_really_inline
+  simd16x32(const simd16<T> chunk0, const simd16<T> chunk1,
+            const simd16<T> chunk2, const simd16<T> chunk3)
+      : chunks{chunk0, chunk1, chunk2, chunk3} {}
+  simdutf_really_inline simd16x32(const T *ptr)
+      : chunks{simd16<T>::load(ptr),
+               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T)),
+               simd16<T>::load(ptr + 2 * sizeof(simd16<T>) / sizeof(T)),
+               simd16<T>::load(ptr + 3 * sizeof(simd16<T>) / sizeof(T))} {}
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
-      const char *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
+    this->chunks[2].store(ptr + sizeof(simd16<T>) * 2 / sizeof(T));
+    this->chunks[3].store(ptr + sizeof(simd16<T>) * 3 / sizeof(T));
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
+  simdutf_really_inline simd16<T> reduce_or() const {
+    return (this->chunks[0] | this->chunks[1]) |
+           (this->chunks[2] | this->chunks[3]);
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
-  }
+  simdutf_really_inline bool is_ascii() const { return reduce_or().is_ascii(); }
 
-  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_latin1_with_errors(buf, len,
-                                                             latin1_output);
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 1);
+    this->chunks[2].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 2);
+    this->chunks[3].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 3);
   }
 
-  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_latin1_with_errors(buf, len,
-                                                             latin1_output);
+  simdutf_really_inline uint64_t to_bitmask() const {
+    __m128i mask = __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[3]).value), 6);
+    mask = __lsx_vor_v(
+        mask, __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[2]).value), 4));
+    mask = __lsx_vor_v(
+        mask, __lsx_vbsll_v(__lsx_vmsknz_b((this->chunks[1]).value), 2));
+    mask = __lsx_vor_v(mask, __lsx_vmsknz_b((this->chunks[0]).value));
+    return __lsx_vpickve2gr_du(mask, 0);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+  simdutf_really_inline void swap_bytes() {
+    this->chunks[0] = this->chunks[0].swap_bytes();
+    this->chunks[1] = this->chunks[1].swap_bytes();
+    this->chunks[2] = this->chunks[2].swap_bytes();
+    this->chunks[3] = this->chunks[3].swap_bytes();
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
-      const char16_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask,
+                           this->chunks[2] == mask, this->chunks[3] == mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16le_to_utf8(const char16_t *buf, size_t len,
-                          char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask,
+                           this->chunks[2] <= mask, this->chunks[3] <= mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  convert_utf16be_to_utf8(const char16_t *buf, size_t len,
-                          char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
-  }
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(low);
+    const simd16<T> mask_high = simd16<T>::splat(high);
 
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len,
-                                                           utf8_output);
+    return simd16x32<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low),
+               (this->chunks[2] <= mask_high) & (this->chunks[2] >= mask_low),
+               (this->chunks[3] <= mask_high) & (this->chunks[3] >= mask_low))
+        .to_bitmask();
   }
-
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len,
-                                                           utf8_output);
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(low);
+    const simd16<T> mask_high = simd16<T>::splat(high);
+    return simd16x32<bool>(
+               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low),
+               (this->chunks[2] > mask_high) | (this->chunks[2] < mask_low),
+               (this->chunks[3] > mask_high) | (this->chunks[3] < mask_low))
+        .to_bitmask();
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask,
+                           this->chunks[2] < mask, this->chunks[3] < mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
-      const char16_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
-  }
+}; // struct simd16x32<T>
 
-  simdutf_warn_unused size_t
-  convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                          char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
-  }
+template <>
+simdutf_really_inline uint64_t simd16x32<uint16_t>::not_in_range(
+    const uint16_t low, const uint16_t high) const {
+  const simd16<uint16_t> mask_low = simd16<uint16_t>::splat(low);
+  const simd16<uint16_t> mask_high = simd16<uint16_t>::splat(high);
+  simd16x32<uint16_t> x(simd16<uint16_t>((this->chunks[0] > mask_high) |
+                                         (this->chunks[0] < mask_low)),
+                        simd16<uint16_t>((this->chunks[1] > mask_high) |
+                                         (this->chunks[1] < mask_low)),
+                        simd16<uint16_t>((this->chunks[2] > mask_high) |
+                                         (this->chunks[2] < mask_low)),
+                        simd16<uint16_t>((this->chunks[3] > mask_high) |
+                                         (this->chunks[3] < mask_low)));
+  return x.to_bitmask();
+}
+/* end file src/simdutf/lsx/simd16-inl.h */
+} // namespace simd
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
 
-  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
-      const char32_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
-                                                           latin1_output);
-  }
+#endif // SIMDUTF_LSX_SIMD_H
+/* end file src/simdutf/lsx/simd.h */
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
-      const char32_t *buf, size_t len,
-      char *latin1_output) const noexcept final override {
-    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
-  }
+/* begin file src/simdutf/lsx/end.h */
+/* end file src/simdutf/lsx/end.h */
 
-  simdutf_warn_unused size_t
-  convert_utf32_to_utf8(const char32_t *buf, size_t len,
-                        char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
-  }
+#endif // SIMDUTF_IMPLEMENTATION_LSX
 
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-      const char32_t *buf, size_t len,
-      char *utf8_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  }
+#endif // SIMDUTF_LSX_H
+/* end file src/simdutf/lsx.h */
+/* begin file src/simdutf/lasx.h */
+#ifndef SIMDUTF_LASX_H
+#define SIMDUTF_LASX_H
 
-  simdutf_warn_unused size_t
-  convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
-                              char *utf8_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
-  }
+#ifdef SIMDUTF_FALLBACK_H
+  #error "lasx.h must be included before fallback.h"
+#endif
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
-  }
 
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
-  }
+#ifndef SIMDUTF_IMPLEMENTATION_LASX
+  #define SIMDUTF_IMPLEMENTATION_LASX (SIMDUTF_IS_LASX)
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX && SIMDUTF_IS_LASX
+  #define SIMDUTF_CAN_ALWAYS_RUN_LASX 1
+#else
+  #define SIMDUTF_CAN_ALWAYS_RUN_LASX 0
+#endif
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len,
-                                                            utf16_output);
-  }
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len,
-                                                            utf16_output);
-  }
+#if SIMDUTF_IMPLEMENTATION_LASX
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
-  }
+namespace simdutf {
+/**
+ * Implementation for LoongArch ASX.
+ */
+namespace lasx {} // namespace lasx
+} // namespace simdutf
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
-      const char32_t *buf, size_t len,
-      char16_t *utf16_output) const noexcept final override {
-    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
-  }
+/* begin file src/simdutf/lasx/implementation.h */
+#ifndef SIMDUTF_LASX_IMPLEMENTATION_H
+#define SIMDUTF_LASX_IMPLEMENTATION_H
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
-  }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
-  }
+namespace simdutf {
+namespace lasx {
+
+namespace {
+using namespace simdutf;
+}
 
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation()
+      : simdutf::implementation("lasx", "LOONGARCH ASX",
+                                internal::instruction_set::LSX |
+                                    internal::instruction_set::LASX) {}
+  simdutf_warn_unused int detect_encodings(const char *input,
+                                           size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf,
+                                         size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(
+      const char *buf, size_t len, char *utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                          char *latin1_output) const noexcept final;
+  simdutf_warn_unused result
+  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                      char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+                                char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
   simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
       const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len,
-                                                            utf32_output);
-  }
-
+      char32_t *utf32_buffer) const noexcept final;
   simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
       const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len,
-                                                            utf32_output);
-  }
+      char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t *buf, size_t length,
+                               char16_t *output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char *buf,
+                                        size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *input, size_t length) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(
+      size_t length, base64_options options) const noexcept;
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept;
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
-  }
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual result
+  base64_to_binary(const char16_t *input, size_t length, char *output,
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options =
+                       last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused virtual full_result base64_to_binary_details(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+};
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
-      const char16_t *buf, size_t len,
-      char32_t *utf32_output) const noexcept final override {
-    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
-  }
+} // namespace lasx
+} // namespace simdutf
 
-  void change_endianness_utf16(const char16_t *buf, size_t len,
-                               char16_t *output) const noexcept final override {
-    set_best()->change_endianness_utf16(buf, len, output);
-  }
+#endif // SIMDUTF_LASX_IMPLEMENTATION_H
+/* end file src/simdutf/lasx/implementation.h */
 
-  simdutf_warn_unused size_t
-  count_utf16le(const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16le(buf, len);
-  }
+/* begin file src/simdutf/lasx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lasx"
+// #define SIMDUTF_IMPLEMENTATION lasx
+/* end file src/simdutf/lasx/begin.h */
 
-  simdutf_warn_unused size_t
-  count_utf16be(const char16_t *buf, size_t len) const noexcept final override {
-    return set_best()->count_utf16be(buf, len);
-  }
+  // Declarations
+/* begin file src/simdutf/lasx/intrinsics.h */
+#ifndef SIMDUTF_LASX_INTRINSICS_H
+#define SIMDUTF_LASX_INTRINSICS_H
 
-  simdutf_warn_unused size_t
-  count_utf8(const char *buf, size_t len) const noexcept final override {
-    return set_best()->count_utf8(buf, len);
-  }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
-    return set_best()->latin1_length_from_utf8(buf, len);
-  }
+// This should be the correct header whether
+// you use visual studio or other compilers.
+#include <lsxintrin.h>
+#include <lasxintrin.h>
+
+#if defined(__loongarch_asx)
+  #ifdef __clang__
+    #define VREGS_PREFIX "$vr"
+    #define XREGS_PREFIX "$xr"
+  #else // GCC
+    #define VREGS_PREFIX "$f"
+    #define XREGS_PREFIX "$f"
+  #endif
+  #define __ALL_REGS                                                           \
+    "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,"  \
+    "27,28,29,30,31"
+// Convert __m128i to __m256i
+static inline __m256i ____m256i(__m128i in) {
+  __m256i out = __lasx_xvldi(0);
+  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " XREGS_PREFIX "\\i    \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[in], " VREGS_PREFIX "\\j  \n\t"
+                   "    xvpermi.q $xr\\i, $xr\\j, 0x0  \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   : [out] "+f"(out)
+                   : [in] "f"(in));
+  return out;
+}
+// Convert two __m128i to __m256i
+static inline __m256i lasx_set_q(__m128i inhi, __m128i inlo) {
+  __m256i out;
+  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[hi], " VREGS_PREFIX "\\i    \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[lo], " VREGS_PREFIX "\\j  \n\t"
+                   "    xvpermi.q $xr\\i, $xr\\j, 0x20  \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   ".ifnc %[out], %[hi]                 \n\t"
+                   ".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " XREGS_PREFIX "\\i   \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[hi], " VREGS_PREFIX "\\j  \n\t"
+                   "    xvori.b $xr\\i, $xr\\j, 0       \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   ".endif                              \n\t"
+                   : [out] "=f"(out), [hi] "+f"(inhi)
+                   : [lo] "f"(inlo));
+  return out;
+}
+// Convert __m256i low part to __m128i
+static inline __m128i lasx_extracti128_lo(__m256i in) {
+  __m128i out;
+  __asm__ volatile(".ifnc %[out], %[in]                 \n\t"
+                   ".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+                   "    vori.b $vr\\i, $vr\\j, 0        \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   ".endif                              \n\t"
+                   : [out] "=f"(out)
+                   : [in] "f"(in));
+  return out;
+}
+// Convert __m256i high part to __m128i
+static inline __m128i lasx_extracti128_hi(__m256i in) {
+  __m128i out;
+  __asm__ volatile(".irp i," __ALL_REGS "\n\t"
+                   " .ifc %[out], " VREGS_PREFIX "\\i   \n\t"
+                   "  .irp j," __ALL_REGS "\n\t"
+                   "   .ifc %[in], " XREGS_PREFIX "\\j  \n\t"
+                   "    xvpermi.q $xr\\i, $xr\\j, 0x11  \n\t"
+                   "   .endif                           \n\t"
+                   "  .endr                             \n\t"
+                   " .endif                             \n\t"
+                   ".endr                               \n\t"
+                   : [out] "=f"(out)
+                   : [in] "f"(in));
+  return out;
+}
+#endif
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf16(size_t len) const noexcept override {
-    return set_best()->latin1_length_from_utf16(len);
-  }
+#endif //  SIMDUTF_LASX_INTRINSICS_H
+/* end file src/simdutf/lasx/intrinsics.h */
+/* begin file src/simdutf/lasx/bitmanipulation.h */
+#ifndef SIMDUTF_LASX_BITMANIPULATION_H
+#define SIMDUTF_LASX_BITMANIPULATION_H
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf32(size_t len) const noexcept override {
-    return set_best()->latin1_length_from_utf32(len);
-  }
+#include <limits>
 
-  simdutf_warn_unused size_t
-  utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_latin1(buf, len);
-  }
+namespace simdutf {
+namespace lasx {
+namespace {
 
-  simdutf_warn_unused size_t utf8_length_from_utf16le(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16le(buf, len);
-  }
+simdutf_really_inline int count_ones(uint64_t input_num) {
+  return __lsx_vpickve2gr_w(__lsx_vpcnt_d(__lsx_vreplgr2vr_d(input_num)), 0);
+}
 
-  simdutf_warn_unused size_t utf8_length_from_utf16be(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf16be(buf, len);
-  }
+#if SIMDUTF_NEED_TRAILING_ZEROES
+simdutf_really_inline int trailing_zeroes(uint64_t input_num) {
+  return __builtin_ctzll(input_num);
+}
+#endif
 
-  simdutf_warn_unused size_t
-  utf16_length_from_latin1(size_t len) const noexcept override {
-    return set_best()->utf16_length_from_latin1(len);
-  }
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
 
-  simdutf_warn_unused size_t
-  utf32_length_from_latin1(size_t len) const noexcept override {
-    return set_best()->utf32_length_from_latin1(len);
-  }
+#endif // SIMDUTF_LASX_BITMANIPULATION_H
+/* end file src/simdutf/lasx/bitmanipulation.h */
+/* begin file src/simdutf/lasx/simd.h */
+#ifndef SIMDUTF_LASX_SIMD_H
+#define SIMDUTF_LASX_SIMD_H
 
-  simdutf_warn_unused size_t utf32_length_from_utf16le(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16le(buf, len);
-  }
+#include <type_traits>
 
-  simdutf_warn_unused size_t utf32_length_from_utf16be(
-      const char16_t *buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf16be(buf, len);
-  }
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace simd {
 
-  simdutf_warn_unused size_t
-  utf16_length_from_utf8(const char *buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf8(buf, len);
-  }
+__attribute__((aligned(32))) static const uint8_t prev_shuf_table[32][32] = {
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+    {0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+     31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14},
+    {0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+     30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13},
+    {0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+     29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+    {0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+     28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11},
+    {0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+     27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+    {0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+     26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+    {0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7, 8,
+     25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6, 7,
+     24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5, 6,
+     23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4, 5,
+     22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4, 5},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3, 4,
+     21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3, 4},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,
+     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2, 3},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1, 2,
+     19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 1,
+     18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1},
+    {0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0},
+    {15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+     15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+     14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+     13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+     12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+     11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+     10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+     9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0,  0},
+    {8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+     8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0,  0},
+    {7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+     7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0,  0},
+    {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+     6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0,  0},
+    {5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+     5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0,  0},
+    {4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+     4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0,  0},
+    {3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+     3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0,  0},
+    {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+     2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0,  0},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0},
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
+};
 
-  simdutf_warn_unused size_t utf8_length_from_utf32(
-      const char32_t *buf, size_t len) const noexcept override {
-    return set_best()->utf8_length_from_utf32(buf, len);
-  }
+__attribute__((aligned(32))) static const uint8_t bitsel_mask_table[32][32] = {
+    {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x0},
+    {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0}};
 
-  simdutf_warn_unused size_t utf16_length_from_utf32(
-      const char32_t *buf, size_t len) const noexcept override {
-    return set_best()->utf16_length_from_utf32(buf, len);
-  }
+// Forward-declared so they can be used by splat and friends.
+template <typename Child> struct base {
+  __m256i value;
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
-    return set_best()->utf32_length_from_utf8(buf, len);
-  }
+  // Zero constructor
+  simdutf_really_inline base() : value{__m256i()} {}
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char *input, size_t length) const noexcept override {
-    return set_best()->maximal_binary_length_from_base64(input, length);
+  // Conversion from SIMD register
+  simdutf_really_inline base(const __m256i _value) : value(_value) {}
+  // Conversion to SIMD register
+  simdutf_really_inline operator const __m256i &() const { return this->value; }
+  simdutf_really_inline operator __m256i &() { return this->value; }
+  template <endianness big_endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    if (big_endian) {
+      __m256i zero = __lasx_xvldi(0);
+      __m256i in8 = __lasx_xvpermi_d(this->value, 0b11011000);
+      __m256i inlow = __lasx_xvilvl_b(in8, zero);
+      __m256i inhigh = __lasx_xvilvh_b(in8, zero);
+      __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(ptr), 0);
+      __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(ptr), 32);
+    } else {
+      __m256i inlow = __lasx_vext2xv_hu_bu(this->value);
+      __m256i inhigh = __lasx_vext2xv_hu_bu(
+          __lasx_xvpermi_q(this->value, this->value, 0b00000001));
+      __lasx_xvst(inlow, reinterpret_cast<__m256i *>(ptr), 0);
+      __lasx_xvst(inhigh, reinterpret_cast<__m256i *>(ptr), 32);
+    }
   }
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+    __m256i in32_0 = __lasx_vext2xv_wu_bu(this->value);
+    __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(ptr), 0);
 
-  simdutf_warn_unused result base64_to_binary(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output, options,
-                                        last_chunk_handling_options);
-  }
+    __m256i in8_1 = __lasx_xvpermi_d(this->value, 0b00000001);
+    __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
+    __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(ptr), 32);
 
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char *input, size_t length, char *output, base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary_details(input, length, output, options,
-                                                last_chunk_handling_options);
-  }
+    __m256i in8_2 = __lasx_xvpermi_d(this->value, 0b00000010);
+    __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
+    __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(ptr), 64);
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char16_t *input, size_t length) const noexcept override {
-    return set_best()->maximal_binary_length_from_base64(input, length);
+    __m256i in8_3 = __lasx_xvpermi_d(this->value, 0b00000011);
+    __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
+    __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(ptr), 96);
   }
-
-  simdutf_warn_unused result base64_to_binary(
-      const char16_t *input, size_t length, char *output,
-      base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary(input, length, output, options,
-                                        last_chunk_handling_options);
+  // Bit operations
+  simdutf_really_inline Child operator|(const Child other) const {
+    return __lasx_xvor_v(this->value, other);
   }
-
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char16_t *input, size_t length, char *output,
-      base64_options options,
-      last_chunk_handling_options last_chunk_handling_options =
-          last_chunk_handling_options::loose) const noexcept override {
-    return set_best()->base64_to_binary_details(input, length, output, options,
-                                                last_chunk_handling_options);
+  simdutf_really_inline Child operator&(const Child other) const {
+    return __lasx_xvand_v(this->value, other);
   }
-
-  simdutf_warn_unused size_t base64_length_from_binary(
-      size_t length, base64_options options) const noexcept override {
-    return set_best()->base64_length_from_binary(length, options);
+  simdutf_really_inline Child operator^(const Child other) const {
+    return __lasx_xvxor_v(this->value, other);
   }
-
-  size_t binary_to_base64(const char *input, size_t length, char *output,
-                          base64_options options) const noexcept override {
-    return set_best()->binary_to_base64(input, length, output, options);
+  simdutf_really_inline Child bit_andnot(const Child other) const {
+    return __lasx_xvandn_v(this->value, other);
+  }
+  simdutf_really_inline Child &operator|=(const Child other) {
+    auto this_cast = static_cast<Child *>(this);
+    *this_cast = *this_cast | other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator&=(const Child other) {
+    auto this_cast = static_cast<Child *>(this);
+    *this_cast = *this_cast & other;
+    return *this_cast;
+  }
+  simdutf_really_inline Child &operator^=(const Child other) {
+    auto this_cast = static_cast<Child *>(this);
+    *this_cast = *this_cast ^ other;
+    return *this_cast;
   }
+};
 
-  simdutf_really_inline
-  detect_best_supported_implementation_on_first_use() noexcept
-      : implementation("best_supported_detector",
-                       "Detects the best supported implementation and sets it",
-                       0) {}
+template <typename T> struct simd8;
 
-private:
-  const implementation *set_best() const noexcept;
-};
+template <typename T, typename Mask = simd8<bool>>
+struct base8 : base<simd8<T>> {
+  typedef uint32_t bitmask_t;
+  typedef uint64_t bitmask2_t;
 
-static_assert(std::is_trivially_destructible<
-                  detect_best_supported_implementation_on_first_use>::value,
-              "detect_best_supported_implementation_on_first_use should be "
-              "trivially destructible");
+  simdutf_really_inline base8() : base<simd8<T>>() {}
+  simdutf_really_inline base8(const __m256i _value) : base<simd8<T>>(_value) {}
+  simdutf_really_inline T first() const {
+    return __lasx_xvpickve2gr_wu(this->value, 0);
+  }
+  simdutf_really_inline T last() const {
+    return __lasx_xvpickve2gr_wu(this->value, 7);
+  }
+  friend simdutf_really_inline Mask operator==(const simd8<T> lhs,
+                                               const simd8<T> rhs) {
+    return __lasx_xvseq_b(lhs, rhs);
+  }
 
-static const std::initializer_list<const implementation *> &
-get_available_implementation_pointers() {
-  static const std::initializer_list<const implementation *>
-      available_implementation_pointers{
-#if SIMDUTF_IMPLEMENTATION_ICELAKE
-          get_icelake_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_HASWELL
-          get_haswell_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_WESTMERE
-          get_westmere_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_ARM64
-          get_arm64_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_PPC64
-          get_ppc64_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_RVV
-          get_rvv_singleton(),
-#endif
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-          get_fallback_singleton(),
-#endif
-      }; // available_implementation_pointers
-  return available_implementation_pointers;
-}
+  static const int SIZE = sizeof(base<T>::value);
 
-// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
-// support
-class unsupported_implementation final : public implementation {
-public:
-  simdutf_warn_unused int detect_encodings(const char *,
-                                           size_t) const noexcept override {
-    return encoding_type::unspecified;
+  template <int N = 1>
+  simdutf_really_inline simd8<T> prev(const simd8<T> prev_chunk) const {
+    if (!N)
+      return this->value;
+
+    __m256i zero = __lasx_xvldi(0);
+    __m256i result, shuf;
+    if (N < 16) {
+      shuf = __lasx_xvld(prev_shuf_table[N], 0);
+
+      result = __lasx_xvshuf_b(
+          __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
+          shuf);
+      __m256i srl_prev = __lasx_xvbsrl_v(
+          __lasx_xvpermi_q(zero, prev_chunk.value, 0b00110001), (16 - N));
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+      result = __lasx_xvbitsel_v(result, srl_prev, mask);
+
+      return result;
+    } else if (N == 16) {
+      return __lasx_xvpermi_q(this->value, prev_chunk.value, 0b00100001);
+    } /*else {
+      __m256i sll_value = __lasx_xvbsll_v(
+          __lasx_xvpermi_q(zero, this->value, 0b00000011), (N - 16) % 32);
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+      shuf = __lasx_xvld(prev_shuf_table[N], 0);
+      result = __lasx_xvshuf_b(
+          __lasx_xvpermi_q(prev_chunk.value, prev_chunk.value, 0b00000001),
+          prev_chunk.value, shuf);
+      result = __lasx_xvbitsel_v(sll_value, result, mask);
+      return result;
+    }*/
   }
+};
 
-  simdutf_warn_unused bool validate_utf8(const char *,
-                                         size_t) const noexcept final override {
-    return false; // Just refuse to validate. Given that we have a fallback
-                  // implementation
-    // it seems unlikely that unsupported_implementation will ever be used. If
-    // it is used, then it will flag all strings as invalid. The alternative is
-    // to return an error_code from which the user has to figure out whether the
-    // string is valid UTF-8... which seems like a lot of work just to handle
-    // the very unlikely case that we have an unsupported implementation. And,
-    // when it does happen (that we have an unsupported implementation), what
-    // are the chances that the programmer has a fallback? Given that *we*
-    // provide the fallback, it implies that the programmer would need a
-    // fallback for our fallback.
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd8<bool> : base8<bool> {
+  static simdutf_really_inline simd8<bool> splat(bool _value) {
+    return __lasx_xvreplgr2vr_b(uint8_t(-(!!_value)));
   }
 
-  simdutf_warn_unused result validate_utf8_with_errors(
-      const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+  simdutf_really_inline simd8() : base8() {}
+  simdutf_really_inline simd8(const __m256i _value) : base8<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8(bool _value) : base8<bool>(splat(_value)) {}
 
-  simdutf_warn_unused bool
-  validate_ascii(const char *, size_t) const noexcept final override {
-    return false;
+  simdutf_really_inline uint32_t to_bitmask() const {
+    __m256i mask = __lasx_xvmsknz_b(this->value);
+    uint32_t mask0 = __lasx_xvpickve2gr_wu(mask, 0);
+    uint32_t mask1 = __lasx_xvpickve2gr_wu(mask, 4);
+    return (mask0 | (mask1 << 16));
   }
-
-  simdutf_warn_unused result validate_ascii_with_errors(
-      const char *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline bool any() const {
+    if (__lasx_xbz_b(this->value))
+      return false;
+    return true;
   }
-
-  simdutf_warn_unused bool
-  validate_utf16le(const char16_t *, size_t) const noexcept final override {
+  simdutf_really_inline bool none() const {
+    if (__lasx_xbz_b(this->value))
+      return true;
     return false;
   }
-
-  simdutf_warn_unused bool
-  validate_utf16be(const char16_t *, size_t) const noexcept final override {
+  simdutf_really_inline bool all() const {
+    if (__lasx_xbnz_b(this->value))
+      return true;
     return false;
   }
+  simdutf_really_inline simd8<bool> operator~() const { return *this ^ true; }
+};
 
-  simdutf_warn_unused result validate_utf16le_with_errors(
-      const char16_t *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
-
-  simdutf_warn_unused result validate_utf16be_with_errors(
-      const char16_t *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
+template <typename T> struct base8_numeric : base8<T> {
+  static simdutf_really_inline simd8<T> splat(T _value) {
+    return __lasx_xvreplgr2vr_b(_value);
   }
-
-  simdutf_warn_unused bool
-  validate_utf32(const char32_t *, size_t) const noexcept final override {
-    return false;
+  static simdutf_really_inline simd8<T> zero() { return __lasx_xvldi(0); }
+  static simdutf_really_inline simd8<T> load(const T values[32]) {
+    return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
   }
-
-  simdutf_warn_unused result validate_utf32_with_errors(
-      const char32_t *, size_t) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  static simdutf_really_inline simd8<T> repeat_16(T v0, T v1, T v2, T v3, T v4,
+                                                  T v5, T v6, T v7, T v8, T v9,
+                                                  T v10, T v11, T v12, T v13,
+                                                  T v14, T v15) {
+    return simd8<T>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13,
+                    v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11,
+                    v12, v13, v14, v15);
   }
 
-  simdutf_warn_unused size_t convert_latin1_to_utf8(
-      const char *, size_t, char *) const noexcept final override {
-    return 0;
-  }
+  simdutf_really_inline base8_numeric() : base8<T>() {}
+  simdutf_really_inline base8_numeric(const __m256i _value)
+      : base8<T>(_value) {}
 
-  simdutf_warn_unused size_t convert_latin1_to_utf16le(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Store to array
+  simdutf_really_inline void store(T dst[32]) const {
+    return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
   }
 
-  simdutf_warn_unused size_t convert_latin1_to_utf16be(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd8<T> operator+(const simd8<T> other) const {
+    return __lasx_xvadd_b(this->value, other);
   }
-
-  simdutf_warn_unused size_t convert_latin1_to_utf32(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<T> operator-(const simd8<T> other) const {
+    return __lasx_xvsub_b(this->value, other);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_latin1(
-      const char *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<T> &operator+=(const simd8<T> other) {
+    *this = *this + other;
+    return *static_cast<simd8<T> *>(this);
   }
-
-  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
-      const char *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<T> &operator-=(const simd8<T> other) {
+    *this = *this - other;
+    return *static_cast<simd8<T> *>(this);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
-      const char *, size_t, char *) const noexcept final override {
-    return 0;
-  }
+  // Override to distinguish from bool version
+  simdutf_really_inline simd8<T> operator~() const { return *this ^ 0xFFu; }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16le(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Perform a lookup assuming the value is between 0 and 16 (undefined behavior
+  // for out of range values)
+  template <typename L>
+  simdutf_really_inline simd8<L> lookup_16(simd8<L> lookup_table) const {
+    __m256i origin = __lasx_xvand_v(this->value, __lasx_xvldi(0x1f));
+    return __lasx_xvshuf_b(__lasx_xvldi(0), lookup_table, origin);
   }
 
-  simdutf_warn_unused size_t convert_utf8_to_utf16be(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  template <typename L>
+  simdutf_really_inline simd8<L>
+  lookup_16(L replace0, L replace1, L replace2, L replace3, L replace4,
+            L replace5, L replace6, L replace7, L replace8, L replace9,
+            L replace10, L replace11, L replace12, L replace13, L replace14,
+            L replace15) const {
+    return lookup_16(simd8<L>::repeat_16(
+        replace0, replace1, replace2, replace3, replace4, replace5, replace6,
+        replace7, replace8, replace9, replace10, replace11, replace12,
+        replace13, replace14, replace15));
   }
+};
 
-  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+// Signed bytes
+template <> struct simd8<int8_t> : base8_numeric<int8_t> {
+  simdutf_really_inline simd8() : base8_numeric<int8_t>() {}
+  simdutf_really_inline simd8(const __m256i _value)
+      : base8_numeric<int8_t>(_value) {}
 
-  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  // Splat constructor
+  simdutf_really_inline simd8(int8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const int8_t values[32]) : simd8(load(values)) {}
+  simdutf_really_inline operator simd8<uint8_t>() const;
+  // Member-by-member initialization
+  simdutf_really_inline
+  simd8(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+        int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+        int8_t v12, int8_t v13, int8_t v14, int8_t v15, int8_t v16, int8_t v17,
+        int8_t v18, int8_t v19, int8_t v20, int8_t v21, int8_t v22, int8_t v23,
+        int8_t v24, int8_t v25, int8_t v26, int8_t v27, int8_t v28, int8_t v29,
+        int8_t v30, int8_t v31)
+      : simd8((__m256i)v32i8{v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,
+                             v8,  v9,  v10, v11, v12, v13, v14, v15,
+                             v16, v17, v18, v19, v20, v21, v22, v23,
+                             v24, v25, v26, v27, v28, v29, v30, v31}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<int8_t>
+  repeat_16(int8_t v0, int8_t v1, int8_t v2, int8_t v3, int8_t v4, int8_t v5,
+            int8_t v6, int8_t v7, int8_t v8, int8_t v9, int8_t v10, int8_t v11,
+            int8_t v12, int8_t v13, int8_t v14, int8_t v15) {
+    return simd8<int8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                         v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                         v10, v11, v12, v13, v14, v15);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline bool is_ascii() const {
+    __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
+    if (__lasx_xbnz_v(ascii_mask))
+      return false;
+    return true;
   }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
-      const char *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  // Order-sensitive comparisons
+  simdutf_really_inline simd8<int8_t> max_val(const simd8<int8_t> other) const {
+    return __lasx_xvmax_b(this->value, other);
   }
-
-  simdutf_warn_unused size_t convert_utf8_to_utf32(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<int8_t> min_val(const simd8<int8_t> other) const {
+    return __lasx_xvmin_b(this->value, other);
   }
-
-  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool> operator>(const simd8<int8_t> other) const {
+    return __lasx_xvslt_b(other, this->value);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
-      const char *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool> operator<(const simd8<int8_t> other) const {
+    return __lasx_xvslt_b(this->value, other);
   }
+};
 
-  simdutf_warn_unused size_t convert_utf16le_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+// Unsigned bytes
+template <> struct simd8<uint8_t> : base8_numeric<uint8_t> {
+  simdutf_really_inline simd8() : base8_numeric<uint8_t>() {}
+  simdutf_really_inline simd8(const __m256i _value)
+      : base8_numeric<uint8_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd8(uint8_t _value) : simd8(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd8(const uint8_t values[32]) : simd8(load(values)) {}
+  // Member-by-member initialization
+  simdutf_really_inline
+  simd8(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4, uint8_t v5,
+        uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9, uint8_t v10,
+        uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14, uint8_t v15,
+        uint8_t v16, uint8_t v17, uint8_t v18, uint8_t v19, uint8_t v20,
+        uint8_t v21, uint8_t v22, uint8_t v23, uint8_t v24, uint8_t v25,
+        uint8_t v26, uint8_t v27, uint8_t v28, uint8_t v29, uint8_t v30,
+        uint8_t v31)
+      : simd8((__m256i)v32u8{v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7,
+                             v8,  v9,  v10, v11, v12, v13, v14, v15,
+                             v16, v17, v18, v19, v20, v21, v22, v23,
+                             v24, v25, v26, v27, v28, v29, v30, v31}) {}
+  // Repeat 16 values as many times as necessary (usually for lookup tables)
+  simdutf_really_inline static simd8<uint8_t>
+  repeat_16(uint8_t v0, uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
+            uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8, uint8_t v9,
+            uint8_t v10, uint8_t v11, uint8_t v12, uint8_t v13, uint8_t v14,
+            uint8_t v15) {
+    return simd8<uint8_t>(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12,
+                          v13, v14, v15, v0, v1, v2, v3, v4, v5, v6, v7, v8, v9,
+                          v10, v11, v12, v13, v14, v15);
   }
 
-  simdutf_warn_unused size_t convert_utf16be_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Saturated math
+  simdutf_really_inline simd8<uint8_t>
+  saturating_add(const simd8<uint8_t> other) const {
+    return __lasx_xvsadd_bu(this->value, other);
   }
-
-  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<uint8_t>
+  saturating_sub(const simd8<uint8_t> other) const {
+    return __lasx_xvssub_bu(this->value, other);
   }
 
-  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  // Order-specific operations
+  simdutf_really_inline simd8<uint8_t>
+  max_val(const simd8<uint8_t> other) const {
+    return __lasx_xvmax_bu(*this, other);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<uint8_t>
+  min_val(const simd8<uint8_t> other) const {
+    return __lasx_xvmin_bu(*this, other);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  gt_bits(const simd8<uint8_t> other) const {
+    return this->saturating_sub(other);
   }
-
-  simdutf_warn_unused size_t convert_utf16le_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd8<uint8_t>
+  lt_bits(const simd8<uint8_t> other) const {
+    return other.saturating_sub(*this);
   }
-
-  simdutf_warn_unused size_t convert_utf16be_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool>
+  operator<=(const simd8<uint8_t> other) const {
+    return __lasx_xvsle_bu(*this, other);
   }
-
-  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool>
+  operator>=(const simd8<uint8_t> other) const {
+    return __lasx_xvsle_bu(other, *this);
   }
-
-  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool>
+  operator>(const simd8<uint8_t> other) const {
+    return __lasx_xvslt_bu(*this, other);
   }
-
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool>
+  operator<(const simd8<uint8_t> other) const {
+    return __lasx_xvslt_bu(other, *this);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
-      const char16_t *, size_t, char *) const noexcept final override {
-    return 0;
+  // Bit-specific operations
+  simdutf_really_inline simd8<bool> bits_not_set() const {
+    return *this == uint8_t(0);
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_latin1(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool> bits_not_set(simd8<uint8_t> bits) const {
+    return (*this & bits).bits_not_set();
   }
-
-  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<bool> any_bits_set() const {
+    return ~this->bits_not_set();
   }
-
-  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8<bool> any_bits_set(simd8<uint8_t> bits) const {
+    return ~this->bits_not_set(bits);
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_utf8(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline bool is_ascii() const {
+    __m256i ascii_mask = __lasx_xvslti_b(this->value, 0);
+    if (__lasx_xbnz_v(ascii_mask))
+      return false;
+    return true;
   }
-
-  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    if (__lasx_xbnz_v(this->value))
+      return true;
+    return false;
   }
-
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
-      const char32_t *, size_t, char *) const noexcept final override {
-    return 0;
+  simdutf_really_inline bool any_bits_set_anywhere(simd8<uint8_t> bits) const {
+    return (*this & bits).any_bits_set_anywhere();
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_utf16le(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  template <int N> simdutf_really_inline simd8<uint8_t> shr() const {
+    return __lasx_xvsrli_b(this->value, N);
   }
-
-  simdutf_warn_unused size_t convert_utf32_to_utf16be(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  template <int N> simdutf_really_inline simd8<uint8_t> shl() const {
+    return __lasx_xvslli_b(this->value, N);
   }
+};
+simdutf_really_inline simd8<int8_t>::operator simd8<uint8_t>() const {
+  return this->value;
+}
 
-  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+template <typename T> struct simd8x64 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd8<T>);
+  static_assert(NUM_CHUNKS == 2,
+                "LASX kernel should use two registers per 64-byte block.");
+  simd8<T> chunks[NUM_CHUNKS];
 
-  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
-  }
+  simd8x64(const simd8x64<T> &o) = delete; // no copy allowed
+  simd8x64<T> &
+  operator=(const simd8<T> other) = delete; // no assignment allowed
+  simd8x64() = delete;                      // no default constructor allowed
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
-  }
+  simdutf_really_inline simd8x64(const simd8<T> chunk0, const simd8<T> chunk1)
+      : chunks{chunk0, chunk1} {}
+  simdutf_really_inline simd8x64(const T *ptr)
+      : chunks{simd8<T>::load(ptr),
+               simd8<T>::load(ptr + sizeof(simd8<T>) / sizeof(T))} {}
 
-  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
-      const char32_t *, size_t, char16_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd8<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd8<T>) * 1 / sizeof(T));
   }
 
-  simdutf_warn_unused size_t convert_utf16le_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline uint64_t to_bitmask() const {
+    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r_hi = this->chunks[1].to_bitmask();
+    return r_lo | (r_hi << 32);
   }
 
-  simdutf_warn_unused size_t convert_utf16be_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8x64<T> &operator|=(const simd8x64<T> &other) {
+    this->chunks[0] |= other.chunks[0];
+    this->chunks[1] |= other.chunks[1];
+    return *this;
   }
 
-  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline simd8<T> reduce_or() const {
+    return this->chunks[0] | this->chunks[1];
   }
 
-  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return result(error_code::OTHER, 0);
+  simdutf_really_inline bool is_ascii() const {
+    return this->reduce_or().is_ascii();
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  template <endianness endian>
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 0);
+    this->chunks[1].template store_ascii_as_utf16<endian>(ptr +
+                                                          sizeof(simd8<T>) * 1);
   }
 
-  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
-      const char16_t *, size_t, char32_t *) const noexcept final override {
-    return 0;
+  simdutf_really_inline void store_ascii_as_utf32(char32_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 0);
+    this->chunks[1].store_ascii_as_utf32(ptr + sizeof(simd8<T>) * 1);
   }
 
-  void change_endianness_utf16(const char16_t *, size_t,
-                               char16_t *) const noexcept final override {}
-
-  simdutf_warn_unused size_t
-  count_utf16le(const char16_t *, size_t) const noexcept final override {
-    return 0;
+  simdutf_really_inline simd8x64<T> bit_or(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<T>(this->chunks[0] | mask, this->chunks[1] | mask);
   }
 
-  simdutf_warn_unused size_t
-  count_utf16be(const char16_t *, size_t) const noexcept final override {
-    return 0;
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t count_utf8(const char *,
-                                        size_t) const noexcept final override {
-    return 0;
+  simdutf_really_inline uint64_t eq(const simd8x64<uint8_t> &other) const {
+    return simd8x64<bool>(this->chunks[0] == other.chunks[0],
+                          this->chunks[1] == other.chunks[1])
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf16(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
+
+    return simd8x64<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd8<T> mask_low = simd8<T>::splat(low);
+    const simd8<T> mask_high = simd8<T>::splat(high);
+    return simd8x64<bool>(
+               (this->chunks[0] > mask_high) | (this->chunks[0] < mask_low),
+               (this->chunks[1] > mask_high) | (this->chunks[1] < mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+        .to_bitmask();
   }
 
-  simdutf_warn_unused size_t
-  latin1_length_from_utf32(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t gt(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] > mask, this->chunks[1] > mask)
+        .to_bitmask();
   }
-  simdutf_warn_unused size_t
-  utf8_length_from_latin1(const char *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t gteq(const T m) const {
+    const simd8<T> mask = simd8<T>::splat(m);
+    return simd8x64<bool>(this->chunks[0] >= mask, this->chunks[1] >= mask)
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t gteq_unsigned(const uint8_t m) const {
+    const simd8<uint8_t> mask = simd8<uint8_t>::splat(m);
+    return simd8x64<bool>((simd8<uint8_t>(__m256i(this->chunks[0])) >= mask),
+                          (simd8<uint8_t>(__m256i(this->chunks[1])) >= mask))
+        .to_bitmask();
   }
+}; // struct simd8x64<T>
 
-  simdutf_warn_unused size_t
-  utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
+/* begin file src/simdutf/lasx/simd16-inl.h */
+template <typename T> struct simd16;
+
+template <typename T, typename Mask = simd16<bool>>
+struct base16 : base<simd16<T>> {
+  using bitmask_type = uint32_t;
+
+  simdutf_really_inline base16() : base<simd16<T>>() {}
+  simdutf_really_inline base16(const __m256i _value)
+      : base<simd16<T>>(_value) {}
+  template <typename Pointer>
+  simdutf_really_inline base16(const Pointer *ptr)
+      : base16(__lasx_xvld(reinterpret_cast<const __m256i *>(ptr), 0)) {}
+  friend simdutf_really_inline Mask operator==(const simd16<T> lhs,
+                                               const simd16<T> rhs) {
+    return __lasx_xvseq_h(lhs.value, rhs.value);
   }
 
-  simdutf_warn_unused size_t
-  utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
+  /// the size of vector in bytes
+  static const int SIZE = sizeof(base<simd16<T>>::value);
+
+  /// the number of elements of type T a vector can hold
+  static const int ELEMENTS = SIZE / sizeof(T);
+
+  template <int N = 1>
+  simdutf_really_inline simd16<T> prev(const simd16<T> prev_chunk) const {
+    if (!N)
+      return this->value;
+
+    __m256i zero = __lasx_xvldi(0);
+    __m256i result, shuf;
+    if (N < 8) {
+      shuf = __lasx_xvld(prev_shuf_table[N * 2], 0);
+
+      result = __lasx_xvshuf_b(
+          __lasx_xvpermi_q(this->value, this->value, 0b00000001), this->value,
+          shuf);
+      __m256i srl_prev = __lasx_xvbsrl_v(
+          __lasx_xvpermi_q(zero, prev_chunk, 0b00110001), (16 - N * 2));
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N], 0);
+      result = __lasx_xvbitsel_v(result, srl_prev, mask);
+
+      return result;
+    } else if (N == 8) {
+      return __lasx_xvpermi_q(this->value, prev_chunk, 0b00100001);
+    } else {
+      __m256i sll_value = __lasx_xvbsll_v(
+          __lasx_xvpermi_q(zero, this->value, 0b00000011), (N * 2 - 16));
+      __m256i mask = __lasx_xvld(bitsel_mask_table[N * 2], 0);
+      shuf = __lasx_xvld(prev_shuf_table[N * 2], 0);
+      result =
+          __lasx_xvshuf_b(__lasx_xvpermi_q(prev_chunk, prev_chunk, 0b00000001),
+                          prev_chunk, shuf);
+      result = __lasx_xvbitsel_v(sll_value, result, mask);
+      return result;
+    }
   }
+};
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
-    return 0;
+// SIMD byte mask type (returned by things like eq and gt)
+template <> struct simd16<bool> : base16<bool> {
+  static simdutf_really_inline simd16<bool> splat(bool _value) {
+    return __lasx_xvreplgr2vr_h(uint8_t(-(!!_value)));
   }
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline simd16() : base16() {}
+  simdutf_really_inline simd16(const __m256i _value) : base16<bool>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(bool _value) : base16<bool>(splat(_value)) {}
+
+  simdutf_really_inline bitmask_type to_bitmask() const {
+    __m256i mask = __lasx_xvmsknz_b(this->value);
+    bitmask_type mask0 = __lasx_xvpickve2gr_wu(mask, 0);
+    bitmask_type mask1 = __lasx_xvpickve2gr_wu(mask, 4);
+    return (mask0 | (mask1 << 16));
+  }
+  simdutf_really_inline bool any() const {
+    if (__lasx_xbz_v(this->value))
+      return false;
+    return true;
+  }
+  simdutf_really_inline simd16<bool> operator~() const { return *this ^ true; }
+};
+
+template <typename T> struct base16_numeric : base16<T> {
+  static simdutf_really_inline simd16<T> splat(T _value) {
+    return __lasx_xvreplgr2vr_h((uint16_t)_value);
+  }
+  static simdutf_really_inline simd16<T> zero() { return __lasx_xvldi(0); }
+  static simdutf_really_inline simd16<T> load(const T values[8]) {
+    return __lasx_xvld(reinterpret_cast<const __m256i *>(values), 0);
   }
 
-  simdutf_warn_unused size_t
-  utf32_length_from_latin1(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline base16_numeric() : base16<T>() {}
+  simdutf_really_inline base16_numeric(const __m256i _value)
+      : base16<T>(_value) {}
+
+  // Store to array
+  simdutf_really_inline void store(T dst[8]) const {
+    return __lasx_xvst(this->value, reinterpret_cast<__m256i *>(dst), 0);
   }
 
-  simdutf_warn_unused size_t
-  utf16_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
+  // Override to distinguish from bool version
+  simdutf_really_inline simd16<T> operator~() const { return *this ^ 0xFFFFu; }
+
+  // Addition/subtraction are the same for signed and unsigned
+  simdutf_really_inline simd16<T> operator+(const simd16<T> other) const {
+    return __lasx_xvadd_h(*this, other);
   }
-  simdutf_warn_unused size_t
-  utf16_length_from_latin1(size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline simd16<T> operator-(const simd16<T> other) const {
+    return __lasx_xvsub_h(*this, other);
   }
-  simdutf_warn_unused size_t
-  utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
+  simdutf_really_inline simd16<T> &operator+=(const simd16<T> other) {
+    *this = *this + other;
+    return *static_cast<simd16<T> *>(this);
+  }
+  simdutf_really_inline simd16<T> &operator-=(const simd16<T> other) {
+    *this = *this - other;
+    return *static_cast<simd16<T> *>(this);
   }
+};
 
-  simdutf_warn_unused size_t
-  utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
-    return 0;
+// Signed code units
+template <> struct simd16<int16_t> : base16_numeric<int16_t> {
+  simdutf_really_inline simd16() : base16_numeric<int16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value)
+      : base16_numeric<int16_t>(_value) {}
+  // Splat constructor
+  simdutf_really_inline simd16(int16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const int16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const int16_t *>(values))) {}
+  // Order-sensitive comparisons
+  simdutf_really_inline simd16<int16_t>
+  max_val(const simd16<int16_t> other) const {
+    return __lasx_xvmax_h(*this, other);
+  }
+  simdutf_really_inline simd16<int16_t>
+  min_val(const simd16<int16_t> other) const {
+    return __lasx_xvmin_h(*this, other);
+  }
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<int16_t> other) const {
+    return __lasx_xvsle_h(other.value, this->value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<int16_t> other) const {
+    return __lasx_xvslt_h(this->value, other.value);
   }
+};
 
-  simdutf_warn_unused size_t
-  utf32_length_from_utf8(const char *, size_t) const noexcept override {
-    return 0;
+// Unsigned code units
+template <> struct simd16<uint16_t> : base16_numeric<uint16_t> {
+  simdutf_really_inline simd16() : base16_numeric<uint16_t>() {}
+  simdutf_really_inline simd16(const __m256i _value)
+      : base16_numeric<uint16_t>(_value) {}
+
+  // Splat constructor
+  simdutf_really_inline simd16(uint16_t _value) : simd16(splat(_value)) {}
+  // Array constructor
+  simdutf_really_inline simd16(const uint16_t *values) : simd16(load(values)) {}
+  simdutf_really_inline simd16(const char16_t *values)
+      : simd16(load(reinterpret_cast<const uint16_t *>(values))) {}
+
+  // Saturated math
+  simdutf_really_inline simd16<uint16_t>
+  saturating_add(const simd16<uint16_t> other) const {
+    return __lasx_xvsadd_hu(this->value, other.value);
+  }
+  simdutf_really_inline simd16<uint16_t>
+  saturating_sub(const simd16<uint16_t> other) const {
+    return __lasx_xvssub_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char *, size_t) const noexcept override {
-    return 0;
+  // Order-specific operations
+  simdutf_really_inline simd16<uint16_t>
+  max_val(const simd16<uint16_t> other) const {
+    return __lasx_xvmax_hu(this->value, other.value);
+  }
+  simdutf_really_inline simd16<uint16_t>
+  min_val(const simd16<uint16_t> other) const {
+    return __lasx_xvmin_hu(this->value, other.value);
+  }
+  // Same as >, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  gt_bits(const simd16<uint16_t> other) const {
+    return this->saturating_sub(other);
+  }
+  // Same as <, but only guarantees true is nonzero (< guarantees true = -1)
+  simdutf_really_inline simd16<uint16_t>
+  lt_bits(const simd16<uint16_t> other) const {
+    return other.saturating_sub(*this);
+  }
+  simdutf_really_inline simd16<bool>
+  operator<=(const simd16<uint16_t> other) const {
+    return __lasx_xvsle_hu(this->value, other.value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator>=(const simd16<uint16_t> other) const {
+    return __lasx_xvsle_hu(other.value, this->value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator>(const simd16<uint16_t> other) const {
+    return __lasx_xvslt_hu(other.value, this->value);
+  }
+  simdutf_really_inline simd16<bool>
+  operator<(const simd16<uint16_t> other) const {
+    return __lasx_xvslt_hu(this->value, other.value);
   }
 
-  simdutf_warn_unused result
-  base64_to_binary(const char *, size_t, char *, base64_options,
-                   last_chunk_handling_options) const noexcept override {
-    return result(error_code::OTHER, 0);
+  // Bit-specific operations
+  simdutf_really_inline simd16<bool> bits_not_set() const {
+    return *this == uint16_t(0);
+  }
+  simdutf_really_inline simd16<bool> bits_not_set(simd16<uint16_t> bits) const {
+    return (*this & bits).bits_not_set();
+  }
+  simdutf_really_inline simd16<bool> any_bits_set() const {
+    return ~this->bits_not_set();
+  }
+  simdutf_really_inline simd16<bool> any_bits_set(simd16<uint16_t> bits) const {
+    return ~this->bits_not_set(bits);
   }
 
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char *, size_t, char *, base64_options,
-      last_chunk_handling_options) const noexcept override {
-    return full_result(error_code::OTHER, 0, 0);
+  simdutf_really_inline bool any_bits_set_anywhere() const {
+    if (__lasx_xbnz_v(this->value))
+      return true;
+    return false;
+  }
+  simdutf_really_inline bool
+  any_bits_set_anywhere(simd16<uint16_t> bits) const {
+    return (*this & bits).any_bits_set_anywhere();
   }
 
-  simdutf_warn_unused size_t maximal_binary_length_from_base64(
-      const char16_t *, size_t) const noexcept override {
-    return 0;
+  template <int N> simdutf_really_inline simd16<uint16_t> shr() const {
+    return simd16<uint16_t>(__lasx_xvsrli_h(this->value, N));
+  }
+  template <int N> simdutf_really_inline simd16<uint16_t> shl() const {
+    return simd16<uint16_t>(__lasx_xvslli_h(this->value, N));
   }
 
-  simdutf_warn_unused result
-  base64_to_binary(const char16_t *, size_t, char *, base64_options,
-                   last_chunk_handling_options) const noexcept override {
-    return result(error_code::OTHER, 0);
+  // Change the endianness
+  simdutf_really_inline simd16<uint16_t> swap_bytes() const {
+    return __lasx_xvshuf4i_b(this->value, 0b10110001);
   }
 
-  simdutf_warn_unused full_result base64_to_binary_details(
-      const char16_t *, size_t, char *, base64_options,
-      last_chunk_handling_options) const noexcept override {
-    return full_result(error_code::OTHER, 0, 0);
+  // Pack with the unsigned saturation of two uint16_t code units into single
+  // uint8_t vector
+  static simdutf_really_inline simd8<uint8_t> pack(const simd16<uint16_t> &v0,
+                                                   const simd16<uint16_t> &v1) {
+    return __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(v1.value, v0.value, 0),
+                            0b11011000);
   }
+};
 
-  simdutf_warn_unused size_t
-  base64_length_from_binary(size_t, base64_options) const noexcept override {
-    return 0;
+template <typename T> struct simd16x32 {
+  static constexpr int NUM_CHUNKS = 64 / sizeof(simd16<T>);
+  static_assert(NUM_CHUNKS == 2,
+                "LASX kernel should use two registers per 64-byte block.");
+  simd16<T> chunks[NUM_CHUNKS];
+
+  simd16x32(const simd16x32<T> &o) = delete; // no copy allowed
+  simd16x32<T> &
+  operator=(const simd16<T> other) = delete; // no assignment allowed
+  simd16x32() = delete;                      // no default constructor allowed
+
+  simdutf_really_inline simd16x32(const simd16<T> chunk0,
+                                  const simd16<T> chunk1)
+      : chunks{chunk0, chunk1} {}
+  simdutf_really_inline simd16x32(const T *ptr)
+      : chunks{simd16<T>::load(ptr),
+               simd16<T>::load(ptr + sizeof(simd16<T>) / sizeof(T))} {}
+
+  simdutf_really_inline void store(T *ptr) const {
+    this->chunks[0].store(ptr + sizeof(simd16<T>) * 0 / sizeof(T));
+    this->chunks[1].store(ptr + sizeof(simd16<T>) * 1 / sizeof(T));
   }
 
-  size_t binary_to_base64(const char *, size_t, char *,
-                          base64_options) const noexcept override {
-    return 0;
+  simdutf_really_inline uint64_t to_bitmask() const {
+    uint64_t r_lo = uint32_t(this->chunks[0].to_bitmask());
+    uint64_t r_hi = this->chunks[1].to_bitmask();
+    return r_lo | (r_hi << 32);
   }
 
-  unsupported_implementation()
-      : implementation("unsupported",
-                       "Unsupported CPU (no detected SIMD instructions)", 0) {}
-};
+  simdutf_really_inline simd16<T> reduce_or() const {
+    return this->chunks[0] | this->chunks[1];
+  }
 
-const unsupported_implementation *get_unsupported_singleton() {
-  static const unsupported_implementation unsupported_singleton{};
-  return &unsupported_singleton;
-}
-static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
-              "unsupported_singleton should be trivially destructible");
+  simdutf_really_inline bool is_ascii() const {
+    return this->reduce_or().is_ascii();
+  }
 
-size_t available_implementation_list::size() const noexcept {
-  return internal::get_available_implementation_pointers().size();
-}
-const implementation *const *
-available_implementation_list::begin() const noexcept {
-  return internal::get_available_implementation_pointers().begin();
-}
-const implementation *const *
-available_implementation_list::end() const noexcept {
-  return internal::get_available_implementation_pointers().end();
-}
-const implementation *
-available_implementation_list::detect_best_supported() const noexcept {
-  // They are prelisted in priority order, so we just go down the list
-  uint32_t supported_instruction_sets =
-      internal::detect_supported_architectures();
-  for (const implementation *impl :
-       internal::get_available_implementation_pointers()) {
-    uint32_t required_instruction_sets = impl->required_instruction_sets();
-    if ((supported_instruction_sets & required_instruction_sets) ==
-        required_instruction_sets) {
-      return impl;
-    }
+  simdutf_really_inline void store_ascii_as_utf16(char16_t *ptr) const {
+    this->chunks[0].store_ascii_as_utf16(ptr + sizeof(simd16<T>) * 0);
+    this->chunks[1].store_ascii_as_utf16(ptr + sizeof(simd16<T>));
   }
-  return get_unsupported_singleton(); // this should never happen?
-}
 
-const implementation *
-detect_best_supported_implementation_on_first_use::set_best() const noexcept {
-  SIMDUTF_PUSH_DISABLE_WARNINGS
-  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
-                                     // manually verified this is safe
-      char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
-  SIMDUTF_POP_DISABLE_WARNINGS
+  simdutf_really_inline simd16x32<T> bit_or(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<T>(this->chunks[0] | mask, this->chunks[1] | mask);
+  }
 
-  if (force_implementation_name) {
-    auto force_implementation =
-        get_available_implementations()[force_implementation_name];
-    if (force_implementation) {
-      return get_active_implementation() = force_implementation;
-    } else {
-      // Note: abort() and stderr usage within the library is forbidden.
-      return get_active_implementation() = get_unsupported_singleton();
-    }
+  simdutf_really_inline void swap_bytes() {
+    this->chunks[0] = this->chunks[0].swap_bytes();
+    this->chunks[1] = this->chunks[1].swap_bytes();
   }
-  return get_active_implementation() =
-             get_available_implementations().detect_best_supported();
-}
 
-} // namespace internal
+  simdutf_really_inline uint64_t eq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] == mask, this->chunks[1] == mask)
+        .to_bitmask();
+  }
 
-/**
- * The list of available implementations compiled into simdutf.
- */
-SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
-get_available_implementations() {
-  static const internal::available_implementation_list
-      available_implementations{};
-  return available_implementations;
-}
+  simdutf_really_inline uint64_t eq(const simd16x32<uint16_t> &other) const {
+    return simd16x32<bool>(this->chunks[0] == other.chunks[0],
+                           this->chunks[1] == other.chunks[1])
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t lteq(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] <= mask, this->chunks[1] <= mask)
+        .to_bitmask();
+  }
+
+  simdutf_really_inline uint64_t in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(low);
+    const simd16<T> mask_high = simd16<T>::splat(high);
+
+    return simd16x32<bool>(
+               (this->chunks[0] <= mask_high) & (this->chunks[0] >= mask_low),
+               (this->chunks[1] <= mask_high) & (this->chunks[1] >= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t not_in_range(const T low, const T high) const {
+    const simd16<T> mask_low = simd16<T>::splat(static_cast<T>(low - 1));
+    const simd16<T> mask_high = simd16<T>::splat(static_cast<T>(high + 1));
+    return simd16x32<bool>(
+               (this->chunks[0] >= mask_high) | (this->chunks[0] <= mask_low),
+               (this->chunks[1] >= mask_high) | (this->chunks[1] <= mask_low))
+        .to_bitmask();
+  }
+  simdutf_really_inline uint64_t lt(const T m) const {
+    const simd16<T> mask = simd16<T>::splat(m);
+    return simd16x32<bool>(this->chunks[0] < mask, this->chunks[1] < mask)
+        .to_bitmask();
+  }
+}; // struct simd16x32<T>
+/* end file src/simdutf/lasx/simd16-inl.h */
+} // namespace simd
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
+
+#endif // SIMDUTF_LASX_SIMD_H
+/* end file src/simdutf/lasx/simd.h */
 
+/* begin file src/simdutf/lasx/end.h */
+/* end file src/simdutf/lasx/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_LASX
+
+#endif // SIMDUTF_LASX_H
+/* end file src/simdutf/lasx.h */
+/* begin file src/simdutf/fallback.h */
+#ifndef SIMDUTF_FALLBACK_H
+#define SIMDUTF_FALLBACK_H
+
+
+// Note that fallback.h is always imported last.
+
+// Default Fallback to on unless a builtin implementation has already been
+// selected.
+#ifndef SIMDUTF_IMPLEMENTATION_FALLBACK
+  #if SIMDUTF_CAN_ALWAYS_RUN_ARM64 || SIMDUTF_CAN_ALWAYS_RUN_ICELAKE ||        \
+      SIMDUTF_CAN_ALWAYS_RUN_HASWELL || SIMDUTF_CAN_ALWAYS_RUN_WESTMERE ||     \
+      SIMDUTF_CAN_ALWAYS_RUN_PPC64 || SIMDUTF_CAN_ALWAYS_RUN_RVV ||            \
+      SIMDUTF_CAN_ALWAYS_RUN_LSX || SIMDUTF_CAN_ALWAYS_RUN_LASX
+    #define SIMDUTF_IMPLEMENTATION_FALLBACK 0
+  #else
+    #define SIMDUTF_IMPLEMENTATION_FALLBACK 1
+  #endif
+#endif
+
+#define SIMDUTF_CAN_ALWAYS_RUN_FALLBACK (SIMDUTF_IMPLEMENTATION_FALLBACK)
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+
+namespace simdutf {
 /**
- * The active implementation.
+ * Fallback implementation (runs on any machine).
  */
-SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
-get_active_implementation() {
-#if SIMDUTF_SINGLE_IMPLEMENTATION
-  // skip runtime detection
-  static internal::atomic_ptr<const implementation> active_implementation{
-      internal::get_single_implementation()};
-  return active_implementation;
-#else
-  static const internal::detect_best_supported_implementation_on_first_use
-      detect_best_supported_implementation_on_first_use_singleton;
-  static internal::atomic_ptr<const implementation> active_implementation{
-      &detect_best_supported_implementation_on_first_use_singleton};
-  return active_implementation;
-#endif
-}
+namespace fallback {} // namespace fallback
+} // namespace simdutf
 
-#if SIMDUTF_SINGLE_IMPLEMENTATION
-const implementation *get_default_implementation() {
-  return internal::get_single_implementation();
-}
-#else
-internal::atomic_ptr<const implementation> &get_default_implementation() {
-  return get_active_implementation();
-}
-#endif
-#define SIMDUTF_GET_CURRENT_IMPLEMENTION
+/* begin file src/simdutf/fallback/implementation.h */
+#ifndef SIMDUTF_FALLBACK_IMPLEMENTATION_H
+#define SIMDUTF_FALLBACK_IMPLEMENTATION_H
 
-simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
-  return get_default_implementation()->validate_utf8(buf, len);
-}
-simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
-                                                     size_t len) noexcept {
-  return get_default_implementation()->validate_utf8_with_errors(buf, len);
-}
-simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
-  return get_default_implementation()->validate_ascii(buf, len);
-}
-simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
-                                                      size_t len) noexcept {
-  return get_default_implementation()->validate_ascii_with_errors(buf, len);
-}
-simdutf_warn_unused size_t convert_utf8_to_utf16(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be(input, length, utf16_output);
-#else
-  return convert_utf8_to_utf16le(input, length, utf16_output);
-#endif
-}
-simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
-                                                  char *utf8_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf8(buf, len,
-                                                              utf8_output);
-}
-simdutf_warn_unused size_t convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf16le(buf, len,
-                                                                 utf16_output);
+
+namespace simdutf {
+namespace fallback {
+
+namespace {
+using namespace simdutf;
 }
-simdutf_warn_unused size_t convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf16be(buf, len,
-                                                                 utf16_output);
-}
-simdutf_warn_unused size_t convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *latin1_output) noexcept {
-  return get_default_implementation()->convert_latin1_to_utf32(buf, len,
-                                                               latin1_output);
+
+class implementation final : public simdutf::implementation {
+public:
+  simdutf_really_inline implementation()
+      : simdutf::implementation("fallback", "Generic fallback implementation",
+                                0) {}
+  simdutf_warn_unused int detect_encodings(const char *input,
+                                           size_t length) const noexcept final;
+  simdutf_warn_unused bool validate_utf8(const char *buf,
+                                         size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_utf8_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_ascii(const char *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result
+  validate_ascii_with_errors(const char *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+                                            size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+                                          size_t len) const noexcept final;
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *buf, size_t len) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf8(
+      const char *buf, size_t len, char *utf8_output) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *buf, size_t len, char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *buf, size_t len, char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *buf, size_t len, char16_t *utf16_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *buf, size_t len, char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *buf, size_t len, char32_t *utf32_output) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *buf, size_t len, char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_latin1(const char16_t *buf, size_t len,
+                                  char *latin1_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *buf, size_t len, char *utf8_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                          char *latin1_output) const noexcept final;
+  simdutf_warn_unused result
+  convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                      char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_latin1(const char32_t *buf, size_t len,
+                                char *latin1_output) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16le(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf16be(const char32_t *buf, size_t len,
+                           char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16le(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf16be(const char32_t *buf, size_t len,
+                                 char16_t *utf16_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_utf16be_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16le_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  simdutf_warn_unused size_t
+  convert_valid_utf16be_to_utf32(const char16_t *buf, size_t len,
+                                 char32_t *utf32_buffer) const noexcept final;
+  void change_endianness_utf16(const char16_t *buf, size_t length,
+                               char16_t *output) const noexcept final;
+  simdutf_warn_unused size_t count_utf16le(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf16be(const char16_t *buf,
+                                           size_t length) const noexcept;
+  simdutf_warn_unused size_t count_utf8(const char *buf,
+                                        size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16le(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16be(const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16le(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t utf32_length_from_utf16be(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_utf32(const char32_t *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t length) const noexcept;
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *input, size_t length) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options) const noexcept;
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *input, size_t length) const noexcept;
+  simdutf_warn_unused result base64_to_binary(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_options) const noexcept;
+  simdutf_warn_unused size_t base64_length_from_binary(
+      size_t length, base64_options options) const noexcept;
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_options =
+          last_chunk_handling_options::loose) const noexcept;
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept;
+};
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_IMPLEMENTATION_H
+/* end file src/simdutf/fallback/implementation.h */
+
+/* begin file src/simdutf/fallback/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "fallback"
+// #define SIMDUTF_IMPLEMENTATION fallback
+/* end file src/simdutf/fallback/begin.h */
+
+  // Declarations
+/* begin file src/simdutf/fallback/bitmanipulation.h */
+#ifndef SIMDUTF_FALLBACK_BITMANIPULATION_H
+#define SIMDUTF_FALLBACK_BITMANIPULATION_H
+
+#include <limits>
+
+namespace simdutf {
+namespace fallback {
+namespace {} // unnamed namespace
+} // namespace fallback
+} // namespace simdutf
+
+#endif // SIMDUTF_FALLBACK_BITMANIPULATION_H
+/* end file src/simdutf/fallback/bitmanipulation.h */
+
+/* begin file src/simdutf/fallback/end.h */
+/* end file src/simdutf/fallback/end.h */
+
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK
+#endif // SIMDUTF_FALLBACK_H
+/* end file src/simdutf/fallback.h */
+
+/* begin file src/scalar/utf8.h */
+#ifndef SIMDUTF_UTF8_H
+#define SIMDUTF_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8 {
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_RVV
+// only used by the fallback kernel.
+// credit: based on code from Google Fuchsia (Apache Licensed)
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 16 bytes are ascii.
+    uint64_t next_pos = pos + 16;
+    if (next_pos <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+
+    while (byte < 0b10000000) {
+      if (++pos == len) {
+        return true;
+      }
+      byte = data[pos];
+    }
+
+    if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) {
+        return false;
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) {
+        return false;
+      }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) {
+        return false;
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point) ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return false;
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) {
+        return false;
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return false;
+      }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) {
+        return false;
+      }
+    } else {
+      // we may have a continuation
+      return false;
+    }
+    pos = next_pos;
+  }
+  return true;
 }
-simdutf_warn_unused size_t convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_latin1(buf, len,
-                                                              latin1_output);
+#endif
+
+inline simdutf_warn_unused result validate_with_errors(const char *buf,
+                                                       size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  uint32_t code_point = 0;
+  while (pos < len) {
+    // check of the next 16 bytes are ascii.
+    size_t next_pos = pos + 16;
+    if (next_pos <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      std::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        pos = next_pos;
+        continue;
+      }
+    }
+    unsigned char byte = data[pos];
+
+    while (byte < 0b10000000) {
+      if (++pos == len) {
+        return result(error_code::SUCCESS, len);
+      }
+      byte = data[pos];
+    }
+
+    if ((byte & 0b11100000) == 0b11000000) {
+      next_pos = pos + 2;
+      if (next_pos > len) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      code_point = (byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if ((code_point < 0x80) || (0x7ff < code_point)) {
+        return result(error_code::OVERLONG, pos);
+      }
+    } else if ((byte & 0b11110000) == 0b11100000) {
+      next_pos = pos + 3;
+      if (next_pos > len) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      code_point = (byte & 0b00001111) << 12 |
+                   (data[pos + 1] & 0b00111111) << 6 |
+                   (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point)) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xd7ff < code_point && code_point < 0xe000) {
+        return result(error_code::SURROGATE, pos);
+      }
+    } else if ((byte & 0b11111000) == 0b11110000) { // 0b11110000
+      next_pos = pos + 4;
+      if (next_pos > len) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      code_point =
+          (byte & 0b00000111) << 18 | (data[pos + 1] & 0b00111111) << 12 |
+          (data[pos + 2] & 0b00111111) << 6 | (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0x10ffff < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      } else {
+        return result(error_code::HEADER_BITS, pos);
+      }
+    }
+    pos = next_pos;
+  }
+  return result(error_code::SUCCESS, len);
 }
-simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_latin1_with_errors(
-      buf, len, latin1_output);
+
+// Finds the previous leading byte starting backward from buf and validates with
+// errors from there Used to pinpoint the location of an error when an invalid
+// chunk is detected We assume that the stream starts with a leading byte, and
+// to check that it is the case, we ask that you pass a pointer to the start of
+// the stream (start).
+inline simdutf_warn_unused result rewind_and_validate_with_errors(
+    const char *start, const char *buf, size_t len) noexcept {
+  // First check that we start with a leading byte
+  if ((*start & 0b11000000) == 0b10000000) {
+    return result(error_code::TOO_LONG, 0);
+  }
+  size_t extra_len{0};
+  // A leading byte cannot be further than 4 bytes away
+  for (int i = 0; i < 5; i++) {
+    unsigned char byte = *buf;
+    if ((byte & 0b11000000) != 0b10000000) {
+      break;
+    } else {
+      buf--;
+      extra_len++;
+    }
+  }
+
+  result res = validate_with_errors(buf, len + extra_len);
+  res.count -= extra_len;
+  return res;
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_latin1(
-      buf, len, latin1_output);
+
+inline size_t count_code_points(const char *buf, size_t len) {
+  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    // -65 is 0b10111111, anything larger in two-complement's should start a new
+    // code point.
+    if (p[i] > -65) {
+      counter++;
+    }
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16le(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16le(input, length,
-                                                               utf16_output);
+
+inline size_t utf16_length_from_utf8(const char *buf, size_t len) {
+  const int8_t *p = reinterpret_cast<const int8_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    if (p[i] > -65) {
+      counter++;
+    }
+    if (uint8_t(p[i]) >= 240) {
+      counter++;
+    }
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_utf8_to_utf16be(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16be(input, length,
-                                                               utf16_output);
+
+simdutf_warn_unused inline size_t trim_partial_utf8(const char *input,
+                                                    size_t length) {
+  if (length < 3) {
+    switch (length) {
+    case 2:
+      if (uint8_t(input[length - 1]) >= 0xc0) {
+        return length - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      if (uint8_t(input[length - 2]) >= 0xe0) {
+        return length - 2;
+      } // 3- and 4-byte characters with only 2 bytes left
+      return length;
+    case 1:
+      if (uint8_t(input[length - 1]) >= 0xc0) {
+        return length - 1;
+      } // 2-, 3- and 4-byte characters with only 1 byte left
+      return length;
+    case 0:
+      return length;
+    }
+  }
+  if (uint8_t(input[length - 1]) >= 0xc0) {
+    return length - 1;
+  } // 2-, 3- and 4-byte characters with only 1 byte left
+  if (uint8_t(input[length - 2]) >= 0xe0) {
+    return length - 2;
+  } // 3- and 4-byte characters with only 1 byte left
+  if (uint8_t(input[length - 3]) >= 0xf0) {
+    return length - 3;
+  } // 4-byte characters with only 3 bytes left
+  return length;
 }
-simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
-#else
-  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+
+} // namespace utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/utf8.h */
+/* begin file src/scalar/utf16.h */
+#ifndef SIMDUTF_UTF16_H
+#define SIMDUTF_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16 {
+
+inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
+  return uint16_t((word >> 8) | (word << 8));
 }
-simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16le_with_errors(
-      input, length, utf16_output);
+
+template <endianness big_endian>
+inline simdutf_warn_unused bool validate(const char16_t *buf,
+                                         size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  uint64_t pos = 0;
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) == 0xD800) {
+      if (pos + 1 >= len) {
+        return false;
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return false;
+      }
+      uint16_t next_word =
+          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return false;
+      }
+      pos += 2;
+    } else {
+      pos++;
+    }
+  }
+  return true;
 }
-simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
-    const char *input, size_t length, char16_t *utf16_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf16be_with_errors(
-      input, length, utf16_output);
+
+template <endianness big_endian>
+inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
+                                                       size_t len) noexcept {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) == 0xD800) {
+      if (pos + 1 >= len) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t next_word =
+          !match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      pos += 2;
+    } else {
+      pos++;
+    }
+  }
+  return result(error_code::SUCCESS, pos);
 }
-simdutf_warn_unused size_t convert_utf8_to_utf32(
-    const char *input, size_t length, char32_t *utf32_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf32(input, length,
-                                                             utf32_output);
+
+template <endianness big_endian>
+inline size_t count_code_points(const char16_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter += ((word & 0xFC00) != 0xDC00);
+  }
+  return counter;
 }
-simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
-    const char *input, size_t length, char32_t *utf32_output) noexcept {
-  return get_default_implementation()->convert_utf8_to_utf32_with_errors(
-      input, length, utf32_output);
+
+template <endianness big_endian>
+inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter++; // ASCII
+    counter += static_cast<size_t>(
+        word >
+        0x7F); // non-ASCII is at least 2 bytes, surrogates are 2*2 == 4 bytes
+    counter += static_cast<size_t>((word > 0x7FF && word <= 0xD7FF) ||
+                                   (word >= 0xE000)); // three-byte
+  }
+  return counter;
 }
-simdutf_warn_unused bool validate_utf16(const char16_t *buf,
-                                        size_t len) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be(buf, len);
-#else
-  return validate_utf16le(buf, len);
-#endif
+
+template <endianness big_endian>
+inline size_t utf32_length_from_utf16(const char16_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
+    counter += ((word & 0xFC00) != 0xDC00);
+  }
+  return counter;
 }
-simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
-                                          size_t len) noexcept {
-  return get_default_implementation()->validate_utf16le(buf, len);
+
+inline size_t latin1_length_from_utf16(size_t len) { return len; }
+
+simdutf_really_inline void change_endianness_utf16(const char16_t *in,
+                                                   size_t size, char16_t *out) {
+  const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
+  uint16_t *output = reinterpret_cast<uint16_t *>(out);
+  for (size_t i = 0; i < size; i++) {
+    *output++ = uint16_t(input[i] >> 8 | input[i] << 8);
+  }
 }
-simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
-                                          size_t len) noexcept {
-  return get_default_implementation()->validate_utf16be(buf, len);
+
+template <endianness big_endian>
+simdutf_warn_unused inline size_t trim_partial_utf16(const char16_t *input,
+                                                     size_t length) {
+  if (length <= 1) {
+    return length;
+  }
+  uint16_t last_word = uint16_t(input[length - 1]);
+  last_word = !match_system(big_endian) ? swap_bytes(last_word) : last_word;
+  length -= ((last_word & 0xFC00) == 0xD800);
+  return length;
 }
-simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
-                                                      size_t len) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return validate_utf16be_with_errors(buf, len);
-#else
-  return validate_utf16le_with_errors(buf, len);
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/utf16.h */
+/* begin file src/scalar/utf32.h */
+#ifndef SIMDUTF_UTF32_H
+#define SIMDUTF_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32 {
+
+inline simdutf_warn_unused bool validate(const char32_t *buf,
+                                         size_t len) noexcept {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  uint64_t pos = 0;
+  for (; pos < len; pos++) {
+    uint32_t word = data[pos];
+    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+      return false;
+    }
+  }
+  return true;
 }
-simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
-                                                        size_t len) noexcept {
-  return get_default_implementation()->validate_utf16le_with_errors(buf, len);
-}
-simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
-                                                        size_t len) noexcept {
-  return get_default_implementation()->validate_utf16be_with_errors(buf, len);
-}
-simdutf_warn_unused bool validate_utf32(const char32_t *buf,
-                                        size_t len) noexcept {
-  return get_default_implementation()->validate_utf32(buf, len);
-}
-simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
-                                                      size_t len) noexcept {
-  return get_default_implementation()->validate_utf32_with_errors(buf, len);
+
+inline simdutf_warn_unused result validate_with_errors(const char32_t *buf,
+                                                       size_t len) noexcept {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  for (; pos < len; pos++) {
+    uint32_t word = data[pos];
+    if (word > 0x10FFFF) {
+      return result(error_code::TOO_LARGE, pos);
+    }
+    if (word >= 0xD800 && word <= 0xDFFF) {
+      return result(error_code::SURROGATE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, pos);
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
-    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
-#else
-  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
-#endif
+
+inline size_t utf8_length_from_utf32(const char32_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    // credit: @ttsugriy  for the vectorizable approach
+    counter++;                                     // ASCII
+    counter += static_cast<size_t>(p[i] > 0x7F);   // two-byte
+    counter += static_cast<size_t>(p[i] > 0x7FF);  // three-byte
+    counter += static_cast<size_t>(p[i] > 0xFFFF); // four-bytes
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
-    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_utf16le(
-      input, length, utf16_buffer);
+
+inline size_t utf16_length_from_utf32(const char32_t *buf, size_t len) {
+  // We are not BOM aware.
+  const uint32_t *p = reinterpret_cast<const uint32_t *>(buf);
+  size_t counter{0};
+  for (size_t i = 0; i < len; i++) {
+    counter++;                                     // non-surrogate word
+    counter += static_cast<size_t>(p[i] > 0xFFFF); // surrogate pair
+  }
+  return counter;
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
-    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_utf16be(
-      input, length, utf16_buffer);
+
+inline size_t latin1_length_from_utf32(size_t len) {
+  // We are not BOM aware.
+  return len; // a utf32 codepoint will always represent 1 latin1 character
 }
-simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
-    const char *input, size_t length, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf8_to_utf32(
-      input, length, utf32_buffer);
+
+inline simdutf_warn_unused uint32_t swap_bytes(const uint32_t word) {
+  return ((word >> 24) & 0xff) |      // move byte 3 to byte 0
+         ((word << 8) & 0xff0000) |   // move byte 1 to byte 2
+         ((word >> 8) & 0xff00) |     // move byte 2 to byte 1
+         ((word << 24) & 0xff000000); // byte 0 to byte 3
 }
-simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf,
-                                                 size_t len,
-                                                 char *utf8_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
-#else
-  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+
+} // namespace utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/utf32.h */
+/* begin file src/scalar/base64.h */
+#ifndef SIMDUTF_BASE64_H
+#define SIMDUTF_BASE64_H
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iostream>
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace base64 {
+
+// This function is not expected to be fast. Do not use in long loops.
+template <class char_type> bool is_ascii_white_space(char_type c) {
+  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f';
 }
-simdutf_warn_unused size_t convert_utf16_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_latin1(buf, len, latin1_buffer);
-#else
-  return convert_utf16le_to_latin1(buf, len, latin1_buffer);
-#endif
+
+template <class char_type> bool is_ascii_white_space_or_padding(char_type c) {
+  return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' ||
+         c == '=';
 }
-simdutf_warn_unused size_t convert_latin1_to_utf16(
-    const char *buf, size_t len, char16_t *utf16_output) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_latin1_to_utf16be(buf, len, utf16_output);
-#else
-  return convert_latin1_to_utf16le(buf, len, utf16_output);
-#endif
+
+template <class char_type> bool is_eight_byte(char_type c) {
+  if (sizeof(char_type) == 1) {
+    return true;
+  }
+  return uint8_t(c) == c;
 }
-simdutf_warn_unused size_t convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_latin1(buf, len,
-                                                                 latin1_buffer);
+
+// Returns true upon success. The destination buffer must be large enough.
+// This functions assumes that the padding (=) has been removed.
+template <class char_type>
+full_result
+base64_tail_decode(char *dst, const char_type *src, size_t length,
+                   size_t padded_characters, // number of padding characters
+                                             // '=', typically 0, 1, 2.
+                   base64_options options,
+                   last_chunk_handling_options last_chunk_options) {
+  // This looks like 5 branches, but we expect the compiler to resolve this to a
+  // single branch:
+  const uint8_t *to_base64 = (options & base64_url)
+                                 ? tables::base64::to_base64_url_value
+                                 : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url)
+                           ? tables::base64::base64_url::d0
+                           : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url)
+                           ? tables::base64::base64_url::d1
+                           : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url)
+                           ? tables::base64::base64_url::d2
+                           : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url)
+                           ? tables::base64::base64_url::d3
+                           : tables::base64::base64_default::d3;
+
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
+           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
+           is_eight_byte(src[3]) &&
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if (match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    // we need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char_type c = *src;
+      uint8_t code = to_base64[uint8_t(c)];
+      buffer[idx] = uint8_t(code);
+      if (is_eight_byte(c) && code <= 63) {
+        idx++;
+      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (last_chunk_options == last_chunk_handling_options::strict &&
+          (idx != 1) && ((idx + padded_characters) & 3) != 0) {
+        // The partial chunk was at src - idx
+        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      } else if (last_chunk_options ==
+                     last_chunk_handling_options::stop_before_partial &&
+                 (idx != 1) && ((idx + padded_characters) & 3) != 0) {
+        // Rewind src to before partial chunk
+        src -= idx;
+        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
+      } else {
+        if (idx == 2) {
+          uint32_t triple =
+              (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6);
+          if ((last_chunk_options == last_chunk_handling_options::strict) &&
+              (triple & 0xffff)) {
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+          if (match_system(endianness::BIG)) {
+            triple <<= 8;
+            std::memcpy(dst, &triple, 1);
+          } else {
+            triple = scalar::utf32::swap_bytes(triple);
+            triple >>= 8;
+            std::memcpy(dst, &triple, 1);
+          }
+          dst += 1;
+        } else if (idx == 3) {
+          uint32_t triple = (uint32_t(buffer[0]) << 3 * 6) +
+                            (uint32_t(buffer[1]) << 2 * 6) +
+                            (uint32_t(buffer[2]) << 1 * 6);
+          if ((last_chunk_options == last_chunk_handling_options::strict) &&
+              (triple & 0xff)) {
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+          if (match_system(endianness::BIG)) {
+            triple <<= 8;
+            std::memcpy(dst, &triple, 2);
+          } else {
+            triple = scalar::utf32::swap_bytes(triple);
+            triple >>= 8;
+            std::memcpy(dst, &triple, 2);
+          }
+          dst += 2;
+        } else if (idx == 1) {
+          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+                  size_t(dst - dstinit)};
+        }
+        return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
+      }
+    }
+
+    uint32_t triple =
+        (uint32_t(buffer[0]) << 3 * 6) + (uint32_t(buffer[1]) << 2 * 6) +
+        (uint32_t(buffer[2]) << 1 * 6) + (uint32_t(buffer[3]) << 0 * 6);
+    if (match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
 }
-simdutf_warn_unused size_t convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_latin1(buf, len,
-                                                                 latin1_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16be_to_latin1(
-      buf, len, latin1_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16le_to_latin1(
-      buf, len, latin1_buffer);
-}
-simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_latin1_with_errors(
-      buf, len, latin1_buffer);
+
+// like base64_tail_decode, but it will not write past the end of the output
+// buffer. The outlen paramter is modified to reflect the number of bytes
+// written. This functions assumes that the padding (=) has been removed.
+template <class char_type>
+result base64_tail_decode_safe(
+    char *dst, size_t &outlen, const char_type *&srcr, size_t length,
+    size_t padded_characters, // number of padding characters '=', typically 0,
+                              // 1, 2.
+    base64_options options, last_chunk_handling_options last_chunk_options) {
+  const char_type *src = srcr;
+  if (length == 0) {
+    outlen = 0;
+    return {SUCCESS, 0};
+  }
+  // This looks like 5 branches, but we expect the compiler to resolve this to a
+  // single branch:
+  const uint8_t *to_base64 = (options & base64_url)
+                                 ? tables::base64::to_base64_url_value
+                                 : tables::base64::to_base64_value;
+  const uint32_t *d0 = (options & base64_url)
+                           ? tables::base64::base64_url::d0
+                           : tables::base64::base64_default::d0;
+  const uint32_t *d1 = (options & base64_url)
+                           ? tables::base64::base64_url::d1
+                           : tables::base64::base64_default::d1;
+  const uint32_t *d2 = (options & base64_url)
+                           ? tables::base64::base64_url::d2
+                           : tables::base64::base64_default::d2;
+  const uint32_t *d3 = (options & base64_url)
+                           ? tables::base64::base64_url::d3
+                           : tables::base64::base64_default::d3;
+
+  const char_type *srcend = src + length;
+  const char_type *srcinit = src;
+  const char *dstinit = dst;
+  const char *dstend = dst + outlen;
+
+  uint32_t x;
+  size_t idx;
+  uint8_t buffer[4];
+  while (true) {
+    while (src + 4 <= srcend && is_eight_byte(src[0]) &&
+           is_eight_byte(src[1]) && is_eight_byte(src[2]) &&
+           is_eight_byte(src[3]) &&
+           (x = d0[uint8_t(src[0])] | d1[uint8_t(src[1])] |
+                d2[uint8_t(src[2])] | d3[uint8_t(src[3])]) < 0x01FFFFFF) {
+      if (dstend - dst < 3) {
+        outlen = size_t(dst - dstinit);
+        srcr = src;
+        return {OUTPUT_BUFFER_TOO_SMALL, size_t(src - srcinit)};
+      }
+      if (match_system(endianness::BIG)) {
+        x = scalar::utf32::swap_bytes(x);
+      }
+      std::memcpy(dst, &x, 3); // optimization opportunity: copy 4 bytes
+      dst += 3;
+      src += 4;
+    }
+    idx = 0;
+    const char_type *srccur = src;
+    // We need at least four characters.
+    while (idx < 4 && src < srcend) {
+      char_type c = *src;
+      uint8_t code = to_base64[uint8_t(c)];
+
+      buffer[idx] = uint8_t(code);
+      if (is_eight_byte(c) && code <= 63) {
+        idx++;
+      } else if (code > 64 || !scalar::base64::is_eight_byte(c)) {
+        outlen = size_t(dst - dstinit);
+        srcr = src;
+        return {INVALID_BASE64_CHARACTER, size_t(src - srcinit)};
+      } else {
+        // We have a space or a newline. We ignore it.
+      }
+      src++;
+    }
+    if (idx != 4) {
+      if (last_chunk_options == last_chunk_handling_options::strict &&
+          ((idx + padded_characters) & 3) != 0) {
+        outlen = size_t(dst - dstinit);
+        srcr = src;
+        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
+      } else if (last_chunk_options ==
+                     last_chunk_handling_options::stop_before_partial &&
+                 ((idx + padded_characters) & 3) != 0) {
+        // Rewind src to before partial chunk
+        srcr = srccur;
+        outlen = size_t(dst - dstinit);
+        return {SUCCESS, size_t(dst - dstinit)};
+      } else { // loose mode
+        if (idx == 0) {
+          // No data left; return success
+          outlen = size_t(dst - dstinit);
+          srcr = src;
+          return {SUCCESS, size_t(dst - dstinit)};
+        } else if (idx == 1) {
+          // Error: Incomplete chunk of length 1 is invalid in loose mode
+          outlen = size_t(dst - dstinit);
+          srcr = src;
+          return {BASE64_INPUT_REMAINDER, size_t(src - srcinit)};
+        } else if (idx == 2 || idx == 3) {
+          // Check if there's enough space in the destination buffer
+          size_t required_space = (idx == 2) ? 1 : 2;
+          if (size_t(dstend - dst) < required_space) {
+            outlen = size_t(dst - dstinit);
+            srcr = src;
+            return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+          }
+          uint32_t triple = 0;
+          if (idx == 2) {
+            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12);
+            if ((last_chunk_options == last_chunk_handling_options::strict) &&
+                (triple & 0xffff)) {
+              srcr = src;
+              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
+            }
+            // Extract the first byte
+            triple >>= 16;
+            dst[0] = static_cast<char>(triple & 0xFF);
+            dst += 1;
+          } else if (idx == 3) {
+            triple = (uint32_t(buffer[0]) << 18) + (uint32_t(buffer[1]) << 12) +
+                     (uint32_t(buffer[2]) << 6);
+            if ((last_chunk_options == last_chunk_handling_options::strict) &&
+                (triple & 0xff)) {
+              srcr = src;
+              return {BASE64_EXTRA_BITS, size_t(src - srcinit)};
+            }
+            // Extract the first two bytes
+            triple >>= 8;
+            dst[0] = static_cast<char>((triple >> 8) & 0xFF);
+            dst[1] = static_cast<char>(triple & 0xFF);
+            dst += 2;
+          }
+          outlen = size_t(dst - dstinit);
+          srcr = src;
+          return {SUCCESS, size_t(dst - dstinit)};
+        }
+      }
+    }
+
+    if (dstend - dst < 3) {
+      outlen = size_t(dst - dstinit);
+      srcr = src;
+      return {OUTPUT_BUFFER_TOO_SMALL, size_t(srccur - srcinit)};
+    }
+    uint32_t triple = (uint32_t(buffer[0]) << 18) +
+                      (uint32_t(buffer[1]) << 12) + (uint32_t(buffer[2]) << 6) +
+                      (uint32_t(buffer[3]));
+    if (match_system(endianness::BIG)) {
+      triple <<= 8;
+      std::memcpy(dst, &triple, 3);
+    } else {
+      triple = scalar::utf32::swap_bytes(triple);
+      triple >>= 8;
+      std::memcpy(dst, &triple, 3);
+    }
+    dst += 3;
+  }
 }
-simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_latin1_with_errors(
-      buf, len, latin1_buffer);
+
+// Returns the number of bytes written. The destination buffer must be large
+// enough. It will add padding (=) if needed.
+size_t tail_encode_base64(char *dst, const char *src, size_t srclen,
+                          base64_options options) {
+  // By default, we use padding if we are not using the URL variant.
+  // This is check with ((options & base64_url) == 0) which returns true if we
+  // are not using the URL variant. However, we also allow 'inversion' of the
+  // convention with the base64_reverse_padding option. If the
+  // base64_reverse_padding option is set, we use padding if we are using the
+  // URL variant, and we omit it if we are not using the URL variant. This is
+  // checked with
+  // ((options & base64_reverse_padding) == base64_reverse_padding).
+  bool use_padding =
+      ((options & base64_url) == 0) ^
+      ((options & base64_reverse_padding) == base64_reverse_padding);
+  // This looks like 3 branches, but we expect the compiler to resolve this to
+  // a single branch:
+  const char *e0 = (options & base64_url) ? tables::base64::base64_url::e0
+                                          : tables::base64::base64_default::e0;
+  const char *e1 = (options & base64_url) ? tables::base64::base64_url::e1
+                                          : tables::base64::base64_default::e1;
+  const char *e2 = (options & base64_url) ? tables::base64::base64_url::e2
+                                          : tables::base64::base64_default::e2;
+  char *out = dst;
+  size_t i = 0;
+  uint8_t t1, t2, t3;
+  for (; i + 2 < srclen; i += 3) {
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    t3 = uint8_t(src[i + 2]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e1[((t2 & 0x0F) << 2) | ((t3 >> 6) & 0x03)];
+    *out++ = e2[t3];
+  }
+  switch (srclen - i) {
+  case 0:
+    break;
+  case 1:
+    t1 = uint8_t(src[i]);
+    *out++ = e0[t1];
+    *out++ = e1[(t1 & 0x03) << 4];
+    if (use_padding) {
+      *out++ = '=';
+      *out++ = '=';
+    }
+    break;
+  default: /* case 2 */
+    t1 = uint8_t(src[i]);
+    t2 = uint8_t(src[i + 1]);
+    *out++ = e0[t1];
+    *out++ = e1[((t1 & 0x03) << 4) | ((t2 >> 4) & 0x0F)];
+    *out++ = e2[(t2 & 0x0F) << 2];
+    if (use_padding) {
+      *out++ = '=';
+    }
+  }
+  return (size_t)(out - dst);
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf,
-                                                   size_t len,
-                                                   char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf8(buf, len,
-                                                               utf8_buffer);
+
+template <class char_type>
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+    const char_type *input, size_t length) noexcept {
+  // We follow https://infra.spec.whatwg.org/#forgiving-base64-decode
+  size_t padding = 0;
+  if (length > 0) {
+    if (input[length - 1] == '=') {
+      padding++;
+      if (length > 1 && input[length - 2] == '=') {
+        padding++;
+      }
+    }
+  }
+  size_t actual_length = length - padding;
+  if (actual_length % 4 <= 1) {
+    return actual_length / 4 * 3;
+  }
+  // if we have a valid input, then the remainder must be 2 or 3 adding one or
+  // two extra bytes.
+  return actual_length / 4 * 3 + (actual_length % 4) - 1;
 }
-simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf,
-                                                   size_t len,
-                                                   char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf8(buf, len,
-                                                               utf8_buffer);
+
+simdutf_warn_unused size_t
+base64_length_from_binary(size_t length, base64_options options) noexcept {
+  // By default, we use padding if we are not using the URL variant.
+  // This is check with ((options & base64_url) == 0) which returns true if we
+  // are not using the URL variant. However, we also allow 'inversion' of the
+  // convention with the base64_reverse_padding option. If the
+  // base64_reverse_padding option is set, we use padding if we are using the
+  // URL variant, and we omit it if we are not using the URL variant. This is
+  // checked with
+  // ((options & base64_reverse_padding) == base64_reverse_padding).
+  bool use_padding =
+      ((options & base64_url) == 0) ^
+      ((options & base64_reverse_padding) == base64_reverse_padding);
+  if (!use_padding) {
+    return length / 3 * 4 + ((length % 3) ? (length % 3) + 1 : 0);
+  }
+  return (length + 2) / 3 *
+         4; // We use padding to make the length a multiple of 4.
 }
-simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
-#else
-  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+
+} // namespace base64
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/base64.h */
+/* begin file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF8_H
+#define SIMDUTF_LATIN1_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf8 {
+
+inline size_t convert(const char *buf, size_t len, char *utf8_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  size_t utf8_pos = 0;
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 |
+                 v2}; // We are only interested in these bits: 1000 1000 1000
+                      // 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          utf8_output[utf8_pos++] = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    unsigned char byte = data[pos];
+    if ((byte & 0x80) == 0) { // if ASCII
+      // will generate one UTF-8 bytes
+      utf8_output[utf8_pos++] = char(byte);
+      pos++;
+    } else {
+      // will generate two UTF-8 bytes
+      utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
+      utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_pos;
 }
-simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
-#else
-  return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+
+inline size_t convert_safe(const char *buf, size_t len, char *utf8_output,
+                           size_t utf8_len) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  size_t pos = 0;
+  size_t skip_pos = 0;
+  size_t utf8_pos = 0;
+  while (pos < len && utf8_pos < utf8_len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos >= skip_pos && pos + 16 <= len &&
+        utf8_pos + 16 <= utf8_len) { // if it is safe to read 16 more bytes,
+                                     // check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 |
+                 v2}; // We are only interested in these bits: 1000 1000 1000
+                      // 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        ::memcpy(utf8_output + utf8_pos, buf + pos, 16);
+        utf8_pos += 16;
+        pos += 16;
+      } else {
+        // At least one of the next 16 bytes are not ASCII, we will process them
+        // one by one
+        skip_pos = pos + 16;
+      }
+    } else {
+      const auto byte = data[pos];
+      if ((byte & 0x80) == 0) { // if ASCII
+        // will generate one UTF-8 bytes
+        utf8_output[utf8_pos++] = char(byte);
+        pos++;
+      } else if (utf8_pos + 2 <= utf8_len) {
+        // will generate two UTF-8 bytes
+        utf8_output[utf8_pos++] = char((byte >> 6) | 0b11000000);
+        utf8_output[utf8_pos++] = char((byte & 0b111111) | 0b10000000);
+        pos++;
+      } else {
+        break;
+      }
+    }
+  }
+  return utf8_pos;
+}
+
+} // namespace latin1_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
 #endif
+/* end file src/scalar/latin1_to_utf8/latin1_to_utf8.h */
+
+namespace simdutf {
+bool implementation::supported_by_runtime_system() const {
+  uint32_t required_instruction_sets = this->required_instruction_sets();
+  uint32_t supported_instruction_sets =
+      internal::detect_supported_architectures();
+  return ((supported_instruction_sets & required_instruction_sets) ==
+          required_instruction_sets);
 }
-simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf8_with_errors(
-      buf, len, utf8_buffer);
+
+simdutf_warn_unused encoding_type implementation::autodetect_encoding(
+    const char *input, size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // UTF8 is common, it includes ASCII, and is commonly represented
+  // without a BOM, so if it fits, go with that. Note that it is still
+  // possible to get it wrong, we are only 'guessing'. If some has UTF-16
+  // data without a BOM, it could pass as UTF-8.
+  //
+  // An interesting twist might be to check for UTF-16 ASCII first (every
+  // other byte is zero).
+  if (validate_utf8(input, length)) {
+    return encoding_type::UTF8;
+  }
+  // The next most common encoding that might appear without BOM is probably
+  // UTF-16LE, so try that next.
+  if ((length % 2) == 0) {
+    // important: we need to divide by two
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      return encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      return encoding_type::UTF32_LE;
+    }
+  }
+  return encoding_type::unspecified;
 }
-simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf8_with_errors(
-      buf, len, utf8_buffer);
+
+namespace internal {
+// When there is a single implementation, we should not pay a price
+// for dispatching to the best implementation. We should just use the
+// one we have. This is a compile-time check.
+#define SIMDUTF_SINGLE_IMPLEMENTATION                                          \
+  (SIMDUTF_IMPLEMENTATION_ICELAKE + SIMDUTF_IMPLEMENTATION_HASWELL +           \
+       SIMDUTF_IMPLEMENTATION_WESTMERE + SIMDUTF_IMPLEMENTATION_ARM64 +        \
+       SIMDUTF_IMPLEMENTATION_PPC64 + SIMDUTF_IMPLEMENTATION_LSX +             \
+       SIMDUTF_IMPLEMENTATION_LASX + SIMDUTF_IMPLEMENTATION_FALLBACK ==        \
+   1)
+
+// Static array of known implementations. We are hoping these get baked into the
+// executable without requiring a static initializer.
+
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+static const icelake::implementation *get_icelake_singleton() {
+  static const icelake::implementation icelake_singleton{};
+  return &icelake_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
-#else
-  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+static const haswell::implementation *get_haswell_singleton() {
+  static const haswell::implementation haswell_singleton{};
+  return &haswell_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
-#else
-  return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+static const westmere::implementation *get_westmere_singleton() {
+  static const westmere::implementation westmere_singleton{};
+  return &westmere_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16le_to_utf8(
-      buf, len, utf8_buffer);
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+static const arm64::implementation *get_arm64_singleton() {
+  static const arm64::implementation arm64_singleton{};
+  return &arm64_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16be_to_utf8(
-      buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
-                                                 size_t len,
-                                                 char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf8(buf, len,
-                                                             utf8_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf8_with_errors(
-      buf, len, utf8_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
-                                                                   utf8_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
-#else
-  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
-#endif
-}
-simdutf_warn_unused size_t convert_utf32_to_latin1(
-    const char32_t *input, size_t length, char *latin1_output) noexcept {
-  return get_default_implementation()->convert_utf32_to_latin1(input, length,
-                                                               latin1_output);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16le(buf, len,
-                                                                utf16_buffer);
-}
-simdutf_warn_unused size_t convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16be(buf, len,
-                                                                utf16_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
-#else
-  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
-#endif
-}
-simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16le_with_errors(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_utf32_to_utf16be_with_errors(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
-#else
-  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+static const ppc64::implementation *get_ppc64_singleton() {
+  static const ppc64::implementation ppc64_singleton{};
+  return &ppc64_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf32_to_utf16le(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf32_to_utf16be(
-      buf, len, utf16_buffer);
-}
-simdutf_warn_unused size_t convert_utf16_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
-#else
-  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+static const rvv::implementation *get_rvv_singleton() {
+  static const rvv::implementation rvv_singleton{};
+  return &rvv_singleton;
 }
-simdutf_warn_unused size_t convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf32(buf, len,
-                                                                utf32_buffer);
-}
-simdutf_warn_unused size_t convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf32(buf, len,
-                                                                utf32_buffer);
-}
-simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
-#else
-  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+static const lsx::implementation *get_lsx_singleton() {
+  static const lsx::implementation lsx_singleton{};
+  return &lsx_singleton;
 }
-simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16le_to_utf32_with_errors(
-      buf, len, utf32_buffer);
-}
-simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_utf16be_to_utf32_with_errors(
-      buf, len, utf32_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
-#else
-  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
 #endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+static const lasx::implementation *get_lasx_singleton() {
+  static const lasx::implementation lasx_singleton{};
+  return &lasx_singleton;
 }
-simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16le_to_utf32(
-      buf, len, utf32_buffer);
-}
-simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
-  return get_default_implementation()->convert_valid_utf16be_to_utf32(
-      buf, len, utf32_buffer);
-}
-void change_endianness_utf16(const char16_t *input, size_t length,
-                             char16_t *output) noexcept {
-  get_default_implementation()->change_endianness_utf16(input, length, output);
-}
-simdutf_warn_unused size_t count_utf16(const char16_t *input,
-                                       size_t length) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return count_utf16be(input, length);
-#else
-  return count_utf16le(input, length);
 #endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+static const fallback::implementation *get_fallback_singleton() {
+  static const fallback::implementation fallback_singleton{};
+  return &fallback_singleton;
 }
-simdutf_warn_unused size_t count_utf16le(const char16_t *input,
-                                         size_t length) noexcept {
-  return get_default_implementation()->count_utf16le(input, length);
-}
-simdutf_warn_unused size_t count_utf16be(const char16_t *input,
-                                         size_t length) noexcept {
-  return get_default_implementation()->count_utf16be(input, length);
-}
-simdutf_warn_unused size_t count_utf8(const char *input,
-                                      size_t length) noexcept {
-  return get_default_implementation()->count_utf8(input, length);
-}
-simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
-                                                   size_t len) noexcept {
-  return get_default_implementation()->latin1_length_from_utf8(buf, len);
-}
-simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
-  return get_default_implementation()->latin1_length_from_utf16(len);
-}
-simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
-  return get_default_implementation()->latin1_length_from_utf32(len);
-}
-simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
-                                                   size_t len) noexcept {
-  return get_default_implementation()->utf8_length_from_latin1(buf, len);
-}
-simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
-                                                  size_t length) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return utf8_length_from_utf16be(input, length);
-#else
-  return utf8_length_from_utf16le(input, length);
 #endif
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+static const implementation *get_single_implementation() {
+  return
+  #if SIMDUTF_IMPLEMENTATION_ICELAKE
+      get_icelake_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_HASWELL
+  get_haswell_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_WESTMERE
+  get_westmere_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_ARM64
+  get_arm64_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_PPC64
+  get_ppc64_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_LSX
+  get_lsx_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_LASX
+  get_lasx_singleton();
+  #endif
+  #if SIMDUTF_IMPLEMENTATION_FALLBACK
+  get_fallback_singleton();
+  #endif
 }
-simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
-                                                    size_t length) noexcept {
-  return get_default_implementation()->utf8_length_from_utf16le(input, length);
-}
-simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
-                                                    size_t length) noexcept {
-  return get_default_implementation()->utf8_length_from_utf16be(input, length);
-}
-simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
-                                                   size_t length) noexcept {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return utf32_length_from_utf16be(input, length);
-#else
-  return utf32_length_from_utf16le(input, length);
 #endif
-}
-simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
-                                                     size_t length) noexcept {
-  return get_default_implementation()->utf32_length_from_utf16le(input, length);
-}
-simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
-                                                     size_t length) noexcept {
-  return get_default_implementation()->utf32_length_from_utf16be(input, length);
-}
-simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
-                                                  size_t length) noexcept {
-  return get_default_implementation()->utf16_length_from_utf8(input, length);
-}
-simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
-  return get_default_implementation()->utf16_length_from_latin1(length);
-}
-simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
-                                                  size_t length) noexcept {
-  return get_default_implementation()->utf8_length_from_utf32(input, length);
-}
-simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
-                                                   size_t length) noexcept {
-  return get_default_implementation()->utf16_length_from_utf32(input, length);
-}
-simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
-                                                  size_t length) noexcept {
-  return get_default_implementation()->utf32_length_from_utf8(input, length);
-}
 
-simdutf_warn_unused size_t
-maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
-  return get_default_implementation()->maximal_binary_length_from_base64(
-      input, length);
-}
+/**
+ * @private Detects best supported implementation on first use, and sets it
+ */
+class detect_best_supported_implementation_on_first_use final
+    : public implementation {
+public:
+  std::string name() const noexcept final { return set_best()->name(); }
+  std::string description() const noexcept final {
+    return set_best()->description();
+  }
+  uint32_t required_instruction_sets() const noexcept final {
+    return set_best()->required_instruction_sets();
+  }
 
-simdutf_warn_unused result base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return get_default_implementation()->base64_to_binary(
-      input, length, output, options, last_chunk_handling_options);
-}
+  simdutf_warn_unused int
+  detect_encodings(const char *input, size_t length) const noexcept override {
+    return set_best()->detect_encodings(input, length);
+  }
 
-simdutf_warn_unused size_t maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) noexcept {
-  return get_default_implementation()->maximal_binary_length_from_base64(
-      input, length);
-}
+  simdutf_warn_unused bool
+  validate_utf8(const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8(buf, len);
+  }
 
-simdutf_warn_unused result base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return get_default_implementation()->base64_to_binary(
-      input, length, output, options, last_chunk_handling_options);
-}
+  simdutf_warn_unused result validate_utf8_with_errors(
+      const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf8_with_errors(buf, len);
+  }
 
-template <typename chartype>
-simdutf_warn_unused result base64_to_binary_safe_impl(
-    const chartype *input, size_t length, char *output, size_t &outlen,
-    base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  static_assert(std::is_same<chartype, char>::value ||
-                    std::is_same<chartype, char16_t>::value,
-                "Only char and char16_t are supported.");
-  // The implementation could be nicer, but we expect that most times, the user
-  // will provide us with a buffer that is large enough.
-  size_t max_length = maximal_binary_length_from_base64(input, length);
-  if (outlen >= max_length) {
-    // fast path
-    full_result r = get_default_implementation()->base64_to_binary_details(
-        input, length, output, options, last_chunk_handling_options);
-    if (r.error != error_code::INVALID_BASE64_CHARACTER &&
-        r.error != error_code::BASE64_EXTRA_BITS) {
-      outlen = r.output_count;
-      if (last_chunk_handling_options == stop_before_partial) {
-        if ((r.output_count % 3) != 0) {
-          bool empty_trail = true;
-          for (size_t i = r.input_count; i < length; i++) {
-            if (!scalar::base64::is_ascii_white_space_or_padding(input[i])) {
-              empty_trail = false;
-              break;
-            }
-          }
-          if (empty_trail) {
-            r.input_count = length;
-          }
-        }
-        return {r.error, r.input_count};
-      }
-      return {r.error, length};
-    }
-    return r;
+  simdutf_warn_unused bool
+  validate_ascii(const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_ascii(buf, len);
   }
-  // The output buffer is maybe too small. We will decode a truncated version of
-  // the input.
-  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
-  size_t safe_input = base64_length_from_binary(outlen3, options);
-  full_result r = get_default_implementation()->base64_to_binary_details(
-      input, safe_input, output, options, loose);
-  if (r.error == error_code::INVALID_BASE64_CHARACTER) {
-    return r;
+
+  simdutf_warn_unused result validate_ascii_with_errors(
+      const char *buf, size_t len) const noexcept final override {
+    return set_best()->validate_ascii_with_errors(buf, len);
   }
-  size_t offset =
-      (r.error == error_code::BASE64_INPUT_REMAINDER)
-          ? 1
-          : ((r.output_count % 3) == 0 ? 0 : (r.output_count % 3) + 1);
-  size_t output_index = r.output_count - (r.output_count % 3);
-  size_t input_index = safe_input;
-  // offset is a value that is no larger than 3. We backtrack
-  // by up to offset characters + an undetermined number of
-  // white space characters. It is expected that the next loop
-  // runs at most 3 times + the number of white space characters
-  // in between them, so we are not worried about performance.
-  while (offset > 0 && input_index > 0) {
-    chartype c = input[--input_index];
-    if (scalar::base64::is_ascii_white_space(c)) {
-      // skipping
-    } else {
-      offset--;
-    }
+
+  simdutf_warn_unused bool
+  validate_utf16le(const char16_t *buf,
+                   size_t len) const noexcept final override {
+    return set_best()->validate_utf16le(buf, len);
   }
-  size_t remaining_out = outlen - output_index;
-  const chartype *tail_input = input + input_index;
-  size_t tail_length = length - input_index;
-  while (tail_length > 0 &&
-         scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
-    tail_length--;
+
+  simdutf_warn_unused bool
+  validate_utf16be(const char16_t *buf,
+                   size_t len) const noexcept final override {
+    return set_best()->validate_utf16be(buf, len);
   }
-  size_t padding_characts = 0;
-  if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
-    tail_length--;
-    padding_characts++;
-    while (tail_length > 0 &&
-           scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
-      tail_length--;
-    }
-    if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
-      tail_length--;
-      padding_characts++;
-    }
+
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16le_with_errors(buf, len);
   }
-  // this will advance tail_input and tail_length
-  result rr = scalar::base64::base64_tail_decode_safe(
-      output + output_index, remaining_out, tail_input, tail_length,
-      padding_characts, options, last_chunk_handling_options);
-  outlen = output_index + remaining_out;
-  if (last_chunk_handling_options != stop_before_partial &&
-      rr.error == error_code::SUCCESS && padding_characts > 0) {
-    // additional checks
-    if ((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) {
-      rr.error = error_code::INVALID_BASE64_CHARACTER;
-    }
+
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf16be_with_errors(buf, len);
   }
-  if (rr.error == error_code::SUCCESS &&
-      last_chunk_handling_options == stop_before_partial) {
-    if (tail_input > input + input_index) {
-      rr.count = tail_input - input;
-    } else if (r.input_count > 0) {
-      rr.count = r.input_count + rr.count;
-    }
-    return rr;
+
+  simdutf_warn_unused bool
+  validate_utf32(const char32_t *buf,
+                 size_t len) const noexcept final override {
+    return set_best()->validate_utf32(buf, len);
   }
-  rr.count += input_index;
-  return rr;
-}
 
-simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
-    const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
-  const auto start{utf8_output};
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *buf, size_t len) const noexcept final override {
+    return set_best()->validate_utf32_with_errors(buf, len);
+  }
 
-  while (true) {
-    // convert_latin1_to_utf8 will never write more than input length * 2
-    auto read_len = std::min(len, utf8_len >> 1);
-    if (read_len <= 16) {
-      break;
-    }
+  simdutf_warn_unused size_t
+  convert_latin1_to_utf8(const char *buf, size_t len,
+                         char *utf8_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf8(buf, len, utf8_output);
+  }
 
-    const auto write_len =
-        simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf16le(buf, len, utf16_output);
+  }
 
-    utf8_output += write_len;
-    utf8_len -= write_len;
-    buf += read_len;
-    len -= read_len;
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf16be(buf, len, utf16_output);
   }
 
-  utf8_output +=
-      scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *buf, size_t len,
+      char32_t *latin1_output) const noexcept final override {
+    return set_best()->convert_latin1_to_utf32(buf, len, latin1_output);
+  }
 
-  return utf8_output - start;
-}
+  simdutf_warn_unused size_t
+  convert_utf8_to_latin1(const char *buf, size_t len,
+                         char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf8_to_latin1(buf, len, latin1_output);
+  }
 
-simdutf_warn_unused result base64_to_binary_safe(
-    const char *input, size_t length, char *output, size_t &outlen,
-    base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return base64_to_binary_safe_impl<char>(input, length, output, outlen,
-                                          options, last_chunk_handling_options);
-}
-simdutf_warn_unused result base64_to_binary_safe(
-    const char16_t *input, size_t length, char *output, size_t &outlen,
-    base64_options options,
-    last_chunk_handling_options last_chunk_handling_options) noexcept {
-  return base64_to_binary_safe_impl<char16_t>(
-      input, length, output, outlen, options, last_chunk_handling_options);
-}
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf8_to_latin1_with_errors(buf, len,
+                                                          latin1_output);
+  }
 
-simdutf_warn_unused size_t
-base64_length_from_binary(size_t length, base64_options options) noexcept {
-  return get_default_implementation()->base64_length_from_binary(length,
-                                                                 options);
-}
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_latin1(buf, len, latin1_output);
+  }
 
-size_t binary_to_base64(const char *input, size_t length, char *output,
-                        base64_options options) noexcept {
-  return get_default_implementation()->binary_to_base64(input, length, output,
-                                                        options);
-}
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16le(buf, len, utf16_output);
+  }
 
-simdutf_warn_unused simdutf::encoding_type
-autodetect_encoding(const char *buf, size_t length) noexcept {
-  return get_default_implementation()->autodetect_encoding(buf, length);
-}
-simdutf_warn_unused int detect_encodings(const char *buf,
-                                         size_t length) noexcept {
-  return get_default_implementation()->detect_encodings(buf, length);
-}
-const implementation *builtin_implementation() {
-  static const implementation *builtin_impl =
-      get_available_implementations()[SIMDUTF_STRINGIFY(
-          SIMDUTF_BUILTIN_IMPLEMENTATION)];
-  return builtin_impl;
-}
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16be(buf, len, utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
-  return scalar::utf8::trim_partial_utf8(input, length);
-}
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16le_with_errors(buf, len,
+                                                           utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
-                                                size_t length) {
-  return scalar::utf16::trim_partial_utf16<BIG>(input, length);
-}
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf16be_with_errors(buf, len,
+                                                           utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
-                                                size_t length) {
-  return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
-}
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16le(buf, len, utf16_output);
+  }
 
-simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
-                                              size_t length) {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return trim_partial_utf16be(input, length);
-#else
-  return trim_partial_utf16le(input, length);
-#endif
-}
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf16be(buf, len, utf16_output);
+  }
 
-} // namespace simdutf
-/* end file src/implementation.cpp */
-/* begin file src/encoding_types.cpp */
+  simdutf_warn_unused size_t
+  convert_utf8_to_utf32(const char *buf, size_t len,
+                        char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf32(buf, len, utf32_output);
+  }
 
-namespace simdutf {
-bool match_system(endianness e) {
-#if SIMDUTF_IS_BIG_ENDIAN
-  return e == endianness::BIG;
-#else
-  return e == endianness::LITTLE;
-#endif
-}
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf8_to_utf32_with_errors(buf, len,
+                                                         utf32_output);
+  }
 
-std::string to_string(encoding_type bom) {
-  switch (bom) {
-  case UTF16_LE:
-    return "UTF16 little-endian";
-  case UTF16_BE:
-    return "UTF16 big-endian";
-  case UTF32_LE:
-    return "UTF32 little-endian";
-  case UTF32_BE:
-    return "UTF32 big-endian";
-  case UTF8:
-    return "UTF8";
-  case unspecified:
-    return "unknown";
-  default:
-    return "error";
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf8_to_utf32(buf, len, utf32_output);
   }
-}
 
-namespace BOM {
-// Note that BOM for UTF8 is discouraged.
-encoding_type check_bom(const uint8_t *byte, size_t length) {
-  if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
-    if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
-      return encoding_type::UTF32_LE;
-    } else {
-      return encoding_type::UTF16_LE;
-    }
-  } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
-    return encoding_type::UTF16_BE;
-  } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
-             byte[2] == 0xfe and byte[3] == 0xff) {
-    return encoding_type::UTF32_BE;
-  } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and
-             byte[2] == 0xbf) {
-    return encoding_type::UTF8;
+  simdutf_warn_unused size_t
+  convert_utf16le_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_latin1(buf, len, latin1_output);
   }
-  return encoding_type::unspecified;
-}
 
-encoding_type check_bom(const char *byte, size_t length) {
-  return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
-}
+  simdutf_warn_unused size_t
+  convert_utf16be_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_latin1(buf, len, latin1_output);
+  }
 
-size_t bom_byte_size(encoding_type bom) {
-  switch (bom) {
-  case UTF16_LE:
-    return 2;
-  case UTF16_BE:
-    return 2;
-  case UTF32_LE:
-    return 4;
-  case UTF32_BE:
-    return 4;
-  case UTF8:
-    return 3;
-  case unspecified:
-    return 0;
-  default:
-    return 0;
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_latin1_with_errors(buf, len,
+                                                             latin1_output);
   }
-}
 
-} // namespace BOM
-} // namespace simdutf
-/* end file src/encoding_types.cpp */
-/* begin file src/error.cpp */
-namespace simdutf {
-// deliberately empty
-}
-/* end file src/error.cpp */
-// The large tables should be included once and they
-// should not depend on a kernel.
-/* begin file src/tables/utf8_to_utf16_tables.h */
-#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
-#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
-#include <cstdint>
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_latin1_with_errors(buf, len,
+                                                             latin1_output);
+  }
 
-namespace simdutf {
-namespace {
-namespace tables {
-namespace utf8_to_utf16 {
-/**
- * utf8bigindex uses about 8 kB
- * shufutf8 uses about 3344 B
- *
- * So we use a bit over 11 kB. It would be
- * easy to save about 4 kB by only
- * storing the index in utf8bigindex, and
- * deriving the consumed bytes otherwise.
- * However, this may come at a significant (10% to 20%)
- * performance penalty.
- */
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_latin1(buf, len, latin1_output);
+  }
 
-const uint8_t shufutf8[209][16] = {
-    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
-    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
-    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
-    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
-    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
-    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
-    {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
-    {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
-    {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
-    {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
-/* number of two bytes : 64 */
-/* number of two + three bytes : 145 */
-/* number of two + three + four bytes : 209 */
-const uint8_t utf8bigindex[4096][2] = {
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
-    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
-    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12},
-    {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},
-    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
-    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
-    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
-    {164, 7},  {145, 3},  {209, 12}, {155, 7},  {167, 7},  {69, 7},   {179, 7},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},
-    {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
-    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
-    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {156, 8},  {168, 8},  {146, 4},
-    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {171, 8},
-    {72, 8},   {183, 8},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
-    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {174, 8},  {148, 6},  {186, 8},  {80, 8},   {98, 8},   {66, 6},
-    {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},
-    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},
-    {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},
-    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
-    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},
-    {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},
-    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
-    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
-    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {187, 9},  {81, 9},
-    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
-    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
-    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
-    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
-    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {77, 7},   {95, 7},
-    {7, 9},    {194, 7},  {83, 7},   {101, 7},  {11, 9},   {119, 7},  {19, 9},
-    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {13, 9},
-    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
-    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
-    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
-    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
-    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
-    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
-    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
-    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
-    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
-    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
-    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},
-    {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},
-    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10},
-    {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},
-    {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},
-    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
-    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
-    {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},
-    {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10},
-    {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},
-    {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},
-    {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},
-    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
-    {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},
-    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},
-    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {15, 10},  {122, 8},  {23, 10},
-    {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
-    {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},
-    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
-    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},
-    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},
-    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
-    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},
-    {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10},
-    {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},
-    {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},
-    {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
-    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},
-    {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},
-    {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},
-    {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},
-    {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},
-    {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},
-    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
-    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
-    {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
-    {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12},
-    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},
-    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
-    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
-    {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},
-    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
-    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
-    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12},
-    {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},
-    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},
-    {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},
-    {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},
-    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
-    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {192, 11}, {152, 7},  {164, 7},  {145, 3},  {204, 11}, {155, 7},  {167, 7},
-    {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},
-    {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},
-    {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {207, 11}, {156, 8},
-    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {159, 8},  {117, 11}, {72, 8},   {135, 11}, {78, 8},   {96, 8},   {65, 5},
-    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {141, 11}, {80, 8},
-    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},
-    {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
-    {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},
-    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
-    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},
-    {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},
-    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},
-    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
-    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},
-    {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},
-    {143, 11}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},
-    {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
-    {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
-    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},
-    {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},
-    {31, 11},  {47, 11},  {7, 9},    {194, 7},  {83, 7},   {55, 11},  {11, 9},
-    {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {59, 11},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},
-    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},
-    {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
-    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},
-    {86, 8},   {61, 11},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},
-    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},
-    {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},
-    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
-    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
-    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
-    {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},
-    {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
-    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
-    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12},
-    {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12},
-    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},
-    {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},
-    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},
-    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
-    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},
-    {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},
-    {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},
-    {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10},
-    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},
-    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10},
-    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {62, 11},  {15, 10},
-    {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},
-    {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},
-    {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
-    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},
-    {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},
-    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},
-    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
-    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},
-    {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},
-    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},
-    {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},
-    {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
-    {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},
-    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},
-    {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},
-    {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},
-    {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},
-    {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},
-    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},
-    {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
-    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},
-    {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},
-    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},
-    {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},
-    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
-    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},
-    {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},
-    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
-    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},
-    {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},
-    {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},
-    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},
-    {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
-    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {152, 7},  {164, 7},  {145, 3},  {209, 12},
-    {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},
-    {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},
-    {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},
-    {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},
-    {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {208, 12}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {159, 8},  {171, 8},  {72, 8},   {183, 8},  {78, 8},
-    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
-    {186, 8},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
-    {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},
-    {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},
-    {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
-    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
-    {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},
-    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
-    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},
-    {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},
-    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {175, 9},  {148, 6},  {144, 12}, {81, 9},   {99, 9},   {66, 6},   {199, 9},
-    {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},
-    {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},
-    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
-    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
-    {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},
-    {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},
-    {71, 7},   {131, 9},  {77, 7},   {95, 7},   {7, 9},    {194, 7},  {83, 7},
-    {101, 7},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {103, 7},  {13, 9},   {121, 7},  {21, 9},   {37, 9},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},
-    {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},
-    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
-    {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},
-    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},
-    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {14, 9},   {122, 8},  {22, 9},
-    {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
-    {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},
-    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
-    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},
-    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},
-    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
-    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},
-    {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10},
-    {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},
-    {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},
-    {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
-    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},
-    {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},
-    {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},
-    {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},
-    {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},
-    {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10},
-    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
-    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
-    {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
-    {63, 12},  {15, 10},  {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12},
-    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},
-    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
-    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
-    {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},
-    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
-    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
-    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},
-    {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},
-    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},
-    {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},
-    {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},
-    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
-    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},
-    {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},
-    {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},
-    {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},
-    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},
-    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},
-    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},
-    {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
-    {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},
-    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
-    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},
-    {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},
-    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},
-    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
-    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},
-    {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},
-    {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},
-    {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
-    {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
-    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7},  {164, 7},
-    {145, 3},  {204, 11}, {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},
-    {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},
-    {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},
-    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {145, 3},  {207, 11}, {156, 8},  {168, 8},  {146, 4},  {180, 8},
-    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {117, 11}, {72, 8},
-    {135, 11}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
-    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
-    {174, 8},  {148, 6},  {141, 11}, {80, 8},   {98, 8},   {66, 6},   {198, 8},
-    {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},
-    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},
-    {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},
-    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
-    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
-    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
-    {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},
-    {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
-    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
-    {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},
-    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
-    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
-    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
-    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},
-    {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},
-    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
-    {209, 12}, {209, 12}, {175, 9},  {148, 6},  {143, 11}, {81, 9},   {99, 9},
-    {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},
-    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},
-    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
-    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},
-    {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
-    {158, 7},  {113, 9},  {71, 7},   {131, 9},  {31, 11},  {47, 11},  {7, 9},
-    {194, 7},  {83, 7},   {55, 11},  {11, 9},   {119, 7},  {19, 9},   {35, 9},
-    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},
-    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {59, 11},  {13, 9},   {121, 7},
-    {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
-    {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},
-    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},
-    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
-    {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},
-    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
-    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},
-    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {61, 11},  {14, 9},
-    {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},
-    {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},
-    {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
-    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
-    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},
-    {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},
-    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
-    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},
-    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
-    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
-    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
-    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
-    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
-    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10},
-    {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},
-    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
-    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
-    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
-    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
-    {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10},
-    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},
-    {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
-    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
-    {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},
-    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
-    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
-    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
-    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},
-    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10},
-    {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
-    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
-    {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},
-    {198, 8},  {86, 8},   {62, 11},  {15, 10},  {122, 8},  {23, 10},  {39, 10},
-    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},
-    {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},
-    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
-    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
-    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
-    {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},
-    {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
-    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
-    {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},
-    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
-    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
-    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
-    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
-    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
-    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
-    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},
-    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
-    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
-    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
-    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
-    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
-    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
-    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},
-    {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},
-    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
-    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},
-    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
-    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
-    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
-    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
-    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
-    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
-    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
-    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},
-    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
-    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
-    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
-    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
-    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
-    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
-    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
-    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
-    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
-    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
-    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
-    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
-    {0, 6}};
-} // namespace utf8_to_utf16
-} // namespace tables
-} // unnamed namespace
-} // namespace simdutf
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+      const char16_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_latin1(buf, len, latin1_output);
+  }
 
-#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
-/* end file src/tables/utf8_to_utf16_tables.h */
-/* begin file src/tables/utf16_to_utf8_tables.h */
-// file generated by scripts/sse_convert_utf16_to_utf8.py
-#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
-#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
+  simdutf_warn_unused size_t
+  convert_utf16le_to_utf8(const char16_t *buf, size_t len,
+                          char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf8(buf, len, utf8_output);
+  }
 
-namespace simdutf {
-namespace {
-namespace tables {
-namespace utf16_to_utf8 {
+  simdutf_warn_unused size_t
+  convert_utf16be_to_utf8(const char16_t *buf, size_t len,
+                          char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf8(buf, len, utf8_output);
+  }
 
-// 1 byte for length, 16 bytes for mask
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf8_with_errors(buf, len,
+                                                           utf8_output);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf8_with_errors(buf, len,
+                                                           utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t
+  convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                          char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+      const char32_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1_with_errors(buf, len,
+                                                           latin1_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+      const char32_t *buf, size_t len,
+      char *latin1_output) const noexcept final override {
+    return set_best()->convert_utf32_to_latin1(buf, len, latin1_output);
+  }
+
+  simdutf_warn_unused size_t
+  convert_utf32_to_utf8(const char32_t *buf, size_t len,
+                        char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *buf, size_t len,
+      char *utf8_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t
+  convert_valid_utf32_to_utf8(const char32_t *buf, size_t len,
+                              char *utf8_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf8(buf, len, utf8_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16le(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16be(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16le_with_errors(buf, len,
+                                                            utf16_output);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_utf32_to_utf16be_with_errors(buf, len,
+                                                            utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf16le(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+      const char32_t *buf, size_t len,
+      char16_t *utf16_output) const noexcept final override {
+    return set_best()->convert_valid_utf32_to_utf16be(buf, len, utf16_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf32(buf, len, utf32_output);
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf32(buf, len, utf32_output);
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16le_to_utf32_with_errors(buf, len,
+                                                            utf32_output);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_utf16be_to_utf32_with_errors(buf, len,
+                                                            utf32_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf16le_to_utf32(buf, len, utf32_output);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+      const char16_t *buf, size_t len,
+      char32_t *utf32_output) const noexcept final override {
+    return set_best()->convert_valid_utf16be_to_utf32(buf, len, utf32_output);
+  }
+
+  void change_endianness_utf16(const char16_t *buf, size_t len,
+                               char16_t *output) const noexcept final override {
+    set_best()->change_endianness_utf16(buf, len, output);
+  }
+
+  simdutf_warn_unused size_t
+  count_utf16le(const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16le(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  count_utf16be(const char16_t *buf, size_t len) const noexcept final override {
+    return set_best()->count_utf16be(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  count_utf8(const char *buf, size_t len) const noexcept final override {
+    return set_best()->count_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *buf, size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf16(len);
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t len) const noexcept override {
+    return set_best()->latin1_length_from_utf32(len);
+  }
+
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_latin1(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf16le(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16le(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf16be(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf16be(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t len) const noexcept override {
+    return set_best()->utf16_length_from_latin1(len);
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t len) const noexcept override {
+    return set_best()->utf32_length_from_latin1(len);
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf16le(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf16le(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf32_length_from_utf16be(
+      const char16_t *buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf16be(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf8_length_from_utf32(
+      const char32_t *buf, size_t len) const noexcept override {
+    return set_best()->utf8_length_from_utf32(buf, len);
+  }
+
+  simdutf_warn_unused size_t utf16_length_from_utf32(
+      const char32_t *buf, size_t len) const noexcept override {
+    return set_best()->utf16_length_from_utf32(buf, len);
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *buf, size_t len) const noexcept override {
+    return set_best()->utf32_length_from_utf8(buf, len);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options,
+                                        last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char *input, size_t length, char *output, base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary_details(input, length, output, options,
+                                                last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *input, size_t length) const noexcept override {
+    return set_best()->maximal_binary_length_from_base64(input, length);
+  }
+
+  simdutf_warn_unused result base64_to_binary(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary(input, length, output, options,
+                                        last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char16_t *input, size_t length, char *output,
+      base64_options options,
+      last_chunk_handling_options last_chunk_handling_options =
+          last_chunk_handling_options::loose) const noexcept override {
+    return set_best()->base64_to_binary_details(input, length, output, options,
+                                                last_chunk_handling_options);
+  }
+
+  simdutf_warn_unused size_t base64_length_from_binary(
+      size_t length, base64_options options) const noexcept override {
+    return set_best()->base64_length_from_binary(length, options);
+  }
+
+  size_t binary_to_base64(const char *input, size_t length, char *output,
+                          base64_options options) const noexcept override {
+    return set_best()->binary_to_base64(input, length, output, options);
+  }
+
+  simdutf_really_inline
+  detect_best_supported_implementation_on_first_use() noexcept
+      : implementation("best_supported_detector",
+                       "Detects the best supported implementation and sets it",
+                       0) {}
+
+private:
+  const implementation *set_best() const noexcept;
+};
+
+static_assert(std::is_trivially_destructible<
+                  detect_best_supported_implementation_on_first_use>::value,
+              "detect_best_supported_implementation_on_first_use should be "
+              "trivially destructible");
+
+static const std::initializer_list<const implementation *> &
+get_available_implementation_pointers() {
+  static const std::initializer_list<const implementation *>
+      available_implementation_pointers{
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+          get_icelake_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+          get_haswell_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+          get_westmere_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_ARM64
+          get_arm64_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+          get_ppc64_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+          get_rvv_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+          get_lsx_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_LASX
+          get_lasx_singleton(),
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+          get_fallback_singleton(),
+#endif
+      }; // available_implementation_pointers
+  return available_implementation_pointers;
+}
+
+// So we can return UNSUPPORTED_ARCHITECTURE from the parser when there is no
+// support
+class unsupported_implementation final : public implementation {
+public:
+  simdutf_warn_unused int detect_encodings(const char *,
+                                           size_t) const noexcept override {
+    return encoding_type::unspecified;
+  }
+
+  simdutf_warn_unused bool validate_utf8(const char *,
+                                         size_t) const noexcept final override {
+    return false; // Just refuse to validate. Given that we have a fallback
+                  // implementation
+    // it seems unlikely that unsupported_implementation will ever be used. If
+    // it is used, then it will flag all strings as invalid. The alternative is
+    // to return an error_code from which the user has to figure out whether the
+    // string is valid UTF-8... which seems like a lot of work just to handle
+    // the very unlikely case that we have an unsupported implementation. And,
+    // when it does happen (that we have an unsupported implementation), what
+    // are the chances that the programmer has a fallback? Given that *we*
+    // provide the fallback, it implies that the programmer would need a
+    // fallback for our fallback.
+  }
+
+  simdutf_warn_unused result validate_utf8_with_errors(
+      const char *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool
+  validate_ascii(const char *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_ascii_with_errors(
+      const char *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool
+  validate_utf16le(const char16_t *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused bool
+  validate_utf16be(const char16_t *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_utf16le_with_errors(
+      const char16_t *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result validate_utf16be_with_errors(
+      const char16_t *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused bool
+  validate_utf32(const char32_t *, size_t) const noexcept final override {
+    return false;
+  }
+
+  simdutf_warn_unused result validate_utf32_with_errors(
+      const char32_t *, size_t) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf8(
+      const char *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf16le(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf16be(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_latin1_to_utf32(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_latin1(
+      const char *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+      const char *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+      const char *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16le(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf16be(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+      const char *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf8_to_utf32(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+      const char *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+      const char16_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_latin1(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_latin1_with_errors(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_latin1(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf8(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+      const char32_t *, size_t, char *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16le(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf32_to_utf16be(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+      const char32_t *, size_t, char16_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16le_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_utf16be_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+      const char16_t *, size_t, char32_t *) const noexcept final override {
+    return 0;
+  }
+
+  void change_endianness_utf16(const char16_t *, size_t,
+                               char16_t *) const noexcept final override {}
+
+  simdutf_warn_unused size_t
+  count_utf16le(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  count_utf16be(const char16_t *, size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t count_utf8(const char *,
+                                        size_t) const noexcept final override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf16(size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  latin1_length_from_utf32(size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t
+  utf8_length_from_latin1(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf8_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf16le(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf16be(const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_latin1(size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t
+  utf16_length_from_latin1(size_t) const noexcept override {
+    return 0;
+  }
+  simdutf_warn_unused size_t
+  utf8_length_from_utf32(const char32_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf16_length_from_utf32(const char32_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t
+  utf32_length_from_utf8(const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result
+  base64_to_binary(const char *, size_t, char *, base64_options,
+                   last_chunk_handling_options) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char *, size_t, char *, base64_options,
+      last_chunk_handling_options) const noexcept override {
+    return full_result(error_code::OTHER, 0, 0);
+  }
+
+  simdutf_warn_unused size_t maximal_binary_length_from_base64(
+      const char16_t *, size_t) const noexcept override {
+    return 0;
+  }
+
+  simdutf_warn_unused result
+  base64_to_binary(const char16_t *, size_t, char *, base64_options,
+                   last_chunk_handling_options) const noexcept override {
+    return result(error_code::OTHER, 0);
+  }
+
+  simdutf_warn_unused full_result base64_to_binary_details(
+      const char16_t *, size_t, char *, base64_options,
+      last_chunk_handling_options) const noexcept override {
+    return full_result(error_code::OTHER, 0, 0);
+  }
+
+  simdutf_warn_unused size_t
+  base64_length_from_binary(size_t, base64_options) const noexcept override {
+    return 0;
+  }
+
+  size_t binary_to_base64(const char *, size_t, char *,
+                          base64_options) const noexcept override {
+    return 0;
+  }
+
+  unsupported_implementation()
+      : implementation("unsupported",
+                       "Unsupported CPU (no detected SIMD instructions)", 0) {}
+};
+
+const unsupported_implementation *get_unsupported_singleton() {
+  static const unsupported_implementation unsupported_singleton{};
+  return &unsupported_singleton;
+}
+static_assert(std::is_trivially_destructible<unsupported_implementation>::value,
+              "unsupported_singleton should be trivially destructible");
+
+size_t available_implementation_list::size() const noexcept {
+  return internal::get_available_implementation_pointers().size();
+}
+const implementation *const *
+available_implementation_list::begin() const noexcept {
+  return internal::get_available_implementation_pointers().begin();
+}
+const implementation *const *
+available_implementation_list::end() const noexcept {
+  return internal::get_available_implementation_pointers().end();
+}
+const implementation *
+available_implementation_list::detect_best_supported() const noexcept {
+  // They are prelisted in priority order, so we just go down the list
+  uint32_t supported_instruction_sets =
+      internal::detect_supported_architectures();
+  for (const implementation *impl :
+       internal::get_available_implementation_pointers()) {
+    uint32_t required_instruction_sets = impl->required_instruction_sets();
+    if ((supported_instruction_sets & required_instruction_sets) ==
+        required_instruction_sets) {
+      return impl;
+    }
+  }
+  return get_unsupported_singleton(); // this should never happen?
+}
+
+const implementation *
+detect_best_supported_implementation_on_first_use::set_best() const noexcept {
+  SIMDUTF_PUSH_DISABLE_WARNINGS
+  SIMDUTF_DISABLE_DEPRECATED_WARNING // Disable CRT_SECURE warning on MSVC:
+                                     // manually verified this is safe
+      char *force_implementation_name = getenv("SIMDUTF_FORCE_IMPLEMENTATION");
+  SIMDUTF_POP_DISABLE_WARNINGS
+
+  if (force_implementation_name) {
+    auto force_implementation =
+        get_available_implementations()[force_implementation_name];
+    if (force_implementation) {
+      return get_active_implementation() = force_implementation;
+    } else {
+      // Note: abort() and stderr usage within the library is forbidden.
+      return get_active_implementation() = get_unsupported_singleton();
+    }
+  }
+  return get_active_implementation() =
+             get_available_implementations().detect_best_supported();
+}
+
+} // namespace internal
+
+/**
+ * The list of available implementations compiled into simdutf.
+ */
+SIMDUTF_DLLIMPORTEXPORT const internal::available_implementation_list &
+get_available_implementations() {
+  static const internal::available_implementation_list
+      available_implementations{};
+  return available_implementations;
+}
+
+/**
+ * The active implementation.
+ */
+SIMDUTF_DLLIMPORTEXPORT internal::atomic_ptr<const implementation> &
+get_active_implementation() {
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+  // skip runtime detection
+  static internal::atomic_ptr<const implementation> active_implementation{
+      internal::get_single_implementation()};
+  return active_implementation;
+#else
+  static const internal::detect_best_supported_implementation_on_first_use
+      detect_best_supported_implementation_on_first_use_singleton;
+  static internal::atomic_ptr<const implementation> active_implementation{
+      &detect_best_supported_implementation_on_first_use_singleton};
+  return active_implementation;
+#endif
+}
+
+#if SIMDUTF_SINGLE_IMPLEMENTATION
+const implementation *get_default_implementation() {
+  return internal::get_single_implementation();
+}
+#else
+internal::atomic_ptr<const implementation> &get_default_implementation() {
+  return get_active_implementation();
+}
+#endif
+#define SIMDUTF_GET_CURRENT_IMPLEMENTION
+
+simdutf_warn_unused bool validate_utf8(const char *buf, size_t len) noexcept {
+  return get_default_implementation()->validate_utf8(buf, len);
+}
+simdutf_warn_unused result validate_utf8_with_errors(const char *buf,
+                                                     size_t len) noexcept {
+  return get_default_implementation()->validate_utf8_with_errors(buf, len);
+}
+simdutf_warn_unused bool validate_ascii(const char *buf, size_t len) noexcept {
+  return get_default_implementation()->validate_ascii(buf, len);
+}
+simdutf_warn_unused result validate_ascii_with_errors(const char *buf,
+                                                      size_t len) noexcept {
+  return get_default_implementation()->validate_ascii_with_errors(buf, len);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf8_to_utf16be(input, length, utf16_output);
+#else
+  return convert_utf8_to_utf16le(input, length, utf16_output);
+#endif
+}
+simdutf_warn_unused size_t convert_latin1_to_utf8(const char *buf, size_t len,
+                                                  char *utf8_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf8(buf, len,
+                                                              utf8_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf16le(buf, len,
+                                                                 utf16_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf16be(buf, len,
+                                                                 utf16_output);
+}
+simdutf_warn_unused size_t convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *latin1_output) noexcept {
+  return get_default_implementation()->convert_latin1_to_utf32(buf, len,
+                                                               latin1_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_latin1(buf, len,
+                                                              latin1_output);
+}
+simdutf_warn_unused result convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_latin1_with_errors(
+      buf, len, latin1_output);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_latin1(
+      buf, len, latin1_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16le(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16le(input, length,
+                                                               utf16_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf16be(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16be(input, length,
+                                                               utf16_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf16_with_errors(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf8_to_utf16be_with_errors(input, length, utf16_output);
+#else
+  return convert_utf8_to_utf16le_with_errors(input, length, utf16_output);
+#endif
+}
+simdutf_warn_unused result convert_utf8_to_utf16le_with_errors(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16le_with_errors(
+      input, length, utf16_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf16be_with_errors(
+    const char *input, size_t length, char16_t *utf16_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf16be_with_errors(
+      input, length, utf16_output);
+}
+simdutf_warn_unused size_t convert_utf8_to_utf32(
+    const char *input, size_t length, char32_t *utf32_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf32(input, length,
+                                                             utf32_output);
+}
+simdutf_warn_unused result convert_utf8_to_utf32_with_errors(
+    const char *input, size_t length, char32_t *utf32_output) noexcept {
+  return get_default_implementation()->convert_utf8_to_utf32_with_errors(
+      input, length, utf32_output);
+}
+simdutf_warn_unused bool validate_utf16(const char16_t *buf,
+                                        size_t len) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return validate_utf16be(buf, len);
+#else
+  return validate_utf16le(buf, len);
+#endif
+}
+simdutf_warn_unused bool validate_utf16le(const char16_t *buf,
+                                          size_t len) noexcept {
+  return get_default_implementation()->validate_utf16le(buf, len);
+}
+simdutf_warn_unused bool validate_utf16be(const char16_t *buf,
+                                          size_t len) noexcept {
+  return get_default_implementation()->validate_utf16be(buf, len);
+}
+simdutf_warn_unused result validate_utf16_with_errors(const char16_t *buf,
+                                                      size_t len) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return validate_utf16be_with_errors(buf, len);
+#else
+  return validate_utf16le_with_errors(buf, len);
+#endif
+}
+simdutf_warn_unused result validate_utf16le_with_errors(const char16_t *buf,
+                                                        size_t len) noexcept {
+  return get_default_implementation()->validate_utf16le_with_errors(buf, len);
+}
+simdutf_warn_unused result validate_utf16be_with_errors(const char16_t *buf,
+                                                        size_t len) noexcept {
+  return get_default_implementation()->validate_utf16be_with_errors(buf, len);
+}
+simdutf_warn_unused bool validate_utf32(const char32_t *buf,
+                                        size_t len) noexcept {
+  return get_default_implementation()->validate_utf32(buf, len);
+}
+simdutf_warn_unused result validate_utf32_with_errors(const char32_t *buf,
+                                                      size_t len) noexcept {
+  return get_default_implementation()->validate_utf32_with_errors(buf, len);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16(
+    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf8_to_utf16be(input, length, utf16_buffer);
+#else
+  return convert_valid_utf8_to_utf16le(input, length, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16le(
+    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_utf16le(
+      input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf16be(
+    const char *input, size_t length, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_utf16be(
+      input, length, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf8_to_utf32(
+    const char *input, size_t length, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf8_to_utf32(
+      input, length, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf8(const char16_t *buf,
+                                                 size_t len,
+                                                 char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+  return convert_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_latin1(buf, len, latin1_buffer);
+#else
+  return convert_utf16le_to_latin1(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_latin1_to_utf16(
+    const char *buf, size_t len, char16_t *utf16_output) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_latin1_to_utf16be(buf, len, utf16_output);
+#else
+  return convert_latin1_to_utf16le(buf, len, utf16_output);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_latin1(buf, len,
+                                                                 latin1_buffer);
+}
+simdutf_warn_unused size_t convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_latin1(buf, len,
+                                                                 latin1_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16be_to_latin1(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16le_to_latin1(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused result convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_latin1_with_errors(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_latin1_with_errors(
+      buf, len, latin1_buffer);
+}
+simdutf_warn_unused size_t convert_utf16le_to_utf8(const char16_t *buf,
+                                                   size_t len,
+                                                   char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf8(buf, len,
+                                                               utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf8(const char16_t *buf,
+                                                   size_t len,
+                                                   char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf8(buf, len,
+                                                               utf8_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf8_with_errors(buf, len, utf8_buffer);
+#else
+  return convert_utf16le_to_utf8_with_errors(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_latin1_with_errors(buf, len, latin1_buffer);
+#else
+  return convert_utf16le_to_latin1_with_errors(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf8_with_errors(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf8_with_errors(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_utf8(buf, len, utf8_buffer);
+#else
+  return convert_valid_utf16le_to_utf8(buf, len, utf8_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_latin1(buf, len, latin1_buffer);
+#else
+  return convert_valid_utf16le_to_latin1(buf, len, latin1_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16le_to_utf8(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16be_to_utf8(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf8(const char32_t *buf,
+                                                 size_t len,
+                                                 char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf8(buf, len,
+                                                             utf8_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf8_with_errors(
+      buf, len, utf8_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf32_to_utf8(buf, len,
+                                                                   utf8_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+  return convert_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf32_to_latin1(
+    const char32_t *input, size_t length, char *latin1_output) noexcept {
+  return get_default_implementation()->convert_utf32_to_latin1(input, length,
+                                                               latin1_output);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16le(buf, len,
+                                                                utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16be(buf, len,
+                                                                utf16_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf16_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf32_to_utf16be_with_errors(buf, len, utf16_buffer);
+#else
+  return convert_utf32_to_utf16le_with_errors(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16le_with_errors(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused result convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_utf32_to_utf16be_with_errors(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf32_to_utf16be(buf, len, utf16_buffer);
+#else
+  return convert_valid_utf32_to_utf16le(buf, len, utf16_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf32_to_utf16le(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf32_to_utf16be(
+      buf, len, utf16_buffer);
+}
+simdutf_warn_unused size_t convert_utf16_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+  return convert_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf32(buf, len,
+                                                                utf32_buffer);
+}
+simdutf_warn_unused size_t convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf32(buf, len,
+                                                                utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_utf16be_to_utf32_with_errors(buf, len, utf32_buffer);
+#else
+  return convert_utf16le_to_utf32_with_errors(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused result convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16le_to_utf32_with_errors(
+      buf, len, utf32_buffer);
+}
+simdutf_warn_unused result convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_utf16be_to_utf32_with_errors(
+      buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return convert_valid_utf16be_to_utf32(buf, len, utf32_buffer);
+#else
+  return convert_valid_utf16le_to_utf32(buf, len, utf32_buffer);
+#endif
+}
+simdutf_warn_unused size_t convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16le_to_utf32(
+      buf, len, utf32_buffer);
+}
+simdutf_warn_unused size_t convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_buffer) noexcept {
+  return get_default_implementation()->convert_valid_utf16be_to_utf32(
+      buf, len, utf32_buffer);
+}
+void change_endianness_utf16(const char16_t *input, size_t length,
+                             char16_t *output) noexcept {
+  get_default_implementation()->change_endianness_utf16(input, length, output);
+}
+simdutf_warn_unused size_t count_utf16(const char16_t *input,
+                                       size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return count_utf16be(input, length);
+#else
+  return count_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t count_utf16le(const char16_t *input,
+                                         size_t length) noexcept {
+  return get_default_implementation()->count_utf16le(input, length);
+}
+simdutf_warn_unused size_t count_utf16be(const char16_t *input,
+                                         size_t length) noexcept {
+  return get_default_implementation()->count_utf16be(input, length);
+}
+simdutf_warn_unused size_t count_utf8(const char *input,
+                                      size_t length) noexcept {
+  return get_default_implementation()->count_utf8(input, length);
+}
+simdutf_warn_unused size_t latin1_length_from_utf8(const char *buf,
+                                                   size_t len) noexcept {
+  return get_default_implementation()->latin1_length_from_utf8(buf, len);
+}
+simdutf_warn_unused size_t latin1_length_from_utf16(size_t len) noexcept {
+  return get_default_implementation()->latin1_length_from_utf16(len);
+}
+simdutf_warn_unused size_t latin1_length_from_utf32(size_t len) noexcept {
+  return get_default_implementation()->latin1_length_from_utf32(len);
+}
+simdutf_warn_unused size_t utf8_length_from_latin1(const char *buf,
+                                                   size_t len) noexcept {
+  return get_default_implementation()->utf8_length_from_latin1(buf, len);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16(const char16_t *input,
+                                                  size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return utf8_length_from_utf16be(input, length);
+#else
+  return utf8_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf8_length_from_utf16le(const char16_t *input,
+                                                    size_t length) noexcept {
+  return get_default_implementation()->utf8_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf16be(const char16_t *input,
+                                                    size_t length) noexcept {
+  return get_default_implementation()->utf8_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16(const char16_t *input,
+                                                   size_t length) noexcept {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return utf32_length_from_utf16be(input, length);
+#else
+  return utf32_length_from_utf16le(input, length);
+#endif
+}
+simdutf_warn_unused size_t utf32_length_from_utf16le(const char16_t *input,
+                                                     size_t length) noexcept {
+  return get_default_implementation()->utf32_length_from_utf16le(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf16be(const char16_t *input,
+                                                     size_t length) noexcept {
+  return get_default_implementation()->utf32_length_from_utf16be(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf8(const char *input,
+                                                  size_t length) noexcept {
+  return get_default_implementation()->utf16_length_from_utf8(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_latin1(size_t length) noexcept {
+  return get_default_implementation()->utf16_length_from_latin1(length);
+}
+simdutf_warn_unused size_t utf8_length_from_utf32(const char32_t *input,
+                                                  size_t length) noexcept {
+  return get_default_implementation()->utf8_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf16_length_from_utf32(const char32_t *input,
+                                                   size_t length) noexcept {
+  return get_default_implementation()->utf16_length_from_utf32(input, length);
+}
+simdutf_warn_unused size_t utf32_length_from_utf8(const char *input,
+                                                  size_t length) noexcept {
+  return get_default_implementation()->utf32_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t
+maximal_binary_length_from_base64(const char *input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(
+      input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return get_default_implementation()->base64_to_binary(
+      input, length, output, options, last_chunk_handling_options);
+}
+
+simdutf_warn_unused size_t maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) noexcept {
+  return get_default_implementation()->maximal_binary_length_from_base64(
+      input, length);
+}
+
+simdutf_warn_unused result base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return get_default_implementation()->base64_to_binary(
+      input, length, output, options, last_chunk_handling_options);
+}
+
+template <typename chartype>
+simdutf_warn_unused result base64_to_binary_safe_impl(
+    const chartype *input, size_t length, char *output, size_t &outlen,
+    base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  static_assert(std::is_same<chartype, char>::value ||
+                    std::is_same<chartype, char16_t>::value,
+                "Only char and char16_t are supported.");
+  // The implementation could be nicer, but we expect that most times, the user
+  // will provide us with a buffer that is large enough.
+  size_t max_length = maximal_binary_length_from_base64(input, length);
+  if (outlen >= max_length) {
+    // fast path
+    full_result r = get_default_implementation()->base64_to_binary_details(
+        input, length, output, options, last_chunk_handling_options);
+    if (r.error != error_code::INVALID_BASE64_CHARACTER &&
+        r.error != error_code::BASE64_EXTRA_BITS) {
+      outlen = r.output_count;
+      if (last_chunk_handling_options == stop_before_partial) {
+        if ((r.output_count % 3) != 0) {
+          bool empty_trail = true;
+          for (size_t i = r.input_count; i < length; i++) {
+            if (!scalar::base64::is_ascii_white_space_or_padding(input[i])) {
+              empty_trail = false;
+              break;
+            }
+          }
+          if (empty_trail) {
+            r.input_count = length;
+          }
+        }
+        return {r.error, r.input_count};
+      }
+      return {r.error, length};
+    }
+    return r;
+  }
+  // The output buffer is maybe too small. We will decode a truncated version of
+  // the input.
+  size_t outlen3 = outlen / 3 * 3; // round down to multiple of 3
+  size_t safe_input = base64_length_from_binary(outlen3, options);
+  full_result r = get_default_implementation()->base64_to_binary_details(
+      input, safe_input, output, options, loose);
+  if (r.error == error_code::INVALID_BASE64_CHARACTER) {
+    return r;
+  }
+  size_t offset =
+      (r.error == error_code::BASE64_INPUT_REMAINDER)
+          ? 1
+          : ((r.output_count % 3) == 0 ? 0 : (r.output_count % 3) + 1);
+  size_t output_index = r.output_count - (r.output_count % 3);
+  size_t input_index = safe_input;
+  // offset is a value that is no larger than 3. We backtrack
+  // by up to offset characters + an undetermined number of
+  // white space characters. It is expected that the next loop
+  // runs at most 3 times + the number of white space characters
+  // in between them, so we are not worried about performance.
+  while (offset > 0 && input_index > 0) {
+    chartype c = input[--input_index];
+    if (scalar::base64::is_ascii_white_space(c)) {
+      // skipping
+    } else {
+      offset--;
+    }
+  }
+  size_t remaining_out = outlen - output_index;
+  const chartype *tail_input = input + input_index;
+  size_t tail_length = length - input_index;
+  while (tail_length > 0 &&
+         scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
+    tail_length--;
+  }
+  size_t padding_characts = 0;
+  if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
+    tail_length--;
+    padding_characts++;
+    while (tail_length > 0 &&
+           scalar::base64::is_ascii_white_space(tail_input[tail_length - 1])) {
+      tail_length--;
+    }
+    if (tail_length > 0 && tail_input[tail_length - 1] == '=') {
+      tail_length--;
+      padding_characts++;
+    }
+  }
+  // this will advance tail_input and tail_length
+  result rr = scalar::base64::base64_tail_decode_safe(
+      output + output_index, remaining_out, tail_input, tail_length,
+      padding_characts, options, last_chunk_handling_options);
+  outlen = output_index + remaining_out;
+  if (last_chunk_handling_options != stop_before_partial &&
+      rr.error == error_code::SUCCESS && padding_characts > 0) {
+    // additional checks
+    if ((outlen % 3 == 0) || ((outlen % 3) + 1 + padding_characts != 4)) {
+      rr.error = error_code::INVALID_BASE64_CHARACTER;
+    }
+  }
+  if (rr.error == error_code::SUCCESS &&
+      last_chunk_handling_options == stop_before_partial) {
+    if (tail_input > input + input_index) {
+      rr.count = tail_input - input;
+    } else if (r.input_count > 0) {
+      rr.count = r.input_count + rr.count;
+    }
+    return rr;
+  }
+  rr.count += input_index;
+  return rr;
+}
+
+simdutf_warn_unused size_t convert_latin1_to_utf8_safe(
+    const char *buf, size_t len, char *utf8_output, size_t utf8_len) noexcept {
+  const auto start{utf8_output};
+
+  while (true) {
+    // convert_latin1_to_utf8 will never write more than input length * 2
+    auto read_len = std::min(len, utf8_len >> 1);
+    if (read_len <= 16) {
+      break;
+    }
+
+    const auto write_len =
+        simdutf::convert_latin1_to_utf8(buf, read_len, utf8_output);
+
+    utf8_output += write_len;
+    utf8_len -= write_len;
+    buf += read_len;
+    len -= read_len;
+  }
+
+  utf8_output +=
+      scalar::latin1_to_utf8::convert_safe(buf, len, utf8_output, utf8_len);
+
+  return utf8_output - start;
+}
+
+simdutf_warn_unused result base64_to_binary_safe(
+    const char *input, size_t length, char *output, size_t &outlen,
+    base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return base64_to_binary_safe_impl<char>(input, length, output, outlen,
+                                          options, last_chunk_handling_options);
+}
+simdutf_warn_unused result base64_to_binary_safe(
+    const char16_t *input, size_t length, char *output, size_t &outlen,
+    base64_options options,
+    last_chunk_handling_options last_chunk_handling_options) noexcept {
+  return base64_to_binary_safe_impl<char16_t>(
+      input, length, output, outlen, options, last_chunk_handling_options);
+}
+
+simdutf_warn_unused size_t
+base64_length_from_binary(size_t length, base64_options options) noexcept {
+  return get_default_implementation()->base64_length_from_binary(length,
+                                                                 options);
+}
+
+size_t binary_to_base64(const char *input, size_t length, char *output,
+                        base64_options options) noexcept {
+  return get_default_implementation()->binary_to_base64(input, length, output,
+                                                        options);
+}
+
+simdutf_warn_unused simdutf::encoding_type
+autodetect_encoding(const char *buf, size_t length) noexcept {
+  return get_default_implementation()->autodetect_encoding(buf, length);
+}
+simdutf_warn_unused int detect_encodings(const char *buf,
+                                         size_t length) noexcept {
+  return get_default_implementation()->detect_encodings(buf, length);
+}
+const implementation *builtin_implementation() {
+  static const implementation *builtin_impl =
+      get_available_implementations()[SIMDUTF_STRINGIFY(
+          SIMDUTF_BUILTIN_IMPLEMENTATION)];
+  return builtin_impl;
+}
+
+simdutf_warn_unused size_t trim_partial_utf8(const char *input, size_t length) {
+  return scalar::utf8::trim_partial_utf8(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16be(const char16_t *input,
+                                                size_t length) {
+  return scalar::utf16::trim_partial_utf16<BIG>(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16le(const char16_t *input,
+                                                size_t length) {
+  return scalar::utf16::trim_partial_utf16<LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t trim_partial_utf16(const char16_t *input,
+                                              size_t length) {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return trim_partial_utf16be(input, length);
+#else
+  return trim_partial_utf16le(input, length);
+#endif
+}
+
+} // namespace simdutf
+/* end file src/implementation.cpp */
+/* begin file src/encoding_types.cpp */
+
+namespace simdutf {
+bool match_system(endianness e) {
+#if SIMDUTF_IS_BIG_ENDIAN
+  return e == endianness::BIG;
+#else
+  return e == endianness::LITTLE;
+#endif
+}
+
+std::string to_string(encoding_type bom) {
+  switch (bom) {
+  case UTF16_LE:
+    return "UTF16 little-endian";
+  case UTF16_BE:
+    return "UTF16 big-endian";
+  case UTF32_LE:
+    return "UTF32 little-endian";
+  case UTF32_BE:
+    return "UTF32 big-endian";
+  case UTF8:
+    return "UTF8";
+  case unspecified:
+    return "unknown";
+  default:
+    return "error";
+  }
+}
+
+namespace BOM {
+// Note that BOM for UTF8 is discouraged.
+encoding_type check_bom(const uint8_t *byte, size_t length) {
+  if (length >= 2 && byte[0] == 0xff and byte[1] == 0xfe) {
+    if (length >= 4 && byte[2] == 0x00 and byte[3] == 0x0) {
+      return encoding_type::UTF32_LE;
+    } else {
+      return encoding_type::UTF16_LE;
+    }
+  } else if (length >= 2 && byte[0] == 0xfe and byte[1] == 0xff) {
+    return encoding_type::UTF16_BE;
+  } else if (length >= 4 && byte[0] == 0x00 and byte[1] == 0x00 and
+             byte[2] == 0xfe and byte[3] == 0xff) {
+    return encoding_type::UTF32_BE;
+  } else if (length >= 4 && byte[0] == 0xef and byte[1] == 0xbb and
+             byte[2] == 0xbf) {
+    return encoding_type::UTF8;
+  }
+  return encoding_type::unspecified;
+}
+
+encoding_type check_bom(const char *byte, size_t length) {
+  return check_bom(reinterpret_cast<const uint8_t *>(byte), length);
+}
+
+size_t bom_byte_size(encoding_type bom) {
+  switch (bom) {
+  case UTF16_LE:
+    return 2;
+  case UTF16_BE:
+    return 2;
+  case UTF32_LE:
+    return 4;
+  case UTF32_BE:
+    return 4;
+  case UTF8:
+    return 3;
+  case unspecified:
+    return 0;
+  default:
+    return 0;
+  }
+}
+
+} // namespace BOM
+} // namespace simdutf
+/* end file src/encoding_types.cpp */
+/* begin file src/error.cpp */
+namespace simdutf {
+// deliberately empty
+}
+/* end file src/error.cpp */
+// The large tables should be included once and they
+// should not depend on a kernel.
+/* begin file src/tables/utf8_to_utf16_tables.h */
+#ifndef SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#define SIMDUTF_UTF8_TO_UTF16_TABLES_H
+#include <cstdint>
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf8_to_utf16 {
+/**
+ * utf8bigindex uses about 8 kB
+ * shufutf8 uses about 3344 B
+ *
+ * So we use a bit over 11 kB. It would be
+ * easy to save about 4 kB by only
+ * storing the index in utf8bigindex, and
+ * deriving the consumed bytes otherwise.
+ * However, this may come at a significant (10% to 20%)
+ * performance penalty.
+ */
+
+const uint8_t shufutf8[209][16] = {
+    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 5, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 3, 255, 4, 255, 6, 5, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 6, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 3, 255, 5, 4, 7, 6, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 6, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 5, 255, 7, 6, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 7, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 2, 255, 4, 3, 6, 5, 8, 7, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+    {0, 255, 1, 255, 3, 2, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+    {0, 255, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 6, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 5, 255, 7, 6, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 7, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 4, 255, 6, 5, 8, 7, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 7, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 6, 255, 8, 7, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 8, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 3, 255, 5, 4, 7, 6, 9, 8, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+    {1, 0, 2, 255, 4, 3, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 7, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 6, 255, 8, 7, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 8, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 5, 255, 7, 6, 9, 8, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 8, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 7, 255, 9, 8, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 9, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 4, 255, 6, 5, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 8, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 7, 255, 9, 8, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 9, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 6, 255, 8, 7, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 9, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 8, 255, 10, 9, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 255, 0, 0, 0, 0},
+    {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 4, 255, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 5, 4, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 6, 5, 4, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 5, 255, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 6, 5, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 7, 6, 5, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 6, 255, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 7, 6, 255, 255},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 8, 7, 6, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 5, 255, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 6, 5, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 7, 6, 5, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 6, 255, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 7, 6, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 8, 7, 6, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 7, 255, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 8, 7, 255, 255},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 9, 8, 7, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 7, 6, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 8, 7, 6, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 7, 255, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 8, 7, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 9, 8, 7, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 8, 255, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 9, 8, 255, 255},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 10, 9, 8, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 7, 255, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 8, 7, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 9, 8, 7, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 8, 255, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 9, 8, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 10, 9, 8, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 9, 255, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 10, 9, 255, 255},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 11, 10, 9, 255},
+    {0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 3, 2, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 4, 3, 2, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 1, 255, 255, 255, 5, 4, 3, 2, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 2, 1, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 3, 2, 1, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 5, 255, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 6, 5, 255, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 7, 6, 5, 255, 0, 0, 0, 0},
+    {0, 255, 255, 255, 4, 3, 2, 1, 8, 7, 6, 5, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 4, 3, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 5, 4, 3, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 2, 255, 255, 255, 6, 5, 4, 3, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 3, 2, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 4, 3, 2, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 6, 255, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 7, 6, 255, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 8, 7, 6, 255, 0, 0, 0, 0},
+    {1, 0, 255, 255, 5, 4, 3, 2, 9, 8, 7, 6, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 4, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 5, 4, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 6, 5, 4, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 3, 255, 255, 255, 7, 6, 5, 4, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 4, 3, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 5, 4, 3, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 7, 255, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 8, 7, 255, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 9, 8, 7, 255, 0, 0, 0, 0},
+    {2, 1, 0, 255, 6, 5, 4, 3, 10, 9, 8, 7, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 5, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 6, 5, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 7, 6, 5, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 4, 255, 255, 255, 8, 7, 6, 5, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 6, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 7, 6, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 8, 7, 6, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 5, 4, 255, 255, 9, 8, 7, 6, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 7, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 8, 7, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 9, 8, 7, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 6, 5, 4, 255, 10, 9, 8, 7, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 8, 255, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 9, 8, 255, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 10, 9, 8, 255, 0, 0, 0, 0},
+    {3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 0, 0, 0, 0}};
+/* number of two bytes : 64 */
+/* number of two + three bytes : 145 */
+/* number of two + three + four bytes : 209 */
+const uint8_t utf8bigindex[4096][2] = {
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
+    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
+    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12},
+    {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},
+    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
+    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
+    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {152, 7},
+    {164, 7},  {145, 3},  {209, 12}, {155, 7},  {167, 7},  {69, 7},   {179, 7},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},
+    {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
+    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
+    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {156, 8},  {168, 8},  {146, 4},
+    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {171, 8},
+    {72, 8},   {183, 8},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
+    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {174, 8},  {148, 6},  {186, 8},  {80, 8},   {98, 8},   {66, 6},
+    {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},
+    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},
+    {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},
+    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
+    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},
+    {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},
+    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
+    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
+    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {187, 9},  {81, 9},
+    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
+    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
+    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
+    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
+    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {77, 7},   {95, 7},
+    {7, 9},    {194, 7},  {83, 7},   {101, 7},  {11, 9},   {119, 7},  {19, 9},
+    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {13, 9},
+    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
+    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
+    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
+    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
+    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
+    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
+    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
+    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
+    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
+    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
+    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},
+    {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},
+    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10},
+    {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},
+    {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},
+    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
+    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10},
+    {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},
+    {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10},
+    {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},
+    {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},
+    {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},
+    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
+    {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},
+    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},
+    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {15, 10},  {122, 8},  {23, 10},
+    {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
+    {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},
+    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
+    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},
+    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},
+    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
+    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},
+    {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10},
+    {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},
+    {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},
+    {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
+    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},
+    {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},
+    {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},
+    {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},
+    {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},
+    {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},
+    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
+    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
+    {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
+    {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12},
+    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},
+    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
+    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
+    {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},
+    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
+    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
+    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12},
+    {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},
+    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},  {163, 6},  {66, 6},
+    {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},
+    {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},
+    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
+    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {192, 11}, {152, 7},  {164, 7},  {145, 3},  {204, 11}, {155, 7},  {167, 7},
+    {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},   {65, 5},   {194, 7},
+    {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},
+    {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {207, 11}, {156, 8},
+    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {159, 8},  {117, 11}, {72, 8},   {135, 11}, {78, 8},   {96, 8},   {65, 5},
+    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {141, 11}, {80, 8},
+    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},  {68, 6},   {122, 8},
+    {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
+    {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},  {82, 6},   {100, 6},
+    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
+    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {77, 7},   {95, 7},
+    {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},   {119, 7},  {18, 8},
+    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {12, 8},
+    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
+    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},
+    {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},
+    {143, 11}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},
+    {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
+    {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
+    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},
+    {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},
+    {31, 11},  {47, 11},  {7, 9},    {194, 7},  {83, 7},   {55, 11},  {11, 9},
+    {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {59, 11},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},
+    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},
+    {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
+    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},
+    {86, 8},   {61, 11},  {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},
+    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},
+    {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},
+    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
+    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
+    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
+    {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},
+    {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
+    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
+    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {209, 12},
+    {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},   {209, 12},
+    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10}, {151, 6},  {163, 6},
+    {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},   {178, 6},  {74, 6},
+    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},   {181, 6},
+    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
+    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},  {203, 10}, {90, 10},
+    {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},   {95, 7},   {65, 5},
+    {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},   {91, 5},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {138, 10}, {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},   {121, 7},
+    {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},   {100, 6},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {206, 10},
+    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10}, {78, 8},   {96, 8},
+    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {140, 10},
+    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {62, 11},  {15, 10},
+    {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12}, {157, 6},  {110, 8},
+    {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},    {193, 6},  {82, 6},
+    {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
+    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {29, 10},
+    {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},  {10, 8},   {119, 7},
+    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {57, 10},
+    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
+    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},  {184, 9},
+    {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},  {67, 5},
+    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {175, 9},
+    {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},   {199, 9},  {87, 9},
+    {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
+    {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},   {65, 5},
+    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},  {152, 7},
+    {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},   {125, 9},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},  {71, 7},
+    {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},  {83, 7},   {54, 10},
+    {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},   {37, 9},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},   {41, 9},
+    {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},  {146, 4},
+    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {115, 9},
+    {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
+    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},   {66, 6},
+    {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},  {22, 9},   {38, 9},
+    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {26, 9},
+    {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},    {118, 6},
+    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
+    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},    {194, 7},
+    {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},  {20, 8},
+    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
+    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},  {65, 5},
+    {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},  {209, 12}, {151, 6},
+    {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},  {68, 6},   {178, 6},
+    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},  {70, 6},
+    {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
+    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {152, 7},  {164, 7},  {145, 3},  {209, 12},
+    {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},  {77, 7},   {95, 7},
+    {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},  {73, 5},
+    {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {185, 7},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},  {68, 6},
+    {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},  {82, 6},
+    {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {208, 12}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {159, 8},  {171, 8},  {72, 8},   {183, 8},  {78, 8},
+    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
+    {186, 8},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {104, 8},
+    {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},    {209, 12}, {157, 6},
+    {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},   {5, 8},    {193, 6},
+    {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
+    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
+    {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},   {101, 7},  {10, 8},
+    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
+    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},  {147, 5},
+    {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},  {165, 5},
+    {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {175, 9},  {148, 6},  {144, 12}, {81, 9},   {99, 9},   {66, 6},   {199, 9},
+    {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},   {64, 4},
+    {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},   {94, 6},
+    {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},
+    {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {190, 9},
+    {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},  {69, 7},
+    {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {113, 9},
+    {71, 7},   {131, 9},  {77, 7},   {95, 7},   {7, 9},    {194, 7},  {83, 7},
+    {101, 7},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {103, 7},  {13, 9},   {121, 7},  {21, 9},   {37, 9},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {25, 9},
+    {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},  {168, 8},
+    {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},
+    {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},   {195, 8},
+    {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},   {98, 8},
+    {66, 6},   {198, 8},  {86, 8},   {104, 8},  {14, 9},   {122, 8},  {22, 9},
+    {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},
+    {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},   {9, 8},
+    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},
+    {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},   {6, 8},
+    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},   {34, 8},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},   {121, 7},
+    {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12},
+    {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},  {162, 5},
+    {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10}, {148, 6},  {188, 10},
+    {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},  {166, 6},  {68, 6},
+    {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {169, 6},
+    {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},
+    {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},  {164, 7},  {145, 3},
+    {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10}, {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},   {132, 10}, {77, 7},
+    {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},   {119, 7},
+    {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {103, 7},
+    {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},    {193, 6},
+    {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10}, {72, 8},   {134, 10},
+    {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},
+    {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},
+    {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},
+    {63, 12},  {15, 10},  {122, 8},  {23, 10},  {39, 10},  {3, 8},    {209, 12},
+    {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},  {43, 10},  {5, 8},
+    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},  {17, 8},   {33, 8},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},
+    {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},
+    {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},  {83, 7},   {53, 10},
+    {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},
+    {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},
+    {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},  {172, 9},
+    {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},  {153, 5},
+    {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},   {99, 9},   {66, 6},
+    {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},   {92, 6},
+    {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},  {76, 6},
+    {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},
+    {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},   {107, 9},
+    {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},  {7, 9},    {194, 7},
+    {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},   {35, 9},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},   {121, 7},  {21, 9},
+    {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},  {156, 8},
+    {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},   {65, 5},
+    {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},  {80, 8},
+    {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},  {14, 9},   {122, 8},
+    {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},
+    {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},   {50, 9},
+    {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},
+    {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},   {44, 9},
+    {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},  {18, 8},
+    {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},   {12, 8},
+    {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},
+    {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12}, {150, 5},
+    {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},   {177, 5},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {209, 12}, {148, 6},
+    {209, 12}, {151, 6},  {163, 6},  {66, 6},   {209, 12}, {154, 6},  {166, 6},
+    {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},
+    {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},   {193, 6},
+    {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {192, 11}, {152, 7},  {164, 7},
+    {145, 3},  {204, 11}, {155, 7},  {167, 7},  {69, 7},   {179, 7},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {170, 7},  {71, 7},   {182, 7},
+    {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},  {67, 5},
+    {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {185, 7},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},   {4, 7},
+    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {145, 3},  {207, 11}, {156, 8},  {168, 8},  {146, 4},  {180, 8},
+    {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {117, 11}, {72, 8},
+    {135, 11}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},
+    {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12},
+    {174, 8},  {148, 6},  {141, 11}, {80, 8},   {98, 8},   {66, 6},   {198, 8},
+    {86, 8},   {104, 8},  {68, 6},   {122, 8},  {74, 6},   {92, 6},   {3, 8},
+    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {76, 6},   {94, 6},
+    {5, 8},    {193, 6},  {82, 6},   {100, 6},  {9, 8},    {118, 6},  {17, 8},
+    {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},
+    {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},
+    {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},
+    {71, 7},   {130, 8},  {77, 7},   {95, 7},   {6, 8},    {194, 7},  {83, 7},
+    {101, 7},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12},
+    {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},
+    {197, 7},  {85, 7},   {103, 7},  {12, 8},   {121, 7},  {20, 8},   {36, 8},
+    {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},
+    {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},
+    {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12}, {209, 12},
+    {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12}, {160, 9},
+    {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},   {196, 9},
+    {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},   {64, 4},
+    {209, 12}, {209, 12}, {175, 9},  {148, 6},  {143, 11}, {81, 9},   {99, 9},
+    {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},  {74, 6},
+    {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},   {129, 9},
+    {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},  {67, 5},
+    {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},  {89, 9},
+    {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},   {209, 12},
+    {158, 7},  {113, 9},  {71, 7},   {131, 9},  {31, 11},  {47, 11},  {7, 9},
+    {194, 7},  {83, 7},   {55, 11},  {11, 9},   {119, 7},  {19, 9},   {35, 9},
+    {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},  {79, 7},
+    {97, 7},   {66, 6},   {197, 7},  {85, 7},   {59, 11},  {13, 9},   {121, 7},
+    {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},
+    {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},   {49, 9},
+    {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {205, 9},
+    {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},  {64, 4},
+    {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},   {96, 8},
+    {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},  {73, 5},
+    {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},  {139, 9},
+    {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {61, 11},  {14, 9},
+    {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},  {110, 8},
+    {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},  {82, 6},
+    {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},  {145, 3},
+    {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},   {93, 7},
+    {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},  {28, 9},
+    {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},   {119, 7},
+    {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},
+    {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {56, 9},
+    {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12}, {157, 6},
+    {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},    {193, 6},
+    {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {145, 3},  {209, 12}, {209, 12}, {209, 12}, {146, 4},  {209, 12}, {149, 4},
+    {161, 4},  {64, 4},   {209, 12}, {209, 12}, {209, 12}, {147, 5},  {209, 12},
+    {150, 5},  {162, 5},  {65, 5},   {209, 12}, {153, 5},  {165, 5},  {67, 5},
+    {177, 5},  {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {176, 10},
+    {148, 6},  {188, 10}, {151, 6},  {163, 6},  {66, 6},   {200, 10}, {154, 6},
+    {166, 6},  {68, 6},   {178, 6},  {74, 6},   {92, 6},   {64, 4},   {209, 12},
+    {157, 6},  {169, 6},  {70, 6},   {181, 6},  {76, 6},   {94, 6},   {65, 5},
+    {193, 6},  {82, 6},   {100, 6},  {67, 5},   {118, 6},  {73, 5},   {91, 5},
+    {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {191, 10}, {152, 7},
+    {164, 7},  {145, 3},  {203, 10}, {90, 10},  {108, 10}, {69, 7},   {126, 10},
+    {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},  {114, 10}, {71, 7},
+    {132, 10}, {77, 7},   {95, 7},   {65, 5},   {194, 7},  {83, 7},   {101, 7},
+    {67, 5},   {119, 7},  {73, 5},   {91, 5},   {1, 7},    {209, 12}, {209, 12},
+    {173, 7},  {148, 6},  {138, 10}, {79, 7},   {97, 7},   {66, 6},   {197, 7},
+    {85, 7},   {103, 7},  {68, 6},   {121, 7},  {74, 6},   {92, 6},   {2, 7},
+    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},  {76, 6},   {94, 6},
+    {4, 7},    {193, 6},  {82, 6},   {100, 6},  {8, 7},    {118, 6},  {16, 7},
+    {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {145, 3},  {206, 10}, {156, 8},  {168, 8},  {146, 4},
+    {180, 8},  {149, 4},  {161, 4},  {64, 4},   {209, 12}, {159, 8},  {116, 10},
+    {72, 8},   {134, 10}, {78, 8},   {96, 8},   {65, 5},   {195, 8},  {84, 8},
+    {102, 8},  {67, 5},   {120, 8},  {73, 5},   {91, 5},   {64, 4},   {209, 12},
+    {209, 12}, {174, 8},  {148, 6},  {140, 10}, {80, 8},   {98, 8},   {66, 6},
+    {198, 8},  {86, 8},   {62, 11},  {15, 10},  {122, 8},  {23, 10},  {39, 10},
+    {3, 8},    {209, 12}, {157, 6},  {110, 8},  {70, 6},   {128, 8},  {27, 10},
+    {43, 10},  {5, 8},    {193, 6},  {82, 6},   {51, 10},  {9, 8},    {118, 6},
+    {17, 8},   {33, 8},   {0, 6},    {209, 12}, {209, 12}, {209, 12}, {209, 12},
+    {189, 8},  {152, 7},  {164, 7},  {145, 3},  {201, 8},  {88, 8},   {106, 8},
+    {69, 7},   {124, 8},  {75, 7},   {93, 7},   {64, 4},   {209, 12}, {158, 7},
+    {112, 8},  {71, 7},   {130, 8},  {29, 10},  {45, 10},  {6, 8},    {194, 7},
+    {83, 7},   {53, 10},  {10, 8},   {119, 7},  {18, 8},   {34, 8},   {1, 7},
+    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {136, 8},  {79, 7},   {97, 7},
+    {66, 6},   {197, 7},  {85, 7},   {57, 10},  {12, 8},   {121, 7},  {20, 8},
+    {36, 8},   {2, 7},    {209, 12}, {157, 6},  {109, 7},  {70, 6},   {127, 7},
+    {24, 8},   {40, 8},   {4, 7},    {193, 6},  {82, 6},   {48, 8},   {8, 7},
+    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12}, {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},  {209, 12}, {209, 12},
+    {209, 12}, {146, 4},  {209, 12}, {149, 4},  {161, 4},  {64, 4},   {209, 12},
+    {160, 9},  {172, 9},  {147, 5},  {184, 9},  {150, 5},  {162, 5},  {65, 5},
+    {196, 9},  {153, 5},  {165, 5},  {67, 5},   {177, 5},  {73, 5},   {91, 5},
+    {64, 4},   {209, 12}, {209, 12}, {175, 9},  {148, 6},  {142, 10}, {81, 9},
+    {99, 9},   {66, 6},   {199, 9},  {87, 9},   {105, 9},  {68, 6},   {123, 9},
+    {74, 6},   {92, 6},   {64, 4},   {209, 12}, {157, 6},  {111, 9},  {70, 6},
+    {129, 9},  {76, 6},   {94, 6},   {65, 5},   {193, 6},  {82, 6},   {100, 6},
+    {67, 5},   {118, 6},  {73, 5},   {91, 5},   {0, 6},    {209, 12}, {209, 12},
+    {209, 12}, {209, 12}, {190, 9},  {152, 7},  {164, 7},  {145, 3},  {202, 9},
+    {89, 9},   {107, 9},  {69, 7},   {125, 9},  {75, 7},   {93, 7},   {64, 4},
+    {209, 12}, {158, 7},  {113, 9},  {71, 7},   {131, 9},  {30, 10},  {46, 10},
+    {7, 9},    {194, 7},  {83, 7},   {54, 10},  {11, 9},   {119, 7},  {19, 9},
+    {35, 9},   {1, 7},    {209, 12}, {209, 12}, {173, 7},  {148, 6},  {137, 9},
+    {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},   {58, 10},  {13, 9},
+    {121, 7},  {21, 9},   {37, 9},   {2, 7},    {209, 12}, {157, 6},  {109, 7},
+    {70, 6},   {127, 7},  {25, 9},   {41, 9},   {4, 7},    {193, 6},  {82, 6},
+    {49, 9},   {8, 7},    {118, 6},  {16, 7},   {32, 7},   {0, 6},    {209, 12},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {209, 12}, {145, 3},
+    {205, 9},  {156, 8},  {168, 8},  {146, 4},  {180, 8},  {149, 4},  {161, 4},
+    {64, 4},   {209, 12}, {159, 8},  {115, 9},  {72, 8},   {133, 9},  {78, 8},
+    {96, 8},   {65, 5},   {195, 8},  {84, 8},   {102, 8},  {67, 5},   {120, 8},
+    {73, 5},   {91, 5},   {64, 4},   {209, 12}, {209, 12}, {174, 8},  {148, 6},
+    {139, 9},  {80, 8},   {98, 8},   {66, 6},   {198, 8},  {86, 8},   {60, 10},
+    {14, 9},   {122, 8},  {22, 9},   {38, 9},   {3, 8},    {209, 12}, {157, 6},
+    {110, 8},  {70, 6},   {128, 8},  {26, 9},   {42, 9},   {5, 8},    {193, 6},
+    {82, 6},   {50, 9},   {9, 8},    {118, 6},  {17, 8},   {33, 8},   {0, 6},
+    {209, 12}, {209, 12}, {209, 12}, {209, 12}, {189, 8},  {152, 7},  {164, 7},
+    {145, 3},  {201, 8},  {88, 8},   {106, 8},  {69, 7},   {124, 8},  {75, 7},
+    {93, 7},   {64, 4},   {209, 12}, {158, 7},  {112, 8},  {71, 7},   {130, 8},
+    {28, 9},   {44, 9},   {6, 8},    {194, 7},  {83, 7},   {52, 9},   {10, 8},
+    {119, 7},  {18, 8},   {34, 8},   {1, 7},    {209, 12}, {209, 12}, {173, 7},
+    {148, 6},  {136, 8},  {79, 7},   {97, 7},   {66, 6},   {197, 7},  {85, 7},
+    {56, 9},   {12, 8},   {121, 7},  {20, 8},   {36, 8},   {2, 7},    {209, 12},
+    {157, 6},  {109, 7},  {70, 6},   {127, 7},  {24, 8},   {40, 8},   {4, 7},
+    {193, 6},  {82, 6},   {48, 8},   {8, 7},    {118, 6},  {16, 7},   {32, 7},
+    {0, 6}};
+} // namespace utf8_to_utf16
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF8_TO_UTF16_TABLES_H
+/* end file src/tables/utf8_to_utf16_tables.h */
+/* begin file src/tables/utf16_to_utf8_tables.h */
+// file generated by scripts/sse_convert_utf16_to_utf8.py
+#ifndef SIMDUTF_UTF16_TO_UTF8_TABLES_H
+#define SIMDUTF_UTF16_TO_UTF8_TABLES_H
+
+namespace simdutf {
+namespace {
+namespace tables {
+namespace utf16_to_utf8 {
+
+// 1 byte for length, 16 bytes for mask
 const uint8_t pack_1_2_utf8_bytes[256][17] = {
     {16, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14},
     {15, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 0x80},
@@ -10734,14577 +13507,24317 @@ const uint8_t pack_1_2_utf8_bytes[256][17] = {
     {10, 1, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
     {9, 0, 2, 4, 6, 8, 10, 12, 15, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80},
-    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
-    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
-    {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    {15, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80},
+    {14, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {14, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 4, 7, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 7, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 7, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 7, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 7, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 7, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 7, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 7, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {14, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80},
+    {13, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 2, 5, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {13, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 4, 6, 9, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 6, 8, 11, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 6, 9, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 6, 8, 10, 13, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
+    {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
+    {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80}};
+
+// 1 byte for length, 16 bytes for mask
+const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
+    {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
+    {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80},
-    {13, 1, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80},
-    {12, 0, 3, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 2, 5, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 5, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 5, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {12, 1, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80},
-    {11, 0, 3, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {11, 1, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 2, 4, 6, 9, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 6, 8, 11, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+    {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
      0x80},
-    {11, 1, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 3, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 3, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 1, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 2, 4, 6, 9, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 1, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 2, 4, 6, 8, 10, 12, 14, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80}};
+    {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80, 0x80},
+    {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
+    {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80},
+    {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80, 0x80},
+    {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80},
+    {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
+     0x80, 0x80}};
+
+} // namespace utf16_to_utf8
+} // namespace tables
+} // unnamed namespace
+} // namespace simdutf
+
+#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
+/* end file src/tables/utf16_to_utf8_tables.h */
+// End of tables.
+
+// The scalar routines should be included once.
+/* begin file src/scalar/ascii.h */
+#ifndef SIMDUTF_ASCII_H
+#define SIMDUTF_ASCII_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace ascii {
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+// Only used by the fallback kernel.
+inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  uint64_t pos = 0;
+  // process in blocks of 16 bytes when possible
+  for (; pos + 16 <= len; pos += 16) {
+    uint64_t v1;
+    std::memcpy(&v1, data + pos, sizeof(uint64_t));
+    uint64_t v2;
+    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+    uint64_t v{v1 | v2};
+    if ((v & 0x8080808080808080) != 0) {
+      return false;
+    }
+  }
+  // process the tail byte-by-byte
+  for (; pos < len; pos++) {
+    if (data[pos] >= 0b10000000) {
+      return false;
+    }
+  }
+  return true;
+}
+#endif
+
+inline simdutf_warn_unused result validate_with_errors(const char *buf,
+                                                       size_t len) noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  // process in blocks of 16 bytes when possible
+  for (; pos + 16 <= len; pos += 16) {
+    uint64_t v1;
+    std::memcpy(&v1, data + pos, sizeof(uint64_t));
+    uint64_t v2;
+    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+    uint64_t v{v1 | v2};
+    if ((v & 0x8080808080808080) != 0) {
+      for (; pos < len; pos++) {
+        if (data[pos] >= 0b10000000) {
+          return result(error_code::TOO_LARGE, pos);
+        }
+      }
+    }
+  }
+  // process the tail byte-by-byte
+  for (; pos < len; pos++) {
+    if (data[pos] >= 0b10000000) {
+      return result(error_code::TOO_LARGE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, pos);
+}
+
+} // namespace ascii
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/ascii.h */
+/* begin file src/scalar/latin1.h */
+#ifndef SIMDUTF_LATIN1_H
+#define SIMDUTF_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1 {
+
+inline size_t utf32_length_from_latin1(size_t len) {
+  // We are not BOM aware.
+  return len; // a utf32 unit will always represent 1 latin1 character
+}
+
+inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
+  const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
+  size_t answer = 0;
+  for (size_t i = 0; i < len; i++) {
+    if ((c[i] >> 7)) {
+      answer++;
+    }
+  }
+  return answer + len;
+}
+
+inline size_t utf16_length_from_latin1(size_t len) { return len; }
+
+} // namespace latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1.h */
+
+/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
+#define SIMDUTF_VALID_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+// only used by the fallback and POWER kernel
+inline size_t convert_valid(const char32_t *buf, size_t len,
+                            char *utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+        *utf8_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if ((word & 0xFFFFFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xFFFFF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xFFFF0000) == 0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_output - start;
+}
+#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
+
+} // namespace utf32_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
+/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+#ifndef SIMDUTF_UTF32_TO_UTF8_H
+#define SIMDUTF_UTF32_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf8 {
+
+inline size_t convert(const char32_t *buf, size_t len, char *utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+        *utf8_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if ((word & 0xFFFFFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xFFFFF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xFFFF0000) == 0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return 0;
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      if (word > 0x10FFFF) {
+        return 0;
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return utf8_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len,
+                                  char *utf8_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 2 ASCII characters
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
+        *utf8_output++ = char(buf[pos]);
+        *utf8_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t word = data[pos];
+    if ((word & 0xFFFFFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xFFFFF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xFFFF0000) == 0) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      if (word > 0x10FFFF) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    }
+  }
+  return result(error_code::SUCCESS, utf8_output - start);
+}
+
+} // namespace utf32_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+
+/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
+#define SIMDUTF_VALID_UTF32_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char32_t *buf, size_t len,
+                            char16_t *utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if ((word & 0xFFFF0000) == 0) {
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
+                            : char16_t(word);
+      pos++;
+    } else {
+      // will generate a surrogate pair
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos++;
+    }
+  }
+  return utf16_output - start;
+}
+
+} // namespace utf32_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
+/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
+#ifndef SIMDUTF_UTF32_TO_UTF16_H
+#define SIMDUTF_UTF32_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char32_t *buf, size_t len, char16_t *utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return 0;
+      }
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
+                            : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return 0;
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+    pos++;
+  }
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char32_t *buf, size_t len,
+                                  char16_t *utf16_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    uint32_t word = data[pos];
+    if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      // will not generate a surrogate pair
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
+                            : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+    pos++;
+  }
+  return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // namespace utf32_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
+
+/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
+#define SIMDUTF_VALID_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+                            char *utf8_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 4 ASCII characters
+    if (pos + 4 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) {
+        v = (v >> 8) | (v << (64 - 8));
+      }
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while (pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian)
+                               ? char(utf16::swap_bytes(buf[pos]))
+                               : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xF800) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value >> 18) | 0b11110000);
+      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+} // namespace utf16_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
+/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+#ifndef SIMDUTF_UTF16_TO_UTF8_H
+#define SIMDUTF_UTF16_TO_UTF8_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf8 {
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char *utf8_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 bytes
+    if (pos + 4 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian)) {
+        v = (v >> 8) | (v << (64 - 8));
+      }
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while (pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian)
+                               ? char(utf16::swap_bytes(buf[pos]))
+                               : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xF800) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      if (pos + 1 >= len) {
+        return 0;
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return 0;
+      }
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return 0;
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value >> 18) | 0b11110000);
+      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return utf8_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+                                  char *utf8_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{utf8_output};
+  while (pos < len) {
+    // try to convert the next block of 8 bytes
+    if (pos + 4 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if (!match_system(big_endian))
+        v = (v >> 8) | (v << (64 - 8));
+      if ((v & 0xFF80FF80FF80FF80) == 0) {
+        size_t final_pos = pos + 4;
+        while (pos < final_pos) {
+          *utf8_output++ = !match_system(big_endian)
+                               ? char(utf16::swap_bytes(buf[pos]))
+                               : char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF80) == 0) {
+      // will generate one UTF-8 bytes
+      *utf8_output++ = char(word);
+      pos++;
+    } else if ((word & 0xF800) == 0) {
+      // will generate two UTF-8 bytes
+      // we have 0b110XXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else if ((word & 0xF800) != 0xD800) {
+      // will generate three UTF-8 bytes
+      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      if (pos + 1 >= len) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      // will generate four UTF-8 bytes
+      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+      *utf8_output++ = char((value >> 18) | 0b11110000);
+      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      pos += 2;
+    }
+  }
+  return result(error_code::SUCCESS, utf8_output - start);
+}
+
+} // namespace utf16_to_utf8
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+
+/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
+#define SIMDUTF_VALID_UTF16_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf32 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+                            char32_t *utf32_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return utf32_output - start;
+}
+
+} // namespace utf16_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
+/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
+#ifndef SIMDUTF_UTF16_TO_UTF32_H
+#define SIMDUTF_UTF16_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_utf32 {
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char32_t *utf32_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return 0;
+      }
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return 0;
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return utf32_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+                                  char32_t *utf32_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    uint16_t word =
+        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xF800) != 0xD800) {
+      // No surrogate pair, extend 16-bit word to 32-bit word
+      *utf32_output++ = char32_t(word);
+      pos++;
+    } else {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      if (diff > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      if (pos + 1 >= len) {
+        return result(error_code::SURROGATE, pos);
+      } // minimal bound checking
+      uint16_t next_word = !match_system(big_endian)
+                               ? utf16::swap_bytes(data[pos + 1])
+                               : data[pos + 1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if (diff2 > 0x3FF) {
+        return result(error_code::SURROGATE, pos);
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      pos += 2;
+    }
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
+}
+
+} // namespace utf16_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
+
+/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
+#define SIMDUTF_VALID_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char *buf, size_t len,
+                            char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while (pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(utf16::swap_bytes(buf[pos]))
+                                : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(leading_byte))
+                            : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 1 >= len) {
+        break;
+      } // minimal bound checking
+      uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
+                                     (data[pos + 1] & 0b00111111));
+      if (!match_system(big_endian)) {
+        code_point = utf16::swap_bytes(uint16_t(code_point));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 2 >= len) {
+        break;
+      } // minimal bound checking
+      uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) |
+                                     ((data[pos + 1] & 0b00111111) << 6) |
+                                     (data[pos + 2] & 0b00111111));
+      if (!match_system(big_endian)) {
+        code_point = utf16::swap_bytes(uint16_t(code_point));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        break;
+      } // minimal bound checking
+      uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
+                            ((data[pos + 1] & 0b00111111) << 12) |
+                            ((data[pos + 2] & 0b00111111) << 6) |
+                            (data[pos + 3] & 0b00111111);
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+#ifndef SIMDUTF_UTF8_TO_UTF16_H
+#define SIMDUTF_UTF8_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(utf16::swap_bytes(buf[pos]))
+                                : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(leading_byte))
+                            : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return 0;
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 2 >= len) {
+        return 0;
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) {
+        return 0;
+      }
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(utf16::swap_bytes(buf[pos]))
+                                : char16_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(utf16::swap_bytes(leading_byte))
+                            : char16_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8, it should become
+      // a single UTF-16 word.
+      if (pos + 2 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if ((code_point < 0x800) || (0xffff < code_point)) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xd7ff < code_point && code_point < 0xe000) {
+        return result(error_code::SURROGATE, pos);
+      }
+      if (!match_system(big_endian)) {
+        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
+      }
+      *utf16_output++ = char16_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0x10ffff < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      code_point -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = utf16::swap_bytes(high_surrogate);
+        low_surrogate = utf16::swap_bytes(low_surrogate);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      pos += 4;
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      } else {
+        return result(error_code::HEADER_BITS, pos);
+      }
+    }
+  }
+  return result(error_code::SUCCESS, utf16_output - start);
+}
+
+/**
+ * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
+ * we have up to len input bytes left, and we encountered some error. It is
+ * possible that the error is at 'buf' exactly, but it could also be in the
+ * previous bytes  (up to 3 bytes back).
+ *
+ * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
+ * current memory section and can be safely accessed. We prior_bytes to access
+ * safely up to three bytes before 'buf'.
+ *
+ * The caller is responsible to ensure that len > 0.
+ *
+ * If the error is believed to have occurred prior to 'buf', the count value
+ * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
+ */
+template <endianness endian>
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+                                             const char *buf, size_t len,
+                                             char16_t *utf16_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  // In theory '3' would be sufficient, but sometimes the error can go back
+  // quite far.
+  size_t how_far_back = prior_bytes;
+  // size_t how_far_back = 3; // 3 bytes in the past + current position
+  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for (size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if (found_leading_bytes) {
+      if (i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0 - i + 1);
+      }
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+  // unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if (!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is
+    // continuation] Or we possibly have a stream that does not start with a
+    // leading byte.
+    return result(error_code::TOO_LONG, 0 - how_far_back);
+  }
+  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+
+/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
+#define SIMDUTF_VALID_UTF8_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf32 {
+
+inline size_t convert_valid(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 8 ASCII bytes
+    if (pos + 8 <=
+        len) { // if it is safe to read 8 more bytes, check that they are ascii
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 8;
+        while (pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        break;
+      } // minimal bound checking
+      *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
+                                 (data[pos + 1] & 0b00111111));
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if (pos + 2 >= len) {
+        break;
+      } // minimal bound checking
+      *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
+                                 ((data[pos + 1] & 0b00111111) << 6) |
+                                 (data[pos + 2] & 0b00111111));
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        break;
+      } // minimal bound checking
+      uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
+                           ((data[pos + 1] & 0b00111111) << 12) |
+                           ((data[pos + 2] & 0b00111111) << 6) |
+                           (data[pos + 3] & 0b00111111);
+      *utf32_output++ = char32_t(code_word);
+      pos += 4;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return utf32_output - start;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
+#ifndef SIMDUTF_UTF8_TO_UTF32_H
+#define SIMDUTF_UTF8_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_utf32 {
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if (pos + 2 >= len) {
+        return 0;
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point ||
+          (0xd7ff < code_point && code_point < 0xe000)) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return 0;
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff || 0x10ffff < code_point) {
+        return 0;
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 4;
+    } else {
+      return 0;
+    }
+  }
+  return utf32_output - start;
+}
+
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char32_t *utf32_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2};
+      if ((v & 0x8080808080808080) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *utf32_output++ = char32_t(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *utf32_output++ = char32_t(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) == 0b11000000) {
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
+      if (code_point < 0x80 || 0x7ff < code_point) {
+        return result(error_code::OVERLONG, pos);
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      if (pos + 2 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      // range check
+      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
+                            (data[pos + 1] & 0b00111111) << 6 |
+                            (data[pos + 2] & 0b00111111);
+      if (code_point < 0x800 || 0xffff < code_point) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xd7ff < code_point && code_point < 0xe000) {
+        return result(error_code::SURROGATE, pos);
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 3;
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      if (pos + 3 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      }
+
+      // range check
+      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
+                            (data[pos + 1] & 0b00111111) << 12 |
+                            (data[pos + 2] & 0b00111111) << 6 |
+                            (data[pos + 3] & 0b00111111);
+      if (code_point <= 0xffff) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0x10ffff < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      }
+      *utf32_output++ = char32_t(code_point);
+      pos += 4;
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      } else {
+        return result(error_code::HEADER_BITS, pos);
+      }
+    }
+  }
+  return result(error_code::SUCCESS, utf32_output - start);
+}
+
+/**
+ * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
+ * we have up to len input bytes left, and we encountered some error. It is
+ * possible that the error is at 'buf' exactly, but it could also be in the
+ * previous bytes location (up to 3 bytes back).
+ *
+ * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
+ * current memory section and can be safely accessed. We prior_bytes to access
+ * safely up to three bytes before 'buf'.
+ *
+ * The caller is responsible to ensure that len > 0.
+ *
+ * If the error is believed to have occurred prior to 'buf', the count value
+ * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
+ */
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+                                             const char *buf, size_t len,
+                                             char32_t *utf32_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  size_t how_far_back = 3; // 3 bytes in the past + current position
+  if (how_far_back > prior_bytes) {
+    how_far_back = prior_bytes;
+  }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for (size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if (found_leading_bytes) {
+      if (i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0 - i + 1);
+      }
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+  // unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if (!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is
+    // continuation] Or we possibly have a stream that does not start with a
+    // leading byte.
+    return result(error_code::TOO_LONG, 0 - how_far_back);
+  }
+
+  result res = convert_with_errors(buf, len + extra_len, utf32_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
+
+/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF16_H
+#define SIMDUTF_LATIN1_TO_UTF16_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf16 {
+
+template <endianness big_endian>
+inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+
+  while (pos < len) {
+    uint16_t word =
+        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+    *utf16_output++ =
+        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+    pos++;
+  }
+
+  return utf16_output - start;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char16_t *utf16_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+
+  while (pos < len) {
+    uint16_t word =
+        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
+    *utf16_output++ =
+        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
+    pos++;
+  }
+
+  return result(error_code::SUCCESS, utf16_output - start);
+}
+
+} // namespace latin1_to_utf16
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
+/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+#ifndef SIMDUTF_LATIN1_TO_UTF32_H
+#define SIMDUTF_LATIN1_TO_UTF32_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace latin1_to_utf32 {
+
+inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
+  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
+  char32_t *start{utf32_output};
+  for (size_t i = 0; i < len; i++) {
+    *utf32_output++ = (char32_t)data[i];
+  }
+  return utf32_output - start;
+}
+
+} // namespace latin1_to_utf32
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+
+/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+#ifndef SIMDUTF_UTF8_TO_LATIN1_H
+#define SIMDUTF_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert(const char *buf, size_t len, char *latin_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
+                           // 1000 1000 .... etc
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) ==
+               0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return 0;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      } // checks if the next byte is a valid continuation byte in UTF-8. A
+        // valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 |
+          (data[pos + 1] &
+           0b00111111); // assembles the Unicode code point from the two bytes.
+                        // It does this by discarding the leading 110 and 10
+                        // bits from the two bytes, shifting the remaining bits
+                        // of the first byte, and then combining the results
+                        // with a bitwise OR operation.
+      if (code_point < 0x80 || 0xFF < code_point) {
+        return 0; // We only care about the range 129-255 which is Non-ASCII
+                  // latin1 characters. A code_point beneath 0x80 is invalid as
+                  // it is already covered by bytes whose leading bit is zero.
+      }
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else {
+      return 0;
+    }
+  }
+  return latin_output - start;
+}
+
+inline result convert_with_errors(const char *buf, size_t len,
+                                  char *latin_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
+                           // 1000 1000...etc
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) ==
+               0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        return result(error_code::TOO_SHORT, pos);
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return result(error_code::TOO_SHORT, pos);
+      } // checks if the next byte is a valid continuation byte in UTF-8. A
+        // valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 |
+          (data[pos + 1] &
+           0b00111111); // assembles the Unicode code point from the two bytes.
+                        // It does this by discarding the leading 110 and 10
+                        // bits from the two bytes, shifting the remaining bits
+                        // of the first byte, and then combining the results
+                        // with a bitwise OR operation.
+      if (code_point < 0x80) {
+        return result(error_code::OVERLONG, pos);
+      }
+      if (0xFF < code_point) {
+        return result(error_code::TOO_LARGE, pos);
+      } // We only care about the range 129-255 which is Non-ASCII latin1
+        // characters
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else if ((leading_byte & 0b11110000) == 0b11100000) {
+      // We have a three-byte UTF-8
+      return result(error_code::TOO_LARGE, pos);
+    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
+      // we have a 4-byte UTF-8 word.
+      return result(error_code::TOO_LARGE, pos);
+    } else {
+      // we either have too many continuation bytes or an invalid leading byte
+      if ((leading_byte & 0b11000000) == 0b10000000) {
+        return result(error_code::TOO_LONG, pos);
+      }
+
+      return result(error_code::HEADER_BITS, pos);
+    }
+  }
+  return result(error_code::SUCCESS, latin_output - start);
+}
+
+inline result rewind_and_convert_with_errors(size_t prior_bytes,
+                                             const char *buf, size_t len,
+                                             char *latin1_output) {
+  size_t extra_len{0};
+  // We potentially need to go back in time and find a leading byte.
+  // In theory '3' would be sufficient, but sometimes the error can go back
+  // quite far.
+  size_t how_far_back = prior_bytes;
+  // size_t how_far_back = 3; // 3 bytes in the past + current position
+  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
+  bool found_leading_bytes{false};
+  // important: it is i <= how_far_back and not 'i < how_far_back'.
+  for (size_t i = 0; i <= how_far_back; i++) {
+    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
+    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
+    if (found_leading_bytes) {
+      if (i > 0 && byte < 128) {
+        // If we had to go back and the leading byte is ascii
+        // then we can stop right away.
+        return result(error_code::TOO_LONG, 0 - i + 1);
+      }
+      buf -= i;
+      extra_len = i;
+      break;
+    }
+  }
+  //
+  // It is possible for this function to return a negative count in its result.
+  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
+  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
+  // unsigned integral type of the result of the sizeof operator
+  //
+  // An unsigned type will simply wrap round arithmetically (well defined).
+  //
+  if (!found_leading_bytes) {
+    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
+    // [....] [continuation] [continuation] [continuation] | [buf is
+    // continuation] Or we possibly have a stream that does not start with a
+    // leading byte.
+    return result(error_code::TOO_LONG, 0 - how_far_back);
+  }
+  result res = convert_with_errors(buf, len + extra_len, latin1_output);
+  if (res.error) {
+    res.count -= extra_len;
+  }
+  return res;
+}
+
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+#ifndef SIMDUTF_UTF16_TO_LATIN1_H
+#define SIMDUTF_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+#include <cstring> // for std::memcpy
+
+template <endianness big_endian>
+inline size_t convert(const char16_t *buf, size_t len, char *latin_output) {
+  if (len == 0) {
+    return 0;
+  }
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *current_write = latin_output;
+  uint16_t word = 0;
+  uint16_t too_large = 0;
+
+  while (pos < len) {
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    too_large |= word;
+    *current_write++ = char(word & 0xFF);
+    pos++;
+  }
+  if ((too_large & 0xFF00) != 0) {
+    return 0;
+  }
+
+  return current_write - latin_output;
+}
+
+template <endianness big_endian>
+inline result convert_with_errors(const char16_t *buf, size_t len,
+                                  char *latin_output) {
+  if (len == 0) {
+    return result(error_code::SUCCESS, 0);
+  }
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+  uint16_t word;
+
+  while (pos < len) {
+    if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
+                           // they are Latin1
+      uint64_t v1, v2, v3, v4;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
+      ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
+      ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
+
+      if (!match_system(big_endian)) {
+        v1 = (v1 >> 8) | (v1 << (64 - 8));
+      }
+      if (!match_system(big_endian)) {
+        v2 = (v2 >> 8) | (v2 << (64 - 8));
+      }
+      if (!match_system(big_endian)) {
+        v3 = (v3 >> 8) | (v3 << (64 - 8));
+      }
+      if (!match_system(big_endian)) {
+        v4 = (v4 >> 8) | (v4 << (64 - 8));
+      }
+
+      if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = !match_system(big_endian)
+                                ? char(utf16::swap_bytes(data[pos]))
+                                : char(data[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    if ((word & 0xFF00) == 0) {
+      *latin_output++ = char(word & 0xFF);
+      pos++;
+    } else {
+      return result(error_code::TOO_LARGE, pos);
+    }
+  }
+  return result(error_code::SUCCESS, latin_output - start);
+}
+
+} // namespace utf16_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
+/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+#ifndef SIMDUTF_UTF32_TO_LATIN1_H
+#define SIMDUTF_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char *start = latin1_output;
+  uint32_t utf32_char;
+  size_t pos = 0;
+  uint32_t too_large = 0;
+
+  while (pos < len) {
+    utf32_char = (uint32_t)data[pos];
+    too_large |= utf32_char;
+    *latin1_output++ = (char)(utf32_char & 0xFF);
+    pos++;
+  }
+  if ((too_large & 0xFFFFFF00) != 0) {
+    return 0;
+  }
+  return latin1_output - start;
+}
+
+inline result convert_with_errors(const char32_t *buf, size_t len,
+                                  char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char *start{latin1_output};
+  size_t pos = 0;
+  while (pos < len) {
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are Latin1
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+        *latin1_output++ = char(buf[pos]);
+        *latin1_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      }
+    }
+    uint32_t utf32_char = data[pos];
+    if ((utf32_char & 0xFFFFFF00) ==
+        0) { // Check if the character can be represented in Latin-1
+      *latin1_output++ = (char)(utf32_char & 0xFF);
+      pos++;
+    } else {
+      return result(error_code::TOO_LARGE, pos);
+    };
+  }
+  return result(error_code::SUCCESS, latin1_output - start);
+}
+
+} // namespace utf32_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
+
+/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf8_to_latin1 {
+
+inline size_t convert_valid(const char *buf, size_t len, char *latin_output) {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+
+  size_t pos = 0;
+  char *start{latin_output};
+
+  while (pos < len) {
+    // try to convert the next block of 16 ASCII bytes
+    if (pos + 16 <=
+        len) { // if it is safe to read 16 more bytes, check that they are ascii
+      uint64_t v1;
+      ::memcpy(&v1, data + pos, sizeof(uint64_t));
+      uint64_t v2;
+      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
+      uint64_t v{v1 |
+                 v2}; // We are only interested in these bits: 1000 1000 1000
+                      // 1000, so it makes sense to concatenate everything
+      if ((v & 0x8080808080808080) ==
+          0) { // if NONE of these are set, e.g. all of them are zero, then
+               // everything is ASCII
+        size_t final_pos = pos + 16;
+        while (pos < final_pos) {
+          *latin_output++ = char(buf[pos]);
+          pos++;
+        }
+        continue;
+      }
+    }
+
+    // suppose it is not an all ASCII byte sequence
+    uint8_t leading_byte = data[pos]; // leading byte
+    if (leading_byte < 0b10000000) {
+      // converting one ASCII byte !!!
+      *latin_output++ = char(leading_byte);
+      pos++;
+    } else if ((leading_byte & 0b11100000) ==
+               0b11000000) { // the first three bits indicate:
+      // We have a two-byte UTF-8
+      if (pos + 1 >= len) {
+        break;
+      } // minimal bound checking
+      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
+        return 0;
+      } // checks if the next byte is a valid continuation byte in UTF-8. A
+        // valid continuation byte starts with 10.
+      // range check -
+      uint32_t code_point =
+          (leading_byte & 0b00011111) << 6 |
+          (data[pos + 1] &
+           0b00111111); // assembles the Unicode code point from the two bytes.
+                        // It does this by discarding the leading 110 and 10
+                        // bits from the two bytes, shifting the remaining bits
+                        // of the first byte, and then combining the results
+                        // with a bitwise OR operation.
+      *latin_output++ = char(code_point);
+      pos += 2;
+    } else {
+      // we may have a continuation but we do not do error checking
+      return 0;
+    }
+  }
+  return latin_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
+/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf16_to_latin1 {
+
+template <endianness big_endian>
+inline size_t convert_valid(const char16_t *buf, size_t len,
+                            char *latin_output) {
+  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  size_t pos = 0;
+  char *start{latin_output};
+  uint16_t word = 0;
+
+  while (pos < len) {
+    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
+    *latin_output++ = char(word);
+    pos++;
+  }
+
+  return latin_output - start;
+}
+
+} // namespace utf16_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
+/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
+#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+
+namespace simdutf {
+namespace scalar {
+namespace {
+namespace utf32_to_latin1 {
+
+inline size_t convert_valid(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
+  char *start = latin1_output;
+  uint32_t utf32_char;
+  size_t pos = 0;
+
+  while (pos < len) {
+    utf32_char = (uint32_t)data[pos];
+
+    if (pos + 2 <=
+        len) { // if it is safe to read 8 more bytes, check that they are Latin1
+      uint64_t v;
+      ::memcpy(&v, data + pos, sizeof(uint64_t));
+      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
+        *latin1_output++ = char(buf[pos]);
+        *latin1_output++ = char(buf[pos + 1]);
+        pos += 2;
+        continue;
+      } else {
+        // output can not be represented in latin1
+        return 0;
+      }
+    }
+    if ((utf32_char & 0xFFFFFF00) == 0) {
+      *latin1_output++ = char(utf32_char);
+    } else {
+      // output can not be represented in latin1
+      return 0;
+    }
+    pos++;
+  }
+  return latin1_output - start;
+}
+
+} // namespace utf32_to_latin1
+} // unnamed namespace
+} // namespace scalar
+} // namespace simdutf
+
+#endif
+/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+
+SIMDUTF_PUSH_DISABLE_WARNINGS
+SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+
+#if SIMDUTF_IMPLEMENTATION_ARM64
+/* begin file src/arm64/implementation.cpp */
+/* begin file src/simdutf/arm64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "arm64"
+// #define SIMDUTF_IMPLEMENTATION arm64
+/* end file src/simdutf/arm64/begin.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+#ifndef SIMDUTF_ARM64_H
+  #error "arm64.h must be included"
+#endif
+using namespace simd;
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  simd8<uint8_t> bits = input.reduce_or();
+  return bits.max_val() < 0b10000000u;
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+  // is using ^ as well. This will work fine because we only have to report
+  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+  // overlapping multibyte characters, and if that happens, there is guaranteed
+  // to be at least *one* lead byte that is part of only 1 other multibyte
+  // character. The error will be detected there.
+  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
+  // Low half contains  10cccccc|1110aaaa
+  // High half contains 10bbbbbb|10bbbbbb
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
+                                                4, 4, 7, 7, 10, 10);
+#else
+  const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
+#endif
+  uint8x16_t perm = vqtbl1q_u8(in, sh);
+  // Split into half vectors.
+  // 10cccccc|1110aaaa
+  uint8x8_t perm_low = vget_low_u8(perm); // no-op
+  // 10bbbbbb|10bbbbbb
+  uint8x8_t perm_high = vget_high_u8(perm);
+  // xxxxxxxx 10bbbbbb
+  uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
+  // xxxxxxxx 1110aaaa
+  uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
+  // Assemble with shift left insert.
+  // xxxxxxaa aabbbbbb
+  uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
+  // (perm_low << 8) | (perm_low >> 8)
+  // xxxxxxxx 10cccccc
+  uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
+  // Shift left insert into the low bits
+  // aaaabbbb bbcccccc
+  uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
+  return composed;
+}
+
+simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
+  // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
+  // Technically this calculates 8, but 6 does better and happens more often
+  // (The languages which use these codepoints use ASCII spaces so 8 would need
+  // to be in the middle of a very long word).
+
+  // 10bbbbbb 110aaaaa
+  uint16x8_t upper = vreinterpretq_u16_u8(in);
+  // (in << 8) | (in >> 8)
+  // 110aaaaa 10bbbbbb
+  uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
+  // 00000000 000aaaaa
+  uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
+  // Assemble with shift left insert.
+  // 00000aaa aabbbbbb
+  uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
+  return composed;
+}
+
+simdutf_really_inline uint16x8_t
+convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes.
+  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+      simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+  // Mask
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 000aaaaa 00000000
+  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+  return composed;
+}
+
+/* begin file src/arm64/arm_validate_utf16.cpp */
+template <endianness big_endian>
+const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
+  const char16_t *end = input + size;
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+  while (end - input >= 16) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    if (!match_system(big_endian)) {
+      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
+      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+    }
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
+    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
+    if (surrogates_wordmask == 0) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint64_t V = ~surrogates_wordmask;
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = ((in & v_fc) == v_dc);
+      const uint64_t H = vH.to_bitmask64();
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint64_t L = ~H & surrogates_wordmask;
+
+      const uint64_t a =
+          L & (H >> 4); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint64_t b =
+          a << 4; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint64_t c = V | a | b; // Combine all the masks into the final one.
+      if (c == ~0ull) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0xfffffffffffffffull) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+  return input;
+}
+
+template <endianness big_endian>
+const result arm_validate_utf16_with_errors(const char16_t *input,
+                                            size_t size) {
+  const char16_t *start = input;
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+  while (input + 16 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+    if (!match_system(big_endian)) {
+      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
+      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+    }
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
+    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
+    if (surrogates_wordmask == 0) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint64_t V = ~surrogates_wordmask;
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = ((in & v_fc) == v_dc);
+      const uint64_t H = vH.to_bitmask64();
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint64_t L = ~H & surrogates_wordmask;
+
+      const uint64_t a =
+          L & (H >> 4); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint64_t b =
+          a << 4; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint64_t c = V | a | b; // Combine all the masks into the final one.
+      if (c == ~0ull) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0xfffffffffffffffull) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return result(error_code::SURROGATE, input - start);
+      }
+    }
+  }
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/arm64/arm_validate_utf16.cpp */
+/* begin file src/arm64/arm_validate_utf32le.cpp */
+
+const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
+  const char32_t *end = input + size;
+
+  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
+  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
+  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
+  uint32x4_t currentmax = vmovq_n_u32(0x0);
+  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+
+  while (end - input >= 4) {
+    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
+    currentmax = vmaxq_u32(in, currentmax);
+    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+    input += 4;
+  }
+
+  uint32x4_t is_zero =
+      veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
+  if (vmaxvq_u32(is_zero) != 0) {
+    return nullptr;
+  }
+
+  is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
+                      standardoffsetmax);
+  if (vmaxvq_u32(is_zero) != 0) {
+    return nullptr;
+  }
+
+  return input;
+}
+
+const result arm_validate_utf32le_with_errors(const char32_t *input,
+                                              size_t size) {
+  const char32_t *start = input;
+  const char32_t *end = input + size;
+
+  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
+  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
+  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
+  uint32x4_t currentmax = vmovq_n_u32(0x0);
+  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+
+  while (end - input >= 4) {
+    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
+    currentmax = vmaxq_u32(in, currentmax);
+    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+
+    uint32x4_t is_zero =
+        veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
+    if (vmaxvq_u32(is_zero) != 0) {
+      return result(error_code::TOO_LARGE, input - start);
+    }
+
+    is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
+                        standardoffsetmax);
+    if (vmaxvq_u32(is_zero) != 0) {
+      return result(error_code::SURROGATE, input - start);
+    }
+
+    input += 4;
+  }
+
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/arm64/arm_validate_utf32le.cpp */
+
+/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+arm_convert_latin1_to_utf16(const char *buf, size_t len,
+                            char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  while (end - buf >= 16) {
+    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
+    uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
+    if (!match_system(big_endian)) {
+      inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
+    uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
+    if (!match_system(big_endian)) {
+      inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output + 8), inhigh);
+    utf16_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf16_output);
+}
+/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */
+/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
+std::pair<const char *, char32_t *>
+arm_convert_latin1_to_utf32(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char *end = buf + len;
+
+  while (end - buf >= 16) {
+    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
+    uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
+    uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
+    uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
+    uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
+    uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
+    uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
+    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);
+
+    utf32_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf32_output);
+}
+/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
+/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+std::pair<const char *, char *>
+arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                           char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char *end = latin1_input + len;
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  // We always write 16 bytes, of which more than the first 8 bytes
+  // are valid. A safety margin of 8 is more than sufficient.
+  while (end - latin1_input >= 16 + 8) {
+    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
+    if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
+      vst1q_u8(utf8_output, in8);
+      utf8_output += 16;
+      latin1_input += 16;
+      continue;
+    }
+
+    // We just fallback on UTF-16 code. This could be optimized/simplified
+    // further.
+    uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
+    // 1. prepare 2-byte values
+    // input 8-bit word : [aabb|bbbb] x 8
+    // expected output   : [1100|00aa|10bb|bbbb] x 8
+    const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+    const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+    // t0 = [0000|00aa|bbbb|bb00]
+    const uint16x8_t t0 = vshlq_n_u16(in16, 2);
+    // t1 = [0000|00aa|0000|0000]
+    const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+    // t2 = [0000|0000|00bb|bbbb]
+    const uint16x8_t t2 = vandq_u16(in16, v_003f);
+    // t3 = [0000|00aa|00bb|bbbb]
+    const uint16x8_t t3 = vorrq_u16(t1, t2);
+    // t4 = [1100|00aa|10bb|bbbb]
+    const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+    // 2. merge ASCII and 2-byte codewords
+    const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+    const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
+    const uint8x16_t utf8_unpacked =
+        vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
+    // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+    const uint16x8_t mask = simdutf_make_uint16x8_t(
+        0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+    const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                             0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+    uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+    // 4. pack the bytes
+    const uint8_t *row =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+    const uint8x16_t shuffle = vld1q_u8(row + 1);
+    const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+    // 5. store bytes
+    vst1q_u8(utf8_output, utf8_packed);
+    // 6. adjust pointers
+    latin1_input += 8;
+    utf8_output += row[0];
+
+  } // while
+
+  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */
+
+/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process in chunks of 12 bytes
+    vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
+    latin1_output += 12; // We wrote 12 18-bit characters.
+    return 12;           // We consumed 12 bytes.
+  }
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+  // scenario we process SIX (6) input code-code units. The max length in bytes
+  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+      simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
+  // Mask
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 000aaaaa 00000000
+  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  uint8x8_t latin1_packed = vmovn_u16(composed);
+  vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
+}
+/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
+/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
+    // We process in chunks of 16 bytes
+    // The routine in simd.h is reused.
+    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+    temp.store_ascii_as_utf16<big_endian>(utf16_output);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16;          // We consumed 16 bytes.
+  }
+
+  // 3 byte sequences are the next most common, as seen in CJK, which has long
+  // sequences of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units.
+    uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+    }
+    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+    // UTF-16 code units.
+    uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed =
+          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+    }
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed =
+          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+    }
+    // Store
+    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    uint16x4_t lowperm = vmovn_u32(perm);
+    // Partially mask with bic (doesn't require a temporary register unlike and)
+    // The shift left insert below will clear the top bits.
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
+    // Split into narrow vectors.
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 xxxxaaaa
+    uint16x4_t highperm = vshrn_n_u32(perm, 16);
+    // Shift right accumulate the middle byte
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 00xx0bbb bbcccccc
+    // 3 byte: 00xxbbbb bbcccccc
+    uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
+    // Shift left and insert the top 4 bits, overwriting the garbage
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 00000bbb bbcccccc
+    // 3 byte: aaaabbbb bbcccccc
+    uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
+    }
+    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+      // it is easier when we can assume they are all pairs. This version does
+      // not use the LUT, but 4 byte sequences are less common and the overhead
+      // of the extra memory access is less important than the early branch
+      // overhead in shorter sequences.
+
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      uint8x16_t swap = vrev16q_u8(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
+      // Create a magic number containing the low 2 bits of the trail surrogate
+      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
+      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      uint32x4_t trail =
+          vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      uint16x8_t magic_with_low_2 =
+          vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      uint32x4_t lead = vreinterpretq_u32_u16(
+          vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
+      // Mask out lead
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
+      // Blend pairs
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      uint16x8_t blend = vreinterpretq_u16_u32(
+          vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed =
+            vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+      }
+      uint16_t buffer[8];
+      vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
+      for (int k = 0; k < 6; k++) {
+        utf16_output[k] = buffer[k];
+      } // the loop might compiler to a couple of instructions.
+      utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
+      return 12;         // We consumed 12 bytes.
+    }
+    // 3 1-4 byte sequences
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we
+    // have. Unfortunately, we cannot trust the input. So it is possible to get
+    // 0xff as an input byte and it should not result in a surrogate pair. We
+    // need to check for that.
+    uint32_t permbuffer[4];
+    vst1q_u32(permbuffer, perm);
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed
+    // first.
+    uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
+    // Start assembling the sequence. Since the 4th byte is in the same position
+    // as it would be in a surrogate and there is no dependency, shift left
+    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
+    // Top 16 bits contains the high ten bits of the surrogate pair before
+    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    uint32x4_t abc =
+        vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+    // correction
+    uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+    // 11110aaa bbbbbbcc|000000cc ccdddddd
+    uint16x8_t masked_pair = vreinterpretq_u16_u32(
+        vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+    // surrogate prefixes in one magic 16-bit addition. similar magic number but
+    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
+    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    uint32x4_t surrogates =
+        vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
+
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected =
+          vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer[4];
+    vst1q_u32(buffer, selected);
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+    // 0x00800000;
+    for (size_t i = 0; i < 3; i++) {
+      // Surrogate
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+        utf16_output[0] = uint16_t(buffer[i] >> 16);
+        utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
+        utf16_output += 2;
+      } else {
+        utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
+        utf16_output++;
+      }
+    }
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
+}
+/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
+/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char32_t *&utf32_out) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process in chunks of 12 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
+    temp.store_ascii_as_utf32_tbl(utf32_out);
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12;          // We consumed 12 bytes.
+  }
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    // Zero extend and store via ST2 with a zero.
+    uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
+    vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if (input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+    // Zero extend and store via ST2 with a zero.
+    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
+    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 6; // We wrote 6 32-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
+  /// Either no fast path or an unimportant fast path.
+
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Zero extend and store with ST2 and zero
+    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
+    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
+    utf32_output += 6; // We wrote 6 32-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Split
+    // 00000000 00000000 0ccccccc
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    uint32x4_t middle = vreinterpretq_u32_u16(
+        vbicq_u16(vreinterpretq_u16_u32(perm),
+                  vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
+    vst1q_u32(utf32_output, composed);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-32 code units. This uses the same method as the fixed 3 byte
+      // version, reversing and shift left insert. However, there is no need for
+      // a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common
+      // and the overhead of the extra memory access is less important than the
+      // early branch overhead in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
+      // Swap 16-bit lanes
+      // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
+      // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
+      uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
+      // Store
+      vst1q_u32(utf32_output, composed);
+
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12;         // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+    // due to surrogates no longer being involved.
+    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
+        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
+    // Ascii
+    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
+    uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
+    // When converting the way we do, the 3 byte prefix will be interpreted as
+    // the 18th bit being set, since the code would interpret the lead byte
+    // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
+    // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
+    // NEON has shift right accumulate, we use that.
+    //  4 byte   3 byte
+    // 10bbbbbb 1110bbbb
+    // 00000000 01000000 6th bit
+    // 00000000 00100000 shift right
+    // 10bbbbbb 0000bbbb add
+    // 00bbbbbb 0000bbbb mask
+    uint8x16_t correction =
+        vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
+    uint32x4_t corrected = vreinterpretq_u32_u8(
+        vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
+    // 00000000 00000000 0000cccc ccdddddd
+    uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
+    // Insert twice
+    // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
+    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
+                              vshrq_n_u32(corrected, 4));
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
+    // Store
+    vst1q_u32(utf32_output, composed);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
+}
+/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
+
+/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */
+
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+arm_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) {
+  const char16_t *end = buf + len;
+  while (end - buf >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(in);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
+
+template <endianness big_endian>
+std::pair<result, char *>
+arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  while (end - buf >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(in);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 8; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
+}
+/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */
+/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+arm_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+  while (end - buf >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                       char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+
+  while ((end - buf) >= 8) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
+      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char32_t *>(utf32_output));
+}
+/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
+/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
+
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
+
+    Ad 1.
+
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
+
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      uint16x8_t nextin =
+          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+      if (!match_system(big_endian)) {
+        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+      }
+      if (vmaxvq_u16(nextin) > 0x7F) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(in);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+        // 2. store (16 bytes)
+        vst1q_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const uint16x8_t t0 = vshlq_n_u16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const uint16x8_t t2 = vandq_u16(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const uint16x8_t t3 = vorrq_u16(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+      const uint8x16_t utf8_unpacked =
+          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+      // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t mask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                               0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+      // 4. pack the bytes
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+      const uint8x16_t shuffle = vld1q_u8(row + 1);
+      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      vst1q_u8(utf8_output, utf8_packed);
+
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
+
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const uint16x8_t t0 = vreinterpretq_u16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      const uint16x8_t s0 = vshrq_n_u16(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+      // [00bb|bbbb|0000|aaaa]
+      const uint16x8_t s2 = vorrq_u16(s0, s1s);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+      const uint16x8_t m0 =
+          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+      const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t onemask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+      const uint16x8_t twomask = simdutf_make_uint16x8_t(
+          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                  0x0100, 0x0400, 0x1000, 0x4000};
+      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                  0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+      const uint16x8_t combined =
+          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                    vandq_u16(one_or_two_bytes_bytemask, twomask));
+      const uint16_t mask = vaddvq_u16(combined);
+      // The following fast path may or may not be beneficial.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += 12;
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += 12;
+        buf += 8;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+      vst1q_u8(utf8_output, utf8_0);
+      utf8_output += row0[0];
+      vst1q_u8(utf8_output, utf8_1);
+      utf8_output += row1[0];
+
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+
+  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
+  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
+    if (!match_system(big_endian)) {
+      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+    }
+    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      uint16x8_t nextin =
+          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
+      if (!match_system(big_endian)) {
+        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+      }
+      if (vmaxvq_u16(nextin) > 0x7F) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(in);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
+        // 2. store (16 bytes)
+        vst1q_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
+    }
+
+    if (vmaxvq_u16(in) <= 0x7FF) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const uint16x8_t t0 = vshlq_n_u16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const uint16x8_t t2 = vandq_u16(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const uint16x8_t t3 = vorrq_u16(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+      const uint8x16_t utf8_unpacked =
+          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
+      // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t mask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                               0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+      // 4. pack the bytes
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+      const uint8x16_t shuffle = vld1q_u8(row + 1);
+      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      vst1q_u8(utf8_output, utf8_packed);
+
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+    const uint16x8_t surrogates_bytemask =
+        vceqq_u16(vandq_u16(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (vmaxvq_u16(surrogates_bytemask) == 0) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
+
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
+
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
+
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
+
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const uint16x8_t t0 = vreinterpretq_u16_u8(
+          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      const uint16x8_t s0 = vshrq_n_u16(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
+      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+      // [00bb|bbbb|0000|aaaa]
+      const uint16x8_t s2 = vorrq_u16(s0, s1s);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
+      const uint16x8_t m0 =
+          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
+      const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+      const uint16x8_t onemask = simdutf_make_uint16x8_t(
+          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+      const uint16x8_t twomask = simdutf_make_uint16x8_t(
+          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                  0x0100, 0x0400, 0x1000, 0x4000};
+      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                  0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+      const uint16x8_t combined =
+          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                    vandq_u16(one_or_two_bytes_bytemask, twomask));
+      const uint16_t mask = vaddvq_u16(combined);
+      // The following fast path may or may not be beneficial.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += 12;
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += 12;
+        buf += 8;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+      vst1q_u8(utf8_output, utf8_0);
+      utf8_output += row0[0];
+      vst1q_u8(utf8_output, utf8_1);
+      utf8_output += row1[0];
+
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
+
+/* begin file src/arm64/arm_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
+
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  uint8_t *out = (uint8_t *)dst;
+  constexpr static uint8_t source_table[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
+  };
+  constexpr static uint8_t source_table_url[64] = {
+      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
+      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
+      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
+      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
+      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
+  };
+  const uint8x16_t v3f = vdupq_n_u8(0x3f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  // When trying to load a uint8_t array, Visual Studio might
+  // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
+  // cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
+  const uint8x16x4_t table = vld4q_u8(
+      (reinterpret_cast<const char *>(options & base64_url) ? source_table_url
+                                                            : source_table));
+#else
+  const uint8x16x4_t table =
+      vld4q_u8((options & base64_url) ? source_table_url : source_table);
+#endif
+  size_t i = 0;
+  for (; i + 16 * 3 <= srclen; i += 16 * 3) {
+    const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
+    uint8x16x4_t result;
+    result.val[0] = vshrq_n_u8(in.val[0], 2);
+    result.val[1] =
+        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
+    result.val[2] =
+        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
+    result.val[3] = vandq_u8(in.val[2], v3f);
+    result.val[0] = vqtbl4q_u8(table, result.val[0]);
+    result.val[1] = vqtbl4q_u8(table, result.val[1]);
+    result.val[2] = vqtbl4q_u8(table, result.val[2]);
+    result.val[3] = vqtbl4q_u8(table, result.val[3]);
+    vst4q_u8(out, result);
+    out += 64;
+  }
+  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
+                                            options);
+
+  return size_t((char *)out - dst);
+}
+
+static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    vst1q_u8((uint8_t *)output, data);
+    return;
+  }
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
+                               tables::base64::thintable_epi8[mask2]};
+  uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t off =
+      simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
+#else
+  const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+#endif
+
+  compactmask = vaddq_u8(compactmask, off);
+  uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
+
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
+  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
+  vst1q_u8((uint8_t *)output, answer);
+}
+
+struct block64 {
+  uint8x16_t chunks[4];
+};
+
+static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
+template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
+  uint8x16_t v0f = vdupq_n_u8(0xf);
+
+  uint8x16_t underscore0, underscore1, underscore2, underscore3;
+  if (base64_url) {
+    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
+    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
+    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
+    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
+  } else {
+    (void)underscore0;
+    (void)underscore1;
+    (void)underscore2;
+    (void)underscore3;
+  }
+
+  uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
+  uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
+  uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
+  uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+
+  // Needed by the decoding step.
+  uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
+  uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
+  uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
+  uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
+  uint8x16_t lut_lo;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  if (base64_url) {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4);
+  } else {
+    lut_lo =
+        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                                0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4);
+  }
+#else
+  if (base64_url) {
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                        0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4};
+  } else {
+    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
+                        0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4};
+  }
+#endif
+  uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
+  uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
+  uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
+  uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
+  uint8x16_t lut_hi;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  if (base64_url) {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  } else {
+    lut_hi =
+        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
+                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+  }
+#else
+  if (base64_url) {
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  } else {
+    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
+                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+  }
+#endif
+  uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
+  uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
+  uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
+  uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
+
+  if (base64_url) {
+    hi0 = vbicq_u8(hi0, underscore0);
+    hi1 = vbicq_u8(hi1, underscore1);
+    hi2 = vbicq_u8(hi2, underscore2);
+    hi3 = vbicq_u8(hi3, underscore3);
+  }
+
+  uint8_t checks =
+      vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
+                         vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  const uint8x16_t bit_mask =
+      simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
+#else
+  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
+                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
+#endif
+  uint64_t badcharmask = 0;
+  *error = checks > 0x3;
+  if (checks) {
+    // Add each of the elements next to each other, successively, to stuff each
+    // 8 byte mask into one.
+    uint8x16_t test0 = vtstq_u8(lo0, hi0);
+    uint8x16_t test1 = vtstq_u8(lo1, hi1);
+    uint8x16_t test2 = vtstq_u8(lo2, hi2);
+    uint8x16_t test3 = vtstq_u8(lo3, hi3);
+    uint8x16_t sum0 =
+        vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
+    uint8x16_t sum1 =
+        vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
+    sum0 = vpaddq_u8(sum0, sum1);
+    sum0 = vpaddq_u8(sum0, sum0);
+    badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+  }
+  // This is the transformation step that can be done while we are waiting for
+  // sum0
+  uint8x16_t roll_lut;
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+  if (base64_url) {
+    roll_lut =
+        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  } else {
+    roll_lut =
+        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  }
+#else
+  if (base64_url) {
+    roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                          0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  } else {
+    roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
+                          0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+  }
+#endif
+  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
+  if (base64_url) {
+    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
+    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
+    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
+    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+  }
+  uint8x16_t roll0 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
+  uint8x16_t roll1 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
+  uint8x16_t roll2 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
+  uint8x16_t roll3 = vqtbl1q_u8(
+      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
+  b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
+  b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
+  b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
+  b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
+  return badcharmask;
+}
+
+void copy_block(block64 *b, char *output) {
+  vst1q_u8((uint8_t *)output, b->chunks[0]);
+  vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
+  vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
+  vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
+}
+
+uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t popcounts =
+      vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
+  uint64_t offsets = popcounts * 0x0101010101010101;
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
+  compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
+  compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
+  return offsets >> 56;
+}
+
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char *src) {
+  b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
+  b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
+  b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
+  b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
+}
+
+// The caller of this function is responsible to ensure that there are 32 bytes
+// available from reading at data. It returns a 16-byte value, narrowing with
+// saturation the 16-bit words.
+inline uint8x16_t load_satured(const uint16_t *data) {
+  uint16x8_t in1 = vld1q_u16(data);
+  uint16x8_t in2 = vld1q_u16(data + 8);
+  return vqmovn_high_u16(vqmovn_u16(in1), in2);
+}
+
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+void load_block(block64 *b, const char16_t *src) {
+  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
+  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
+  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
+  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
+}
+
+// decode 64 bytes and output 48 bytes
+void base64_decode_block(char *out, const char *src) {
+  uint8x16x4_t str = vld4q_u8((uint8_t *)src);
+  uint8x16x3_t outvec;
+  outvec.val[0] =
+      vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+  outvec.val[1] =
+      vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+  outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+  vst3q_u8((uint8_t *)out, outvec);
+}
+
+template <bool base64_url, typename char_type>
+full_result
+compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
+    }
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  const char_type *const srcinit = src;
+  const char *const dstinit = dst;
+  const char_type *const srcend = src + srclen;
+
+  constexpr size_t block_size = 10;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char_type *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (badcharmask) {
+        if (error) {
+          src -= 64;
+          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+                 to_base64[uint8_t(*src)] <= 64) {
+            src++;
+          }
+          if (src < srcend) {
+            // should never happen
+          }
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                  size_t(dst - dstinit)};
+        }
+      }
+
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else {
+        // optimization opportunity: if bufferptr == buffer and mask == 0, we
+        // can avoid the call to compress_block and decode directly.
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = to_base64[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
+
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // backtrack
+    int leftover = int(bufferptr - buffer_start);
+    while (leftover > 0) {
+      while (to_base64[uint8_t(*(src - 1))] == 64) {
+        src--;
+      }
+      src--;
+      leftover--;
+    }
+  }
+  if (src < srcend + equalsigns) {
+    full_result r = scalar::base64::base64_tail_decode(
+        dst, src, srcend - src, equalsigns, options, last_chunk_options);
+    r.input_count += size_t(src - srcinit);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+        r.error == error_code::BASE64_EXTRA_BITS) {
+      return r;
+    } else {
+      r.output_count += size_t(dst - dstinit);
+    }
+    if (last_chunk_options != stop_before_partial &&
+        r.error == error_code::SUCCESS && equalsigns > 0) {
+      // additional checks
+      if ((r.output_count % 3 == 0) ||
+          ((r.output_count % 3) + 1 + equalsigns != 4)) {
+        r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.input_count = equallocation;
+      }
+    }
+    return r;
+  }
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
+/* end file src/arm64/arm_base64.cpp */
+/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
+std::pair<const char32_t *, char *>
+arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const char32_t *end = buf + len;
+  while (end - buf >= 8) {
+    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+    if (vmaxvq_u16(utf16_packed) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
+
+std::pair<result, char *>
+arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  while (end - buf >= 8) {
+    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+
+    if (vmaxvq_u16(utf16_packed) <= 0xff) {
+      // 1. pack the bytes
+      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
+      // 2. store (8 bytes)
+      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 8; k++) {
+        uint32_t word = buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
+}
+/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
+/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+arm_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                           char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *end = buf + len;
+
+  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
+
+  while (end - buf >= 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(in) <= 0xFFFF) {
+      uint16x4_t utf16_packed = vmovn_u32(in);
+
+      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff),
+                                             vcge_u16(utf16_packed, v_d800)),
+                                    forbidden_bytemask);
+
+      if (!match_system(big_endian)) {
+        utf16_packed =
+            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
+      }
+      vst1_u16(utf16_output, utf16_packed);
+      utf16_output += 4;
+      buf += 4;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
+
+  // check for invalid input
+  if (vmaxv_u16(forbidden_bytemask) != 0) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+  }
+
+  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+}
+
+template <endianness big_endian>
+std::pair<result, char16_t *>
+arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                       char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  while (end - buf >= 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(in) <= 0xFFFF) {
+      uint16x4_t utf16_packed = vmovn_u32(in);
+
+      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
+      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
+      const uint16x4_t forbidden_bytemask = vand_u16(
+          vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
+      if (vmaxv_u16(forbidden_bytemask) != 0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+
+      if (!match_system(big_endian)) {
+        utf16_packed =
+            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
+      }
+      vst1_u16(utf16_output, utf16_packed);
+      utf16_output += 4;
+      buf += 4;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char16_t *>(utf16_output));
+}
+/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
+/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
+std::pair<const char32_t *, char *>
+arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *end = buf + len;
+
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+
+  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin < end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+
+      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
+        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+        // t2 = [0000|0000|00bb|bbbb]
+        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const uint16x8_t t3 = vorrq_u16(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
+            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+        // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t mask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+        const uint8x16_t shuffle = vld1q_u8(row + 1);
+        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+        // 5. store bytes
+        vst1q_u8(utf8_output, utf8_packed);
+
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+        forbidden_bytemask =
+            vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
+                                vcgeq_u16(utf16_packed, v_d800)),
+                      forbidden_bytemask);
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+        /* In this branch we handle three cases:
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          single UFT-8 byte
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+          two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          three UTF-8 bytes
+
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** --
+          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+          they differ by exactly one bit.
+
+          Finally from these two code units we build proper UTF-8 sequence,
+          taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 =
+            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
+                                            vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 =
+            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask =
+            vcleq_u16(utf16_packed, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
+                                        one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+        // 4. expand code units 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t onemask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+        const uint16x8_t twomask = simdutf_make_uint16x8_t(
+            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                    0x0100, 0x0400, 0x1000, 0x4000};
+        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                    0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+        const uint16x8_t combined =
+            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                      vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte code units. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  // check for invalid input
+  if (vmaxvq_u16(forbidden_bytemask) != 0) {
+    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+  }
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
+
+std::pair<result, char *>
+arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin < end) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+
+    // Check if no bits set above 16th
+    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
+      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
+      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
+        // 2. store (8 bytes)
+        vst1_u8(utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+
+      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
+        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
+        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
+        // t2 = [0000|0000|00bb|bbbb]
+        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const uint16x8_t t3 = vorrq_u16(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
+            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
+        // 3. prepare bitmask for 8-bit lookup
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t mask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
+#else
+        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                 0x0002, 0x0008, 0x0020, 0x0080};
+#endif
+        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+        const uint8x16_t shuffle = vld1q_u8(row + 1);
+        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+
+        // 5. store bytes
+        vst1q_u8(utf8_output, utf8_packed);
+
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+
+        // check for invalid input
+        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
+        const uint16x8_t forbidden_bytemask = vandq_u16(
+            vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
+        if (vmaxvq_u16(forbidden_bytemask) != 0) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start),
+                                reinterpret_cast<char *>(utf8_output));
+        }
+
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
+            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+#else
+        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
+                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
+#endif
+        /* In this branch we handle three cases:
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          single UFT-8 byte
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+          two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          three UTF-8 bytes
+
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
+
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
+
+          We precompute byte 1 for case #3 and -- **conditionally** --
+          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+          they differ by exactly one bit.
+
+          Finally from these two code units we build proper UTF-8 sequence,
+          taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        const uint16x8_t t0 =
+            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
+                                            vreinterpretq_u8_u16(dup_even)));
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        const uint16x8_t s1 =
+            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
+        // [00bb|bbbb|0000|aaaa]
+        const uint16x8_t s2 = vorrq_u16(s0, s1s);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
+        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        const uint16x8_t one_or_two_bytes_bytemask =
+            vcleq_u16(utf16_packed, v_07ff);
+        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
+                                        one_or_two_bytes_bytemask);
+        const uint16x8_t s4 = veorq_u16(s3, m0);
+#undef simdutf_vec
+
+        // 4. expand code units 16-bit => 32-bit
+        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
+        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
+        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
+#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
+        const uint16x8_t onemask = simdutf_make_uint16x8_t(
+            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
+        const uint16x8_t twomask = simdutf_make_uint16x8_t(
+            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
+#else
+        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
+                                    0x0100, 0x0400, 0x1000, 0x4000};
+        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
+                                    0x0200, 0x0800, 0x2000, 0x8000};
+#endif
+        const uint16x8_t combined =
+            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
+                      vandq_u16(one_or_two_bytes_bytemask, twomask));
+        const uint16_t mask = vaddvq_u16(combined);
+        // The following fast path may or may not be beneficial.
+        /*if(mask == 0) {
+          // We only have three-byte code units. Use fast path.
+          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
+          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
+          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
+          vst1q_u8(utf8_output, utf8_0);
+          utf8_output += 12;
+          vst1q_u8(utf8_output, utf8_1);
+          utf8_output += 12;
+          buf += 8;
+          continue;
+        }*/
+        const uint8_t mask0 = uint8_t(mask);
+
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
+        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+
+        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
+        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+
+        vst1q_u8(utf8_output, utf8_0);
+        utf8_output += row0[0];
+        vst1q_u8(utf8_output, utf8_1);
+        utf8_output += row1[0];
+
+        buf += 8;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
+    }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
+
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
+
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
+}
+
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
+
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
+  }
+
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    }
+  }
+
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
+}
+
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
+    }
+    reader.advance();
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
+
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
+}
+
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
+    }
+    reader.advance();
+
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
+
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
+    }
+    return utf16_output - start;
+  }
+
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
+  }
+
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf16 {
+
+using namespace simd;
+
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
+}
+
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
+    }
+    return utf32_output - start;
+  }
+
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
+  }
+
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_utf32 {
+
+using namespace simd;
+
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
+}
+
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+// other functions
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
+  }
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
+
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
+
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+/* begin file src/generic/utf8.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8 {
+
+using namespace simd;
+
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
+  }
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      FORBIDDEN,
+      // 1110____ ________ <three byte lead in byte 1>
+      FORBIDDEN,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      FORBIDDEN);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              FORBIDDEN,
+              // ____0101 ________
+              FORBIDDEN,
+              // ____011_ ________
+              FORBIDDEN, FORBIDDEN,
+
+              // ____1___ ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              // ____1101 ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    this->error |= check_special_cases(input, prev1);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      latin1_output += howmany;
+    }
+    return latin1_output - start;
+  }
+
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        if (errors()) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        latin1_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
+  }
+
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
+
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace arm64
+} // namespace simdutf
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+
+namespace simdutf {
+namespace arm64 {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
+  }
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace arm64
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+
+// placeholder scalars
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace arm64 {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_ascii(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail,
+                                                       len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char32_t *tail = arm_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = arm_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res =
+        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char *, char *> ret =
+      arm_convert_latin1_to_utf8(buf, len, utf8_output);
+  size_t converted_chars = ret.second - utf8_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      arm_convert_latin1_to_utf32(buf, len, utf32_output);
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+                                                           utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+                                                          utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+                                                       utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+          buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                               latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+                                                                utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+                                                             utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
+  std::pair<const char32_t *, char *> ret =
+      arm_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+                                                              utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      arm_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      arm_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+        ret.first, len - (ret.first - buf), ret.second);
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                              utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  // See
+  // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
+  // credit to Pete Cawley
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  uint64_t result = 0;
+  const int lanes = sizeof(uint8x16_t);
+  uint8_t rem = length % lanes;
+  const uint8_t *simd_end = data + (length / lanes) * lanes;
+  const uint8x16_t threshold = vdupq_n_u8(0x80);
+  for (; data < simd_end; data += lanes) {
+    // load 16 bytes
+    uint8x16_t input_vec = vld1q_u8(data);
+    // compare to threshold (0x80)
+    uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
+    // vertical addition
+    result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
+  }
+  return result + (length / lanes) * lanes +
+         scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
+  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
+  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
+    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
+    const uint32x4_t two_bytes_bytemask =
+        veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const uint32x4_t three_bytes_bytemask =
+        veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+
+    const uint16x8_t reduced_ascii_bytes_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
+    const uint16x8_t reduced_two_bytes_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
+    const uint16x8_t reduced_three_bytes_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+
+    const uint16x8_t compressed_bytemask0 =
+        vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
+    const uint16x8_t compressed_bytemask1 =
+        vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+
+    size_t ascii_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
+    size_t two_bytes_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
+    size_t three_bytes_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+
+    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+  }
+  return count +
+         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
+  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
+    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
+    const uint16x8_t reduced_bytemask =
+        vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
+    const uint16x8_t compressed_bytemask =
+        vpaddq_u16(reduced_bytemask, reduced_bytemask);
+    size_t surrogate_count = count_ones(
+        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
+    count += 4 + surrogate_count;
+  }
+  return count +
+         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  return encode_base64(output, input, length, options);
+}
+
+} // namespace arm64
+} // namespace simdutf
+
+/* begin file src/simdutf/arm64/end.h */
+/* end file src/simdutf/arm64/end.h */
+/* end file src/arm64/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_FALLBACK
+/* begin file src/fallback/implementation.cpp */
+/* begin file src/simdutf/fallback/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "fallback"
+// #define SIMDUTF_IMPLEMENTATION fallback
+/* end file src/simdutf/fallback/begin.h */
+
+
+
+
+
+
+
+
+#include <cstdint>
+#include <cstring>
+
+namespace simdutf {
+namespace fallback {
+
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return scalar::utf8::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return scalar::utf8::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return scalar::ascii::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return scalar::ascii::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate(buf, len);
+}
+
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate_with_errors(buf, len);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                              utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len,
+                                                           utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                            utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len,
+                                                         utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
+                                                                  utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len,
+                                                               utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len,
+                                                              latin1_output);
+}
 
-// 1 byte for length, 16 bytes for mask
-const uint8_t pack_1_2_3_utf8_bytes[256][17] = {
-    {12, 2, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80},
-    {9, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {11, 3, 1, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {10, 0, 6, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 2, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {11, 2, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 3, 1, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 7, 5, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 2, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 3, 1, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 0, 4, 10, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 2, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {11, 2, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 3, 1, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 6, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {10, 2, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 0, 7, 5, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 2, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 4, 11, 9, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 2, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 2, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 7, 5, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 4, 8, 14, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 10, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 6, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {2, 3, 1, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {5, 2, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 7, 5, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 2, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {3, 3, 1, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {2, 0, 4, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 6, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 2, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {7, 2, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 7, 5, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 4, 11, 9, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 6, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 2, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {3, 3, 1, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {2, 0, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 2, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 7, 5, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 2, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 4, 8, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {11, 2, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {10, 3, 1, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {9, 0, 6, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {10, 2, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 0, 7, 5, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 2, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 4, 10, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 6, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 2, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {4, 3, 1, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {3, 0, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {7, 2, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 7, 5, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 4, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {10, 2, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 2, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 7, 5, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 4, 11, 9, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 8, 15, 13, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {10, 2, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {7, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {9, 3, 1, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {8, 0, 6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 2, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {9, 2, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 7, 5, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {8, 2, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 4, 10, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 6, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 2, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80, 0x80},
-    {3, 3, 1, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {2, 0, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 2, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 7, 5, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 2, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 4, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {9, 2, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80},
-    {6, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 3, 1, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {7, 0, 6, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 2, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {8, 2, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 7, 5, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 2, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {6, 3, 1, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {5, 0, 4, 11, 9, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {8, 2, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {7, 3, 1, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {6, 0, 6, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 2, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {2, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {4, 3, 1, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {3, 0, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {7, 2, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {4, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 3, 1, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {5, 0, 7, 5, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {6, 2, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80},
-    {3, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80, 0x80},
-    {5, 3, 1, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80},
-    {4, 0, 4, 8, 12, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
-     0x80, 0x80}};
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len,
+                                                           latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+      buf, len, latin1_output);
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+      buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(
+      buf, len, latin1_output);
+}
 
-} // namespace utf16_to_utf8
-} // namespace tables
-} // unnamed namespace
-} // namespace simdutf
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len,
+                                                                 latin1_output);
+}
 
-#endif // SIMDUTF_UTF16_TO_UTF8_TABLES_H
-/* end file src/tables/utf16_to_utf8_tables.h */
-// End of tables.
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
+                                                            utf8_output);
+}
 
-// The scalar routines should be included once.
-/* begin file src/scalar/ascii.h */
-#ifndef SIMDUTF_ASCII_H
-#define SIMDUTF_ASCII_H
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+}
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace ascii {
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-// Only used by the fallback kernel.
-inline simdutf_warn_unused bool validate(const char *buf, size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  uint64_t pos = 0;
-  // process in blocks of 16 bytes when possible
-  for (; pos + 16 <= len; pos += 16) {
-    uint64_t v1;
-    std::memcpy(&v1, data + pos, sizeof(uint64_t));
-    uint64_t v2;
-    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-    uint64_t v{v1 | v2};
-    if ((v & 0x8080808080808080) != 0) {
-      return false;
-    }
-  }
-  // process the tail byte-by-byte
-  for (; pos < len; pos++) {
-    if (data[pos] >= 0b10000000) {
-      return false;
-    }
-  }
-  return true;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf8_output);
 }
-#endif
 
-inline simdutf_warn_unused result validate_with_errors(const char *buf,
-                                                       size_t len) noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  // process in blocks of 16 bytes when possible
-  for (; pos + 16 <= len; pos += 16) {
-    uint64_t v1;
-    std::memcpy(&v1, data + pos, sizeof(uint64_t));
-    uint64_t v2;
-    std::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-    uint64_t v{v1 | v2};
-    if ((v & 0x8080808080808080) != 0) {
-      for (; pos < len; pos++) {
-        if (data[pos] >= 0b10000000) {
-          return result(error_code::TOO_LARGE, pos);
-        }
-      }
-    }
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+      buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
+                                                                  utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
+                                                               utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                             utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
+                                                          utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
+                                                                utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
+                                                             utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
+                                                          utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf32_output);
+}
+
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+      buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
+      buf, len, utf32_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
+                                                                utf32_output);
+}
+
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  scalar::utf16::change_endianness_utf16(input, length, output);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return scalar::utf8::count_code_points(buf, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  size_t answer = length;
+  size_t i = 0;
+  auto pop = [](uint64_t v) {
+    return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
+                        UINT64_C(0x0101010101010101) >>
+                    56);
+  };
+  for (; i + 32 <= length; i += 32) {
+    uint64_t v;
+    memcpy(&v, input + i, 8);
+    answer += pop(v);
+    memcpy(&v, input + i + 8, sizeof(v));
+    answer += pop(v);
+    memcpy(&v, input + i + 16, sizeof(v));
+    answer += pop(v);
+    memcpy(&v, input + i + 24, sizeof(v));
+    answer += pop(v);
   }
-  // process the tail byte-by-byte
-  for (; pos < len; pos++) {
-    if (data[pos] >= 0b10000000) {
-      return result(error_code::TOO_LARGE, pos);
-    }
+  for (; i + 8 <= length; i += 8) {
+    uint64_t v;
+    memcpy(&v, input + i, sizeof(v));
+    answer += pop(v);
   }
-  return result(error_code::SUCCESS, pos);
+  for (; i + 1 <= length; i += 1) {
+    answer += static_cast<uint8_t>(input[i]) >> 7;
+  }
+  return answer;
 }
 
-} // namespace ascii
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
+                                                                   length);
+}
 
-#endif
-/* end file src/scalar/ascii.h */
-/* begin file src/scalar/latin1.h */
-#ifndef SIMDUTF_LATIN1_H
-#define SIMDUTF_LATIN1_H
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1 {
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
+                                                                    length);
+}
 
-inline size_t utf32_length_from_latin1(size_t len) {
-  // We are not BOM aware.
-  return len; // a utf32 unit will always represent 1 latin1 character
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-inline size_t utf8_length_from_latin1(const char *buf, size_t len) {
-  const uint8_t *c = reinterpret_cast<const uint8_t *>(buf);
-  size_t answer = 0;
-  for (size_t i = 0; i < len; i++) {
-    if ((c[i] >> 7)) {
-      answer++;
-    }
-  }
-  return answer + len;
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-inline size_t utf16_length_from_latin1(size_t len) { return len; }
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
+}
 
-} // namespace latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf8_length_from_utf32(input, length);
+}
 
-#endif
-/* end file src/scalar/latin1.h */
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf16_length_from_utf32(input, length);
+}
 
-/* begin file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_UTF8_H
-#define SIMDUTF_VALID_UTF32_TO_UTF8_H
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_utf8 {
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
+}
 
-#if SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
-// only used by the fallback and POWER kernel
-inline size_t convert_valid(const char32_t *buf, size_t len,
-                            char *utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-        *utf8_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
-      }
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
     }
-    uint32_t word = data[pos];
-    if ((word & 0xFFFFFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xFFFFF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xFFFF0000) == 0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 18) | 0b11110000);
-      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
     }
   }
-  return utf8_output - start;
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
 }
-#endif // SIMDUTF_IMPLEMENTATION_FALLBACK || SIMDUTF_IMPLEMENTATION_PPC64
 
-} // namespace utf32_to_utf8
-} // unnamed namespace
-} // namespace scalar
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  return scalar::base64::tail_encode_base64(output, input, length, options);
+}
+} // namespace fallback
 } // namespace simdutf
 
+/* begin file src/simdutf/fallback/end.h */
+/* end file src/simdutf/fallback/end.h */
+/* end file src/fallback/implementation.cpp */
 #endif
-/* end file src/scalar/utf32_to_utf8/valid_utf32_to_utf8.h */
-/* begin file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
-#ifndef SIMDUTF_UTF32_TO_UTF8_H
-#define SIMDUTF_UTF32_TO_UTF8_H
+#if SIMDUTF_IMPLEMENTATION_ICELAKE
+/* begin file src/icelake/implementation.cpp */
+
 
+/* begin file src/simdutf/icelake/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "icelake"
+// #define SIMDUTF_IMPLEMENTATION icelake
+
+#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+// nothing needed.
+#else
+SIMDUTF_TARGET_ICELAKE
+#endif
+
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+// clang-format off
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+// clang-format on
+#endif // end of workaround
+/* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
-namespace scalar {
+namespace icelake {
 namespace {
-namespace utf32_to_utf8 {
+#ifndef SIMDUTF_ICELAKE_H
+  #error "icelake.h must be included"
+#endif
+/* begin file src/icelake/icelake_utf8_common.inl.cpp */
+// Common procedures for both validating and non-validating conversions from
+// UTF-8.
+enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };
 
-inline size_t convert(const char32_t *buf, size_t len, char *utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-        *utf8_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
+using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
+using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;
+
+/*
+    process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
+    to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
+    might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
+    indicates how many input bytes are relevant.
+
+    Returns true when the result is correct, otherwise it returns false.
+
+    The provided in and out pointers are advanced according to how many input
+    bytes have been processed, upon success.
+*/
+template <block_processing_mode tail, endianness big_endian>
+simdutf_really_inline bool
+process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
+  // constants
+  __m512i mask_identity = _mm512_set_epi8(
+      63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
+      45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
+      27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
+      8, 7, 6, 5, 4, 3, 2, 1, 0);
+  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
+  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
+  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
+  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
+      0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
+      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
+      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
+  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
+  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
+  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
+  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  // Note that 'tail' is a compile-time constant !
+  __mmask64 b =
+      (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
+  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
+                                         : _mm512_maskz_loadu_epi8(b, in);
+  __mmask64 m1 = (tail == SIMDUTF_FULL)
+                     ? _mm512_cmplt_epu8_mask(input, mask_80808080)
+                     : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
+  if (_ktestc_mask64_u8(m1,
+                        b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
+                              // alternatively, we could do 'if (m1 == b) { '
+    if (tail == SIMDUTF_FULL) {
+      in += 64; // consumed 64 bytes
+      // we convert a full 64-byte block, writing 128 bytes.
+      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+      if (big_endian) {
+        input1 = _mm512_shuffle_epi8(input1, byteflip);
       }
-    }
-    uint32_t word = data[pos];
-    if ((word & 0xFFFFFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xFFFFF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xFFFF0000) == 0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return 0;
+      _mm512_storeu_si512(out, input1);
+      out += 32;
+      __m512i input2 =
+          _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+      if (big_endian) {
+        input2 = _mm512_shuffle_epi8(input2, byteflip);
       }
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+      _mm512_storeu_si512(out, input2);
+      out += 32;
+      return true; // we are done
     } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      if (word > 0x10FFFF) {
-        return 0;
+      in += gap;
+      if (gap <= 32) {
+        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+        if (big_endian) {
+          input1 = _mm512_shuffle_epi8(input1, byteflip);
+        }
+        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
+                                 input1);
+        out += gap;
+      } else {
+        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
+        if (big_endian) {
+          input1 = _mm512_shuffle_epi8(input1, byteflip);
+        }
+        _mm512_storeu_si512(out, input1);
+        out += 32;
+        __m512i input2 =
+            _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
+        if (big_endian) {
+          input2 = _mm512_shuffle_epi8(input2, byteflip);
+        }
+        _mm512_mask_storeu_epi16(
+            out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
+        out += gap - 32;
       }
-      *utf8_output++ = char((word >> 18) | 0b11110000);
-      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
+      return true; // we are done
     }
   }
-  return utf8_output - start;
-}
+  // classify characters further
+  __mmask64 m234 = _mm512_cmp_epu8_mask(
+      mask_c0c0c0c0, input,
+      _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
+  __mmask64 m34 =
+      _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
+                           _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
 
-inline result convert_with_errors(const char32_t *buf, size_t len,
-                                  char *utf8_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 2 ASCII characters
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF80FFFFFF80) == 0) {
-        *utf8_output++ = char(buf[pos]);
-        *utf8_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
-      }
-    }
-    uint32_t word = data[pos];
-    if ((word & 0xFFFFFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xFFFFF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xFFFF0000) == 0) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return result(error_code::SURROGATE, pos);
+  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
+      m234, input, mask_c2c2c2c2,
+      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
+                      // Overlong 2-byte sequence
+  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
+    // Overlong 2-byte sequence
+    return false;
+  }
+  if (_ktestz_mask64_u8(m34, m34) == 0) {
+    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
+    // 4-byte sequence!
+    __mmask64 m4 = _mm512_cmp_epu8_mask(
+        input, mask_f0f0f0f0,
+        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
+
+    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
+                                   ? _knot_mask64(m1)
+                                   : _kand_mask64(_knot_mask64(m1), b);
+
+    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
+    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
+    // We could do it as follows...
+    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
+    // masks a and b and return 1 if all zeroes but GCC generates better code
+    // when we do:
+    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
+                   // return 1 if all zeroes
+      // Fast path with 1,2,3 bytes
+      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
+      __mmask64 m1234 = _kor_mask64(m1, m234);
+      // mismatched continuation bytes:
+      if (tail == SIMDUTF_FULL) {
+        __mmask64 xnormcm1234 = _kxnor_mask64(
+            mc,
+            m1234); // XNOR of mc and m1234 should be all zero if they differ
+        // the presence of a 1 bit indicates that they overlap.
+        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
+        // 1 if all zeroes.
+        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+          return false;
+        }
+      } else {
+        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+        if (mc != bxorm1234) {
+          return false;
+        }
       }
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      if (word > 0x10FFFF) {
-        return result(error_code::TOO_LARGE, pos);
+      // mend: identifying the last bytes of each sequence to be decoded
+      __mmask64 mend = _kshiftri_mask64(m1234, 1);
+      if (tail != SIMDUTF_FULL) {
+        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
       }
-      *utf8_output++ = char((word >> 18) | 0b11110000);
-      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    }
-  }
-  return result(error_code::SUCCESS, utf8_output - start);
-}
 
-} // namespace utf32_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+      __m512i last_and_thirdu16 =
+          _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
 
-#endif
-/* end file src/scalar/utf32_to_utf8/utf32_to_utf8.h */
+      __m512i nonasciitags = _mm512_maskz_mov_epi8(
+          mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+      __m512i clearedbytes = _mm512_andnot_si512(
+          nonasciitags, input); // high two bits cleared where not ASCII
+      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
+          0x5555555555555555, last_and_thirdu16,
+          clearedbytes); // the last byte of each character
 
-/* begin file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_UTF16_H
-#define SIMDUTF_VALID_UTF32_TO_UTF16_H
+      __mmask64 mask_before_non_ascii = _kshiftri_mask64(
+          mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+      __m512i indexofsecondlastbytes = _mm512_add_epi16(
+          mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+      __m512i beforeasciibytes =
+          _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
+          0x5555555555555555, indexofsecondlastbytes,
+          beforeasciibytes); // the second last bytes (of two, three byte seq,
+                             // surrogates)
+      secondlastbytes =
+          _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_utf16 {
+      __m512i indexofthirdlastbytes = _mm512_add_epi16(
+          mask_ffffffff,
+          indexofsecondlastbytes); // indices of the second last bytes
+      __m512i thirdlastbyte =
+          _mm512_maskz_mov_epi8(m34,
+                                clearedbytes); // only those that are the third
+                                               // last byte of a sequence
+      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
+          0x5555555555555555, indexofthirdlastbytes,
+          thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                          // surrogate)
+      thirdlastbytes =
+          _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
+                                               thirdlastbytes, 254);
+      // the elements of Wout excluding the last element if it happens to be a
+      // high surrogate:
 
-template <endianness big_endian>
-inline size_t convert_valid(const char32_t *buf, size_t len,
-                            char16_t *utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if ((word & 0xFFFF0000) == 0) {
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
-                            : char16_t(word);
-      pos++;
-    } else {
-      // will generate a surrogate pair
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
+      __mmask64 mprocessed =
+          (tail == SIMDUTF_FULL)
+              ? _pdep_u64(0xFFFFFFFF, mend)
+              : _pdep_u64(
+                    0xFFFFFFFF,
+                    _kand_mask64(
+                        mend, b)); // we adjust mend at the end of the output.
+
+      // Encodings out of range...
+      {
+        // the location of 3-byte sequence start bytes in the input
+        __mmask64 m3 = m34 & (b ^ m4);
+        // code units in Wout corresponding to 3-byte sequences.
+        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+        __mmask32 Msmall800 =
+            _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+        __mmask32 M3s =
+            _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+        if (_kor_mask32(Msmall800, M3s)) {
+          return false;
+        }
       }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos++;
+      int64_t nout = _mm_popcnt_u64(mprocessed);
+      in += 64 - _lzcnt_u64(mprocessed);
+      if (big_endian) {
+        Wout = _mm512_shuffle_epi8(Wout, byteflip);
+      }
+      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+      out += nout;
+      return true; // ok
     }
-  }
-  return utf16_output - start;
-}
+    //
+    // We have a 4-byte sequence, this is the general case.
+    // Slow!
+    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
+    __mmask64 mc =
+        _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
+    __mmask64 m1234 = _kor_mask64(m1, m234);
 
-} // namespace utf32_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    // mend: identifying the last bytes of each sequence to be decoded
+    __mmask64 mend =
+        _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
+    if (tail != SIMDUTF_FULL) {
+      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
+    }
+    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
+    __m512i last_and_thirdu16 =
+        _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
 
-#endif
-/* end file src/scalar/utf32_to_utf16/valid_utf32_to_utf16.h */
-/* begin file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
-#ifndef SIMDUTF_UTF32_TO_UTF16_H
-#define SIMDUTF_UTF32_TO_UTF16_H
+    __m512i nonasciitags = _mm512_maskz_mov_epi8(
+        mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
+    __m512i clearedbytes = _mm512_andnot_si512(
+        nonasciitags, input); // high two bits cleared where not ASCII
+    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
+        0x5555555555555555, last_and_thirdu16,
+        clearedbytes); // the last byte of each character
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_utf16 {
+    __mmask64 mask_before_non_ascii = _kshiftri_mask64(
+        mask_not_ascii, 1); // bytes that precede non-ASCII bytes
+    __m512i indexofsecondlastbytes = _mm512_add_epi16(
+        mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
+    __m512i beforeasciibytes =
+        _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
+    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
+        0x5555555555555555, indexofsecondlastbytes,
+        beforeasciibytes); // the second last bytes (of two, three byte seq,
+                           // surrogates)
+    secondlastbytes =
+        _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
 
-template <endianness big_endian>
-inline size_t convert(const char32_t *buf, size_t len, char16_t *utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if ((word & 0xFFFF0000) == 0) {
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return 0;
+    __m512i indexofthirdlastbytes = _mm512_add_epi16(
+        mask_ffffffff,
+        indexofsecondlastbytes); // indices of the second last bytes
+    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(
+        m34,
+        clearedbytes); // only those that are the third last byte of a sequence
+    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
+        0x5555555555555555, indexofthirdlastbytes,
+        thirdlastbyte); // the third last bytes (of three byte sequences, hi
+                        // surrogate)
+    thirdlastbytes =
+        _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
+    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
+        lastbytes, secondlastbytes, thirdlastbytes, 254);
+    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
+    __mmask32 Mlo = __mmask32(Mlo_uint64);
+    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
+    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(
+        Mlo,
+        mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
+    __m512i shifted4_thirdsecondandlastbytes =
+        _mm512_srli_epi16(thirdsecondandlastbytes,
+                          4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
+    __m512i tagged_lo_surrogates = _mm512_or_si512(
+        thirdsecondandlastbytes,
+        lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
+    __m512i Wout = _mm512_mask_add_epi16(
+        tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
+        mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
+    // the elements of Wout excluding the last element if it happens to be a
+    // high surrogate:
+    __mmask32 Mout = ~(Mhi & 0x80000000);
+    __mmask64 mprocessed =
+        (tail == SIMDUTF_FULL)
+            ? _pdep_u64(Mout, mend)
+            : _pdep_u64(
+                  Mout,
+                  _kand_mask64(mend,
+                               b)); // we adjust mend at the end of the output.
+
+    // mismatched continuation bytes:
+    if (tail == SIMDUTF_FULL) {
+      __mmask64 xnormcm1234 = _kxnor_mask64(
+          mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
+      // the presence of a 1 bit indicates that they overlap.
+      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
+      // if all zeroes.
+      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
+        return false;
       }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
-                            : char16_t(word);
     } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) {
-        return 0;
+      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
+      if (mc != bxorm1234) {
+        return false;
       }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
+    }
+    // Encodings out of range...
+    {
+      // the location of 3-byte sequence start bytes in the input
+      __mmask64 m3 = m34 & (b ^ m4);
+      // code units in Wout corresponding to 3-byte sequences.
+      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
+      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
+      __mmask32 Msmall800 =
+          _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
+      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
+      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
+      __mmask32 M3s =
+          _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
+      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
+      __mmask32 M4s =
+          _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
+      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
+        return false;
       }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
     }
-    pos++;
+    in += 64 - _lzcnt_u64(mprocessed);
+    int64_t nout = _mm_popcnt_u64(mprocessed);
+    if (big_endian) {
+      Wout = _mm512_shuffle_epi8(Wout, byteflip);
+    }
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
+    out += nout;
+    return true; // ok
   }
-  return utf16_output - start;
-}
-
-template <endianness big_endian>
-inline result convert_with_errors(const char32_t *buf, size_t len,
-                                  char16_t *utf16_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    uint32_t word = data[pos];
-    if ((word & 0xFFFF0000) == 0) {
-      if (word >= 0xD800 && word <= 0xDFFF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      // will not generate a surrogate pair
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(uint16_t(word)))
-                            : char16_t(word);
-    } else {
-      // will generate a surrogate pair
-      if (word > 0x10FFFF) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-      word -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
+  // Fast path 2: all ASCII or 2 byte
+  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
+                                        ? _knot_mask64(m234)
+                                        : _kand_mask64(_knot_mask64(m234), b);
+  // on top of -0xc0 we subtract -2 which we get back later of the
+  // continuation byte tags
+  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
+  __mmask64 leading = tail == (tail == SIMDUTF_FULL)
+                          ? _kor_mask64(m1, m234)
+                          : _kand_mask64(_kor_mask64(m1, m234),
+                                         b); // first bytes of each sequence
+  if (tail == SIMDUTF_FULL) {
+    __mmask64 xnor234leading =
+        _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
+    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
+      return false;
+    }
+  } else {
+    __mmask64 bxorleading = _kxor_mask64(b, leading);
+    if (_kshiftli_mask64(m234, 1) != bxorleading) {
+      return false;
     }
-    pos++;
   }
-  return result(error_code::SUCCESS, utf16_output - start);
-}
+  //
+  if (tail == SIMDUTF_FULL) {
+    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
+    // to increment the input buffer as quickly as possible.
+    // We process 32 bytes unless the byte at index 32 is a continuation byte,
+    // in which case we include it as well for a total of 33 bytes.
+    // Note that if x is an ASCII byte, then the following is false:
+    // int8_t(x) <= int8_t(0xc0) under two's complement.
+    in += 32;
+    if (int8_t(*in) <= int8_t(0xc0))
+      in++;
+    // The alternative is to do
+    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+    // but it requires loading the input, doing the mask computation, and
+    // converting back the mask to a general register. It just takes too long,
+    // leaving the processor likely to be idle.
+  } else {
+    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
+  }
+  __m512i lead = _mm512_maskz_compress_epi8(
+      leading, leading2byte); // will contain zero for ascii, and the data
+  lead = _mm512_cvtepu8_epi16(
+      _mm512_castsi512_si256(lead)); // ... zero extended into code units
+  __m512i follow = _mm512_maskz_compress_epi8(
+      continuation_or_ascii, input); // the last bytes of each sequence
+  follow = _mm512_cvtepu8_epi16(
+      _mm512_castsi512_si256(follow)); // ... zero extended into code units
+  lead = _mm512_slli_epi16(lead, 6);   // shifted into position
+  __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
 
-} // namespace utf32_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  if (big_endian) {
+    final = _mm512_shuffle_epi8(final, byteflip);
+  }
+  if (tail == SIMDUTF_FULL) {
+    // Next part is UTF-16 specific and can be generalized to UTF-32.
+    int nout = _mm_popcnt_u32(uint32_t(leading));
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  } else {
+    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
+    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
+    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  }
 
-#endif
-/* end file src/scalar/utf32_to_utf16/utf32_to_utf16.h */
+  return true; // we are fine.
+}
 
-/* begin file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
-#ifndef SIMDUTF_VALID_UTF16_TO_UTF8_H
-#define SIMDUTF_VALID_UTF16_TO_UTF8_H
+/*
+    utf32_to_utf16_masked converts `count` lower UTF-32 code units
+    from input `utf32` into UTF-16. It differs from utf32_to_utf16
+    in that it 'masks' the writes.
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf8 {
+    Returns how many 16-bit code units were stored.
 
+    byteflip is used for flipping 16-bit code units, and it should be
+        __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+    We pass it to the (always inlined) function to encourage the compiler to
+    keep the value in a (constant) register.
+*/
 template <endianness big_endian>
-inline size_t convert_valid(const char16_t *buf, size_t len,
-                            char *utf8_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 4 ASCII characters
-    if (pos + 4 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) {
-        v = (v >> 8) | (v << (64 - 8));
-      }
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while (pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian)
-                               ? char(utf16::swap_bytes(buf[pos]))
-                               : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
+                                                   __m512i utf32,
+                                                   unsigned int count,
+                                                   char16_t *output) {
+
+  const __mmask16 valid = uint16_t((1 << count) - 1);
+  // 1. check if we have any surrogate pairs
+  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
+  const __mmask16 sp_mask =
+      _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
+
+  if (sp_mask == 0) {
+    if (big_endian) {
+      _mm256_mask_storeu_epi16(
+          (__m256i *)output, valid,
+          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
+                              _mm512_castsi512_si256(byteflip)));
 
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xF800) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
     } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value >> 18) | 0b11110000);
-      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
+      _mm256_mask_storeu_epi16((__m256i *)output, valid,
+                               _mm512_cvtepi32_epi16(utf32));
     }
+    return count;
   }
-  return utf8_output - start;
-}
 
-} // namespace utf16_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  {
+    // build surrogate pair code units in 32-bit lanes
 
-#endif
-/* end file src/scalar/utf16_to_utf8/valid_utf16_to_utf8.h */
-/* begin file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
-#ifndef SIMDUTF_UTF16_TO_UTF8_H
-#define SIMDUTF_UTF16_TO_UTF8_H
+    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
+    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
+    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf8 {
+    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
+    const __m512i t1 = _mm512_slli_epi32(t0, 6);
 
-template <endianness big_endian>
-inline size_t convert(const char16_t *buf, size_t len, char *utf8_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 bytes
-    if (pos + 4 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian)) {
-        v = (v >> 8) | (v << (64 - 8));
-      }
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while (pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian)
-                               ? char(utf16::swap_bytes(buf[pos]))
-                               : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xF800) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if (pos + 1 >= len) {
-        return 0;
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return 0;
-      }
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return 0;
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value >> 18) | 0b11110000);
-      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
-    }
-  }
-  return utf8_output - start;
-}
+    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
+    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
+    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
 
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t *buf, size_t len,
-                                  char *utf8_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{utf8_output};
-  while (pos < len) {
-    // try to convert the next block of 8 bytes
-    if (pos + 4 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if (!match_system(big_endian))
-        v = (v >> 8) | (v << (64 - 8));
-      if ((v & 0xFF80FF80FF80FF80) == 0) {
-        size_t final_pos = pos + 4;
-        while (pos < final_pos) {
-          *utf8_output++ = !match_system(big_endian)
-                               ? char(utf16::swap_bytes(buf[pos]))
-                               : char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF80) == 0) {
-      // will generate one UTF-8 bytes
-      *utf8_output++ = char(word);
-      pos++;
-    } else if ((word & 0xF800) == 0) {
-      // will generate two UTF-8 bytes
-      // we have 0b110XXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 6) | 0b11000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else if ((word & 0xF800) != 0xD800) {
-      // will generate three UTF-8 bytes
-      // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((word >> 12) | 0b11100000);
-      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((word & 0b111111) | 0b10000000);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      if (pos + 1 >= len) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      // will generate four UTF-8 bytes
-      // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-      *utf8_output++ = char((value >> 18) | 0b11110000);
-      *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-      *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-      *utf8_output++ = char((value & 0b111111) | 0b10000000);
-      pos += 2;
+    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
+    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
+    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
+    const __m512i t3 =
+        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
+    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
+    __m512i t5 = _mm512_ror_epi32(t4, 16);
+    // Here we want to trim all of the upper 16-bit code units from the 2-byte
+    // characters represented as 4-byte values. We can compute it from
+    // sp_mask or the following... It can be more optimized!
+    const __mmask32 nonzero = _kor_mask32(
+        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+    const __mmask32 nonzero_masked =
+        _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
+    if (big_endian) {
+      t5 = _mm512_shuffle_epi8(t5, byteflip);
     }
+    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
+    // (zen4)
+    __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
+    _mm512_mask_storeu_epi16(
+        output,
+        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
+        compressed);
+    //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
   }
-  return result(error_code::SUCCESS, utf8_output - start);
-}
-
-} // namespace utf16_to_utf8
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
 
-#endif
-/* end file src/scalar/utf16_to_utf8/utf16_to_utf8.h */
+  return count + static_cast<unsigned int>(count_ones(sp_mask));
+}
 
-/* begin file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
-#ifndef SIMDUTF_VALID_UTF16_TO_UTF32_H
-#define SIMDUTF_VALID_UTF16_TO_UTF32_H
+/*
+    utf32_to_utf16 converts `count` lower UTF-32 code units
+    from input `utf32` into UTF-16. It may overflow.
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf32 {
+    Returns how many 16-bit code units were stored.
 
+    byteflip is used for flipping 16-bit code units, and it should be
+        __m512i byteflip = _mm512_setr_epi64(
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809,
+            0x0607040502030001,
+            0x0e0f0c0d0a0b0809
+        );
+    We pass it to the (always inlined) function to encourage the compiler to
+    keep the value in a (constant) register.
+*/
 template <endianness big_endian>
-inline size_t convert_valid(const char16_t *buf, size_t len,
-                            char32_t *utf32_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
+simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
+                                            __m512i utf32, unsigned int count,
+                                            char16_t *output) {
+  // check if we have any surrogate pairs
+  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
+  const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+
+  if (sp_mask == 0) {
+    // technically, it should be _mm256_storeu_epi16
+    if (big_endian) {
+      _mm256_storeu_si256(
+          (__m256i *)output,
+          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
+                              _mm512_castsi512_si256(byteflip)));
     } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+      _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
     }
+    return count;
   }
-  return utf32_output - start;
-}
 
-} // namespace utf16_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  {
+    // build surrogate pair code units in 32-bit lanes
 
-#endif
-/* end file src/scalar/utf16_to_utf32/valid_utf16_to_utf32.h */
-/* begin file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
-#ifndef SIMDUTF_UTF16_TO_UTF32_H
-#define SIMDUTF_UTF16_TO_UTF32_H
+    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
+    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
+    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_utf32 {
+    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
+    const __m512i t1 = _mm512_slli_epi32(t0, 6);
 
-template <endianness big_endian>
-inline size_t convert(const char16_t *buf, size_t len, char32_t *utf32_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return 0;
-      }
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return 0;
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
+    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
+    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
+    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+
+    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
+    //    to t0
+    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
+    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
+    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
+    const __m512i t3 =
+        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
+    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
+    __m512i t5 = _mm512_ror_epi32(t4, 16);
+    const __mmask32 nonzero = _kor_mask32(
+        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
+    if (big_endian) {
+      t5 = _mm512_shuffle_epi8(t5, byteflip);
     }
+    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
+    // (zen4)
+    __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
+    _mm512_mask_storeu_epi16(
+        output,
+        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
+        compressed);
+    //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
   }
-  return utf32_output - start;
+
+  return count + static_cast<unsigned int>(count_ones(sp_mask));
 }
 
-template <endianness big_endian>
-inline result convert_with_errors(const char16_t *buf, size_t len,
-                                  char32_t *utf32_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    uint16_t word =
-        !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xF800) != 0xD800) {
-      // No surrogate pair, extend 16-bit word to 32-bit word
-      *utf32_output++ = char32_t(word);
-      pos++;
-    } else {
-      // must be a surrogate pair
-      uint16_t diff = uint16_t(word - 0xD800);
-      if (diff > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      if (pos + 1 >= len) {
-        return result(error_code::SURROGATE, pos);
-      } // minimal bound checking
-      uint16_t next_word = !match_system(big_endian)
-                               ? utf16::swap_bytes(data[pos + 1])
-                               : data[pos + 1];
-      uint16_t diff2 = uint16_t(next_word - 0xDC00);
-      if (diff2 > 0x3FF) {
-        return result(error_code::SURROGATE, pos);
-      }
-      uint32_t value = (diff << 10) + diff2 + 0x10000;
-      *utf32_output++ = char32_t(value);
-      pos += 2;
-    }
+/**
+ * Store the last N bytes of previous followed by 512-N bytes from input.
+ */
+template <int N> __m512i prev(__m512i input, __m512i previous) {
+  static_assert(N <= 32, "N must be no larger than 32");
+  const __m512i movemask =
+      _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+  const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
+#if SIMDUTF_GCC8 || SIMDUTF_GCC9
+  constexpr int shift = 16 - N; // workaround for GCC8,9
+  return _mm512_alignr_epi8(input, rotated, shift);
+#else
+  return _mm512_alignr_epi8(input, rotated, 16 - N);
+#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
+}
+
+template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
+__m512i shuffle_epi128(__m512i v) {
+  static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
+  static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
+  static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
+  static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
+
+  constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
+  return _mm512_shuffle_i32x4(v, v, shuffle);
+}
+
+template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
+  return shuffle_epi128<idx, idx, idx, idx>(v);
+}
+
+/**
+ * Current unused.
+ */
+template <int N> __m512i rotate_by_N_epi8(const __m512i input) {
+
+  // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
+  const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
+
+  return _mm512_alignr_epi8(permuted, input, N);
+}
+
+/*
+    expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
+    stored at separate 32-bit lanes.
+
+    For each lane we have also a character class (`char_class), given in form
+    0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
+    corresponding bytes during pshufb.
+*/
+simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
+                                                     __m512i utf8) {
+  /*
+      Input:
+      - utf8: bytes stored at separate 32-bit code units
+      - valid: which code units have valid UTF-8 characters
+
+      Bit layout of single word. We show 4 cases for each possible
+      UTF-8 character encoding. The `?` denotes bits we must not
+      assume their value.
+
+      |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
+      |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
+      |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
+      |????.????|????.????|????.????|0aaa.aaaa| ASCII char
+        byte 3    byte 2    byte 1     byte 0
+  */
+
+  /* 1. Reset control bits of continuation bytes and the MSB
+        of the leading byte; this makes all bytes unsigned (and
+        does not alter ASCII char).
+
+      |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
+      |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
+      |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
+      |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
+       ^^        ^^        ^^        ^
+  */
+  __m512i values;
+  const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
+  values = _mm512_and_si512(utf8, v_3f3f_3f7f);
+
+  /* 2. Swap and join fields A-B and C-D
+
+      |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
+      |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
+      |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
+      |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
+  const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
+  values = _mm512_maddubs_epi16(values, v_0140_0140);
+
+  /* 3. Swap and join fields AB & CD
+
+      |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
+      |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
+      |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
+      |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
+  const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
+  values = _mm512_madd_epi16(values, v_0001_1000);
+
+  /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
+      |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
+      |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
+      |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
+      |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
+  {
+    /** pshufb
+
+    continuation = 0
+    ascii    = 7
+    _2_bytes = 9
+    _3_bytes = 10
+    _4_bytes = 11
+
+    shift_left_v3 = 4 * [
+        ascii, # 0000
+        ascii, # 0001
+        ascii, # 0010
+        ascii, # 0011
+        ascii, # 0100
+        ascii, # 0101
+        ascii, # 0110
+        ascii, # 0111
+        continuation, # 1000
+        continuation, # 1001
+        continuation, # 1010
+        continuation, # 1011
+        _2_bytes, # 1100
+        _2_bytes, # 1101
+        _3_bytes, # 1110
+        _4_bytes, # 1111
+    ] */
+    const __m512i shift_left_v3 = _mm512_setr_epi64(
+        0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
+        0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
+        0x0707070707070707, 0x0b0a090900000000);
+
+    const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
+    values = _mm512_sllv_epi32(values, shift);
   }
-  return result(error_code::SUCCESS, utf32_output - start);
+
+  /* 5. Shift right the values by variable amounts to reset lowest bits
+      |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
+      |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
+      |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
+      |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
+  {
+    // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
+    const __m512i shift_right = _mm512_setr_epi64(
+        0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
+        0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
+        0x1919191919191919, 0x0b10151500000000);
+
+    const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
+    values = _mm512_srlv_epi32(values, shift);
+  }
+
+  return values;
 }
 
-} // namespace utf16_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
+                                                  int &count) {
+  const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
+  const __m512i expand_ver2 = _mm512_setr_epi64(
+      0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
+      0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
+      0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
+  const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
+  const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
+  const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
+  const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
+  const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
+  count = static_cast<int>(count_ones(leading_bytes));
+  return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
+                                    input);
+}
 
-#endif
-/* end file src/scalar/utf16_to_utf32/utf16_to_utf32.h */
+simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
+  __m512i char_class = _mm512_srli_epi32(input, 4);
+  /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
+  const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
+  const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
+  char_class =
+      _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
+  return expanded_utf8_to_utf32(char_class, input);
+}
+/* end file src/icelake/icelake_utf8_common.inl.cpp */
+/* begin file src/icelake/icelake_macros.inl.cpp */
 
-/* begin file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
-#ifndef SIMDUTF_VALID_UTF8_TO_UTF16_H
-#define SIMDUTF_VALID_UTF8_TO_UTF16_H
+/*
+    This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
+   UTF-8 string) and loads all possible 4-byte substring into an AVX512
+   register.
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf16 {
+    For example if we have bytes abcdefgh... we create following 32-bit lanes
 
-template <endianness big_endian>
-inline size_t convert_valid(const char *buf, size_t len,
-                            char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while (pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(utf16::swap_bytes(buf[pos]))
-                                : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(leading_byte))
-                            : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 1 >= len) {
-        break;
-      } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte & 0b00011111) << 6) |
-                                     (data[pos + 1] & 0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 2 >= len) {
-        break;
-      } // minimal bound checking
-      uint16_t code_point = uint16_t(((leading_byte & 0b00001111) << 12) |
-                                     ((data[pos + 1] & 0b00111111) << 6) |
-                                     (data[pos + 2] & 0b00111111));
-      if (!match_system(big_endian)) {
-        code_point = utf16::swap_bytes(uint16_t(code_point));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        break;
-      } // minimal bound checking
-      uint32_t code_point = ((leading_byte & 0b00000111) << 18) |
-                            ((data[pos + 1] & 0b00111111) << 12) |
-                            ((data[pos + 2] & 0b00111111) << 6) |
-                            (data[pos + 3] & 0b00111111);
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
-    }
+    [abcd|bcde|cdef|defg|efgh|...]
+     ^                          ^
+     byte 0 of reg              byte 63 of reg
+*/
+/** pshufb
+        # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10,
+   11, 12, 13, 14, 15] # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,
+   6,  8,  9, 10, 11, 12, 13, 14, 15]
+
+        expand_ver2 = [
+            # lane 0:
+            0, 1, 2, 3,
+            1, 2, 3, 4,
+            2, 3, 4, 5,
+            3, 4, 5, 6,
+
+            # lane 1:
+            4, 5, 6, 7,
+            5, 6, 7, 8,
+            6, 7, 8, 9,
+            7, 8, 9, 10,
+
+            # lane 2:
+             8,  9, 10, 11,
+             9, 10, 11, 12,
+            10, 11, 12, 13,
+            11, 12, 13, 14,
+
+            # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
+   17, 18, 19 12, 13, 14, 15, 13, 14, 15,  0, 14, 15,  0,  1, 15,  0,  1,  2,
+        ]
+*/
+
+#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                      \
+  {                                                                            \
+    const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);        \
+    const __m512i expand_ver2 = _mm512_setr_epi64(                             \
+        0x0403020103020100, 0x0605040305040302, 0x0807060507060504,            \
+        0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,            \
+        0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);                               \
+    const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);            \
+                                                                               \
+    __mmask16 leading_bytes;                                                   \
+    const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                       \
+    const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                   \
+    const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                       \
+    leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                 \
+                                                                               \
+    __m512i char_class;                                                        \
+    char_class = _mm512_srli_epi32(input, 4);                                  \
+    /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                     \
+    const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                       \
+    const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                 \
+    char_class =                                                               \
+        _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
+                                                                               \
+    const int valid_count = static_cast<int>(count_ones(leading_bytes));       \
+    const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);           \
+                                                                               \
+    const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(),     \
+                                                   leading_bytes, utf32);      \
+                                                                               \
+    if (UTF32) {                                                               \
+      if (MASKED) {                                                            \
+        const __mmask16 valid = uint16_t((1 << valid_count) - 1);              \
+        _mm512_mask_storeu_epi32((__m512i *)output, valid, out);               \
+      } else {                                                                 \
+        _mm512_storeu_si512((__m512i *)output, out);                           \
+      }                                                                        \
+      output += valid_count;                                                   \
+    } else {                                                                   \
+      if (MASKED) {                                                            \
+        output += utf32_to_utf16_masked<big_endian>(                           \
+            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+      } else {                                                                 \
+        output += utf32_to_utf16<big_endian>(                                  \
+            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
+      }                                                                        \
+    }                                                                          \
   }
-  return utf16_output - start;
-}
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)       \
+  {                                                                            \
+    if (UTF32) {                                                               \
+      if (MASKED) {                                                            \
+        const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);         \
+        _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT);        \
+      } else {                                                                 \
+        _mm512_storeu_si512((__m512i *)output, INPUT);                         \
+      }                                                                        \
+      output += VALID_COUNT;                                                   \
+    } else {                                                                   \
+      if (MASKED) {                                                            \
+        output += utf32_to_utf16_masked<big_endian>(                           \
+            byteflip, INPUT, VALID_COUNT,                                      \
+            reinterpret_cast<char16_t *>(output));                             \
+      } else {                                                                 \
+        output +=                                                              \
+            utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT,           \
+                                       reinterpret_cast<char16_t *>(output));  \
+      }                                                                        \
+    }                                                                          \
+  }
 
-#endif
-/* end file src/scalar/utf8_to_utf16/valid_utf8_to_utf16.h */
-/* begin file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
-#ifndef SIMDUTF_UTF8_TO_UTF16_H
-#define SIMDUTF_UTF8_TO_UTF16_H
+#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                       \
+  if (UTF32) {                                                                 \
+    const __m128i t0 = _mm512_castsi512_si128(utf8);                           \
+    const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                     \
+    const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                     \
+    const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                     \
+    _mm512_storeu_si512((__m512i *)(output + 0 * 16),                          \
+                        _mm512_cvtepu8_epi32(t0));                             \
+    _mm512_storeu_si512((__m512i *)(output + 1 * 16),                          \
+                        _mm512_cvtepu8_epi32(t1));                             \
+    _mm512_storeu_si512((__m512i *)(output + 2 * 16),                          \
+                        _mm512_cvtepu8_epi32(t2));                             \
+    _mm512_storeu_si512((__m512i *)(output + 3 * 16),                          \
+                        _mm512_cvtepu8_epi32(t3));                             \
+  } else {                                                                     \
+    const __m256i h0 = _mm512_castsi512_si256(utf8);                           \
+    const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                     \
+    if (big_endian) {                                                          \
+      _mm512_storeu_si512(                                                     \
+          (__m512i *)(output + 0 * 16),                                        \
+          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip));            \
+      _mm512_storeu_si512(                                                     \
+          (__m512i *)(output + 2 * 16),                                        \
+          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip));            \
+    } else {                                                                   \
+      _mm512_storeu_si512((__m512i *)(output + 0 * 16),                        \
+                          _mm512_cvtepu8_epi16(h0));                           \
+      _mm512_storeu_si512((__m512i *)(output + 2 * 16),                        \
+                          _mm512_cvtepu8_epi16(h1));                           \
+    }                                                                          \
+  }
+/* end file src/icelake/icelake_macros.inl.cpp */
+/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
+// file included directly
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf16 {
+// File contains conversion procedure from VALID UTF-8 strings.
 
-template <endianness big_endian>
-inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(utf16::swap_bytes(buf[pos]))
-                                : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+/*
+    valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
 
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(leading_byte))
-                            : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return 0;
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 2 >= len) {
-        return 0;
-      } // minimal bound checking
+    The `OUTPUT` template type decides what to do with UTF-32: store
+    it directly or convert into UTF-16 (with AVX512).
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
+    Input:
+    - str           - valid UTF-8 string
+    - len           - string length
+    - out_buffer    - output buffer
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) {
-        return 0;
-      }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
-    } else {
-      return 0;
-    }
-  }
-  return utf16_output - start;
-}
+    Result:
+    - pair.first    - the first unprocessed input byte
+    - pair.second   - the first unprocessed output word
+*/
+template <endianness big_endian, typename OUTPUT>
+std::pair<const char *, OUTPUT *>
+valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
+  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+  static_assert(
+      UTF32 or UTF16,
+      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+  static_assert(!(UTF32 and big_endian),
+                "we do not currently support big-endian UTF-32");
 
-template <endianness big_endian>
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(utf16::swap_bytes(buf[pos]))
-                                : char16_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf16_output++ = !match_system(big_endian)
-                            ? char16_t(utf16::swap_bytes(leading_byte))
-                            : char16_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 1 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8, it should become
-      // a single UTF-16 word.
-      if (pos + 2 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  const char *ptr = str;
+  const char *end = ptr + len;
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if ((code_point < 0x800) || (0xffff < code_point)) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xd7ff < code_point && code_point < 0xe000) {
-        return result(error_code::SURROGATE, pos);
-      }
-      if (!match_system(big_endian)) {
-        code_point = uint32_t(utf16::swap_bytes(uint16_t(code_point)));
-      }
-      *utf16_output++ = char16_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
+  OUTPUT *output = dwords;
+  /**
+   * In the main loop, we consume 64 bytes per iteration,
+   * but we access 64 + 4 bytes.
+   * We check for ptr + 64 + 64 <= end because
+   * we want to be do maskless writes without overruns.
+   */
+  while (end - ptr >= 64 + 4) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
+    if (ascii == 0) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+      continue;
+    }
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0x10ffff < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-      code_point -= 0x10000;
-      uint16_t high_surrogate = uint16_t(0xD800 + (code_point >> 10));
-      uint16_t low_surrogate = uint16_t(0xDC00 + (code_point & 0x3FF));
-      if (!match_system(big_endian)) {
-        high_surrogate = utf16::swap_bytes(high_surrogate);
-        low_surrogate = utf16::swap_bytes(low_surrogate);
-      }
-      *utf16_output++ = char16_t(high_surrogate);
-      *utf16_output++ = char16_t(low_surrogate);
-      pos += 4;
+    const __m512i lane0 = broadcast_epi128<0>(utf8);
+    const __m512i lane1 = broadcast_epi128<1>(utf8);
+    int valid_count0;
+    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+    const __m512i lane2 = broadcast_epi128<2>(utf8);
+    int valid_count1;
+    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+    if (valid_count0 + valid_count1 <= 16) {
+      vec0 = _mm512_mask_expand_epi32(
+          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+      valid_count0 += valid_count1;
+      vec0 = expand_utf8_to_utf32(vec0);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
     } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      } else {
-        return result(error_code::HEADER_BITS, pos);
-      }
+      vec0 = expand_utf8_to_utf32(vec0);
+      vec1 = expand_utf8_to_utf32(vec1);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
     }
-  }
-  return result(error_code::SUCCESS, utf16_output - start);
-}
-
-/**
- * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
- * we have up to len input bytes left, and we encountered some error. It is
- * possible that the error is at 'buf' exactly, but it could also be in the
- * previous bytes  (up to 3 bytes back).
- *
- * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
- * current memory section and can be safely accessed. We prior_bytes to access
- * safely up to three bytes before 'buf'.
- *
- * The caller is responsible to ensure that len > 0.
- *
- * If the error is believed to have occurred prior to 'buf', the count value
- * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
- */
-template <endianness endian>
-inline result rewind_and_convert_with_errors(size_t prior_bytes,
-                                             const char *buf, size_t len,
-                                             char16_t *utf16_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  // In theory '3' would be sufficient, but sometimes the error can go back
-  // quite far.
-  size_t how_far_back = prior_bytes;
-  // size_t how_far_back = 3; // 3 bytes in the past + current position
-  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for (size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if (found_leading_bytes) {
-      if (i > 0 && byte < 128) {
-        // If we had to go back and the leading byte is ascii
-        // then we can stop right away.
-        return result(error_code::TOO_LONG, 0 - i + 1);
-      }
-      buf -= i;
-      extra_len = i;
-      break;
+    const __m512i lane3 = broadcast_epi128<3>(utf8);
+    int valid_count2;
+    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+    uint32_t tmp1;
+    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+    const __m512i lane4 = _mm512_set1_epi32(tmp1);
+    int valid_count3;
+    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+    if (valid_count2 + valid_count3 <= 16) {
+      vec2 = _mm512_mask_expand_epi32(
+          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+      valid_count2 += valid_count3;
+      vec2 = expand_utf8_to_utf32(vec2);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+    } else {
+      vec2 = expand_utf8_to_utf32(vec2);
+      vec3 = expand_utf8_to_utf32(vec3);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
     }
+    ptr += 4 * 16;
   }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
-  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
-  // unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if (!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is
-    // continuation] Or we possibly have a stream that does not start with a
-    // leading byte.
-    return result(error_code::TOO_LONG, 0 - how_far_back);
-  }
-  result res = convert_with_errors<endian>(buf, len + extra_len, utf16_output);
-  if (res.error) {
-    res.count -= extra_len;
+
+  if (end - ptr >= 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
+    if (ascii == 0) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+    } else {
+      const __m512i lane0 = broadcast_epi128<0>(utf8);
+      const __m512i lane1 = broadcast_epi128<1>(utf8);
+      int valid_count0;
+      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+      const __m512i lane2 = broadcast_epi128<2>(utf8);
+      int valid_count1;
+      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+      if (valid_count0 + valid_count1 <= 16) {
+        vec0 = _mm512_mask_expand_epi32(
+            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        valid_count0 += valid_count1;
+        vec0 = expand_utf8_to_utf32(vec0);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      } else {
+        vec0 = expand_utf8_to_utf32(vec0);
+        vec1 = expand_utf8_to_utf32(vec1);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+      }
+
+      const __m512i lane3 = broadcast_epi128<3>(utf8);
+      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+      ptr += 3 * 16;
+    }
   }
-  return res;
+  return {ptr, output};
 }
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
+/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
+/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
+// file included directly
 
-#endif
-/* end file src/scalar/utf8_to_utf16/utf8_to_utf16.h */
+simdutf_really_inline __m512i check_special_cases(__m512i input,
+                                                  const __m512i prev1) {
+  __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
+                                    0x0202020202020202, 0x4915012180808080,
+                                    0x0202020202020202, 0x4915012180808080,
+                                    0x0202020202020202, 0x4915012180808080);
+  const __m512i v_0f = _mm512_set1_epi8(0x0f);
+  __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
 
-/* begin file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
-#ifndef SIMDUTF_VALID_UTF8_TO_UTF32_H
-#define SIMDUTF_VALID_UTF8_TO_UTF32_H
+  __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
+  __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
+                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
+  __m512i index2 = _mm512_and_si512(prev1, v_0f);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf32 {
+  __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
+  __m512i mask3 =
+      _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
+                        0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
+                        0x101010101010101, 0x1010101babaaee6);
+  __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
+  __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
+  return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
+}
 
-inline size_t convert_valid(const char *buf, size_t len,
-                            char32_t *utf32_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 8 ASCII bytes
-    if (pos + 8 <=
-        len) { // if it is safe to read 8 more bytes, check that they are ascii
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 8;
-        while (pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        break;
-      } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte & 0b00011111) << 6) |
-                                 (data[pos + 1] & 0b00111111));
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if (pos + 2 >= len) {
-        break;
-      } // minimal bound checking
-      *utf32_output++ = char32_t(((leading_byte & 0b00001111) << 12) |
-                                 ((data[pos + 1] & 0b00111111) << 6) |
-                                 (data[pos + 2] & 0b00111111));
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        break;
-      } // minimal bound checking
-      uint32_t code_word = ((leading_byte & 0b00000111) << 18) |
-                           ((data[pos + 1] & 0b00111111) << 12) |
-                           ((data[pos + 2] & 0b00111111) << 6) |
-                           (data[pos + 3] & 0b00111111);
-      *utf32_output++ = char32_t(code_word);
-      pos += 4;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
-    }
-  }
-  return utf32_output - start;
+simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
+                                                      const __m512i prev_input,
+                                                      const __m512i sc) {
+  __m512i prev2 = prev<2>(input, prev_input);
+  __m512i prev3 = prev<3>(input, prev_input);
+  __m512i is_third_byte = _mm512_subs_epu8(
+      prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
+  __m512i is_fourth_byte = _mm512_subs_epu8(
+      prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
+  __m512i is_third_or_fourth_byte =
+      _mm512_or_si512(is_third_byte, is_fourth_byte);
+  const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
+  is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
+  // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
+  const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+  return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
+                                   0b1101010);
+  //__m512i is_third_or_fourth_byte_mask =
+  //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
+  // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
+}
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline __m512i is_incomplete(const __m512i input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
+                                        0xffffffffffffffff, 0xffffffffffffffff,
+                                        0xffffffffffffffff, 0xffffffffffffffff,
+                                        0xffffffffffffffff, 0xbfdfefffffffffff);
+  return _mm512_subs_epu8(input, max_value);
 }
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+struct avx512_utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  __m512i error{};
 
-#endif
-/* end file src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h */
-/* begin file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
-#ifndef SIMDUTF_UTF8_TO_UTF32_H
-#define SIMDUTF_UTF8_TO_UTF32_H
+  // The last input we received
+  __m512i prev_input_block{};
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  __m512i prev_incomplete{};
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_utf32 {
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const __m512i input,
+                                              const __m512i prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    __m512i prev1 = prev<1>(input, prev_input);
+    __m512i sc = check_special_cases(input, prev1);
+    this->error = _mm512_or_si512(
+        check_multibyte_lengths(input, prev_input, sc), this->error);
+  }
 
-inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+  }
+
+  // returns true if ASCII.
+  simdutf_really_inline bool check_next_input(const __m512i input) {
+    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
+    const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
+    if (ascii == 0) {
+      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+      return true;
+    } else {
+      this->check_utf8_bytes(input, this->prev_input_block);
+      this->prev_incomplete = is_incomplete(input);
+      this->prev_input_block = input;
+      return false;
     }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if (pos + 2 >= len) {
-        return 0;
-      } // minimal bound checking
+  }
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return _mm512_test_epi8_mask(this->error, this->error) != 0;
+  }
+}; // struct avx512_utf8_checker
+/* end file src/icelake/icelake_utf8_validation.inl.cpp */
+/* begin file src/icelake/icelake_from_utf8.inl.cpp */
+// file included directly
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point ||
-          (0xd7ff < code_point && code_point < 0xe000)) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return 0;
-      }
+// File contains conversion procedure from possibly invalid UTF-8 strings.
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff || 0x10ffff < code_point) {
-        return 0;
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
+/**
+ * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
+ * out.
+ * Returns the position of the input and output after the processing is
+ * completed. Upon error, the output is set to null.
+ */
+
+template <endianness big_endian>
+utf8_to_utf16_result
+fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
+  const char *const final_in = in + len;
+  bool result = true;
+  while (result) {
+    if (final_in - in >= 64) {
+      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
+          in, out, final_in - in);
+    } else if (in < final_in) {
+      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
+          in, out, final_in - in);
     } else {
-      return 0;
+      break;
     }
   }
-  return utf32_output - start;
+  if (!result) {
+    out = nullptr;
+  }
+  return std::make_pair(in, out);
 }
 
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char32_t *utf32_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2};
-      if ((v & 0x8080808080808080) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *utf32_output++ = char32_t(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
+template <endianness big_endian>
+simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
+                                                              size_t len,
+                                                              char16_t *out) {
+  const char *const init_in = in;
+  const char16_t *const init_out = out;
+  const char *const final_in = in + len;
+  bool result = true;
+  while (result) {
+    if (final_in - in >= 64) {
+      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
+          in, out, final_in - in);
+    } else if (in < final_in) {
+      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
+          in, out, final_in - in);
+    } else {
+      break;
     }
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *utf32_output++ = char32_t(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) == 0b11000000) {
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 | (data[pos + 1] & 0b00111111);
-      if (code_point < 0x80 || 0x7ff < code_point) {
-        return result(error_code::OVERLONG, pos);
+  }
+  if (!result) {
+    size_t pos = size_t(in - init_in);
+    if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
+      // We must check whether we are the fourth continuation byte
+      bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
+      bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
+      bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
+      if (c1 && c2 && c3) {
+        return {simdutf::TOO_LONG, pos};
       }
-      *utf32_output++ = char32_t(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      if (pos + 2 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
+    }
+    // rewind_and_convert_with_errors will seek a potential error from in
+    // onward, with the ability to go back up to in - init_in bytes, and read
+    // final_in - in bytes forward.
+    simdutf::result res =
+        scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(
+            in - init_in, in, final_in - in, out);
+    res.count += (in - init_in);
+    return res;
+  } else {
+    return simdutf::result(error_code::SUCCESS, out - init_out);
+  }
+}
 
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      // range check
-      uint32_t code_point = (leading_byte & 0b00001111) << 12 |
-                            (data[pos + 1] & 0b00111111) << 6 |
-                            (data[pos + 2] & 0b00111111);
-      if (code_point < 0x800 || 0xffff < code_point) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xd7ff < code_point && code_point < 0xe000) {
-        return result(error_code::SURROGATE, pos);
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 3;
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      if (pos + 3 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 2] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
-      if ((data[pos + 3] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      }
+template <endianness big_endian, typename OUTPUT>
+// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
+// is legacy.
+std::pair<const char *, OUTPUT *>
+validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
+  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+  static_assert(
+      UTF32 or UTF16,
+      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+  static_assert(!(UTF32 and big_endian),
+                "we do not currently support big-endian UTF-32");
 
-      // range check
-      uint32_t code_point = (leading_byte & 0b00000111) << 18 |
-                            (data[pos + 1] & 0b00111111) << 12 |
-                            (data[pos + 2] & 0b00111111) << 6 |
-                            (data[pos + 3] & 0b00111111);
-      if (code_point <= 0xffff) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0x10ffff < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      }
-      *utf32_output++ = char32_t(code_point);
-      pos += 4;
+  const char *ptr = str;
+  const char *end = ptr + len;
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  OUTPUT *output = dwords;
+  avx512_utf8_checker checker{};
+  /**
+   * In the main loop, we consume 64 bytes per iteration,
+   * but we access 64 + 4 bytes.
+   * We use masked writes to avoid overruns, see
+   * https://github.com/simdutf/simdutf/issues/471
+   */
+  while (end - ptr >= 64 + 4) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    if (checker.check_next_input(utf8)) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+      continue;
+    }
+    const __m512i lane0 = broadcast_epi128<0>(utf8);
+    const __m512i lane1 = broadcast_epi128<1>(utf8);
+    int valid_count0;
+    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+    const __m512i lane2 = broadcast_epi128<2>(utf8);
+    int valid_count1;
+    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+    if (valid_count0 + valid_count1 <= 16) {
+      vec0 = _mm512_mask_expand_epi32(
+          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+      valid_count0 += valid_count1;
+      vec0 = expand_utf8_to_utf32(vec0);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+    } else {
+      vec0 = expand_utf8_to_utf32(vec0);
+      vec1 = expand_utf8_to_utf32(vec1);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+    }
+    const __m512i lane3 = broadcast_epi128<3>(utf8);
+    int valid_count2;
+    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+    uint32_t tmp1;
+    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+    const __m512i lane4 = _mm512_set1_epi32(tmp1);
+    int valid_count3;
+    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+    if (valid_count2 + valid_count3 <= 16) {
+      vec2 = _mm512_mask_expand_epi32(
+          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+      valid_count2 += valid_count3;
+      vec2 = expand_utf8_to_utf32(vec2);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
     } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      } else {
-        return result(error_code::HEADER_BITS, pos);
-      }
+      vec2 = expand_utf8_to_utf32(vec2);
+      vec3 = expand_utf8_to_utf32(vec3);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
     }
+    ptr += 4 * 16;
   }
-  return result(error_code::SUCCESS, utf32_output - start);
-}
+  const char *validatedptr = ptr; // validated up to ptr
 
-/**
- * When rewind_and_convert_with_errors is called, we are pointing at 'buf' and
- * we have up to len input bytes left, and we encountered some error. It is
- * possible that the error is at 'buf' exactly, but it could also be in the
- * previous bytes location (up to 3 bytes back).
- *
- * prior_bytes indicates how many bytes, prior to 'buf' may belong to the
- * current memory section and can be safely accessed. We prior_bytes to access
- * safely up to three bytes before 'buf'.
- *
- * The caller is responsible to ensure that len > 0.
- *
- * If the error is believed to have occurred prior to 'buf', the count value
- * contain in the result will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
- */
-inline result rewind_and_convert_with_errors(size_t prior_bytes,
-                                             const char *buf, size_t len,
-                                             char32_t *utf32_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  size_t how_far_back = 3; // 3 bytes in the past + current position
-  if (how_far_back > prior_bytes) {
-    how_far_back = prior_bytes;
-  }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for (size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if (found_leading_bytes) {
-      if (i > 0 && byte < 128) {
-        // If we had to go back and the leading byte is ascii
-        // then we can stop right away.
-        return result(error_code::TOO_LONG, 0 - i + 1);
+  // For the final pass, we validate 64 bytes, but we only transcode
+  // 3*16 bytes, so we may end up double-validating 16 bytes.
+  if (end - ptr >= 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    if (checker.check_next_input(utf8)) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+    } else {
+      const __m512i lane0 = broadcast_epi128<0>(utf8);
+      const __m512i lane1 = broadcast_epi128<1>(utf8);
+      int valid_count0;
+      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+      const __m512i lane2 = broadcast_epi128<2>(utf8);
+      int valid_count1;
+      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+      if (valid_count0 + valid_count1 <= 16) {
+        vec0 = _mm512_mask_expand_epi32(
+            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        valid_count0 += valid_count1;
+        vec0 = expand_utf8_to_utf32(vec0);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      } else {
+        vec0 = expand_utf8_to_utf32(vec0);
+        vec1 = expand_utf8_to_utf32(vec1);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
       }
-      buf -= i;
-      extra_len = i;
-      break;
+
+      const __m512i lane3 = broadcast_epi128<3>(utf8);
+      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+
+      ptr += 3 * 16;
     }
+    validatedptr += 4 * 16;
   }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
-  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
-  // unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if (!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is
-    // continuation] Or we possibly have a stream that does not start with a
-    // leading byte.
-    return result(error_code::TOO_LONG, 0 - how_far_back);
+  if (end != validatedptr) {
+    const __m512i utf8 =
+        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
+                                (const __m512i *)validatedptr);
+    checker.check_next_input(utf8);
   }
-
-  result res = convert_with_errors(buf, len + extra_len, utf32_output);
-  if (res.error) {
-    res.count -= extra_len;
+  checker.check_eof();
+  if (checker.errors()) {
+    return {ptr, nullptr}; // We found an error.
   }
-  return res;
+  return {ptr, output};
 }
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/utf8_to_utf32/utf8_to_utf32.h */
+// Like validating_utf8_to_fixed_length but returns as soon as an error is
+// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
+// This code is legacy.
+template <endianness big_endian, typename OUTPUT>
+std::tuple<const char *, OUTPUT *, bool>
+validating_utf8_to_fixed_length_with_constant_checks(const char *str,
+                                                     size_t len,
+                                                     OUTPUT *dwords) {
+  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
+  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
+  static_assert(
+      UTF32 or UTF16,
+      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
+  static_assert(!(UTF32 and big_endian),
+                "we do not currently support big-endian UTF-32");
 
-/* begin file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF16_H
-#define SIMDUTF_LATIN1_TO_UTF16_H
+  const char *ptr = str;
+  const char *end = ptr + len;
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  OUTPUT *output = dwords;
+  avx512_utf8_checker checker{};
+  /**
+   * In the main loop, we consume 64 bytes per iteration,
+   * but we access 64 + 4 bytes.
+   */
+  while (end - ptr >= 4 + 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    bool ascii = checker.check_next_input(utf8);
+    if (checker.errors()) {
+      return {ptr, output, false}; // We found an error.
+    }
+    if (ascii) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+      continue;
+    }
+    const __m512i lane0 = broadcast_epi128<0>(utf8);
+    const __m512i lane1 = broadcast_epi128<1>(utf8);
+    int valid_count0;
+    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+    const __m512i lane2 = broadcast_epi128<2>(utf8);
+    int valid_count1;
+    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+    if (valid_count0 + valid_count1 <= 16) {
+      vec0 = _mm512_mask_expand_epi32(
+          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+      valid_count0 += valid_count1;
+      vec0 = expand_utf8_to_utf32(vec0);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+    } else {
+      vec0 = expand_utf8_to_utf32(vec0);
+      vec1 = expand_utf8_to_utf32(vec1);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+    }
+    const __m512i lane3 = broadcast_epi128<3>(utf8);
+    int valid_count2;
+    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
+    uint32_t tmp1;
+    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
+    const __m512i lane4 = _mm512_set1_epi32(tmp1);
+    int valid_count3;
+    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
+    if (valid_count2 + valid_count3 <= 16) {
+      vec2 = _mm512_mask_expand_epi32(
+          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
+      valid_count2 += valid_count3;
+      vec2 = expand_utf8_to_utf32(vec2);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+    } else {
+      vec2 = expand_utf8_to_utf32(vec2);
+      vec3 = expand_utf8_to_utf32(vec3);
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+    }
+    ptr += 4 * 16;
+  }
+  const char *validatedptr = ptr; // validated up to ptr
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf16 {
+  // For the final pass, we validate 64 bytes, but we only transcode
+  // 3*16 bytes, so we may end up double-validating 16 bytes.
+  if (end - ptr >= 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    bool ascii = checker.check_next_input(utf8);
+    if (checker.errors()) {
+      return {ptr, output, false}; // We found an error.
+    }
+    if (ascii) {
+      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
+      output += 64;
+      ptr += 64;
+    } else {
+      const __m512i lane0 = broadcast_epi128<0>(utf8);
+      const __m512i lane1 = broadcast_epi128<1>(utf8);
+      int valid_count0;
+      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
+      const __m512i lane2 = broadcast_epi128<2>(utf8);
+      int valid_count1;
+      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
+      if (valid_count0 + valid_count1 <= 16) {
+        vec0 = _mm512_mask_expand_epi32(
+            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
+        valid_count0 += valid_count1;
+        vec0 = expand_utf8_to_utf32(vec0);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+      } else {
+        vec0 = expand_utf8_to_utf32(vec0);
+        vec1 = expand_utf8_to_utf32(vec1);
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
+        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+      }
 
-template <endianness big_endian>
-inline size_t convert(const char *buf, size_t len, char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
+      const __m512i lane3 = broadcast_epi128<3>(utf8);
+      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
 
-  while (pos < len) {
-    uint16_t word =
-        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
-    *utf16_output++ =
-        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
-    pos++;
+      ptr += 3 * 16;
+    }
+    validatedptr += 4 * 16;
   }
-
-  return utf16_output - start;
+  if (end != validatedptr) {
+    const __m512i utf8 =
+        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
+                                (const __m512i *)validatedptr);
+    checker.check_next_input(utf8);
+  }
+  checker.check_eof();
+  if (checker.errors()) {
+    return {ptr, output, false}; // We found an error.
+  }
+  return {ptr, output, true};
 }
+/* end file src/icelake/icelake_from_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
+// file included directly
 
-template <endianness big_endian>
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char16_t *utf16_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char16_t *start{utf16_output};
+// File contains conversion procedure from possibly invalid UTF-8 strings.
 
-  while (pos < len) {
-    uint16_t word =
-        uint16_t(data[pos]); // extend Latin-1 char to 16-bit Unicode code point
-    *utf16_output++ =
-        char16_t(match_system(big_endian) ? word : utf16::swap_bytes(word));
-    pos++;
+template <bool is_remaining>
+simdutf_really_inline size_t process_block_from_utf8_to_latin1(
+    const char *buf, size_t len, char *latin_output, __m512i minus64,
+    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
+  __mmask64 load_mask =
+      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
+  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
+  if (nonascii == 0) {
+    if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
+      return 0;              // Indicates error
+    }
+    is_remaining
+        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
+        : _mm512_storeu_si512((__m512i *)latin_output, input);
+    return len;
   }
 
-  return result(error_code::SUCCESS, utf16_output - start);
-}
-
-} // namespace latin1_to_utf16
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/latin1_to_utf16/latin1_to_utf16.h */
-/* begin file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
-#ifndef SIMDUTF_LATIN1_TO_UTF32_H
-#define SIMDUTF_LATIN1_TO_UTF32_H
+  const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace latin1_to_utf32 {
+  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+  __mmask64 invalid_leading_bytes =
+      _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
 
-inline size_t convert(const char *buf, size_t len, char32_t *utf32_output) {
-  const unsigned char *data = reinterpret_cast<const unsigned char *>(buf);
-  char32_t *start{utf32_output};
-  for (size_t i = 0; i < len; i++) {
-    *utf32_output++ = (char32_t)data[i];
+  if (invalid_leading_bytes) {
+    return 0; // Indicates error
   }
-  return utf32_output - start;
-}
 
-} // namespace latin1_to_utf32
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+  __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
 
-#endif
-/* end file src/scalar/latin1_to_utf32/latin1_to_utf32.h */
+  if ((nonascii ^ leading) != leading_shift) {
+    return 0; // Indicates error
+  }
 
-/* begin file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
-#ifndef SIMDUTF_UTF8_TO_LATIN1_H
-#define SIMDUTF_UTF8_TO_LATIN1_H
+  const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+  input =
+      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_latin1 {
+  __mmask64 retain = ~leading & load_mask;
+  __m512i output = _mm512_maskz_compress_epi8(retain, input);
+  int64_t written_out = count_ones(retain);
+  if (written_out == 0) {
+    return 0; // Indicates error
+  }
+  *next_bit6_ptr = bit6 >> 63;
+  *next_leading_ptr = leading >> 63;
 
-inline size_t convert(const char *buf, size_t len, char *latin_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
+  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
 
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
-                           // 1000 1000 .... etc
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
+  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
 
-    // suppose it is not an all ASCII byte sequence
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *latin_output++ = char(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) ==
-               0b11000000) { // the first three bits indicate:
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return 0;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      } // checks if the next byte is a valid continuation byte in UTF-8. A
-        // valid continuation byte starts with 10.
-      // range check -
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 |
-          (data[pos + 1] &
-           0b00111111); // assembles the Unicode code point from the two bytes.
-                        // It does this by discarding the leading 110 and 10
-                        // bits from the two bytes, shifting the remaining bits
-                        // of the first byte, and then combining the results
-                        // with a bitwise OR operation.
-      if (code_point < 0x80 || 0xFF < code_point) {
-        return 0; // We only care about the range 129-255 which is Non-ASCII
-                  // latin1 characters. A code_point beneath 0x80 is invalid as
-                  // it is already covered by bytes whose leading bit is zero.
-      }
-      *latin_output++ = char(code_point);
-      pos += 2;
-    } else {
-      return 0;
-    }
-  }
-  return latin_output - start;
+  return written_out;
 }
 
-inline result convert_with_errors(const char *buf, size_t len,
-                                  char *latin_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
-
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 | v2}; // We are only interested in these bits: 1000 1000
-                           // 1000 1000...etc
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = char(buf[pos]);
-          pos++;
-        }
-        continue;
-      }
-    }
-    // suppose it is not an all ASCII byte sequence
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *latin_output++ = char(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) ==
-               0b11000000) { // the first three bits indicate:
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        return result(error_code::TOO_SHORT, pos);
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return result(error_code::TOO_SHORT, pos);
-      } // checks if the next byte is a valid continuation byte in UTF-8. A
-        // valid continuation byte starts with 10.
-      // range check -
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 |
-          (data[pos + 1] &
-           0b00111111); // assembles the Unicode code point from the two bytes.
-                        // It does this by discarding the leading 110 and 10
-                        // bits from the two bytes, shifting the remaining bits
-                        // of the first byte, and then combining the results
-                        // with a bitwise OR operation.
-      if (code_point < 0x80) {
-        return result(error_code::OVERLONG, pos);
-      }
-      if (0xFF < code_point) {
-        return result(error_code::TOO_LARGE, pos);
-      } // We only care about the range 129-255 which is Non-ASCII latin1
-        // characters
-      *latin_output++ = char(code_point);
-      pos += 2;
-    } else if ((leading_byte & 0b11110000) == 0b11100000) {
-      // We have a three-byte UTF-8
-      return result(error_code::TOO_LARGE, pos);
-    } else if ((leading_byte & 0b11111000) == 0b11110000) { // 0b11110000
-      // we have a 4-byte UTF-8 word.
-      return result(error_code::TOO_LARGE, pos);
-    } else {
-      // we either have too many continuation bytes or an invalid leading byte
-      if ((leading_byte & 0b11000000) == 0b10000000) {
-        return result(error_code::TOO_LONG, pos);
-      }
+size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
+                             char *&inlatin_output) {
+  const char *buf = inbuf;
+  char *latin_output = inlatin_output;
+  char *start = latin_output;
+  size_t pos = 0;
+  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
+  __m512i one = _mm512_set1_epi8(1);
+  __mmask64 next_leading = 0;
+  __mmask64 next_bit6 = 0;
 
-      return result(error_code::HEADER_BITS, pos);
+  while (pos + 64 <= len) {
+    size_t written = process_block_from_utf8_to_latin1<false>(
+        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
+    if (written == 0) {
+      inlatin_output = latin_output;
+      inbuf = buf + pos - next_leading;
+      return 0; // Indicates error at pos or after, or just before pos (too
+                // short error)
     }
+    latin_output += written;
+    pos += 64;
   }
-  return result(error_code::SUCCESS, latin_output - start);
-}
 
-inline result rewind_and_convert_with_errors(size_t prior_bytes,
-                                             const char *buf, size_t len,
-                                             char *latin1_output) {
-  size_t extra_len{0};
-  // We potentially need to go back in time and find a leading byte.
-  // In theory '3' would be sufficient, but sometimes the error can go back
-  // quite far.
-  size_t how_far_back = prior_bytes;
-  // size_t how_far_back = 3; // 3 bytes in the past + current position
-  // if(how_far_back >= prior_bytes) { how_far_back = prior_bytes; }
-  bool found_leading_bytes{false};
-  // important: it is i <= how_far_back and not 'i < how_far_back'.
-  for (size_t i = 0; i <= how_far_back; i++) {
-    unsigned char byte = buf[-static_cast<std::ptrdiff_t>(i)];
-    found_leading_bytes = ((byte & 0b11000000) != 0b10000000);
-    if (found_leading_bytes) {
-      if (i > 0 && byte < 128) {
-        // If we had to go back and the leading byte is ascii
-        // then we can stop right away.
-        return result(error_code::TOO_LONG, 0 - i + 1);
-      }
-      buf -= i;
-      extra_len = i;
-      break;
+  if (pos < len) {
+    size_t remaining = len - pos;
+    size_t written = process_block_from_utf8_to_latin1<true>(
+        buf + pos, remaining, latin_output, minus64, one, &next_leading,
+        &next_bit6);
+    if (written == 0) {
+      inbuf = buf + pos - next_leading;
+      inlatin_output = latin_output;
+      return 0; // Indicates error at pos or after, or just before pos (too
+                // short error)
     }
+    latin_output += written;
   }
-  //
-  // It is possible for this function to return a negative count in its result.
-  // C++ Standard Section 18.1 defines size_t is in <cstddef> which is described
-  // in C Standard as <stddef.h>. C Standard Section 4.1.5 defines size_t as an
-  // unsigned integral type of the result of the sizeof operator
-  //
-  // An unsigned type will simply wrap round arithmetically (well defined).
-  //
-  if (!found_leading_bytes) {
-    // If how_far_back == 3, we may have four consecutive continuation bytes!!!
-    // [....] [continuation] [continuation] [continuation] | [buf is
-    // continuation] Or we possibly have a stream that does not start with a
-    // leading byte.
-    return result(error_code::TOO_LONG, 0 - how_far_back);
-  }
-  result res = convert_with_errors(buf, len + extra_len, latin1_output);
-  if (res.error) {
-    res.count -= extra_len;
+  if (next_leading) {
+    inbuf = buf + len - next_leading;
+    inlatin_output = latin_output;
+    return 0; // Indicates error at end of buffer
   }
-  return res;
+  inlatin_output = latin_output;
+  inbuf += len;
+  return size_t(latin_output - start);
 }
+/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
+// file included directly
 
-} // namespace utf8_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+// File contains conversion procedure from valid UTF-8 strings.
 
-#endif
-/* end file src/scalar/utf8_to_latin1/utf8_to_latin1.h */
-/* begin file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
-#ifndef SIMDUTF_UTF16_TO_LATIN1_H
-#define SIMDUTF_UTF16_TO_LATIN1_H
+template <bool is_remaining>
+simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
+    const char *buf, size_t len, char *latin_output, __m512i minus64,
+    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
+  __mmask64 load_mask =
+      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
+  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_latin1 {
+  if (nonascii == 0) {
+    is_remaining
+        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
+        : _mm512_storeu_si512((__m512i *)latin_output, input);
+    return len;
+  }
 
-#include <cstring> // for std::memcpy
+  __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
 
-template <endianness big_endian>
-inline size_t convert(const char16_t *buf, size_t len, char *latin_output) {
-  if (len == 0) {
-    return 0;
+  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+
+  *next_leading_ptr = leading >> 63;
+
+  __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
+  input =
+      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
+  *next_bit6_ptr = bit6 >> 63;
+
+  __mmask64 retain = ~leading & load_mask;
+  __m512i output = _mm512_maskz_compress_epi8(retain, input);
+  int64_t written_out = count_ones(retain);
+  if (written_out == 0) {
+    return 0; // Indicates error
   }
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+  // Optimization opportunity: sometimes, masked writes are not needed.
+  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
+  return written_out;
+}
+
+size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
+                                   char *latin_output) {
+  char *start = latin_output;
   size_t pos = 0;
-  char *current_write = latin_output;
-  uint16_t word = 0;
-  uint16_t too_large = 0;
+  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
+  __m512i one = _mm512_set1_epi8(1);
+  __mmask64 next_leading = 0;
+  __mmask64 next_bit6 = 0;
 
-  while (pos < len) {
-    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    too_large |= word;
-    *current_write++ = char(word & 0xFF);
-    pos++;
+  while (pos + 64 <= len) {
+    size_t written = process_valid_block_from_utf8_to_latin1<false>(
+        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
+    latin_output += written;
+    pos += 64;
   }
-  if ((too_large & 0xFF00) != 0) {
-    return 0;
+
+  if (pos < len) {
+    size_t remaining = len - pos;
+    size_t written = process_valid_block_from_utf8_to_latin1<true>(
+        buf + pos, remaining, latin_output, minus64, one, &next_leading,
+        &next_bit6);
+    latin_output += written;
   }
 
-  return current_write - latin_output;
+  return (size_t)(latin_output - start);
 }
-
+/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
+// file included directly
 template <endianness big_endian>
-inline result convert_with_errors(const char16_t *buf, size_t len,
-                                  char *latin_output) {
-  if (len == 0) {
-    return result(error_code::SUCCESS, 0);
+size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                                       char *latin1_output) {
+  const char16_t *end = buf + len;
+  __m512i v_0xFF = _mm512_set1_epi16(0xff);
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
+      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  while (end - buf >= 32) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+      return 0;
+    }
+    _mm256_storeu_si256(
+        (__m256i *)latin1_output,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 32;
+    buf += 32;
   }
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
-  uint16_t word;
-
-  while (pos < len) {
-    if (pos + 16 <= len) { // if it is safe to read 32 more bytes, check that
-                           // they are Latin1
-      uint64_t v1, v2, v3, v4;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      ::memcpy(&v2, data + pos + 4, sizeof(uint64_t));
-      ::memcpy(&v3, data + pos + 8, sizeof(uint64_t));
-      ::memcpy(&v4, data + pos + 12, sizeof(uint64_t));
-
-      if (!match_system(big_endian)) {
-        v1 = (v1 >> 8) | (v1 << (64 - 8));
-      }
-      if (!match_system(big_endian)) {
-        v2 = (v2 >> 8) | (v2 << (64 - 8));
-      }
-      if (!match_system(big_endian)) {
-        v3 = (v3 >> 8) | (v3 << (64 - 8));
-      }
-      if (!match_system(big_endian)) {
-        v4 = (v4 >> 8) | (v4 << (64 - 8));
-      }
-
-      if (((v1 | v2 | v3 | v4) & 0xFF00FF00FF00FF00) == 0) {
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = !match_system(big_endian)
-                                ? char(utf16::swap_bytes(data[pos]))
-                                : char(data[pos]);
-          pos++;
-        }
-        continue;
-      }
+  if (buf < end) {
+    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
     }
-    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    if ((word & 0xFF00) == 0) {
-      *latin_output++ = char(word & 0xFF);
-      pos++;
-    } else {
-      return result(error_code::TOO_LARGE, pos);
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+      return 0;
     }
+    _mm256_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
   }
-  return result(error_code::SUCCESS, latin_output - start);
+  return len;
 }
 
-} // namespace utf16_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/utf16_to_latin1/utf16_to_latin1.h */
-/* begin file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
-#ifndef SIMDUTF_UTF32_TO_LATIN1_H
-#define SIMDUTF_UTF32_TO_LATIN1_H
-
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_latin1 {
-
-inline size_t convert(const char32_t *buf, size_t len, char *latin1_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  char *start = latin1_output;
-  uint32_t utf32_char;
-  size_t pos = 0;
-  uint32_t too_large = 0;
-
-  while (pos < len) {
-    utf32_char = (uint32_t)data[pos];
-    too_large |= utf32_char;
-    *latin1_output++ = (char)(utf32_char & 0xFF);
-    pos++;
-  }
-  if ((too_large & 0xFFFFFF00) != 0) {
-    return 0;
+template <endianness big_endian>
+std::pair<result, char *>
+icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                            char *latin1_output) {
+  const char16_t *end = buf + len;
+  const char16_t *start = buf;
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  __m512i v_0xFF = _mm512_set1_epi16(0xff);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
+      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+  while (end - buf >= 32) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+      uint16_t word;
+      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
+                                 : uint16_t(*buf))) <= 0xff) {
+        *latin1_output++ = uint8_t(word);
+        buf++;
+      }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
+    }
+    _mm256_storeu_si256(
+        (__m256i *)latin1_output,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 32;
+    buf += 32;
   }
-  return latin1_output - start;
-}
+  if (buf < end) {
+    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
 
-inline result convert_with_errors(const char32_t *buf, size_t len,
-                                  char *latin1_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  char *start{latin1_output};
-  size_t pos = 0;
-  while (pos < len) {
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are Latin1
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-        *latin1_output++ = char(buf[pos]);
-        *latin1_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
+      uint16_t word;
+      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
+                                 : uint16_t(*buf))) <= 0xff) {
+        *latin1_output++ = uint8_t(word);
+        buf++;
       }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
     }
-    uint32_t utf32_char = data[pos];
-    if ((utf32_char & 0xFFFFFF00) ==
-        0) { // Check if the character can be represented in Latin-1
-      *latin1_output++ = (char)(utf32_char & 0xFF);
-      pos++;
-    } else {
-      return result(error_code::TOO_LARGE, pos);
-    };
+    _mm256_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
   }
-  return result(error_code::SUCCESS, latin1_output - start);
+  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
+/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
+// file included directly
 
-} // namespace utf32_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
-
-#endif
-/* end file src/scalar/utf32_to_latin1/utf32_to_latin1.h */
-
-/* begin file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
-#ifndef SIMDUTF_VALID_UTF8_TO_LATIN1_H
-#define SIMDUTF_VALID_UTF8_TO_LATIN1_H
+/**
+ * This function converts the input (inbuf, inlen), assumed to be valid
+ * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
+ * written is written to 'outlen' and the function reports the number of input
+ * word consumed.
+ */
+template <endianness big_endian>
+size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
+                             unsigned char *outbuf, size_t *outlen) {
+  __m512i in;
+  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  const char16_t *const inbuf_orig = inbuf;
+  const unsigned char *const outbuf_orig = outbuf;
+  int adjust = 0;
+  int carry = 0;
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf8_to_latin1 {
+  while (inlen >= 32) {
+    in = _mm512_loadu_si512(inbuf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    inlen -= 31;
+  lastiteration:
+    inbuf += 31;
 
-inline size_t convert_valid(const char *buf, size_t len, char *latin_output) {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(buf);
+  failiteration:
+    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
+        inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
 
-  size_t pos = 0;
-  char *start{latin_output};
+    if (_ktestz_mask32_u8(inmask, is234byte)) {
+      // fast path for ASCII only
+      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
+      outbuf += 31;
+      carry = 0;
 
-  while (pos < len) {
-    // try to convert the next block of 16 ASCII bytes
-    if (pos + 16 <=
-        len) { // if it is safe to read 16 more bytes, check that they are ascii
-      uint64_t v1;
-      ::memcpy(&v1, data + pos, sizeof(uint64_t));
-      uint64_t v2;
-      ::memcpy(&v2, data + pos + sizeof(uint64_t), sizeof(uint64_t));
-      uint64_t v{v1 |
-                 v2}; // We are only interested in these bits: 1000 1000 1000
-                      // 1000, so it makes sense to concatenate everything
-      if ((v & 0x8080808080808080) ==
-          0) { // if NONE of these are set, e.g. all of them are zero, then
-               // everything is ASCII
-        size_t final_pos = pos + 16;
-        while (pos < final_pos) {
-          *latin_output++ = char(buf[pos]);
-          pos++;
-        }
+      if (inlen < 32) {
+        goto tail;
+      } else {
         continue;
       }
     }
 
-    // suppose it is not an all ASCII byte sequence
-    uint8_t leading_byte = data[pos]; // leading byte
-    if (leading_byte < 0b10000000) {
-      // converting one ASCII byte !!!
-      *latin_output++ = char(leading_byte);
-      pos++;
-    } else if ((leading_byte & 0b11100000) ==
-               0b11000000) { // the first three bits indicate:
-      // We have a two-byte UTF-8
-      if (pos + 1 >= len) {
-        break;
-      } // minimal bound checking
-      if ((data[pos + 1] & 0b11000000) != 0b10000000) {
-        return 0;
-      } // checks if the next byte is a valid continuation byte in UTF-8. A
-        // valid continuation byte starts with 10.
-      // range check -
-      uint32_t code_point =
-          (leading_byte & 0b00011111) << 6 |
-          (data[pos + 1] &
-           0b00111111); // assembles the Unicode code point from the two bytes.
-                        // It does this by discarding the leading 110 and 10
-                        // bits from the two bytes, shifting the remaining bits
-                        // of the first byte, and then combining the results
-                        // with a bitwise OR operation.
-      *latin_output++ = char(code_point);
-      pos += 2;
-    } else {
-      // we may have a continuation but we do not do error checking
-      return 0;
-    }
-  }
-  return latin_output - start;
-}
-
-} // namespace utf8_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    const __mmask32 is12byte =
+        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
 
-#endif
-/* end file src/scalar/utf8_to_latin1/valid_utf8_to_latin1.h */
-/* begin file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
-#ifndef SIMDUTF_VALID_UTF16_TO_LATIN1_H
-#define SIMDUTF_VALID_UTF16_TO_LATIN1_H
+    if (_ktestc_mask32_u8(is12byte, inmask)) {
+      // fast path for 1 and 2 byte only
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf16_to_latin1 {
+      const __m512i twobytes = _mm512_ternarylogic_epi32(
+          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
+          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
+      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
+                                 _mm512_set1_epi16(int16_t(0x80c0)));
+      const __m512i cmpmask =
+          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
+                                  _mm512_set1_epi16(0x0800));
+      const __mmask64 smoosh =
+          _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
+      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
+      _mm512_mask_storeu_epi8(outbuf,
+                              _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
+                                                       _cvtmask64_u64(smoosh))),
+                              out);
+      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
+      carry = 0;
 
-template <endianness big_endian>
-inline size_t convert_valid(const char16_t *buf, size_t len,
-                            char *latin_output) {
-  const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
-  size_t pos = 0;
-  char *start{latin_output};
-  uint16_t word = 0;
+      if (inlen < 32) {
+        goto tail;
+      } else {
+        continue;
+      }
+    }
+    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
 
-  while (pos < len) {
-    word = !match_system(big_endian) ? utf16::swap_bytes(data[pos]) : data[pos];
-    *latin_output++ = char(word);
-    pos++;
-  }
+    __m512i taglo = _mm512_set1_epi32(0x8080e000);
+    __m512i taghi = taglo;
 
-  return latin_output - start;
-}
+    const __m512i fc00masked =
+        _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
+    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
+        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
+    const __mmask32 losurr = _mm512_cmp_epu16_mask(
+        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
 
-} // namespace utf16_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    int carryout = 0;
+    if (!_kortestz_mask32_u8(hisurr, losurr)) {
+      // handle surrogates
 
-#endif
-/* end file src/scalar/utf16_to_latin1/valid_utf16_to_latin1.h */
-/* begin file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
-#ifndef SIMDUTF_VALID_UTF32_TO_LATIN1_H
-#define SIMDUTF_VALID_UTF32_TO_LATIN1_H
+      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
+      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
 
-namespace simdutf {
-namespace scalar {
-namespace {
-namespace utf32_to_latin1 {
+      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
+      taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
+                                    _mm512_set1_epi32(0x808080f0));
+      taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
+                                    _mm512_set1_epi32(0x808080f0));
 
-inline size_t convert_valid(const char32_t *buf, size_t len,
-                            char *latin1_output) {
-  const uint32_t *data = reinterpret_cast<const uint32_t *>(buf);
-  char *start = latin1_output;
-  uint32_t utf32_char;
-  size_t pos = 0;
+      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
+      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
+      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
+      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
+      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
+      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
 
-  while (pos < len) {
-    utf32_char = (uint32_t)data[pos];
+      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
 
-    if (pos + 2 <=
-        len) { // if it is safe to read 8 more bytes, check that they are Latin1
-      uint64_t v;
-      ::memcpy(&v, data + pos, sizeof(uint64_t));
-      if ((v & 0xFFFFFF00FFFFFF00) == 0) {
-        *latin1_output++ = char(buf[pos]);
-        *latin1_output++ = char(buf[pos + 1]);
-        pos += 2;
-        continue;
-      } else {
-        // output can not be represented in latin1
-        return 0;
+      const uint32_t h = _cvtmask32_u32(hisurr);
+      const uint32_t l = _cvtmask32_u32(losurr);
+      // check for mismatched surrogates
+      if ((h + h + carry) ^ l) {
+        const uint32_t lonohi = l & ~(h + h + carry);
+        const uint32_t hinolo = h & ~(l >> 1);
+        inlen = _tzcnt_u32(hinolo | lonohi);
+        inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
+        in = _mm512_maskz_mov_epi16(inmask, in);
+        adjust = (int)inlen - 31;
+        inlen = 0;
+        goto failiteration;
       }
     }
-    if ((utf32_char & 0xFFFFFF00) == 0) {
-      *latin1_output++ = char(utf32_char);
-    } else {
-      // output can not be represented in latin1
-      return 0;
-    }
-    pos++;
-  }
-  return latin1_output - start;
-}
 
-} // namespace utf32_to_latin1
-} // unnamed namespace
-} // namespace scalar
-} // namespace simdutf
+    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
+    carry = carryout;
 
-#endif
-/* end file src/scalar/utf32_to_latin1/valid_utf32_to_latin1.h */
+    __m512i mslo =
+        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
 
-SIMDUTF_PUSH_DISABLE_WARNINGS
-SIMDUTF_DISABLE_UNDESIRED_WARNINGS
+    __m512i mshi =
+        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
 
-#if SIMDUTF_IMPLEMENTATION_ARM64
-/* begin file src/arm64/implementation.cpp */
-/* begin file src/simdutf/arm64/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "arm64"
-// #define SIMDUTF_IMPLEMENTATION arm64
-/* end file src/simdutf/arm64/begin.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-#ifndef SIMDUTF_ARM64_H
-  #error "arm64.h must be included"
-#endif
-using namespace simd;
+    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
+    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
-  simd8<uint8_t> bits = input.reduce_or();
-  return bits.max_val() < 0b10000000u;
-}
+    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
+    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
+    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
 
-simdutf_unused simdutf_really_inline simd8<bool>
-must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
-                     const simd8<uint8_t> prev3) {
-  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
-  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
-  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
-  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
-  // is using ^ as well. This will work fine because we only have to report
-  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
-  // overlapping multibyte characters, and if that happens, there is guaranteed
-  // to be at least *one* lead byte that is part of only 1 other multibyte
-  // character. The error will be detected there.
-  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
-}
+    taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
+                                  _mm512_set1_epi32(0x80c00000));
+    taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
+                                  _mm512_set1_epi32(0x80c00000));
+    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
+                                              _mm512_set1_epi32(0xffffffff),
+                                              _mm512_set1_epi32(0x00010101));
+    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
+                                              _mm512_set1_epi32(0xffffffff),
+                                              _mm512_set1_epi32(0x00010101));
 
-simdutf_really_inline simd8<bool>
-must_be_2_3_continuation(const simd8<uint8_t> prev2,
-                         const simd8<uint8_t> prev3) {
-  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
-  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
-  return is_third_byte ^ is_fourth_byte;
-}
+    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
+                                      _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
+    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
+                                      _mm512_set1_epi32(0xffffffff),
+                                      _mm512_set1_epi32(0x00010101));
 
-// common functions for utf8 conversions
-simdutf_really_inline uint16x4_t convert_utf8_3_byte_to_utf16(uint8x16_t in) {
-  // Low half contains  10cccccc|1110aaaa
-  // High half contains 10bbbbbb|10bbbbbb
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t sh = simdutf_make_uint8x16_t(0, 2, 3, 5, 6, 8, 9, 11, 1, 1,
-                                                4, 4, 7, 7, 10, 10);
-#else
-  const uint8x16_t sh = {0, 2, 3, 5, 6, 8, 9, 11, 1, 1, 4, 4, 7, 7, 10, 10};
-#endif
-  uint8x16_t perm = vqtbl1q_u8(in, sh);
-  // Split into half vectors.
-  // 10cccccc|1110aaaa
-  uint8x8_t perm_low = vget_low_u8(perm); // no-op
-  // 10bbbbbb|10bbbbbb
-  uint8x8_t perm_high = vget_high_u8(perm);
-  // xxxxxxxx 10bbbbbb
-  uint16x4_t mid = vreinterpret_u16_u8(perm_high); // no-op
-  // xxxxxxxx 1110aaaa
-  uint16x4_t high = vreinterpret_u16_u8(perm_low); // no-op
-  // Assemble with shift left insert.
-  // xxxxxxaa aabbbbbb
-  uint16x4_t mid_high = vsli_n_u16(mid, high, 6);
-  // (perm_low << 8) | (perm_low >> 8)
-  // xxxxxxxx 10cccccc
-  uint16x4_t low = vreinterpret_u16_u8(vrev16_u8(perm_low));
-  // Shift left insert into the low bits
-  // aaaabbbb bbcccccc
-  uint16x4_t composed = vsli_n_u16(low, mid_high, 6);
-  return composed;
-}
+    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
+                                     0xea); // A&B|C
+    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
+                                     0xea);
+    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
 
-simdutf_really_inline uint16x8_t convert_utf8_2_byte_to_utf16(uint8x16_t in) {
-  // Converts 6 2 byte UTF-8 characters to 6 UTF-16 characters.
-  // Technically this calculates 8, but 6 does better and happens more often
-  // (The languages which use these codepoints use ASCII spaces so 8 would need
-  // to be in the middle of a very long word).
+    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
 
-  // 10bbbbbb 110aaaaa
-  uint16x8_t upper = vreinterpretq_u16_u8(in);
-  // (in << 8) | (in >> 8)
-  // 110aaaaa 10bbbbbb
-  uint16x8_t lower = vreinterpretq_u16_u8(vrev16q_u8(in));
-  // 00000000 000aaaaa
-  uint16x8_t upper_masked = vandq_u16(upper, vmovq_n_u16(0x1F));
-  // Assemble with shift left insert.
-  // 00000aaa aabbbbbb
-  uint16x8_t composed = vsliq_n_u16(lower, upper_masked, 6);
-  return composed;
-}
+    const __mmask64 wantlo =
+        _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
+    const __mmask64 wanthi =
+        _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
+    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
+    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
+    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
+    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
 
-simdutf_really_inline uint16x8_t
-convert_utf8_1_to_2_byte_to_utf16(uint8x16_t in, size_t shufutf8_idx) {
-  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
-  // This is a relatively easy scenario
-  // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes.
-  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-      simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]));
-  // Shuffle
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 110aaaaa 10bbbbbb
-  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
-  // Mask
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000000 00bbbbbb
-  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
-  // 1 byte: 00000000 00000000
-  // 2 byte: 000aaaaa 00000000
-  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
-  // Combine with a shift right accumulate
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000aaa aabbbbbb
-  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
-  return composed;
+    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
+    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+
+    _mm512_mask_storeu_epi8(
+        outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
+    _mm512_mask_storeu_epi8(
+        outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
+        outhi);
+    outbuf += advlo + advhi;
+  }
+  outbuf += -adjust;
+
+tail:
+  if (inlen != 0) {
+    // We must have inlen < 31.
+    inmask = _cvtu32_mask32((1U << inlen) - 1);
+    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
+    }
+    adjust = (int)inlen - 31;
+    inlen = 0;
+    goto lastiteration;
+  }
+  *outlen = (outbuf - outbuf_orig) + adjust;
+  return ((inbuf - inbuf_orig) + adjust);
 }
+/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
+// file included directly
 
-/* begin file src/arm64/arm_validate_utf16.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf32_output
+  A scalar routing should carry on the conversion of the tail.
+*/
 template <endianness big_endian>
-const char16_t *arm_validate_utf16(const char16_t *input, size_t size) {
-  const char16_t *end = input + size;
-  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-  const auto v_fc = simd8<uint8_t>::splat(0xfc);
-  const auto v_dc = simd8<uint8_t>::splat(0xdc);
-  while (end - input >= 16) {
-    // 0. Load data: since the validation takes into account only higher
-    //    byte of each word, we compress the two vectors into one which
-    //    consists only the higher bytes.
-    auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
-    if (!match_system(big_endian)) {
-      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
-      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+std::tuple<const char16_t *, char32_t *, bool>
+convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                       char32_t *utf32_output) {
+  const char16_t *end = buf + len;
+  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
+  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
+  __mmask32 carry{0};
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  while (std::distance(buf, end) >= 32) {
+    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (big_endian) {
+      in = _mm512_shuffle_epi8(in, byteflip);
     }
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
-    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
-    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-    if (surrogates_wordmask == 0) {
-      input += 16;
-    } else {
-      // 2. We have some surrogates that have to be distinguished:
-      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
-      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
-      //
-      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
 
-      // V - non-surrogate code units
-      //     V = not surrogates_wordmask
-      const uint64_t V = ~surrogates_wordmask;
+    // H - bitmask for high surrogates
+    const __mmask32 H =
+        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
+    // H - bitmask for low surrogates
+    const __mmask32 L =
+        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
 
-      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-      const auto vH = ((in & v_fc) == v_dc);
-      const uint64_t H = vH.to_bitmask64();
+    if ((H | L)) {
+      // surrogate pair(s) in a register
+      const __mmask32 V =
+          (L ^
+           (carry | (H << 1))); // A high surrogate must be followed by low one
+                                // and a low one must be preceded by a high one.
+                                // If valid, V should be equal to 0
 
-      // L - word mask for low surrogates
-      //     L = not H and surrogates_wordmask
-      const uint64_t L = ~H & surrogates_wordmask;
+      if (V == 0) {
+        // valid case
+        /*
+            Input surrogate pair:
+            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
+                low surrogate      high surrogate
+        */
+        /*  1. Expand all code units to 32-bit code units
+            in
+           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+        */
+        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
+        const __m512i second =
+            _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
 
-      const uint64_t a =
-          L & (H >> 4); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint64_t b =
-          a << 4; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint64_t c = V | a | b; // Combine all the masks into the final one.
-      if (c == ~0ull) {
-        // The whole input register contains valid UTF-16, i.e.,
-        // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0xfffffffffffffffull) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
-        // iteration we 1) check if the low surrogate is followed by a high
-        // one, 2) reject sole high surrogate.
-        input += 15;
+        /*  2. Shift by one 16-bit word to align low surrogates with high
+           surrogates in
+           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
+            shifted
+           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+        */
+        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
+        const __m512i shifted_second =
+            _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+
+        /*  3. Align all high surrogates in first and second by shifting to the
+           left by 10 bits
+            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+        */
+        const __m512i aligned_first =
+            _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
+        const __m512i aligned_second =
+            _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+
+        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in,
+           shifted and constant in
+           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
+            shifted
+           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
+            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
+        */
+        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
+        const __m512i added_first = _mm512_mask_add_epi32(
+            aligned_first, (__mmask16)H, aligned_first, shifted_first);
+        const __m512i utf32_first = _mm512_mask_add_epi32(
+            added_first, (__mmask16)H, added_first, constant);
+
+        const __m512i added_second =
+            _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16),
+                                  aligned_second, shifted_second);
+        const __m512i utf32_second = _mm512_mask_add_epi32(
+            added_second, (__mmask16)(H >> 16), added_second, constant);
+
+        //  5. Store all valid UTF-32 code units (low surrogate positions and
+        //  32nd word are invalid)
+        const __mmask32 valid = ~L & 0x7fffffff;
+        // We deliberately do a _mm512_maskz_compress_epi32 followed by
+        // storeu_epi32 to ease performance portability to Zen 4.
+        const __m512i compressed_first =
+            _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
+        const size_t howmany1 = count_ones((uint16_t)(valid));
+        _mm512_storeu_si512((__m512i *)utf32_output, compressed_first);
+        utf32_output += howmany1;
+        const __m512i compressed_second =
+            _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
+        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
+        // The following could be unsafe in some cases?
+        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
+        _mm512_mask_storeu_epi32((__m512i *)utf32_output,
+                                 __mmask16((1 << howmany2) - 1),
+                                 compressed_second);
+        utf32_output += howmany2;
+        // Only process 31 code units, but keep track if the 31st word is a high
+        // surrogate as a carry
+        buf += 31;
+        carry = (H >> 30) & 0x1;
       } else {
-        return nullptr;
+        // invalid case
+        return std::make_tuple(buf + carry, utf32_output, false);
+      }
+    } else {
+      // no surrogates
+      // extend all thirty-two 16-bit code units to thirty-two 32-bit code units
+      _mm512_storeu_si512((__m512i *)(utf32_output),
+                          _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
+      _mm512_storeu_si512(
+          (__m512i *)(utf32_output) + 1,
+          _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
+      utf32_output += 32;
+      buf += 32;
+      carry = 0;
+    }
+  } // while
+  return std::make_tuple(buf + carry, utf32_output, true);
+}
+/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
+// file included directly
+size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                                       char *latin1_output) {
+  const char32_t *end = buf + len;
+  __m512i v_0xFF = _mm512_set1_epi32(0xff);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
+      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
+  while (end - buf >= 16) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      return 0;
+    }
+    _mm_storeu_si128(
+        (__m128i *)latin1_output,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 16;
+    buf += 16;
+  }
+  if (buf < end) {
+    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      return 0;
+    }
+    _mm_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+  }
+  return len;
+}
+
+std::pair<result, char *>
+icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                            char *latin1_output) {
+  const char32_t *end = buf + len;
+  const char32_t *start = buf;
+  __m512i v_0xFF = _mm512_set1_epi32(0xff);
+  __m512i shufmask = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
+      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
+  while (end - buf >= 16) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      while (uint32_t(*buf) <= 0xff) {
+        *latin1_output++ = uint8_t(*buf++);
       }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
     }
+    _mm_storeu_si128(
+        (__m128i *)latin1_output,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+    latin1_output += 16;
+    buf += 16;
   }
-  return input;
+  if (buf < end) {
+    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
+    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
+    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
+      while (uint32_t(*buf) <= 0xff) {
+        *latin1_output++ = uint8_t(*buf++);
+      }
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            latin1_output);
+    }
+    _mm_mask_storeu_epi8(
+        latin1_output, mask,
+        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+  }
+  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
+/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
+// file included directly
 
-template <endianness big_endian>
-const result arm_validate_utf16_with_errors(const char16_t *input,
-                                            size_t size) {
-  const char16_t *start = input;
-  const char16_t *end = input + size;
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+std::pair<const char32_t *, char *>
+avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
+                             char *utf8_output) {
+  const char32_t *end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  __m256i running_max = _mm256_setzero_si256();
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
-  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
-  const auto v_fc = simd8<uint8_t>::splat(0xfc);
-  const auto v_dc = simd8<uint8_t>::splat(0xdc);
-  while (input + 16 < end) {
-    // 0. Load data: since the validation takes into account only higher
-    //    byte of each word, we compress the two vectors into one which
-    //    consists only the higher bytes.
-    auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-    if (!match_system(big_endian)) {
-      in0 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in0)));
-      in1 = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in1)));
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-    const simd8<uint8_t> in = simd16<uint16_t>::pack(t0, t1);
-    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
-    const uint64_t surrogates_wordmask = ((in & v_f8) == v_d8).to_bitmask64();
-    if (surrogates_wordmask == 0) {
-      input += 16;
-    } else {
-      // 2. We have some surrogates that have to be distinguished:
-      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
-      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
-      //
-      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-      // V - non-surrogate code units
-      //     V = not surrogates_wordmask
-      const uint64_t V = ~surrogates_wordmask;
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
-      const auto vH = ((in & v_fc) == v_dc);
-      const uint64_t H = vH.to_bitmask64();
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-      // L - word mask for low surrogates
-      //     L = not H and surrogates_wordmask
-      const uint64_t L = ~H & surrogates_wordmask;
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-      const uint64_t a =
-          L & (H >> 4); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint64_t b =
-          a << 4; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint64_t c = V | a | b; // Combine all the masks into the final one.
-      if (c == ~0ull) {
-        // The whole input register contains valid UTF-16, i.e.,
-        // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0xfffffffffffffffull) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
-        // iteration we 1) check if the low surrogate is followed by a high
-        // one, 2) reject sole high surrogate.
-        input += 15;
-      } else {
-        return result(error_code::SURROGATE, input - start);
-      }
-    }
-  }
-  return result(error_code::SUCCESS, input - start);
-}
-/* end file src/arm64/arm_validate_utf16.cpp */
-/* begin file src/arm64/arm_validate_utf32le.cpp */
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-const char32_t *arm_validate_utf32le(const char32_t *input, size_t size) {
-  const char32_t *end = input + size;
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
-  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
-  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
-  uint32x4_t currentmax = vmovq_n_u32(0x0);
-  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-  while (end - input >= 4) {
-    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
-    currentmax = vmaxq_u32(in, currentmax);
-    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
-    input += 4;
-  }
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-  uint32x4_t is_zero =
-      veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-  if (vmaxvq_u32(is_zero) != 0) {
-    return nullptr;
-  }
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
 
-  is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
-                      standardoffsetmax);
-  if (vmaxvq_u32(is_zero) != 0) {
-    return nullptr;
-  }
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-  return input;
-}
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-const result arm_validate_utf32le_with_errors(const char32_t *input,
-                                              size_t size) {
-  const char32_t *start = input;
-  const char32_t *end = input + size;
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-  const uint32x4_t standardmax = vmovq_n_u32(0x10ffff);
-  const uint32x4_t offset = vmovq_n_u32(0xffff2000);
-  const uint32x4_t standardoffsetmax = vmovq_n_u32(0xfffff7ff);
-  uint32x4_t currentmax = vmovq_n_u32(0x0);
-  uint32x4_t currentoffsetmax = vmovq_n_u32(0x0);
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-  while (end - input >= 4) {
-    const uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input));
-    currentmax = vmaxq_u32(in, currentmax);
-    currentoffsetmax = vmaxq_u32(vaddq_u32(in, offset), currentoffsetmax);
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-    uint32x4_t is_zero =
-        veorq_u32(vmaxq_u32(currentmax, standardmax), standardmax);
-    if (vmaxvq_u32(is_zero) != 0) {
-      return result(error_code::TOO_LARGE, input - start);
-    }
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-    is_zero = veorq_u32(vmaxq_u32(currentoffsetmax, standardoffsetmax),
-                        standardoffsetmax);
-    if (vmaxvq_u32(is_zero) != 0) {
-      return result(error_code::SURROGATE, input - start);
-    }
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-    input += 4;
-  }
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-  return result(error_code::SUCCESS, input - start);
-}
-/* end file src/arm64/arm_validate_utf32le.cpp */
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-/* begin file src/arm64/arm_convert_latin1_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char *, char16_t *>
-arm_convert_latin1_to_utf16(const char *buf, size_t len,
-                            char16_t *utf16_output) {
-  const char *end = buf + len;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-  while (end - buf >= 16) {
-    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
-    uint16x8_t inlow = vmovl_u8(vget_low_u8(in8));
-    if (!match_system(big_endian)) {
-      inlow = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inlow)));
-    }
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), inlow);
-    uint16x8_t inhigh = vmovl_u8(vget_high_u8(in8));
-    if (!match_system(big_endian)) {
-      inhigh = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(inhigh)));
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
     }
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output + 8), inhigh);
-    utf16_output += 16;
-    buf += 16;
+  } // while
+
+  // check for invalid input
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+    return std::make_pair(nullptr, utf8_output);
   }
 
-  return std::make_pair(buf, utf16_output);
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf8_output);
+  }
+
+  return std::make_pair(buf, utf8_output);
 }
-/* end file src/arm64/arm_convert_latin1_to_utf16.cpp */
-/* begin file src/arm64/arm_convert_latin1_to_utf32.cpp */
-std::pair<const char *, char32_t *>
-arm_convert_latin1_to_utf32(const char *buf, size_t len,
-                            char32_t *utf32_output) {
-  const char *end = buf + len;
 
-  while (end - buf >= 16) {
-    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(buf));
-    uint16x8_t in8low = vmovl_u8(vget_low_u8(in8));
-    uint32x4_t in16lowlow = vmovl_u16(vget_low_u16(in8low));
-    uint32x4_t in16lowhigh = vmovl_u16(vget_high_u16(in8low));
-    uint16x8_t in8high = vmovl_u8(vget_high_u8(in8));
-    uint32x4_t in8highlow = vmovl_u16(vget_low_u16(in8high));
-    uint32x4_t in8highhigh = vmovl_u16(vget_high_u16(in8high));
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output), in16lowlow);
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 4), in16lowhigh);
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 8), in8highlow);
-    vst1q_u32(reinterpret_cast<uint32_t *>(utf32_output + 12), in8highhigh);
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+std::pair<result, char *>
+avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                         char *utf8_output) {
+  const char32_t *end = buf + len;
+  const char32_t *start = buf;
 
-    utf32_output += 16;
-    buf += 16;
-  }
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
 
-  return std::make_pair(buf, utf32_output);
-}
-/* end file src/arm64/arm_convert_latin1_to_utf32.cpp */
-/* begin file src/arm64/arm_convert_latin1_to_utf8.cpp */
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-std::pair<const char *, char *>
-arm_convert_latin1_to_utf8(const char *latin1_input, size_t len,
-                           char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char *end = latin1_input + len;
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  // We always write 16 bytes, of which more than the first 8 bytes
-  // are valid. A safety margin of 8 is more than sufficient.
-  while (end - latin1_input >= 16 + 8) {
-    uint8x16_t in8 = vld1q_u8(reinterpret_cast<const uint8_t *>(latin1_input));
-    if (vmaxvq_u8(in8) <= 0x7F) { // ASCII fast path!!!!
-      vst1q_u8(utf8_output, in8);
-      utf8_output += 16;
-      latin1_input += 16;
-      continue;
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    // Check for too large input
+    const __m256i max_input =
+        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(
+            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            utf8_output);
     }
 
-    // We just fallback on UTF-16 code. This could be optimized/simplified
-    // further.
-    uint16x8_t in16 = vmovl_u8(vget_low_u8(in8));
-    // 1. prepare 2-byte values
-    // input 8-bit word : [aabb|bbbb] x 8
-    // expected output   : [1100|00aa|10bb|bbbb] x 8
-    const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-    const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
 
-    // t0 = [0000|00aa|bbbb|bb00]
-    const uint16x8_t t0 = vshlq_n_u16(in16, 2);
-    // t1 = [0000|00aa|0000|0000]
-    const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-    // t2 = [0000|0000|00bb|bbbb]
-    const uint16x8_t t2 = vandq_u16(in16, v_003f);
-    // t3 = [0000|00aa|00bb|bbbb]
-    const uint16x8_t t3 = vorrq_u16(t1, t2);
-    // t4 = [1100|00aa|10bb|bbbb]
-    const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-    // 2. merge ASCII and 2-byte codewords
-    const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-    const uint16x8_t one_byte_bytemask = vcleq_u16(in16, v_007f);
-    const uint8x16_t utf8_unpacked =
-        vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in16, t4));
-    // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-    const uint16x8_t mask = simdutf_make_uint16x8_t(
-        0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-    const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                             0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-    uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-    // 4. pack the bytes
-    const uint8_t *row =
-        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-    const uint8x16_t shuffle = vld1q_u8(row + 1);
-    const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
 
-    // 5. store bytes
-    vst1q_u8(utf8_output, utf8_packed);
-    // 6. adjust pointers
-    latin1_input += 8;
-    utf8_output += row[0];
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-  } // while
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
-}
-/* end file src/arm64/arm_convert_latin1_to_utf8.cpp */
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-/* begin file src/arm64/arm_convert_utf8_to_latin1.cpp */
-// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 16, usually 12).
-size_t convert_masked_utf8_to_latin1(const char *input,
-                                     uint64_t utf8_end_of_code_point_mask,
-                                     char *&latin1_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-  // We first try a few fast paths.
-  // The obvious first test is ASCII, which actually consumes the full 16.
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process in chunks of 12 bytes
-    vst1q_u8(reinterpret_cast<uint8_t *>(latin1_output), in);
-    latin1_output += 12; // We wrote 12 18-bit characters.
-    return 12;           // We consumed 12 bytes.
-  }
-  /// We do not have a fast path available, or the fast path is unimportant, so
-  /// we fallback.
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
-  // this indicates an invalid input:
-  if (idx >= 64) {
-    return consumed;
-  }
-  // Here we should have (idx < 64), if not, there is a bug in the validation or
-  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
-  // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
-  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
-  // scenario we process SIX (6) input code-code units. The max length in bytes
-  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
-  uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-      simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-  // Shuffle
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 110aaaaa 10bbbbbb
-  uint16x8_t perm = vreinterpretq_u16_u8(vqtbl1q_u8(in, sh));
-  // Mask
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000000 00bbbbbb
-  uint16x8_t ascii = vandq_u16(perm, vmovq_n_u16(0x7f)); // 6 or 7 bits
-  // 1 byte: 00000000 00000000
-  // 2 byte: 000aaaaa 00000000
-  uint16x8_t highbyte = vandq_u16(perm, vmovq_n_u16(0x1f00)); // 5 bits
-  // Combine with a shift right accumulate
-  // 1 byte: 00000000 0bbbbbbb
-  // 2 byte: 00000aaa aabbbbbb
-  uint16x8_t composed = vsraq_n_u16(ascii, highbyte, 2);
-  // writing 8 bytes even though we only care about the first 6 bytes.
-  uint8x8_t latin1_packed = vmovn_u16(composed);
-  vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-  latin1_output += 6; // We wrote 6 bytes.
-  return consumed;
-}
-/* end file src/arm64/arm_convert_utf8_to_latin1.cpp */
-/* begin file src/arm64/arm_convert_utf8_to_utf16.cpp */
-// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 16, usually 12).
-template <endianness big_endian>
-size_t convert_masked_utf8_to_utf16(const char *input,
-                                    uint64_t utf8_end_of_code_point_mask,
-                                    char16_t *&utf16_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-  // We first try a few fast paths.
-  // The obvious first test is ASCII, which actually consumes the full 16.
-  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xffff) {
-    // We process in chunks of 16 bytes
-    // The routine in simd.h is reused.
-    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
-    temp.store_ascii_as_utf16<big_endian>(utf16_output);
-    utf16_output += 16; // We wrote 16 16-bit characters.
-    return 16;          // We consumed 16 bytes.
-  }
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-  // 3 byte sequences are the next most common, as seen in CJK, which has long
-  // sequences of these.
-  if (input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
-    // UTF-16 code units.
-    uint16x4_t composed = convert_utf8_3_byte_to_utf16(in);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
-    }
-    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
-    utf16_output += 4; // We wrote 4 16-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
-  if ((utf8_end_of_code_point_mask & 0xFFF) == 0xaaa) {
-    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
-    // UTF-16 code units.
-    uint16x8_t composed = convert_utf8_2_byte_to_utf16(in);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed =
-          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
+      // 6. adjust pointers
+      buf += 16;
+      continue;
     }
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
 
-    utf16_output += 6; // We wrote 6 16-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
+      // Check for illegal surrogate code units
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf8_output);
+      }
 
-  /// We do not have a fast path available, or the fast path is unimportant, so
-  /// we fallback.
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-  if (idx < 64) {
-    // SIX (6) input code-code units
-    // Convert to UTF-16
-    uint16x8_t composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed =
-          vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
-    }
-    // Store
-    vst1q_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
-    utf16_output += 6; // We wrote 6 16-bit characters.
-    return consumed;
-  } else if (idx < 145) {
-    // FOUR (4) input code-code units
-    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    // XXX: depending on the system scalar instructions might be faster.
-    // 1 byte: 00000000 00000000 0ccccccc
-    // 2 byte: 00000000 110bbbbb 10cccccc
-    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // 1 byte: 00000000 0ccccccc
-    // 2 byte: xx0bbbbb x0cccccc
-    // 3 byte: xxbbbbbb x0cccccc
-    uint16x4_t lowperm = vmovn_u32(perm);
-    // Partially mask with bic (doesn't require a temporary register unlike and)
-    // The shift left insert below will clear the top bits.
-    // 1 byte: 00000000 00000000
-    // 2 byte: xx0bbbbb 00000000
-    // 3 byte: xxbbbbbb 00000000
-    uint16x4_t middlebyte = vbic_u16(lowperm, vmov_n_u16(uint16_t(~0xFF00)));
-    // ASCII
-    // 1 byte: 00000000 0ccccccc
-    // 2+byte: 00000000 00cccccc
-    uint16x4_t ascii = vand_u16(lowperm, vmov_n_u16(0x7F));
-    // Split into narrow vectors.
-    // 2 byte: 00000000 00000000
-    // 3 byte: 00000000 xxxxaaaa
-    uint16x4_t highperm = vshrn_n_u32(perm, 16);
-    // Shift right accumulate the middle byte
-    // 1 byte: 00000000 0ccccccc
-    // 2 byte: 00xx0bbb bbcccccc
-    // 3 byte: 00xxbbbb bbcccccc
-    uint16x4_t middlelow = vsra_n_u16(ascii, middlebyte, 2);
-    // Shift left and insert the top 4 bits, overwriting the garbage
-    // 1 byte: 00000000 0ccccccc
-    // 2 byte: 00000bbb bbcccccc
-    // 3 byte: aaaabbbb bbcccccc
-    uint16x4_t composed = vsli_n_u16(middlelow, highperm, 12);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      composed = vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(composed)));
-    }
-    vst1_u16(reinterpret_cast<uint16_t *>(utf16_output), composed);
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-    utf16_output += 4; // We wrote 4 16-bit codepoints
-    return consumed;
-  } else if (idx < 209) {
-    // THREE (3) input code-code units
-    if (input_utf8_end_of_code_point_mask == 0x888) {
-      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
-      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
-      // it is easier when we can assume they are all pairs. This version does
-      // not use the LUT, but 4 byte sequences are less common and the overhead
-      // of the extra memory access is less important than the early branch
-      // overhead in shorter sequences.
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-      // Swap byte pairs
-      // 10dddddd 10cccccc|10bbbbbb 11110aaa
-      // 10cccccc 10dddddd|11110aaa 10bbbbbb
-      uint8x16_t swap = vrev16q_u8(in);
-      // Shift left 2 bits
-      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
-      uint32x4_t shift = vreinterpretq_u32_u8(vshlq_n_u8(swap, 2));
-      // Create a magic number containing the low 2 bits of the trail surrogate
-      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
-      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
-      // surrogate high    = +0x0000|0xD800
-      // surrogate low     = +0xDC00|0x0000
-      // -------------------------------
-      //                   = +0xDC00|0xE7C0
-      uint32x4_t magic = vmovq_n_u32(0xDC00E7C0);
-      // Generate unadjusted trail surrogate minus lowest 2 bits
-      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
-      uint32x4_t trail =
-          vbslq_u32(vmovq_n_u32(0x0000FF00), vreinterpretq_u32_u8(swap), shift);
-      // Insert low 2 bits of trail surrogate to magic number for later
-      // 11011100 00000000 11100111 110000cc
-      uint16x8_t magic_with_low_2 =
-          vreinterpretq_u16_u32(vsraq_n_u32(magic, shift, 30));
-      // Generate lead surrogate
-      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
-      uint32x4_t lead = vreinterpretq_u32_u16(
-          vsliq_n_u16(vreinterpretq_u16_u8(swap), vreinterpretq_u16_u8(in), 6));
-      // Mask out lead
-      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
-      lead = vbicq_u32(lead, vmovq_n_u32(uint32_t(~0x03FFFFFF)));
-      // Blend pairs
-      // 000000cc ccdddddd|11110aaa bbbbbb00
-      uint16x8_t blend = vreinterpretq_u16_u32(
-          vbslq_u32(vmovq_n_u32(0x0000FFFF), trail, lead));
-      // Add magic number to finish the result
-      // 110111CC CCDDDDDD|110110AA BBBBBBCC
-      uint16x8_t composed = vaddq_u16(blend, magic_with_low_2);
-      // Byte swap if necessary
-      if (!match_system(big_endian)) {
-        composed =
-            vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(composed)));
-      }
-      uint16_t buffer[8];
-      vst1q_u16(reinterpret_cast<uint16_t *>(buffer), composed);
-      for (int k = 0; k < 6; k++) {
-        utf16_output[k] = buffer[k];
-      } // the loop might compiler to a couple of instructions.
-      utf16_output += 6; // We wrote 3 32-bit surrogate pairs.
-      return 12;         // We consumed 12 bytes.
-    }
-    // 3 1-4 byte sequences
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-    // 1 byte: 00000000 00000000 00000000 0ddddddd
-    // 3 byte: 00000000 00000000 110ccccc 10dddddd
-    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
-    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // added to fix issue https://github.com/simdutf/simdutf/issues/514
-    // We only want to write 2 * 16-bit code units when that is actually what we
-    // have. Unfortunately, we cannot trust the input. So it is possible to get
-    // 0xff as an input byte and it should not result in a surrogate pair. We
-    // need to check for that.
-    uint32_t permbuffer[4];
-    vst1q_u32(permbuffer, perm);
-    // Mask the low and middle bytes
-    // 00000000 00000000 00000000 0ddddddd
-    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7f));
-    // Because the surrogates need more work, the high surrogate is computed
-    // first.
-    uint32x4_t middlehigh = vshlq_n_u32(perm, 2);
-    // 00000000 00000000 00cccccc 00000000
-    uint32x4_t middlebyte = vandq_u32(perm, vmovq_n_u32(0x3F00));
-    // Start assembling the sequence. Since the 4th byte is in the same position
-    // as it would be in a surrogate and there is no dependency, shift left
-    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
-    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
-    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0xFF000000), perm, middlehigh);
-    // Top 16 bits contains the high ten bits of the surrogate pair before
-    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
-    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
-    uint32x4_t abc =
-        vbslq_u32(vmovq_n_u32(0xFFFC0000), ab, vshlq_n_u32(middlebyte, 4));
-    // Combine the low 6 or 7 bits by a shift right accumulate
-    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
-    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
-    // correction
-    uint32x4_t composed = vsraq_n_u32(ascii, abc, 6);
-    // After this is for surrogates
-    // Blend the low and high surrogates
-    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
-    uint32x4_t mixed = vbslq_u32(vmovq_n_u32(0xFFFF0000), abc, composed);
-    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
-    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
-    // 11110aaa bbbbbbcc|000000cc ccdddddd
-    uint16x8_t masked_pair = vreinterpretq_u16_u32(
-        vbicq_u32(mixed, vmovq_n_u32(uint32_t(~0xFFFF03FF))));
-    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
-    // surrogate prefixes in one magic 16-bit addition. similar magic number but
-    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
-    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
-    // surrogate high    = +0xD800|0x0000
-    // surrogate low     = +0x0000|0xDC00
-    // -----------------------------------
-    //                   = +0xE7C0|0xDC00
-    uint16x8_t magic = vreinterpretq_u16_u32(vmovq_n_u32(0xE7C0DC00));
-    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
-    uint32x4_t surrogates =
-        vreinterpretq_u32_u16(vaddq_u16(masked_pair, magic));
-    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
-    uint32x4_t is_pair = vcltzq_s32(vreinterpretq_s32_u32(perm));
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
-    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
-    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
-    uint32x4_t selected = vbslq_u32(is_pair, surrogates, composed);
-    // Byte swap if necessary
-    if (!match_system(big_endian)) {
-      selected =
-          vreinterpretq_u32_u8(vrev16q_u8(vreinterpretq_u8_u32(selected)));
-    }
-    // Attempting to shuffle and store would be complex, just scalarize.
-    uint32_t buffer[4];
-    vst1q_u32(buffer, selected);
-    // Test for the top bit of the surrogate mask. Remove due to issue 514
-    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
-    // 0x00800000;
-    for (size_t i = 0; i < 3; i++) {
-      // Surrogate
-      // Used to be if (buffer[i] & SURROGATE_MASK) {
-      // See discussion above.
-      // patch for issue https://github.com/simdutf/simdutf/issues/514
-      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
-        utf16_output[0] = uint16_t(buffer[i] >> 16);
-        utf16_output[1] = uint16_t(buffer[i] & 0xFFFF);
-        utf16_output += 2;
-      } else {
-        utf16_output[0] = uint16_t(buffer[i] & 0xFFFF);
-        utf16_output++;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
       }
+      buf += k;
     }
-    return consumed;
-  } else {
-    // here we know that there is an error but we do not handle errors
-    return 12;
-  }
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
-/* end file src/arm64/arm_convert_utf8_to_utf16.cpp */
-/* begin file src/arm64/arm_convert_utf8_to_utf32.cpp */
-// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_utf32(const char *input,
-                                    uint64_t utf8_end_of_code_point_mask,
-                                    char32_t *&utf32_out) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
-  uint8x16_t in = vld1q_u8(reinterpret_cast<const uint8_t *>(input));
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xFFF;
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
-  //
-  // We first try a few fast paths.
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process in chunks of 12 bytes.
-    // use fast implementation in src/simdutf/arm64/simd.h
-    // Ideally the compiler can keep the tables in registers.
-    simd8<int8_t> temp{vreinterpretq_s8_u8(in)};
-    temp.store_ascii_as_utf32_tbl(utf32_out);
-    utf32_output += 12; // We wrote 12 32-bit characters.
-    return 12;          // We consumed 12 bytes.
-  }
-  if (input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
-    // UTF-32 code units. Convert to UTF-16
-    uint16x4_t composed_utf16 = convert_utf8_3_byte_to_utf16(in);
-    // Zero extend and store via ST2 with a zero.
-    uint16x4x2_t interleaver = {{composed_utf16, vmov_n_u16(0)}};
-    vst2_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
-    utf32_output += 4; // We wrote 4 32-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
+/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
+// file included directly
 
-  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
-  if (input_utf8_end_of_code_point_mask == 0xaaa) {
-    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
-    // UTF-32 code units. Convert to UTF-16
-    uint16x8_t composed_utf16 = convert_utf8_2_byte_to_utf16(in);
-    // Zero extend and store via ST2 with a zero.
-    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
-    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
-    utf32_output += 6; // We wrote 6 32-bit characters.
-    return 12;         // We consumed 12 bytes.
-  }
-  /// Either no fast path or an unimportant fast path.
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                              char16_t *utf16_output) {
+  const char32_t *end = buf + len;
 
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-  if (idx < 64) {
-    // SIX (6) input code-code units
-    // Convert to UTF-16
-    uint16x8_t composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
-    // Zero extend and store with ST2 and zero
-    uint16x8x2_t interleaver = {{composed_utf16, vmovq_n_u16(0)}};
-    vst2q_u16(reinterpret_cast<uint16_t *>(utf32_output), interleaver);
-    utf32_output += 6; // We wrote 6 32-bit characters.
-    return consumed;
-  } else if (idx < 145) {
-    // FOUR (4) input code-code units
-    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    // Shuffle
-    // 1 byte: 00000000 00000000 0ccccccc
-    // 2 byte: 00000000 110bbbbb 10cccccc
-    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // Split
-    // 00000000 00000000 0ccccccc
-    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F)); // 6 or 7 bits
-    // Note: unmasked
-    // xxxxxxxx aaaaxxxx xxxxxxxx
-    uint32x4_t high = vshrq_n_u32(perm, 4); // 4 bits
-    // Use 16 bit bic instead of and.
-    // The top bits will be corrected later in the bsl
-    // 00000000 10bbbbbb 00000000
-    uint32x4_t middle = vreinterpretq_u32_u16(
-        vbicq_u16(vreinterpretq_u16_u32(perm),
-                  vmovq_n_u16(uint16_t(~0xff00)))); // 5 or 6 bits
-    // Combine low and middle with shift right accumulate
-    // 00000000 00xxbbbb bbcccccc
-    uint32x4_t lowmid = vsraq_n_u32(ascii, middle, 2);
-    // Insert top 4 bits from high byte with bitwise select
-    // 00000000 aaaabbbb bbcccccc
-    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0x0000F000), high, lowmid);
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 4; // We wrote 4 32-bit characters.
-    return consumed;
-  } else if (idx < 209) {
-    // THREE (3) input code-code units
-    if (input_utf8_end_of_code_point_mask == 0x888) {
-      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
-      // UTF-32 code units. This uses the same method as the fixed 3 byte
-      // version, reversing and shift left insert. However, there is no need for
-      // a shuffle mask now, just rev16 and rev32.
-      //
-      // This version does not use the LUT, but 4 byte sequences are less common
-      // and the overhead of the extra memory access is less important than the
-      // early branch overhead in shorter sequences, so it comes last.
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
 
-      // Swap pairs of bytes
-      // 10dddddd|10cccccc|10bbbbbb|11110aaa
-      // 10cccccc 10dddddd|11110aaa 10bbbbbb
-      uint16x8_t swap1 = vreinterpretq_u16_u8(vrev16q_u8(in));
-      // Shift left and insert
-      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
-      uint16x8_t merge1 = vsliq_n_u16(swap1, vreinterpretq_u16_u8(in), 6);
-      // Swap 16-bit lanes
-      // xxxxcccc ccdddddd xxxxxxxa aabbbbbb
-      // xxxxxxxa aabbbbbb xxxxcccc ccdddddd
-      uint32x4_t swap2 = vreinterpretq_u32_u16(vrev32q_u16(merge1));
-      // Shift insert again
-      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
-      uint32x4_t merge2 = vsliq_n_u32(swap2, vreinterpretq_u32_u16(merge1), 12);
-      // Clear the garbage
-      // 00000000 000aaabb bbbbcccc ccdddddd
-      uint32x4_t composed = vandq_u32(merge2, vmovq_n_u32(0x1FFFFF));
-      // Store
-      vst1q_u32(utf32_output, composed);
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-      utf32_output += 3; // We wrote 3 32-bit characters.
-      return 12;         // We consumed 12 bytes.
-    }
-    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
-    // due to surrogates no longer being involved.
-    uint8x16_t sh = vld1q_u8(reinterpret_cast<const uint8_t *>(
-        simdutf::tables::utf8_to_utf16::shufutf8[idx]));
-    // 1 byte: 00000000 00000000 00000000 0ddddddd
-    // 2 byte: 00000000 00000000 110ccccc 10dddddd
-    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
-    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
-    uint32x4_t perm = vreinterpretq_u32_u8(vqtbl1q_u8(in, sh));
-    // Ascii
-    uint32x4_t ascii = vandq_u32(perm, vmovq_n_u32(0x7F));
-    uint32x4_t middle = vandq_u32(perm, vmovq_n_u32(0x3f00));
-    // When converting the way we do, the 3 byte prefix will be interpreted as
-    // the 18th bit being set, since the code would interpret the lead byte
-    // (0b1110bbbb) as a continuation byte (0b10bbbbbb). To fix this, we can
-    // either xor or do an 8 bit add of the 6th bit shifted right by 1. Since
-    // NEON has shift right accumulate, we use that.
-    //  4 byte   3 byte
-    // 10bbbbbb 1110bbbb
-    // 00000000 01000000 6th bit
-    // 00000000 00100000 shift right
-    // 10bbbbbb 0000bbbb add
-    // 00bbbbbb 0000bbbb mask
-    uint8x16_t correction =
-        vreinterpretq_u8_u32(vandq_u32(perm, vmovq_n_u32(0x00400000)));
-    uint32x4_t corrected = vreinterpretq_u32_u8(
-        vsraq_n_u8(vreinterpretq_u8_u32(perm), correction, 1));
-    // 00000000 00000000 0000cccc ccdddddd
-    uint32x4_t cd = vsraq_n_u32(ascii, middle, 2);
-    // Insert twice
-    // xxxxxxxx xxxaaabb bbbbxxxx xxxxxxxx
-    uint32x4_t ab = vbslq_u32(vmovq_n_u32(0x01C0000), vshrq_n_u32(corrected, 6),
-                              vshrq_n_u32(corrected, 4));
-    // 00000000 000aaabb bbbbcccc ccdddddd
-    uint32x4_t composed = vbslq_u32(vmovq_n_u32(0xFFE00FFF), cd, ab);
-    // Store
-    vst1q_u32(utf32_output, composed);
-    utf32_output += 3; // We wrote 3 32-bit characters.
-    return consumed;
-  } else {
-    // here we know that there is an error but we do not handle errors
-    return 12;
-  }
-}
-/* end file src/arm64/arm_convert_utf8_to_utf32.cpp */
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-/* begin file src/arm64/arm_convert_utf16_to_latin1.cpp */
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-template <endianness big_endian>
-std::pair<const char16_t *, char *>
-arm_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) {
-  const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
-    if (vmaxvq_u16(in) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(in);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
       buf += 8;
-      latin1_output += 8;
     } else {
-      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
     }
-  } // while
-  return std::make_pair(buf, latin1_output);
+  }
+
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf16_output);
+  }
+
+  return std::make_pair(buf, utf16_output);
 }
 
+// Todo: currently, this is just the haswell code, optimize for icelake kernel.
 template <endianness big_endian>
-std::pair<result, char *>
-arm_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                        char *latin1_output) {
-  const char16_t *start = buf;
-  const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
-    if (vmaxvq_u16(in) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(in);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
+std::pair<result, char16_t *>
+avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                          char16_t *utf16_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf16_output);
+      }
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
       buf += 8;
-      latin1_output += 8;
     } else {
-      // Let us do a scalar fallback.
-      for (int k = 0; k < 8; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if (word <= 0xff) {
-          *latin1_output++ = char(word);
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
         } else {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
-                                latin1_output);
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
         }
       }
+      buf += k;
     }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        latin1_output);
+  }
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+}
+/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
+/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
+// file included directly
+
+bool validate_ascii(const char *buf, size_t len) {
+  const char *end = buf + len;
+  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+  __m512i running_or = _mm512_setzero_si512();
+  for (; end - buf >= 64; buf += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
+    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
+                                           0xf8); // running_or | (utf8 & ascii)
+  }
+  if (buf < end) {
+    const __m512i utf8 = _mm512_maskz_loadu_epi8(
+        (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
+    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
+                                           0xf8); // running_or | (utf8 & ascii)
+  }
+  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
 }
-/* end file src/arm64/arm_convert_utf16_to_latin1.cpp */
-/* begin file src/arm64/arm_convert_utf16_to_utf32.cpp */
-/*
-    The vectorized algorithm works on single SSE register i.e., it
-    loads eight 16-bit code units.
+/* end file src/icelake/icelake_ascii_validation.inl.cpp */
+/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
+// file included directly
 
-    We consider three cases:
-    1. an input register contains no surrogates and each value
-       is in range 0x0000 .. 0x07ff.
-    2. an input register contains no surrogates and values are
-       is in range 0x0000 .. 0xffff.
-    3. an input register contains surrogates --- i.e. codepoints
-       can have 16 or 32 bits.
+const char32_t *validate_utf32(const char32_t *buf, size_t len) {
+  if (len < 16) {
+    return buf;
+  }
+  const char32_t *end = buf + len - 16;
 
-    Ad 1.
+  const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
+  __m512i currentmax = _mm512_setzero_si512();
+  __m512i currentoffsetmax = _mm512_setzero_si512();
 
-    When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
-    char) or 2) two UTF8 bytes.
+  while (buf <= end) {
+    __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
+    buf += 16;
+    currentoffsetmax =
+        _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
+    currentmax = _mm512_max_epu32(utf32, currentmax);
+  }
 
-    For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
-    shuffle.
+  const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
+  const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
+  __m512i is_zero =
+      _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
+  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
+    return nullptr;
+  }
+  is_zero = _mm512_xor_si512(
+      _mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
+    return nullptr;
+  }
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+  return buf;
+}
+/* end file src/icelake/icelake_utf32_validation.inl.cpp */
+/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
+// file included directly
 
-    Ad 2.
+static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
+                                               char *utf8_output,
+                                               int mask_output) {
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
+  size_t output_size = input_len + (size_t)count_ones(nonascii);
 
-    When values fit in 16-bit code units, but are above 0x07ff, then
-    a single word may produce one, two or three UTF8 bytes.
+  // Mask to denote whether the byte is a leading byte that is not ascii
+  __mmask64 sixth = _mm512_cmpge_epu8_mask(
+      input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000
 
-    We prepare data for all these three cases in two registers.
-    The first register contains lower two UTF8 bytes (used in all
-    cases), while the second one contains just the third byte for
-    the three-UTF8-bytes case.
+  const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
+  uint64_t ascii = ~nonascii;
+  // the bits in ascii are inverted and zeros are interspersed in between them
+  uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
+  uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);
 
-    Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
-    The bytes from the registers are compressed using two shuffles.
+  // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
+  __m512i input_interleaved = _mm512_permutexvar_epi8(
+      _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
+                       0x37173616, 0x35153414, 0x33133212, 0x31113010,
+                       0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
+                       0x27072606, 0x25052404, 0x23032202, 0x21012000),
+      input);
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+  // double size of each byte, and insert the leading byte 1100 0010
 
+  /*
+  upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
+  process. We adjust for the bytes that have their two most significant bits.
+  This takes care of the first 32 bytes, assuming we interleaved the bytes. */
+  __m512i outputA =
+      _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
+  outputA = _mm512_mask_add_epi16(
+      outputA, (__mmask32)sixth, outputA,
+      _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
 
-    To summarize:
-    - We need two 256-entry tables that have 8704 bytes in total.
-*/
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-template <endianness big_endian>
-std::pair<const char16_t *, char32_t *>
-arm_convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                           char32_t *utf32_out) {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  const char16_t *end = buf + len;
+  // in the second 32-bit half, set first or second option based on whether
+  // original input is leading byte (second case) or not (first case)
+  __m512i leadingB =
+      _mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
+                              _mm512_set1_epi16(0x00c2),  // 0000 0000 1101 0010
+                              _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
+  __m512i outputB = _mm512_ternarylogic_epi32(
+      input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
+      (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+  // prune redundant bytes
+  outputA = _mm512_maskz_compress_epi8(maskA, outputA);
+  outputB = _mm512_maskz_compress_epi8(maskB, outputB);
 
-  while (end - buf >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
+  size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
 
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
-      // units
-      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-      // surrogate pair(s) in a register
+  if (mask_output) {
+    if (input_len > 32) { // is the second half of the input vector used?
+      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
+      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
+      utf8_output += output_sizeA;
+      write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
+      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
     } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xF800) != 0xD800) {
-          *utf32_output++ = char32_t(word);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char32_t *>(utf32_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
-        }
-      }
-      buf += k;
+      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
+      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
     }
-  } // while
-  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+  } else {
+    _mm512_storeu_si512(utf8_output, outputA);
+    utf8_output += output_sizeA;
+    _mm512_storeu_si512(utf8_output, outputB);
+  }
+  return output_size;
 }
 
-/*
-  Returns a pair: a result struct and utf8_output.
-  If there is an error, the count field of the result is the position of the
-  error. Otherwise, it is the position of the first unprocessed byte in buf
-  (even if finished). A scalar routing should carry on the conversion of the
-  tail if needed.
-*/
-template <endianness big_endian>
-std::pair<result, char32_t *>
-arm_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
-                                       char32_t *utf32_out) {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  const char16_t *start = buf;
-  const char16_t *end = buf + len;
+static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
+                                                  char *utf8_output) {
+  __mmask64 nonascii = _mm512_movepi8_mask(input);
+  if (nonascii) {
+    return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
+  } else {
+    _mm512_storeu_si512(utf8_output, input);
+    return 64;
+  }
+}
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
+size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
+                                   char *utf8_output) {
+  char *start = utf8_output;
+  size_t pos = 0;
+  // if there's at least 128 bytes remaining, we don't need to mask the output
+  for (; pos + 128 <= len; pos += 64) {
+    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
+    utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
+  }
+  // in the last 128 bytes, the first 64 may require masking the output
+  if (pos + 64 <= len) {
+    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
+    utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
+    pos += 64;
+  }
+  // with the last 64 bytes, the input also needs to be masked
+  if (pos < len) {
+    __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
+    __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
+    utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
+  }
+  return (size_t)(utf8_output - start);
+}
+/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
+/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
+// file included directly
+template <endianness big_endian>
+size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+                                       char16_t *utf16_output) {
+  size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
 
-  while ((end - buf) >= 8) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
+                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  for (size_t i = 0; i < rounded_len; i += 32) {
+    // Load 32 Latin1 characters into a 256-bit register
+    __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
+    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
+    __m512i out = _mm512_cvtepu8_epi16(in);
+    if (big_endian) {
+      out = _mm512_shuffle_epi8(out, byteflip);
     }
+    // Store the results back to memory
+    _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
+  }
+  if (rounded_len != len) {
+    uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
+    __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
 
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
-      // units
-      vst1q_u32(utf32_output, vmovl_u16(vget_low_u16(in)));
-      vst1q_u32(utf32_output + 4, vmovl_high_u16(in));
-      utf32_output += 8;
-      buf += 8;
-      // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xF800) != 0xD800) {
-          *utf32_output++ = char32_t(word);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k - 1),
-                reinterpret_cast<char32_t *>(utf32_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf32_output++ = char32_t(value);
-        }
-      }
-      buf += k;
+    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
+    __m512i out = _mm512_cvtepu8_epi16(in);
+    if (big_endian) {
+      out = _mm512_shuffle_epi8(out, byteflip);
     }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char32_t *>(utf32_output));
+    // Store the results back to memory
+    _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
+  }
+
+  return len;
 }
-/* end file src/arm64/arm_convert_utf16_to_utf32.cpp */
-/* begin file src/arm64/arm_convert_utf16_to_utf8.cpp */
-/*
-    The vectorized algorithm works on single SSE register i.e., it
-    loads eight 16-bit code units.
+/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
+/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
+std::pair<const char *, char32_t *>
+avx512_convert_latin1_to_utf32(const char *buf, size_t len,
+                               char32_t *utf32_output) {
+  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-    We consider three cases:
-    1. an input register contains no surrogates and each value
-       is in range 0x0000 .. 0x07ff.
-    2. an input register contains no surrogates and values are
-       is in range 0x0000 .. 0xffff.
-    3. an input register contains surrogates --- i.e. codepoints
-       can have 16 or 32 bits.
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    // Load 16 Latin1 characters into a 128-bit register
+    __m128i in = _mm_loadu_si128((__m128i *)&buf[i]);
 
-    Ad 1.
+    // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
+    // vpmovzxbd
+    __m512i out = _mm512_cvtepu8_epi32(in);
 
-    When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
-    char) or 2) two UTF8 bytes.
+    // Store the results back to memory
+    _mm512_storeu_si512((__m512i *)&utf32_output[i], out);
+  }
 
-    For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
-    shuffle.
+  // Return pointers pointing to where we left off
+  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+}
+/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
+/* begin file src/icelake/icelake_base64.inl.cpp */
+// file included directly
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+struct block64 {
+  __m512i chunks[1];
+};
+
+template <bool base64_url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  const uint8_t *input = (const uint8_t *)src;
+
+  uint8_t *out = (uint8_t *)dst;
+  static const char *lookup_tbl =
+      base64_url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+  const __m512i shuffle_input = _mm512_setr_epi32(
+      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
+      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
+      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
+  const __m512i lookup =
+      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
+  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
+  size_t size = srclen;
+  __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
+  while (size >= 48) {
+    const __m512i v = _mm512_maskz_loadu_epi8(
+        input_mask, reinterpret_cast<const __m512i *>(input));
+    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
+    _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
+    out += 64;
+    input += 48;
+    size -= 48;
+  }
+  input_mask = ((__mmask64)1 << size) - 1;
+  const __m512i v = _mm512_maskz_loadu_epi8(
+      input_mask, reinterpret_cast<const __m512i *>(input));
+  const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
+  const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
+  bool padding_needed =
+      (((options & base64_url) == 0) ^
+       ((options & base64_reverse_padding) == base64_reverse_padding));
+  size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
+  size_t output_len = ((size + 2) / 3) * 4;
+  size_t non_padded_output_len = output_len - padding_amount;
+  if (!padding_needed) {
+    output_len = non_padded_output_len;
+  }
+  __mmask64 output_mask = output_len == 64 ? (__mmask64)UINT64_MAX
+                                           : ((__mmask64)1 << output_len) - 1;
+  __m512i result = _mm512_mask_permutexvar_epi8(
+      _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
+      indices, lookup);
+  _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
+                          result);
+  return (size_t)(out - (uint8_t *)dst) + output_len;
+}
+
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error,
+                                      uint64_t input_mask = UINT64_MAX) {
+  __m512i input = b->chunks[0];
+  const __m512i ascii_space_tbl = _mm512_set_epi8(
+      0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
+      9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
+      0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
+  __m512i lookup0;
+  if (base64_url) {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
+        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
+  } else {
+    lookup0 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
+        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
+        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
+        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
+  }
+  __m512i lookup1;
+  if (base64_url) {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
+        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  } else {
+    lookup1 = _mm512_set_epi8(
+        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
+        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
+        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
+        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
+  }
+
+  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
+  const __m512i combined = _mm512_or_si512(translated, input);
+  const __mmask64 mask = _mm512_movepi8_mask(combined) & input_mask;
+  if (mask) {
+    const __mmask64 spaces =
+        _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(ascii_space_tbl, input),
+                               input) &
+        input_mask;
+    *error = (mask ^ spaces);
+  }
+  b->chunks[0] = translated;
+
+  return mask | (~input_mask);
+}
 
-    Ad 2.
+static inline void copy_block(block64 *b, char *output) {
+  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
+}
 
-    When values fit in 16-bit code units, but are above 0x07ff, then
-    a single word may produce one, two or three UTF8 bytes.
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
+  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
+  return _mm_popcnt_u64(nmask);
+}
 
-    We prepare data for all these three cases in two registers.
-    The first register contains lower two UTF8 bytes (used in all
-    cases), while the second one contains just the third byte for
-    the three-UTF8-bytes case.
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+}
 
-    Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
-    The bytes from the registers are compressed using two shuffles.
+static inline void load_block_partial(block64 *b, const char *src,
+                                      __mmask64 input_mask) {
+  b->chunks[0] = _mm512_maskz_loadu_epi8(
+      input_mask, reinterpret_cast<const __m512i *>(src));
+}
 
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
 
+static inline void load_block_partial(block64 *b, const char16_t *src,
+                                      __mmask64 input_mask) {
+  __m512i m1 = _mm512_maskz_loadu_epi16((__mmask32)input_mask,
+                                        reinterpret_cast<const __m512i *>(src));
+  __m512i m2 =
+      _mm512_maskz_loadu_epi16((__mmask32)(input_mask >> 32),
+                               reinterpret_cast<const __m512i *>(src + 32));
+  __m512i p = _mm512_packus_epi16(m1, m2);
+  b->chunks[0] =
+      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+}
 
-    To summarize:
-    - We need two 256-entry tables that have 8704 bytes in total.
-*/
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-template <endianness big_endian>
-std::pair<const char16_t *, char *>
-arm_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char16_t *end = buf + len;
+static inline void base64_decode(char *out, __m512i str) {
+  const __m512i merge_ab_and_bc =
+      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
+  const __m512i merged =
+      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
+  const __m512i pack = _mm512_set_epi8(
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
+      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
+      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
+      5, 6, 0, 1, 2);
+  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
+  _mm512_mask_storeu_epi8(
+      (__m512i *)out, 0xffffffffffff,
+      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
+}
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out,
+                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
+}
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+}
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  (void)options;
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  size_t equalsigns = 0;
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
     }
-    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-      // It is common enough that we have sequences of 16 consecutive ASCII
-      // characters.
-      uint16x8_t nextin =
-          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
-      if (!match_system(big_endian)) {
-        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  const chartype *const srcinit = src;
+  const char *const dstinit = dst;
+  const chartype *const srcend = src + srclen;
+
+  // figure out why block_size == 2 is sometimes best???
+  constexpr size_t block_size = 6;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const chartype *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      uint64_t error = 0;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (error) {
+        src -= 64;
+        size_t error_offset = _tzcnt_u64(error);
+        return {error_code::INVALID_BASE64_CHARACTER,
+                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
       }
-      if (vmaxvq_u16(nextin) > 0x7F) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(in);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
       } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-        // 2. store (16 bytes)
-        vst1q_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
+        base64_decode_block(dst, &b);
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
       }
     }
+  }
 
-    if (vmaxvq_u16(in) <= 0x7FF) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const uint16x8_t t0 = vshlq_n_u16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const uint16x8_t t2 = vandq_u16(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const uint16x8_t t3 = vorrq_u16(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-      // 2. merge ASCII and 2-byte codewords
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-      const uint8x16_t utf8_unpacked =
-          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-      // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t mask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                               0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const uint8x16_t shuffle = vld1q_u8(row + 1);
-      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      vst1q_u8(utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
+  int last_block_len = (int)(srcend - src);
+  if (last_block_len != 0) {
+    __mmask64 input_mask = ((__mmask64)1 << last_block_len) - 1;
+    block64 b;
+    load_block_partial(&b, src, input_mask);
+    uint64_t error = 0;
+    uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error, input_mask);
+    if (error) {
+      size_t error_offset = _tzcnt_u64(error);
+      return {error_code::INVALID_BASE64_CHARACTER,
+              size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
     }
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-      /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+    src += last_block_len;
+    bufferptr += compress_block(&b, badcharmask, bufferptr);
+  }
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+  char *buffer_start = buffer;
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+  if ((bufferptr - buffer_start) != 0) {
+    size_t rem = (bufferptr - buffer_start);
+    int idx = rem % 4;
+    __mmask64 mask = ((__mmask64)1 << rem) - 1;
+    __m512i input = _mm512_maskz_loadu_epi8(mask, buffer_start);
+    size_t output_len = (rem / 4) * 3;
+    __mmask64 output_mask = mask >> (rem - output_len);
+    const __m512i merge_ab_and_bc =
+        _mm512_maddubs_epi16(input, _mm512_set1_epi32(0x01400140));
+    const __m512i merged =
+        _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
+    const __m512i pack = _mm512_set_epi8(
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
+        52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
+        28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
+        5, 6, 0, 1, 2);
+    const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
+
+    if (last_chunk_options == last_chunk_handling_options::strict &&
+        (idx != 1) && ((idx + equalsigns) & 3) != 0) {
+      // The partial chunk was at src - idx
+      _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+      dst += output_len;
+      return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+              size_t(dst - dstinit)};
+    } else if (last_chunk_options ==
+                   last_chunk_handling_options::stop_before_partial &&
+               (idx != 1) && ((idx + equalsigns) & 3) != 0) {
+      // Rewind src to before partial chunk
+      _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+      dst += output_len;
+      src -= idx;
+    } else {
+      if (idx == 2) {
+        if (last_chunk_options == last_chunk_handling_options::strict) {
+          uint32_t triple = (uint32_t(bufferptr[-2]) << 3 * 6) +
+                            (uint32_t(bufferptr[-1]) << 2 * 6);
+          if (triple & 0xffff) {
+            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+            dst += output_len;
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+        }
+        output_mask = (output_mask << 1) | 1;
+        output_len += 1;
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+      } else if (idx == 3) {
+        if (last_chunk_options == last_chunk_handling_options::strict) {
+          uint32_t triple = (uint32_t(bufferptr[-3]) << 3 * 6) +
+                            (uint32_t(bufferptr[-2]) << 2 * 6) +
+                            (uint32_t(bufferptr[-1]) << 1 * 6);
+          if (triple & 0xff) {
+            _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+            dst += output_len;
+            return {BASE64_EXTRA_BITS, size_t(src - srcinit),
+                    size_t(dst - dstinit)};
+          }
+        }
+        output_mask = (output_mask << 2) | 3;
+        output_len += 2;
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+      } else if (idx == 1) {
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+        return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      } else {
+        _mm512_mask_storeu_epi8((__m512i *)dst, output_mask, shuffled);
+        dst += output_len;
+      }
+    }
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+    if (last_chunk_options != stop_before_partial && equalsigns > 0) {
+      size_t output_count = size_t(dst - dstinit);
+      if ((output_count % 3 == 0) ||
+          ((output_count % 3) + 1 + equalsigns != 4)) {
+        return {INVALID_BASE64_CHARACTER, equallocation, output_count};
+      }
+    }
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const uint16x8_t t0 = vreinterpretq_u16_u8(
-          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+    return {SUCCESS, srclen, size_t(dst - dstinit)};
+  }
 
-      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-      const uint16x8_t s0 = vshrq_n_u16(in, 12);
-      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
-      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-      // [00bb|bbbb|0000|aaaa]
-      const uint16x8_t s2 = vorrq_u16(s0, s1s);
-      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-      const uint16x8_t m0 =
-          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-      const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
+}
+/* end file src/icelake/icelake_base64.inl.cpp */
 
-      // 4. expand code units 16-bit => 32-bit
-      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+#include <cstdint>
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t onemask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-      const uint16x8_t twomask = simdutf_make_uint16x8_t(
-          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                  0x0100, 0x0400, 0x1000, 0x4000};
-      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                  0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-      const uint16x8_t combined =
-          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                    vandq_u16(one_or_two_bytes_bytemask, twomask));
-      const uint16_t mask = vaddvq_u16(combined);
-      // The following fast path may or may not be beneficial.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += 12;
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
+} // namespace
+} // namespace icelake
+} // namespace simdutf
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+namespace simdutf {
+namespace icelake {
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: convert to a one-pass algorithm
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
+}
 
-      vst1q_u8(utf8_output, utf8_0);
-      utf8_output += row0[0];
-      vst1q_u8(utf8_output, utf8_1);
-      utf8_output += row1[0];
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return true;
+  }
+  avx512_utf8_checker checker{};
+  const char *ptr = buf;
+  const char *end = ptr + len;
+  for (; end - ptr >= 64; ptr += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    checker.check_next_input(utf8);
+  }
+  if (end != ptr) {
+    const __m512i utf8 = _mm512_maskz_loadu_epi8(
+        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
+    checker.check_next_input(utf8);
+  }
+  checker.check_eof();
+  return !checker.errors();
+}
 
-      buf += 8;
-      // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xF800) != 0xD800) {
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char *>(utf8_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value >> 18) | 0b11110000);
-          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, len);
+  }
+  avx512_utf8_checker checker{};
+  const char *ptr = buf;
+  const char *end = ptr + len;
+  size_t count{0};
+  for (; end - ptr >= 64; ptr += 64) {
+    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
+    checker.check_next_input(utf8);
+    if (checker.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(buf),
+          reinterpret_cast<const char *>(buf + count), len - count);
+      res.count += count;
+      return res;
     }
-  } // while
+    count += 64;
+  }
+  if (end != ptr) {
+    const __m512i utf8 = _mm512_maskz_loadu_epi8(
+        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
+    checker.check_next_input(utf8);
+  }
+  checker.check_eof();
+  if (checker.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(buf),
+        reinterpret_cast<const char *>(buf + count), len - count);
+    res.count += count;
+    return res;
+  }
+  return result(error_code::SUCCESS, len);
+}
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return icelake::validate_ascii(buf, len);
+}
 
-  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  const char *buf_orig = buf;
+  const char *end = buf + len;
+  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
+  for (; end - buf >= 64; buf += 64) {
+    const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
+    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+    if (notascii) {
+      return result(error_code::TOO_LARGE,
+                    buf - buf_orig + _tzcnt_u64(notascii));
+    }
+  }
+  if (end != buf) {
+    const __m512i input = _mm512_maskz_loadu_epi8(
+        ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
+    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
+    if (notascii) {
+      return result(error_code::TOO_LARGE,
+                    buf - buf_orig + _tzcnt_u64(notascii));
+    }
+  }
+  return result(error_code::SUCCESS, len);
 }
 
-/*
-  Returns a pair: a result struct and utf8_output.
-  If there is an error, the count field of the result is the position of the
-  error. Otherwise, it is the position of the first unprocessed byte in buf
-  (even if finished). A scalar routing should carry on the conversion of the
-  tail if needed.
-*/
-template <endianness big_endian>
-std::pair<result, char *>
-arm_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
-                                      char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char16_t *start = buf;
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
   const char16_t *end = buf + len;
 
-  const uint16x8_t v_f800 = vmovq_n_u16((uint16_t)0xf800);
-  const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    uint16x8_t in = vld1q_u16(reinterpret_cast<const uint16_t *>(buf));
-    if (!match_system(big_endian)) {
-      in = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(in)));
-    }
-    if (vmaxvq_u16(in) <= 0x7F) { // ASCII fast path!!!!
-      // It is common enough that we have sequences of 16 consecutive ASCII
-      // characters.
-      uint16x8_t nextin =
-          vld1q_u16(reinterpret_cast<const uint16_t *>(buf) + 8);
-      if (!match_system(big_endian)) {
-        nextin = vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(nextin)));
+  for (; end - buf >= 32;) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
       }
-      if (vmaxvq_u16(nextin) > 0x7F) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(in);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
       } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x16_t utf8_packed = vmovn_high_u16(vmovn_u16(in), nextin);
-        // 2. store (16 bytes)
-        vst1q_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
+        buf += 32;
       }
+    } else {
+      buf += 32;
     }
-
-    if (vmaxvq_u16(in) <= 0x7FF) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-      const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const uint16x8_t t0 = vshlq_n_u16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const uint16x8_t t2 = vandq_u16(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const uint16x8_t t3 = vorrq_u16(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-      // 2. merge ASCII and 2-byte codewords
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-      const uint8x16_t utf8_unpacked =
-          vreinterpretq_u8_u16(vbslq_u16(one_byte_bytemask, in, t4));
-      // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t mask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-      const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                               0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-      uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const uint8x16_t shuffle = vld1q_u8(row + 1);
-      const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      vst1q_u8(utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
+  }
+  if (buf < end) {
+    __m512i in =
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
+      }
     }
-    const uint16x8_t surrogates_bytemask =
-        vceqq_u16(vandq_u16(in, v_f800), v_d800);
-    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
-    // However, it is likely an uncommon occurrence.
-    if (vmaxvq_u16(surrogates_bytemask) == 0) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-      const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                   0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-      /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
-
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const uint16x8_t t0 = vreinterpretq_u16_u8(
-          vqtbl1q_u8(vreinterpretq_u8_u16(in), vreinterpretq_u8_u16(dup_even)));
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+  }
+  return true;
+}
 
-      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-      const uint16x8_t s0 = vshrq_n_u16(in, 12);
-      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-      const uint16x8_t s1 = vandq_u16(in, simdutf_vec(0b0000111111000000));
-      // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-      const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-      // [00bb|bbbb|0000|aaaa]
-      const uint16x8_t s2 = vorrq_u16(s0, s1s);
-      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-      const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-      const uint16x8_t one_or_two_bytes_bytemask = vcleq_u16(in, v_07ff);
-      const uint16x8_t m0 =
-          vbicq_u16(simdutf_vec(0b0100000000000000), one_or_two_bytes_bytemask);
-      const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  const char16_t *end = buf + len;
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  for (; end - buf >= 32;) {
+    __m512i in =
+        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
+      }
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
+      } else {
+        buf += 32;
+      }
+    } else {
+      buf += 32;
+    }
+  }
+  if (buf < end) {
+    __m512i in = _mm512_shuffle_epi8(
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
+        byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
 
-      // 4. expand code units 16-bit => 32-bit
-      const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-      const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  const char16_t *start_buf = buf;
+  const char16_t *end = buf + len;
+  for (; end - buf >= 32;) {
+    __m512i in = _mm512_loadu_si512((__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
+      } else {
+        buf += 32;
+      }
+    } else {
+      buf += 32;
+    }
+  }
+  if (buf < end) {
+    __m512i in =
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+    }
+  }
+  return result(error_code::SUCCESS, len);
+}
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-      const uint16x8_t one_byte_bytemask = vcleq_u16(in, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-      const uint16x8_t onemask = simdutf_make_uint16x8_t(
-          0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-      const uint16x8_t twomask = simdutf_make_uint16x8_t(
-          0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-      const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                  0x0100, 0x0400, 0x1000, 0x4000};
-      const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                  0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-      const uint16x8_t combined =
-          vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                    vandq_u16(one_or_two_bytes_bytemask, twomask));
-      const uint16_t mask = vaddvq_u16(combined);
-      // The following fast path may or may not be beneficial.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += 12;
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  const char16_t *start_buf = buf;
+  const char16_t *end = buf + len;
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  for (; end - buf >= 32;) {
+    __m512i in =
+        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
+      if (ends_with_high) {
+        buf += 31; // advance only by 31 code units so that we start with the
+                   // high surrogate on the next round.
+      } else {
+        buf += 32;
+      }
+    } else {
+      buf += 32;
+    }
+  }
+  if (buf < end) {
+    __m512i in = _mm512_shuffle_epi8(
+        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
+        byteflip);
+    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
+    __mmask32 surrogates =
+        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
+    if (surrogates) {
+      __mmask32 highsurrogates =
+          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
+      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
+      // high must be followed by low
+      if ((highsurrogates << 1) != lowsurrogates) {
+        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
+        uint32_t extra_high =
+            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
+        return result(error_code::SURROGATE,
+                      (buf - start_buf) +
+                          (extra_low < extra_high ? extra_low : extra_high));
+      }
+    }
+  }
+  return result(error_code::SUCCESS, len);
+}
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-      const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  const char32_t *tail = icelake::validate_utf32(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    // we come here if there was an error, or buf was nullptr which may happen
+    // for empty input.
+    return len == 0;
+  }
+}
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-      const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  const char32_t *buf_orig = buf;
+  if (len >= 16) {
+    const char32_t *end = buf + len - 16;
+    while (buf <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
+      __mmask16 outside_range = _mm512_cmp_epu32_mask(
+          utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
 
-      vst1q_u8(utf8_output, utf8_0);
-      utf8_output += row0[0];
-      vst1q_u8(utf8_output, utf8_1);
-      utf8_output += row1[0];
+      __m512i utf32_off =
+          _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
 
-      buf += 8;
-      // surrogate pair(s) in a register
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint16_t word = !match_system(big_endian)
-                            ? scalar::utf16::swap_bytes(buf[k])
-                            : buf[k];
-        if ((word & 0xFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xF800) != 0xD800) {
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          // must be a surrogate pair
-          uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word = !match_system(big_endian)
-                                   ? scalar::utf16::swap_bytes(buf[k + 1])
-                                   : buf[k + 1];
-          k++;
-          uint16_t diff2 = uint16_t(next_word - 0xDC00);
-          if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k - 1),
-                reinterpret_cast<char *>(utf8_output));
-          }
-          uint32_t value = (diff << 10) + diff2 + 0x10000;
-          *utf8_output++ = char((value >> 18) | 0b11110000);
-          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
+          utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
+      if ((outside_range | surrogate_range)) {
+        auto outside_idx = _tzcnt_u32(outside_range);
+        auto surrogate_idx = _tzcnt_u32(surrogate_range);
+
+        if (outside_idx < surrogate_idx) {
+          return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
         }
+
+        return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
       }
-      buf += k;
+
+      buf += 16;
     }
-  } // while
+  }
+  if (len > 0) {
+    __m512i utf32 = _mm512_maskz_loadu_epi32(
+        __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
+    __mmask16 outside_range = _mm512_cmp_epu32_mask(
+        utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
+    __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char *>(utf8_output));
+    __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
+        utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
+    if ((outside_range | surrogate_range)) {
+      auto outside_idx = _tzcnt_u32(outside_range);
+      auto surrogate_idx = _tzcnt_u32(surrogate_range);
+
+      if (outside_idx < surrogate_idx) {
+        return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+      }
+
+      return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+    }
+  }
+
+  return result(error_code::SUCCESS, len);
 }
-/* end file src/arm64/arm_convert_utf16_to_utf8.cpp */
 
-/* begin file src/arm64/arm_base64.cpp */
-/**
- * References and further reading:
- *
- * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
- * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
- * https://arxiv.org/abs/1910.05109
- *
- * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
- * Instructions, ACM Transactions on the Web 12 (3), 2018.
- * https://arxiv.org/abs/1704.00605
- *
- * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
- * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
- * Request for Comments: 4648.
- *
- * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
- * http://www.alfredklomp.com/programming/sse-base64/. (2014).
- *
- * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
- * acceleration. https://github.com/aklomp/base64. (2014).
- *
- * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
- * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
- *
- * Nick Kopp. 2013. Base64 Encoding on a GPU.
- * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
- */
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
+}
 
-size_t encode_base64(char *dst, const char *src, size_t srclen,
-                     base64_options options) {
-  // credit: Wojciech Muła
-  uint8_t *out = (uint8_t *)dst;
-  constexpr static uint8_t source_table[64] = {
-      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
-      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
-      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
-      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
-      'N', 'd', 't', '9', 'O', 'e', 'u', '+', 'P', 'f', 'v', '/',
-  };
-  constexpr static uint8_t source_table_url[64] = {
-      'A', 'Q', 'g', 'w', 'B', 'R', 'h', 'x', 'C', 'S', 'i', 'y', 'D',
-      'T', 'j', 'z', 'E', 'U', 'k', '0', 'F', 'V', 'l', '1', 'G', 'W',
-      'm', '2', 'H', 'X', 'n', '3', 'I', 'Y', 'o', '4', 'J', 'Z', 'p',
-      '5', 'K', 'a', 'q', '6', 'L', 'b', 'r', '7', 'M', 'c', 's', '8',
-      'N', 'd', 't', '9', 'O', 'e', 'u', '-', 'P', 'f', 'v', '_',
-  };
-  const uint8x16_t v3f = vdupq_n_u8(0x3f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  // When trying to load a uint8_t array, Visual Studio might
-  // error with: error C2664: '__n128x4 neon_ld4m_q8(const char *)':
-  // cannot convert argument 1 from 'const uint8_t [64]' to 'const char *
-  const uint8x16x4_t table = vld4q_u8(
-      (reinterpret_cast<const char *>(options & base64_url) ? source_table_url
-                                                            : source_table));
-#else
-  const uint8x16x4_t table =
-      vld4q_u8((options & base64_url) ? source_table_url : source_table);
-#endif
-  size_t i = 0;
-  for (; i + 16 * 3 <= srclen; i += 16 * 3) {
-    const uint8x16x3_t in = vld3q_u8((const uint8_t *)src + i);
-    uint8x16x4_t result;
-    result.val[0] = vshrq_n_u8(in.val[0], 2);
-    result.val[1] =
-        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[1], 4), in.val[0], 4), v3f);
-    result.val[2] =
-        vandq_u8(vsliq_n_u8(vshrq_n_u8(in.val[2], 6), in.val[1], 2), v3f);
-    result.val[3] = vandq_u8(in.val[2], v3f);
-    result.val[0] = vqtbl4q_u8(table, result.val[0]);
-    result.val[1] = vqtbl4q_u8(table, result.val[1]);
-    result.val[2] = vqtbl4q_u8(table, result.val[2]);
-    result.val[3] = vqtbl4q_u8(table, result.val[3]);
-    vst4q_u8(out, result);
-    out += 64;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len,
+                                                             utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len,
+                                                          utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      avx512_convert_latin1_to_utf32(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  out += scalar::base64::tail_encode_base64((char *)out, src + i, srclen - i,
-                                            options);
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
 
-  return size_t((char *)out - dst);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
 }
 
-static inline void compress(uint8x16_t data, uint16_t mask, char *output) {
-  if (mask == 0) {
-    vst1q_u8((uint8_t *)output, data);
-    return;
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  // First, try to convert as much as possible using the SIMD implementation.
+  const char *obuf = buf;
+  char *olatin1_output = latin1_output;
+  size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
+
+  // If we have completely converted the string
+  if (obuf == buf + len) {
+    return {simdutf::SUCCESS, written};
   }
-  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
-  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
-  uint64x2_t compactmasku64 = {tables::base64::thintable_epi8[mask1],
-                               tables::base64::thintable_epi8[mask2]};
-  uint8x16_t compactmask = vreinterpretq_u8_u64(compactmasku64);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t off =
-      simdutf_make_uint8x16_t(0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8);
-#else
-  const uint8x16_t off = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
-#endif
+  size_t pos = obuf - buf;
+  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+      pos, buf + pos, len - pos, latin1_output);
+  res.count += pos;
+  return res;
+}
 
-  compactmask = vaddq_u8(compactmask, off);
-  uint8x16_t pruned = vqtbl1q_u8(data, compactmask);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
+}
 
-  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
-  // then load the corresponding mask, what it does is to write
-  // only the first pop1 bytes from the first 8 bytes, and then
-  // it fills in with the bytes from the second 8 bytes + some filling
-  // at the end.
-  compactmask = vld1q_u8(tables::base64::pshufb_combine_table + pop1 * 8);
-  uint8x16_t answer = vqtbl1q_u8(pruned, compactmask);
-  vst1q_u8((uint8_t *)output, answer);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret =
+      fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len,
+                                                            utf16_output);
+  if (ret.second == nullptr) {
+    return 0;
+  }
+  return ret.second - utf16_output;
 }
 
-struct block64 {
-  uint8x16_t chunks[4];
-};
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(
+      buf, len, utf16_output);
+  if (ret.second == nullptr) {
+    return 0;
+  }
+  return ret.second - utf16_output;
+}
 
-static_assert(sizeof(block64) == 64, "block64 is not 64 bytes");
-template <bool base64_url> uint64_t to_base64_mask(block64 *b, bool *error) {
-  uint8x16_t v0f = vdupq_n_u8(0xf);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
 
-  uint8x16_t underscore0, underscore1, underscore2, underscore3;
-  if (base64_url) {
-    underscore0 = vceqq_u8(b->chunks[0], vdupq_n_u8(0x5f));
-    underscore1 = vceqq_u8(b->chunks[1], vdupq_n_u8(0x5f));
-    underscore2 = vceqq_u8(b->chunks[2], vdupq_n_u8(0x5f));
-    underscore3 = vceqq_u8(b->chunks[3], vdupq_n_u8(0x5f));
-  } else {
-    (void)underscore0;
-    (void)underscore1;
-    (void)underscore2;
-    (void)underscore3;
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret =
+      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(
+          buf, len, utf16_output);
+  size_t saved_bytes = ret.second - utf16_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
   }
 
-  uint8x16_t lo_nibbles0 = vandq_u8(b->chunks[0], v0f);
-  uint8x16_t lo_nibbles1 = vandq_u8(b->chunks[1], v0f);
-  uint8x16_t lo_nibbles2 = vandq_u8(b->chunks[2], v0f);
-  uint8x16_t lo_nibbles3 = vandq_u8(b->chunks[3], v0f);
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
+  }
 
-  // Needed by the decoding step.
-  uint8x16_t hi_nibbles0 = vshrq_n_u8(b->chunks[0], 4);
-  uint8x16_t hi_nibbles1 = vshrq_n_u8(b->chunks[1], 4);
-  uint8x16_t hi_nibbles2 = vshrq_n_u8(b->chunks[2], 4);
-  uint8x16_t hi_nibbles3 = vshrq_n_u8(b->chunks[3], 4);
-  uint8x16_t lut_lo;
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  if (base64_url) {
-    lut_lo =
-        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                                0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4);
-  } else {
-    lut_lo =
-        simdutf_make_uint8x16_t(0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                                0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4);
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes =
+        scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-#else
-  if (base64_url) {
-    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                        0x70, 0x61, 0xe1, 0xf4, 0xe5, 0xa5, 0xf4, 0xf4};
-  } else {
-    lut_lo = uint8x16_t{0x3a, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70,
-                        0x70, 0x61, 0xe1, 0xb4, 0xe5, 0xe5, 0xf4, 0xb4};
+
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16_result ret =
+      icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(
+          buf, len, utf16_output);
+  size_t saved_bytes = ret.second - utf16_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
   }
-#endif
-  uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
-  uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
-  uint8x16_t lo2 = vqtbl1q_u8(lut_lo, lo_nibbles2);
-  uint8x16_t lo3 = vqtbl1q_u8(lut_lo, lo_nibbles3);
-  uint8x16_t lut_hi;
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  if (base64_url) {
-    lut_hi =
-        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
-                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
-  } else {
-    lut_hi =
-        simdutf_make_uint8x16_t(0x11, 0x20, 0x42, 0x80, 0x8, 0x4, 0x8, 0x4,
-                                0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20);
+
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
   }
-#else
-  if (base64_url) {
-    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
-  } else {
-    lut_hi = uint8x16_t{0x11, 0x20, 0x42, 0x80, 0x8,  0x4,  0x8,  0x4,
-                        0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20};
+
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes =
+        scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-#endif
-  uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
-  uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
-  uint8x16_t hi2 = vqtbl1q_u8(lut_hi, hi_nibbles2);
-  uint8x16_t hi3 = vqtbl1q_u8(lut_hi, hi_nibbles3);
 
-  if (base64_url) {
-    hi0 = vbicq_u8(hi0, underscore0);
-    hi1 = vbicq_u8(hi1, underscore1);
-    hi2 = vbicq_u8(hi2, underscore2);
-    hi3 = vbicq_u8(hi3, underscore3);
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  utf8_to_utf32_result ret =
+      icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
+          buf, len, utf32_output);
+  if (ret.second == nullptr)
+    return 0;
+
+  size_t saved_bytes = ret.second - utf32_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
   }
 
-  uint8_t checks =
-      vmaxvq_u8(vorrq_u8(vorrq_u8(vandq_u8(lo0, hi0), vandq_u8(lo1, hi1)),
-                         vorrq_u8(vandq_u8(lo2, hi2), vandq_u8(lo3, hi3))));
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  const uint8x16_t bit_mask =
-      simdutf_make_uint8x16_t(0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                              0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80);
-#else
-  const uint8x16_t bit_mask = {0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80,
-                               0x01, 0x02, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80};
-#endif
-  uint64_t badcharmask = 0;
-  *error = checks > 0x3;
-  if (checks) {
-    // Add each of the elements next to each other, successively, to stuff each
-    // 8 byte mask into one.
-    uint8x16_t test0 = vtstq_u8(lo0, hi0);
-    uint8x16_t test1 = vtstq_u8(lo1, hi1);
-    uint8x16_t test2 = vtstq_u8(lo2, hi2);
-    uint8x16_t test3 = vtstq_u8(lo3, hi3);
-    uint8x16_t sum0 =
-        vpaddq_u8(vandq_u8(test0, bit_mask), vandq_u8(test1, bit_mask));
-    uint8x16_t sum1 =
-        vpaddq_u8(vandq_u8(test2, bit_mask), vandq_u8(test3, bit_mask));
-    sum0 = vpaddq_u8(sum0, sum1);
-    sum0 = vpaddq_u8(sum0, sum0);
-    badcharmask = vgetq_lane_u64(vreinterpretq_u64_u8(sum0), 0);
+  // Note: the AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outside 16-byte window.
+  //       It means, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
   }
-  // This is the transformation step that can be done while we are waiting for
-  // sum0
-  uint8x16_t roll_lut;
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-  if (base64_url) {
-    roll_lut =
-        simdutf_make_uint8x16_t(0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
-  } else {
-    roll_lut =
-        simdutf_make_uint8x16_t(0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                                0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
+        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-#else
-  if (base64_url) {
-    roll_lut = uint8x16_t{0xe0, 0x11, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                          0x0,  0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
-  } else {
-    roll_lut = uint8x16_t{0x0, 0x10, 0x13, 0x4, 0xbf, 0xbf, 0xb9, 0xb9,
-                          0x0, 0x0,  0x0,  0x0, 0x0,  0x0,  0x0,  0x0};
+
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return {error_code::SUCCESS, 0};
+  }
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
+  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
+      endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+
+  if (!std::get<2>(ret)) {
+    size_t pos = std::get<0>(ret) - buf;
+    // We might have an error that occurs right before  pos.
+    // This is only a concern if buf[pos] is not a continuation byte.
+    if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
+      pos -= 1;
+    } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
+      // We must check whether we are the fourth continuation byte
+      bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
+      bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
+      bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
+      if (c1 && c2 && c3) {
+        return {simdutf::TOO_LONG, pos};
+      }
+    }
+    // todo: we reset the output to utf32 instead of using std::get<2.(ret) as
+    // you'd expect. that is because
+    // validating_utf8_to_fixed_length_with_constant_checks may have processed
+    // data beyond the error.
+    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+        pos, buf + pos, len - pos, utf32);
+    res.count += pos;
+    return res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  const char *end = buf + len;
+  if (std::get<0>(ret) == end) {
+    return {simdutf::SUCCESS, saved_bytes};
+  }
+
+  // Note: the AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outside 16-byte window.
+  //       It means, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (std::get<0>(ret) != end and
+         ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
+    std::get<0>(ret) += 1;
+  }
+
+  if (std::get<0>(ret) != end) {
+    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
+        std::get<0>(ret), len - (std::get<0>(ret) - buf),
+        reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
+    if (scalar_result.error != simdutf::SUCCESS) {
+      scalar_result.count += (std::get<0>(ret) - buf);
+    } else {
+      scalar_result.count += saved_bytes;
+    }
+    return scalar_result;
+  }
+
+  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  utf8_to_utf32_result ret =
+      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
+          buf, len, utf32_output);
+  size_t saved_bytes = ret.second - utf32_output;
+  const char *end = buf + len;
+  if (ret.first == end) {
+    return saved_bytes;
+  }
+
+  // Note: AVX512 procedure looks up 4 bytes forward, and
+  //       correctly converts multi-byte chars even if their
+  //       continuation bytes lie outsiede 16-byte window.
+  //       It meas, we have to skip continuation bytes from
+  //       the beginning ret.first, as they were already consumed.
+  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
+    ret.first += 1;
   }
-#endif
-  uint8x16_t vsecond_last = base64_url ? vdupq_n_u8(0x2d) : vdupq_n_u8(0x2f);
-  if (base64_url) {
-    hi_nibbles0 = vbicq_u8(hi_nibbles0, underscore0);
-    hi_nibbles1 = vbicq_u8(hi_nibbles1, underscore1);
-    hi_nibbles2 = vbicq_u8(hi_nibbles2, underscore2);
-    hi_nibbles3 = vbicq_u8(hi_nibbles3, underscore3);
+
+  if (ret.first != end) {
+    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
+        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-  uint8x16_t roll0 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[0], vsecond_last), hi_nibbles0));
-  uint8x16_t roll1 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[1], vsecond_last), hi_nibbles1));
-  uint8x16_t roll2 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[2], vsecond_last), hi_nibbles2));
-  uint8x16_t roll3 = vqtbl1q_u8(
-      roll_lut, vaddq_u8(vceqq_u8(b->chunks[3], vsecond_last), hi_nibbles3));
-  b->chunks[0] = vaddq_u8(b->chunks[0], roll0);
-  b->chunks[1] = vaddq_u8(b->chunks[1], roll1);
-  b->chunks[2] = vaddq_u8(b->chunks[2], roll2);
-  b->chunks[3] = vaddq_u8(b->chunks[3], roll3);
-  return badcharmask;
+
+  return saved_bytes;
 }
 
-void copy_block(block64 *b, char *output) {
-  vst1q_u8((uint8_t *)output, b->chunks[0]);
-  vst1q_u8((uint8_t *)output + 16, b->chunks[1]);
-  vst1q_u8((uint8_t *)output + 32, b->chunks[2]);
-  vst1q_u8((uint8_t *)output + 48, b->chunks[3]);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+                                                             latin1_output);
 }
 
-uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
-  uint64_t popcounts =
-      vget_lane_u64(vreinterpret_u64_u8(vcnt_u8(vcreate_u8(~mask))), 0);
-  uint64_t offsets = popcounts * 0x0101010101010101;
-  compress(b->chunks[0], uint16_t(mask), output);
-  compress(b->chunks[1], uint16_t(mask >> 16), &output[(offsets >> 8) & 0xFF]);
-  compress(b->chunks[2], uint16_t(mask >> 32), &output[(offsets >> 24) & 0xFF]);
-  compress(b->chunks[3], uint16_t(mask >> 48), &output[(offsets >> 40) & 0xFF]);
-  return offsets >> 56;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+                                                          latin1_output);
 }
 
-// The caller of this function is responsible to ensure that there are 64 bytes
-// available from reading at src. The data is read into a block64 structure.
-void load_block(block64 *b, const char *src) {
-  b->chunks[0] = vld1q_u8(reinterpret_cast<const uint8_t *>(src));
-  b->chunks[1] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 16);
-  b->chunks[2] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 32);
-  b->chunks[3] = vld1q_u8(reinterpret_cast<const uint8_t *>(src) + 48);
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+             buf, len, latin1_output)
+      .first;
 }
 
-// The caller of this function is responsible to ensure that there are 32 bytes
-// available from reading at data. It returns a 16-byte value, narrowing with
-// saturation the 16-bit words.
-inline uint8x16_t load_satured(const uint16_t *data) {
-  uint16x8_t in1 = vld1q_u16(data);
-  uint16x8_t in2 = vld1q_u16(data + 8);
-  return vqmovn_high_u16(vqmovn_u16(in1), in2);
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(
+             buf, len, latin1_output)
+      .first;
 }
 
-// The caller of this function is responsible to ensure that there are 128 bytes
-// available from reading at src. The data is read into a block64 structure.
-void load_block(block64 *b, const char16_t *src) {
-  b->chunks[0] = load_satured(reinterpret_cast<const uint16_t *>(src));
-  b->chunks[1] = load_satured(reinterpret_cast<const uint16_t *>(src) + 16);
-  b->chunks[2] = load_satured(reinterpret_cast<const uint16_t *>(src) + 32);
-  b->chunks[3] = load_satured(reinterpret_cast<const uint16_t *>(src) + 48);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement custom function
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
-// decode 64 bytes and output 48 bytes
-void base64_decode_block(char *out, const char *src) {
-  uint8x16x4_t str = vld4q_u8((uint8_t *)src);
-  uint8x16x3_t outvec;
-  outvec.val[0] =
-      vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
-  outvec.val[1] =
-      vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
-  outvec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
-  vst3q_u8((uint8_t *)out, outvec);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement custom function
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
-template <bool base64_url, typename char_type>
-full_result
-compress_decode_base64(char *dst, const char_type *src, size_t srclen,
-                       base64_options options,
-                       last_chunk_handling_options last_chunk_options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
-                                        : tables::base64::to_base64_value;
-  size_t equallocation =
-      srclen; // location of the first padding character if any
-  // skip trailing spaces
-  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-         to_base64[uint8_t(src[srclen - 1])] == 64) {
-    srclen--;
-  }
-  size_t equalsigns = 0;
-  if (srclen > 0 && src[srclen - 1] == '=') {
-    equallocation = srclen - 1;
-    srclen--;
-    equalsigns = 1;
-    // skip trailing spaces
-    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-           to_base64[uint8_t(src[srclen - 1])] == 64) {
-      srclen--;
-    }
-    if (srclen > 0 && src[srclen - 1] == '=') {
-      equallocation = srclen - 1;
-      srclen--;
-      equalsigns = 2;
-    }
-  }
-  if (srclen == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    return 0;
   }
-  const char_type *const srcinit = src;
-  const char *const dstinit = dst;
-  const char_type *const srcend = src + srclen;
-
-  constexpr size_t block_size = 10;
-  char buffer[block_size * 64];
-  char *bufferptr = buffer;
-  if (srclen >= 64) {
-    const char_type *const srcend64 = src + srclen - 64;
-    while (src <= srcend64) {
-      block64 b;
-      load_block(&b, src);
-      src += 64;
-      bool error = false;
-      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
-      if (badcharmask) {
-        if (error) {
-          src -= 64;
-          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
-                 to_base64[uint8_t(*src)] <= 64) {
-            src++;
-          }
-          if (src < srcend) {
-            // should never happen
-          }
-          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                  size_t(dst - dstinit)};
-        }
-      }
+  return outlen;
+}
 
-      if (badcharmask != 0) {
-        // optimization opportunity: check for simple masks like those made of
-        // continuous 1s followed by continuous 0s. And masks containing a
-        // single bad character.
-        bufferptr += compress_block(&b, badcharmask, bufferptr);
-      } else {
-        // optimization opportunity: if bufferptr == buffer and mask == 0, we
-        // can avoid the call to compress_block and decode directly.
-        copy_block(&b, bufferptr);
-        bufferptr += 64;
-      }
-      if (bufferptr >= (block_size - 1) * 64 + buffer) {
-        for (size_t i = 0; i < (block_size - 1); i++) {
-          base64_decode_block(dst, buffer + i * 64);
-          dst += 48;
-        }
-        std::memcpy(buffer, buffer + (block_size - 1) * 64,
-                    64); // 64 might be too much
-        bufferptr -= (block_size - 1) * 64;
-      }
-    }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    return 0;
   }
-  char *buffer_start = buffer;
-  // Optimization note: if this is almost full, then it is worth our
-  // time, otherwise, we should just decode directly.
-  int last_block = (int)((bufferptr - buffer_start) % 64);
-  if (last_block != 0 && srcend - src + last_block >= 64) {
-    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = to_base64[uint8_t(*src)];
-      *bufferptr = char(val);
-      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
-        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      }
-      bufferptr += (val <= 63);
-      src++;
-    }
+  return outlen;
+}
+
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+        buf + inlen, len - inlen, utf8_output + outlen);
+    res.count += inlen;
+    return res;
   }
+  return {simdutf::SUCCESS, outlen};
+}
 
-  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
-    base64_decode_block(dst, buffer_start);
-    dst += 48;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  size_t outlen;
+  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
+      buf, len, (unsigned char *)utf8_output, &outlen);
+  if (inlen != len) {
+    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+        buf + inlen, len - inlen, utf8_output + outlen);
+    res.count += inlen;
+    return res;
   }
-  if ((bufferptr - buffer_start) % 64 != 0) {
-    while (buffer_start + 4 < bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 4);
+  return {simdutf::SUCCESS, outlen};
+}
 
-      dst += 3;
-      buffer_start += 4;
-    }
-    if (buffer_start + 4 <= bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 3);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
 
-      dst += 3;
-      buffer_start += 4;
-    }
-    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
-    // backtrack
-    int leftover = int(bufferptr - buffer_start);
-    while (leftover > 0) {
-      while (to_base64[uint8_t(*(src - 1))] == 64) {
-        src--;
-      }
-      src--;
-      leftover--;
-    }
-  }
-  if (src < srcend + equalsigns) {
-    full_result r = scalar::base64::base64_tail_decode(
-        dst, src, srcend - src, equalsigns, options, last_chunk_options);
-    r.input_count += size_t(src - srcinit);
-    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
-        r.error == error_code::BASE64_EXTRA_BITS) {
-      return r;
-    } else {
-      r.output_count += size_t(dst - dstinit);
-    }
-    if (last_chunk_options != stop_before_partial &&
-        r.error == error_code::SUCCESS && equalsigns > 0) {
-      // additional checks
-      if ((r.output_count % 3 == 0) ||
-          ((r.output_count % 3) + 1 + equalsigns != 4)) {
-        r.error = error_code::INVALID_BASE64_CHARACTER;
-        r.input_count = equallocation;
-      }
-    }
-    return r;
-  }
-  if (equalsigns > 0) {
-    if ((size_t(dst - dstinit) % 3 == 0) ||
-        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
-    }
-  }
-  return {SUCCESS, srclen, size_t(dst - dstinit)};
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
-/* end file src/arm64/arm_base64.cpp */
-/* begin file src/arm64/arm_convert_utf32_to_latin1.cpp */
-std::pair<const char32_t *, char *>
-arm_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                            char *latin1_output) {
-  const char32_t *end = buf + len;
-  while (end - buf >= 8) {
-    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
 
-    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
-    if (vmaxvq_u16(utf16_packed) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
-      buf += 8;
-      latin1_output += 8;
-    } else {
-      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
-    }
-  } // while
-  return std::make_pair(buf, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
 }
 
-std::pair<result, char *>
-arm_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                        char *latin1_output) {
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
+      .first;
+}
 
-  while (end - buf >= 8) {
-    uint32x4_t in1 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t in2 = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+}
 
-    uint16x8_t utf16_packed = vcombine_u16(vqmovn_u32(in1), vqmovn_u32(in2));
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      avx512_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    if (vmaxvq_u16(utf16_packed) <= 0xff) {
-      // 1. pack the bytes
-      uint8x8_t latin1_packed = vmovn_u16(utf16_packed);
-      // 2. store (8 bytes)
-      vst1_u8(reinterpret_cast<uint8_t *>(latin1_output), latin1_packed);
-      // 3. adjust pointers
-      buf += 8;
-      latin1_output += 8;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      // Let us do a scalar fallback.
-      for (int k = 0; k < 8; k++) {
-        uint32_t word = buf[k];
-        if (word <= 0xff) {
-          *latin1_output++ = char(word);
-        } else {
-          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
-                                latin1_output);
-        }
-      }
+      ret.second += scalar_res.count;
     }
-  } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        latin1_output);
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
-/* end file src/arm64/arm_convert_utf32_to_latin1.cpp */
-/* begin file src/arm64/arm_convert_utf32_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char32_t *, char16_t *>
-arm_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                           char16_t *utf16_out) {
-  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
-  const char32_t *end = buf + len;
 
-  uint16x4_t forbidden_bytemask = vmov_n_u16(0x0);
-
-  while (end - buf >= 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
-
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      forbidden_bytemask = vorr_u16(vand_u16(vcle_u16(utf16_packed, v_dfff),
-                                             vcge_u16(utf16_packed, v_d800)),
-                                    forbidden_bytemask);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
 
-      if (!match_system(big_endian)) {
-        utf16_packed =
-            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
-    } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char16_t *>(utf16_output));
-          }
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(word >> 8 | word << 8)
-                                : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char16_t *>(utf16_output));
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate =
-                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
+}
 
-  // check for invalid input
-  if (vmaxv_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+          buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                                 utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-template <endianness big_endian>
-std::pair<result, char16_t *>
-arm_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                       char16_t *utf16_out) {
-  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
 
-  while (end - buf >= 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(in) <= 0xFFFF) {
-      uint16x4_t utf16_packed = vmovn_u32(in);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                          utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      const uint16x4_t v_d800 = vmov_n_u16((uint16_t)0xd800);
-      const uint16x4_t v_dfff = vmov_n_u16((uint16_t)0xdfff);
-      const uint16x4_t forbidden_bytemask = vand_u16(
-          vcle_u16(utf16_packed, v_dfff), vcge_u16(utf16_packed, v_d800));
-      if (vmaxv_u16(forbidden_bytemask) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              reinterpret_cast<char16_t *>(utf16_output));
-      }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      if (!match_system(big_endian)) {
-        utf16_packed =
-            vreinterpret_u16_u8(vrev16_u8(vreinterpret_u8_u16(utf16_packed)));
-      }
-      vst1_u16(utf16_output, utf16_packed);
-      utf16_output += 4;
-      buf += 4;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                          utf32_output);
+  if (!std::get<2>(ret)) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    scalar_res.count += (std::get<0>(ret) - buf);
+    return scalar_res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_res.error) {
+      scalar_res.count += (std::get<0>(ret) - buf);
+      return scalar_res;
     } else {
-      size_t forward = 3;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k),
-                reinterpret_cast<char16_t *>(utf16_output));
-          }
-          *utf16_output++ = !match_system(big_endian)
-                                ? char16_t(word >> 8 | word << 8)
-                                : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k),
-                reinterpret_cast<char16_t *>(utf16_output));
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (!match_system(big_endian)) {
-            high_surrogate =
-                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
-            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
+      scalar_res.count += saved_bytes;
+      return scalar_res;
     }
   }
-
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char16_t *>(utf16_output));
+  return simdutf::result(simdutf::SUCCESS, saved_bytes);
 }
-/* end file src/arm64/arm_convert_utf32_to_utf16.cpp */
-/* begin file src/arm64/arm_convert_utf32_to_utf8.cpp */
-std::pair<const char32_t *, char *>
-arm_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char32_t *end = buf + len;
-
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
 
-  uint16x8_t forbidden_bytemask = vmovq_n_u16(0x0);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    scalar_res.count += (std::get<0>(ret) - buf);
+    return scalar_res;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_res.error) {
+      scalar_res.count += (std::get<0>(ret) - buf);
+      return scalar_res;
+    } else {
+      scalar_res.count += saved_bytes;
+      return scalar_res;
+    }
+  }
+  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+}
 
-  while (buf + 16 + safety_margin < end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                          utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        continue; // we are done for this round!
-      }
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::tuple<const char16_t *, char32_t *, bool> ret =
+      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (!std::get<2>(ret)) {
+    return 0;
+  }
+  size_t saved_bytes = std::get<1>(ret) - utf32_output;
+  if (std::get<0>(ret) != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-        // 1. prepare 2-byte values
-        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-        // expected output   : [110a|aaaa|10bb|bbbb] x 8
-        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  size_t pos = 0;
+  const __m512i byteflip = _mm512_setr_epi64(
+      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+      0x0607040502030001, 0x0e0f0c0d0a0b0809);
+  while (pos + 32 <= length) {
+    __m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos));
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    _mm512_storeu_si512(output + pos, utf16);
+    pos += 32;
+  }
+  if (pos < length) {
+    __mmask32 m((1U << (length - pos)) - 1);
+    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos));
+    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+    _mm512_mask_storeu_epi16(output + pos, m, utf16);
+  }
+}
 
-        // t0 = [000a|aaaa|bbbb|bb00]
-        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-        // t1 = [000a|aaaa|0000|0000]
-        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-        // t2 = [0000|0000|00bb|bbbb]
-        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-        // t3 = [000a|aaaa|00bb|bbbb]
-        const uint16x8_t t3 = vorrq_u16(t1, t2);
-        // t4 = [110a|aaaa|10bb|bbbb]
-        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-        // 2. merge ASCII and 2-byte codewords
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
-            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-        // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t mask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                 0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-        // 4. pack the bytes
-        const uint8_t *row =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-        const uint8x16_t shuffle = vld1q_u8(row + 1);
-        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
 
-        // 5. store bytes
-        vst1q_u8(utf8_output, utf8_packed);
+  if (length >= 32) {
+    const char16_t *end = input + length - 32;
 
-        // 6. adjust pointers
-        buf += 8;
-        utf8_output += row[0];
-        continue;
-      } else {
-        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        forbidden_bytemask =
-            vorrq_u16(vandq_u16(vcleq_u16(utf16_packed, v_dfff),
-                                vcgeq_u16(utf16_packed, v_d800)),
-                      forbidden_bytemask);
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-        /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-          single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-          two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-          three UTF-8 bytes
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 32;
+      uint64_t not_high_surrogate =
+          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
+                                _mm512_cmplt_epu16_mask(utf16, low));
+      count += count_ones(not_high_surrogate);
+    }
+  }
 
-          We expand the input word (16-bit) into two code units (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
+  return count + scalar::utf16::count_code_points<endianness::LITTLE>(
+                     ptr, length - (ptr - input));
+}
 
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
+  if (length >= 32) {
 
-          We precompute byte 1 for case #3 and -- **conditionally** --
-          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
-          they differ by exactly one bit.
+    const char16_t *end = input + length - 32;
 
-          Finally from these two code units we build proper UTF-8 sequence,
-          taking into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 =
-            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
-                                            vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
+    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
 
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 =
-            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask =
-            vcleq_u16(utf16_packed, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
-                                        one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+        0x0607040502030001, 0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+      __m512i utf16 =
+          _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip);
+      ptr += 32;
+      uint64_t not_high_surrogate =
+          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
+                                _mm512_cmplt_epu16_mask(utf16, low));
+      count += count_ones(not_high_surrogate);
+    }
+  }
 
-        // 4. expand code units 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+  return count + scalar::utf16::count_code_points<endianness::BIG>(
+                     ptr, length - (ptr - input));
+}
 
-        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-        const uint16x8_t twomask = simdutf_make_uint16x8_t(
-            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                    0x0100, 0x0400, 0x1000, 0x4000};
-        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                    0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-        const uint16x8_t combined =
-            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                      vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte code units. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
-        const uint8_t *row0 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer =
+      length / sizeof(__m512i) *
+      sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
+  size_t i = 0;
+  __m512i unrolled_popcount{0};
 
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t *row1 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
 
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
+  while (i + sizeof(__m512i) <= length) {
+    size_t iterations = (length - i) / sizeof(__m512i);
 
-        buf += 8;
-      }
-      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) {
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr,
-                                  reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
+    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+    for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
+      __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+      __m512i input2 =
+          _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+      __m512i input3 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
+      __m512i input4 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
+      __m512i input5 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
+      __m512i input6 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
+      __m512i input7 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
+      __m512i input8 =
+          _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));
+
+      __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
+      __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
+      __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
+      __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
+      __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
+      __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
+      __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
+      __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
+
+      __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
+                                               mask4, mask3, mask2, mask1);
+
+      unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
+                                           _mm512_popcnt_epi64(mask_register));
     }
-  } // while
 
-  // check for invalid input
-  if (vmaxvq_u16(forbidden_bytemask) != 0) {
-    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+    for (; i <= max_i; i += sizeof(__m512i)) {
+      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+      uint64_t continuation_bitmask = static_cast<uint64_t>(
+          _mm512_cmple_epi8_mask(more_input, continuation));
+      answer -= count_ones(continuation_bitmask);
+    }
   }
-  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
-}
 
-std::pair<result, char *>
-arm_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                      char *utf8_out) {
-  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+  __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
+  __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
+  answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
+            (size_t)_mm256_extract_epi64(first_half, 1) +
+            (size_t)_mm256_extract_epi64(first_half, 2) +
+            (size_t)_mm256_extract_epi64(first_half, 3) +
+            (size_t)_mm256_extract_epi64(second_half, 0) +
+            (size_t)_mm256_extract_epi64(second_half, 1) +
+            (size_t)_mm256_extract_epi64(second_half, 2) +
+            (size_t)_mm256_extract_epi64(second_half, 3);
 
-  const uint16x8_t v_c080 = vmovq_n_u16((uint16_t)0xc080);
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+  return answer + scalar::utf8::count_code_points(
+                      reinterpret_cast<const char *>(str + i), length - i);
+}
 
-  while (buf + 16 + safety_margin < end) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(buf));
-    uint32x4_t nextin = vld1q_u32(reinterpret_cast<const uint32_t *>(buf + 4));
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
+}
 
-    // Check if no bits set above 16th
-    if (vmaxvq_u32(vorrq_u32(in, nextin)) <= 0xFFFF) {
-      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
-      // Apply UTF-16 => UTF-8 routine (arm_convert_utf16_to_utf8.cpp)
-      uint16x8_t utf16_packed = vcombine_u16(vmovn_u32(in), vmovn_u32(nextin));
-      if (vmaxvq_u16(utf16_packed) <= 0x7F) { // ASCII fast path!!!!
-        // 1. pack the bytes
-        // obviously suboptimal.
-        uint8x8_t utf8_packed = vmovn_u16(utf16_packed);
-        // 2. store (8 bytes)
-        vst1_u8(utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        continue; // we are done for this round!
-      }
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
+}
 
-      if (vmaxvq_u16(utf16_packed) <= 0x7FF) {
-        // 1. prepare 2-byte values
-        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-        // expected output   : [110a|aaaa|10bb|bbbb] x 8
-        const uint16x8_t v_1f00 = vmovq_n_u16((int16_t)0x1f00);
-        const uint16x8_t v_003f = vmovq_n_u16((int16_t)0x003f);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-        // t0 = [000a|aaaa|bbbb|bb00]
-        const uint16x8_t t0 = vshlq_n_u16(utf16_packed, 2);
-        // t1 = [000a|aaaa|0000|0000]
-        const uint16x8_t t1 = vandq_u16(t0, v_1f00);
-        // t2 = [0000|0000|00bb|bbbb]
-        const uint16x8_t t2 = vandq_u16(utf16_packed, v_003f);
-        // t3 = [000a|aaaa|00bb|bbbb]
-        const uint16x8_t t3 = vorrq_u16(t1, t2);
-        // t4 = [110a|aaaa|10bb|bbbb]
-        const uint16x8_t t4 = vorrq_u16(t3, v_c080);
-        // 2. merge ASCII and 2-byte codewords
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-        const uint8x16_t utf8_unpacked = vreinterpretq_u8_u16(
-            vbslq_u16(one_byte_bytemask, utf16_packed, t4));
-        // 3. prepare bitmask for 8-bit lookup
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t mask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0002, 0x0008, 0x0020, 0x0080);
-#else
-        const uint16x8_t mask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                 0x0002, 0x0008, 0x0020, 0x0080};
-#endif
-        uint16_t m2 = vaddvq_u16(vandq_u16(one_byte_bytemask, mask));
-        // 4. pack the bytes
-        const uint8_t *row =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-        const uint8x16_t shuffle = vld1q_u8(row + 1);
-        const uint8x16_t utf8_packed = vqtbl1q_u8(utf8_unpacked, shuffle);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
+  if (length >= 32) {
+    const char16_t *end = input + length - 32;
 
-        // 5. store bytes
-        vst1q_u8(utf8_output, utf8_packed);
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-        // 6. adjust pointers
-        buf += 8;
-        utf8_output += row[0];
-        continue;
-      } else {
-        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 32;
+      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+      __mmask32 two_bytes_bitmask =
+          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+      __mmask32 surrogates_bitmask =
+          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
+          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-        // check for invalid input
-        const uint16x8_t v_d800 = vmovq_n_u16((uint16_t)0xd800);
-        const uint16x8_t v_dfff = vmovq_n_u16((uint16_t)0xdfff);
-        const uint16x8_t forbidden_bytemask = vandq_u16(
-            vcleq_u16(utf16_packed, v_dfff), vcgeq_u16(utf16_packed, v_d800));
-        if (vmaxvq_u16(forbidden_bytemask) != 0) {
-          return std::make_pair(result(error_code::SURROGATE, buf - start),
-                                reinterpret_cast<char *>(utf8_output));
-        }
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+      size_t three_bytes_count =
+          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
 
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t dup_even = simdutf_make_uint16x8_t(
-            0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-#else
-        const uint16x8_t dup_even = {0x0000, 0x0202, 0x0404, 0x0606,
-                                     0x0808, 0x0a0a, 0x0c0c, 0x0e0e};
-#endif
-        /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-          single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-          two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-          three UTF-8 bytes
+      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+               2 * surrogate_bytes_count;
+    }
+  }
 
-          We expand the input word (16-bit) into two code units (32-bit), thus
-          we have room for four bytes. However, we need five distinct bit
-          layouts. Note that the last byte in cases #2 and #3 is the same.
+  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
+                     ptr, length - (ptr - input));
+}
 
-          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-          in register t2.
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  const char16_t *ptr = input;
+  size_t count{0};
 
-          We precompute byte 1 for case #3 and -- **conditionally** --
-          precompute either byte 1 for case #2 or byte 2 for case #3. Note that
-          they differ by exactly one bit.
+  if (length >= 32) {
+    const char16_t *end = input + length - 32;
 
-          Finally from these two code units we build proper UTF-8 sequence,
-          taking into account the case (i.e, the number of bytes to write).
-        */
-        /**
-         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-         * t2 => [0ccc|cccc] [10cc|cccc]
-         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-         */
-#define simdutf_vec(x) vmovq_n_u16(static_cast<uint16_t>(x))
-        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-        const uint16x8_t t0 =
-            vreinterpretq_u16_u8(vqtbl1q_u8(vreinterpretq_u8_u16(utf16_packed),
-                                            vreinterpretq_u8_u16(dup_even)));
-        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-        const uint16x8_t t1 = vandq_u16(t0, simdutf_vec(0b0011111101111111));
-        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-        const uint16x8_t t2 = vorrq_u16(t1, simdutf_vec(0b1000000000000000));
+    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
+    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
+    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
+    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
 
-        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
-        const uint16x8_t s0 = vshrq_n_u16(utf16_packed, 12);
-        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
-        const uint16x8_t s1 =
-            vandq_u16(utf16_packed, simdutf_vec(0b0000111111000000));
-        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
-        const uint16x8_t s1s = vshlq_n_u16(s1, 2);
-        // [00bb|bbbb|0000|aaaa]
-        const uint16x8_t s2 = vorrq_u16(s0, s1s);
-        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-        const uint16x8_t s3 = vorrq_u16(s2, simdutf_vec(0b1100000011100000));
-        const uint16x8_t v_07ff = vmovq_n_u16((uint16_t)0x07FF);
-        const uint16x8_t one_or_two_bytes_bytemask =
-            vcleq_u16(utf16_packed, v_07ff);
-        const uint16x8_t m0 = vbicq_u16(simdutf_vec(0b0100000000000000),
-                                        one_or_two_bytes_bytemask);
-        const uint16x8_t s4 = veorq_u16(s3, m0);
-#undef simdutf_vec
+    const __m512i byteflip = _mm512_setr_epi64(
+        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
+        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
+        0x0607040502030001, 0x0e0f0c0d0a0b0809);
+    while (ptr <= end) {
+      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
+      utf16 = _mm512_shuffle_epi8(utf16, byteflip);
+      ptr += 32;
+      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
+      __mmask32 two_bytes_bitmask =
+          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
+      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
+      __mmask32 surrogates_bitmask =
+          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
+          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
 
-        // 4. expand code units 16-bit => 32-bit
-        const uint8x16_t out0 = vreinterpretq_u8_u16(vzip1q_u16(t2, s4));
-        const uint8x16_t out1 = vreinterpretq_u8_u16(vzip2q_u16(t2, s4));
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
+      size_t three_bytes_count =
+          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+               2 * surrogate_bytes_count;
+    }
+  }
 
-        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-        const uint16x8_t v_007f = vmovq_n_u16((uint16_t)0x007F);
-        const uint16x8_t one_byte_bytemask = vcleq_u16(utf16_packed, v_007f);
-#ifdef SIMDUTF_REGULAR_VISUAL_STUDIO
-        const uint16x8_t onemask = simdutf_make_uint16x8_t(
-            0x0001, 0x0004, 0x0010, 0x0040, 0x0100, 0x0400, 0x1000, 0x4000);
-        const uint16x8_t twomask = simdutf_make_uint16x8_t(
-            0x0002, 0x0008, 0x0020, 0x0080, 0x0200, 0x0800, 0x2000, 0x8000);
-#else
-        const uint16x8_t onemask = {0x0001, 0x0004, 0x0010, 0x0040,
-                                    0x0100, 0x0400, 0x1000, 0x4000};
-        const uint16x8_t twomask = {0x0002, 0x0008, 0x0020, 0x0080,
-                                    0x0200, 0x0800, 0x2000, 0x8000};
-#endif
-        const uint16x8_t combined =
-            vorrq_u16(vandq_u16(one_byte_bytemask, onemask),
-                      vandq_u16(one_or_two_bytes_bytemask, twomask));
-        const uint16_t mask = vaddvq_u16(combined);
-        // The following fast path may or may not be beneficial.
-        /*if(mask == 0) {
-          // We only have three-byte code units. Use fast path.
-          const uint8x16_t shuffle = {2,3,1,6,7,5,10,11,9,14,15,13,0,0,0,0};
-          const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle);
-          const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle);
-          vst1q_u8(utf8_output, utf8_0);
-          utf8_output += 12;
-          vst1q_u8(utf8_output, utf8_1);
-          utf8_output += 12;
-          buf += 8;
-          continue;
-        }*/
-        const uint8_t mask0 = uint8_t(mask);
+  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
+                     ptr, length - (ptr - input));
+}
 
-        const uint8_t *row0 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-        const uint8x16_t shuffle0 = vld1q_u8(row0 + 1);
-        const uint8x16_t utf8_0 = vqtbl1q_u8(out0, shuffle0);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return implementation::count_utf16le(input, length);
+}
 
-        const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-        const uint8_t *row1 =
-            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-        const uint8x16_t shuffle1 = vld1q_u8(row1 + 1);
-        const uint8x16_t utf8_1 = vqtbl1q_u8(out1, shuffle1);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return implementation::count_utf16be(input, length);
+}
 
-        vst1q_u8(utf8_output, utf8_0);
-        utf8_output += row0[0];
-        vst1q_u8(utf8_output, utf8_1);
-        utf8_output += row1[0];
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
+}
 
-        buf += 8;
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
+  size_t i = 0;
+  if (answer >= 2048) { // long strings optimization
+    unsigned char v_0xFF = 0xff;
+    __m512i eight_64bits = _mm512_setzero_si512();
+    while (i + sizeof(__m512i) <= length) {
+      __m512i runner = _mm512_setzero_si512();
+      size_t iterations = (length - i) / sizeof(__m512i);
+      if (iterations > 255) {
+        iterations = 255;
       }
-      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes.
-    } else {
-      // Let us do a scalar fallback.
-      // It may seem wasteful to use scalar code, but being efficient with SIMD
-      // in the presence of surrogate pairs may require non-trivial tables.
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
+      size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
+      for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
+        // Load four __m512i vectors
+        __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
+        __m512i input2 =
+            _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
+        __m512i input3 = _mm512_loadu_si512(
+            (const __m512i *)(str + i + 2 * sizeof(__m512i)));
+        __m512i input4 = _mm512_loadu_si512(
+            (const __m512i *)(str + i + 3 * sizeof(__m512i)));
+
+        // Generate four masks
+        __mmask64 mask1 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
+        __mmask64 mask2 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
+        __mmask64 mask3 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
+        __mmask64 mask4 =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
+        // Apply the masks and subtract from the runner
+        __m512i not_ascii1 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
+        __m512i not_ascii2 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
+        __m512i not_ascii3 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
+        __m512i not_ascii4 =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
+
+        runner = _mm512_sub_epi8(runner, not_ascii1);
+        runner = _mm512_sub_epi8(runner, not_ascii2);
+        runner = _mm512_sub_epi8(runner, not_ascii3);
+        runner = _mm512_sub_epi8(runner, not_ascii4);
       }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) {
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) {
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) {
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k),
-                reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else {
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k),
-                reinterpret_cast<char *>(utf8_output));
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
+
+      for (; i <= max_i; i += sizeof(__m512i)) {
+        __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
+
+        __mmask64 mask =
+            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
+        __m512i not_ascii =
+            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
+        runner = _mm512_sub_epi8(runner, not_ascii);
       }
-      buf += k;
+
+      eight_64bits = _mm512_add_epi64(
+          eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
+    }
+
+    __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
+    __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
+    answer += (size_t)_mm256_extract_epi64(first_half, 0) +
+              (size_t)_mm256_extract_epi64(first_half, 1) +
+              (size_t)_mm256_extract_epi64(first_half, 2) +
+              (size_t)_mm256_extract_epi64(first_half, 3) +
+              (size_t)_mm256_extract_epi64(second_half, 0) +
+              (size_t)_mm256_extract_epi64(second_half, 1) +
+              (size_t)_mm256_extract_epi64(second_half, 2) +
+              (size_t)_mm256_extract_epi64(second_half, 3);
+  } else if (answer > 0) {
+    for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
+      __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
+      uint64_t non_ascii = _mm512_movepi8_mask(latin);
+      answer += count_ones(non_ascii);
     }
-  } // while
+  }
+  return answer + scalar::latin1::utf8_length_from_latin1(
+                      reinterpret_cast<const char *>(str + i), length - i);
+}
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start),
-                        reinterpret_cast<char *>(utf8_output));
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= length; pos += 64) {
+    __m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos));
+    uint64_t utf8_continuation_mask =
+        _mm512_cmplt_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    uint64_t utf8_4byte =
+        _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
+    count += count_ones(utf8_4byte);
+  }
+  return count +
+         scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
 }
-/* end file src/arm64/arm_convert_utf32_to_utf8.cpp */
 
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* begin file src/generic/buf_block_reader.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const char32_t *ptr = input;
+  size_t count{0};
 
-// Walks through a buffer in block-sized increments, loading the last part with
-// spaces
-template <size_t STEP_SIZE> struct buf_block_reader {
-public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0
-   * (in which case this function fills the buffer with spaces and returns 0. In
-   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
-   * block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+  if (length >= 16) {
+    const char32_t *end = input + length - 16;
 
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
+    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text_64(const uint8_t *text) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+    while (ptr <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 16;
+      __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
+      __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
+          _knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
+      __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
+          _knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32,
+          v_0000_ffff);
+
+      size_t ascii_count = count_ones(ascii_bitmask);
+      size_t two_bytes_count = count_ones(two_bytes_bitmask);
+      size_t three_bytes_count = count_ones(three_bytes_bitmask);
+      size_t four_bytes_count =
+          16 - ascii_count - two_bytes_count - three_bytes_count;
+      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
+               4 * four_bytes_count;
+    }
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+
+  return count +
+         scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
 }
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t *>(buf));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') {
-      buf[i] = '_';
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const char32_t *ptr = input;
+  size_t count{0};
+
+  if (length >= 16) {
+    const char32_t *end = input + length - 16;
+
+    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+
+    while (ptr <= end) {
+      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
+      ptr += 16;
+      __mmask16 surrogates_bitmask =
+          _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+
+      count += 16 + count_ones(surrogates_bitmask);
     }
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
+
+  return count +
+         scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_unused static char *format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
-  for (size_t i = 0; i < 64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
-  }
-  buf[64] = '\0';
-  return buf;
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return implementation::count_utf8(input, length);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline
-buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
-    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
-      idx{0} {}
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
-  return idx;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *
-buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t
-buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if (len == idx) {
-    return 0;
-  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20,
-              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
-                          // to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-} // unnamed namespace
-} // namespace arm64
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
+}
+
+} // namespace icelake
 } // namespace simdutf
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_validation {
 
-using namespace simd;
+/* begin file src/simdutf/icelake/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+SIMDUTF_POP_DISABLE_WARNINGS
+#endif // end of workaround
+/* end file src/simdutf/icelake/end.h */
+/* end file src/icelake/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_HASWELL
+/* begin file src/haswell/implementation.cpp */
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+/* begin file src/simdutf/haswell/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "haswell"
+// #define SIMDUTF_IMPLEMENTATION haswell
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+// nothing needed.
+#else
+SIMDUTF_TARGET_HASWELL
+#endif
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+// clang-format off
+SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
+// clang-format on
+#endif // end of workaround
+/* end file src/simdutf/haswell/begin.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+#ifndef SIMDUTF_HASWELL_H
+  #error "haswell.h must be included"
+#endif
+using namespace simd;
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  return input.reduce_or().is_ascii();
 }
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte =
+      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction
+  // will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+         int8_t(0);
 }
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the
-// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
-  // If the previous input's last 3 bytes match this, they're too short (they
-  // ended at EOF):
-  // ... 1111____ 111_____ 11______
-  static const uint8_t max_array[32] = {255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        0b11110000u - 1,
-                                        0b11100000u - 1,
-                                        0b11000000u - 1};
-  const simd8<uint8_t> max_value(
-      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
-  return input.gt_bits(max_value);
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
+  return simd8<bool>(is_third_byte | is_fourth_byte);
 }
 
-struct utf8_checker {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
-  // The last input we received
-  simd8<uint8_t> prev_input_block;
-  // Whether the last input we received was incomplete (used for ASCII fast
-  // path)
-  simd8<uint8_t> prev_incomplete;
+/* begin file src/haswell/avx2_validate_utf16.cpp */
+/*
+    In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+    In a vectorized algorithm we want to examine the most significant
+    nibble in order to select a fast path. If none of highest nibbles
+    are 0xD (13), than we are sure that UTF-16 chunk in a vector
+    register is valid.
+
+    Let us analyze what we need to check if the nibble is 0xD. The
+    value of the preceding nibble determines what we have:
+
+    0xd000 .. 0xd7ff - a valid word
+    0xd800 .. 0xdbff - low surrogate
+    0xdc00 .. 0xdfff - high surrogate
+
+    Other constraints we have to consider:
+    - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+    - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+    - there must not be sole low surrogate nor high surrogate
+
+    We're going to build three bitmasks based on the 3rd nibble:
+    - V = valid word,
+    - L = low surrogate (0xd800 .. 0xdbff)
+    - H = high surrogate (0xdc00 .. 0xdfff)
+
+      0   1   2   3   4   5   6   7    <--- word index
+    [ V | L | H | L | H | V | V | L ]
+      1   0   0   0   0   1   1   0     - V = valid masks
+      0   1   0   1   0   0   0   1     - L = low surrogate
+      0   0   1   0   1   0   0   0     - H high surrogate
+
+
+      1   0   0   0   0   1   1   0   V = valid masks
+      0   1   0   1   0   0   0   0   a = L & (H >> 1)
+      0   0   1   0   1   0   0   0   b = a << 1
+      1   1   1   1   1   1   1   0   c = V | a | b
+                                  ^
+                                  the last bit can be zero, we just consume 7
+   code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check
+   the rest);
+   - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
+
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
+
+    const auto in = simd16<uint16_t>::pack(t0, t1);
+
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint32_t V = ~surrogates_bitmask;
 
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint32_t H = vH.to_bitmask();
 
-  // The only problem that can happen at EOF is that a multibyte character is
-  // too short or a byte value too large in the last bytes: check_special_cases
-  // only checks for bytes too large in the first of two bytes.
-  simdutf_really_inline void check_eof() {
-    // If the previous block had incomplete UTF-8 characters at the end, an
-    // ASCII block can't possibly finish them.
-    this->error |= this->prev_incomplete;
-  }
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint32_t L = ~H & surrogates_bitmask;
 
-  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
-    if (simdutf_likely(is_ascii(input))) {
-      this->error |= this->prev_incomplete;
-    } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+      if (c == 0xffffffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+      } else {
+        return nullptr;
       }
-      this->prev_incomplete =
-          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
   }
 
-  // do not forget to call check_eof!
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
+  return input;
+}
+
+template <endianness big_endian>
+const result avx2_validate_utf16_with_errors(const char16_t *input,
+                                             size_t size) {
+  if (simdutf_unlikely(size == 0)) {
+    return result(error_code::SUCCESS, 0);
   }
+  const char16_t *start = input;
+  const char16_t *end = input + size;
 
-}; // struct utf8_checker
-} // namespace utf8_validation
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-using utf8_validation::utf8_checker;
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
 
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-/* begin file src/generic/utf8_validation/utf8_validator.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_validation {
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
 
-/**
- * Validates that the string is actual UTF-8.
- */
-template <class checker>
-bool generic_validate_utf8(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    reader.advance();
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  return !c.errors();
-}
+    const auto t0 = in0.shr<8>();
+    const auto t1 = in1.shr<8>();
 
-bool generic_validate_utf8(const char *input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+    const auto in = simd16<uint16_t>::pack(t0, t1);
 
-/**
- * Validates that the string is actual UTF-8 and stops on errors.
- */
-template <class checker>
-result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    if (c.errors()) {
-      if (count != 0) {
-        count--;
-      } // Sometimes the error is only detected in the next chunk
-      result res = scalar::utf8::rewind_and_validate_with_errors(
-          reinterpret_cast<const char *>(input),
-          reinterpret_cast<const char *>(input + count), length - count);
-      res.count += count;
-      return res;
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint32_t V = ~surrogates_bitmask;
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint32_t H = vH.to_bitmask();
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint32_t L = ~H & surrogates_bitmask;
+
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+
+      if (c == 0xffffffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+      } else {
+        return result(error_code::SURROGATE, input - start);
+      }
     }
-    reader.advance();
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  if (c.errors()) {
-    if (count != 0) {
-      count--;
-    } // Sometimes the error is only detected in the next chunk
-    result res = scalar::utf8::rewind_and_validate_with_errors(
-        reinterpret_cast<const char *>(input),
-        reinterpret_cast<const char *>(input) + count, length - count);
-    res.count += count;
-    return res;
-  } else {
-    return result(error_code::SUCCESS, length);
   }
-}
 
-result generic_validate_utf8_with_errors(const char *input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+  return result(error_code::SUCCESS, input - start);
 }
+/* end file src/haswell/avx2_validate_utf16.cpp */
+/* begin file src/haswell/avx2_validate_utf32le.cpp */
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check
+   the rest);
+   - nullptr if an error was detected.
+*/
+const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) {
+  const char32_t *end = input + size;
 
-template <class checker>
-bool generic_validate_ascii(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  uint8_t blocks[64]{};
-  simd::simd8x64<uint8_t> running_or(blocks);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    running_or |= in;
-    reader.advance();
+  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+  const __m256i offset = _mm256_set1_epi32(0xffff2000);
+  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+  __m256i currentmax = _mm256_setzero_si256();
+  __m256i currentoffsetmax = _mm256_setzero_si256();
+
+  while (input + 8 < end) {
+    const __m256i in = _mm256_loadu_si256((__m256i *)input);
+    currentmax = _mm256_max_epu32(in, currentmax);
+    currentoffsetmax =
+        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+    input += 8;
+  }
+  __m256i is_zero =
+      _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
+  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    return nullptr;
   }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  running_or |= in;
-  return running_or.is_ascii();
-}
 
-bool generic_validate_ascii(const char *input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+  is_zero = _mm256_xor_si256(
+      _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
+  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    return nullptr;
+  }
+
+  return input;
 }
 
-template <class checker>
-result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(
-          reinterpret_cast<const char *>(input + count), length - count);
-      return result(res.error, count + res.count);
+const result avx2_validate_utf32le_with_errors(const char32_t *input,
+                                               size_t size) {
+  const char32_t *start = input;
+  const char32_t *end = input + size;
+
+  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
+  const __m256i offset = _mm256_set1_epi32(0xffff2000);
+  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
+  __m256i currentmax = _mm256_setzero_si256();
+  __m256i currentoffsetmax = _mm256_setzero_si256();
+
+  while (input + 8 < end) {
+    const __m256i in = _mm256_loadu_si256((__m256i *)input);
+    currentmax = _mm256_max_epu32(in, currentmax);
+    currentoffsetmax =
+        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+
+    __m256i is_zero = _mm256_xor_si256(
+        _mm256_max_epu32(currentmax, standardmax), standardmax);
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+      return result(error_code::TOO_LARGE, input - start);
     }
-    reader.advance();
 
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(
-        reinterpret_cast<const char *>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
+    is_zero =
+        _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax),
+                         standardoffsetmax);
+    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+      return result(error_code::SURROGATE, input - start);
+    }
+    input += 8;
   }
+
+  return result(error_code::SUCCESS, input - start);
 }
+/* end file src/haswell/avx2_validate_utf32le.cpp */
 
-result generic_validate_ascii_with_errors(const char *input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
+std::pair<const char *, char *>
+avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                            char *utf8_output) {
+  const char *end = latin1_input + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+  const size_t safety_margin = 12;
+
+  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
+    __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m128i v_80 = _mm_set1_epi8((char)0x80);
+    if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
+      // 1. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, in8);
+      // 2. adjust pointers
+      latin1_input += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // We proceed only with the first 16 bytes.
+    const __m256i in = _mm256_cvtepu8_epi16((in8));
+
+    // 1. prepare 2-byte values
+    // input 16-bit word : [0000|0000|aabb|bbbb] x 8
+    // expected output   : [1100|00aa|10bb|bbbb] x 8
+    const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+    const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+
+    // t0 = [0000|00aa|bbbb|bb00]
+    const __m256i t0 = _mm256_slli_epi16(in, 2);
+    // t1 = [0000|00aa|0000|0000]
+    const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+    // t2 = [0000|0000|00bb|bbbb]
+    const __m256i t2 = _mm256_and_si256(in, v_003f);
+    // t3 = [000a|aaaa|00bb|bbbb]
+    const __m256i t3 = _mm256_or_si256(t1, t2);
+    // t4 = [1100|00aa|10bb|bbbb]
+    const __m256i t4 = _mm256_or_si256(t3, v_c080);
+
+    // 2. merge ASCII and 2-byte codewords
+
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+    // 3. prepare bitmask for 8-bit lookup
+    const uint32_t M0 = one_byte_bitmask & 0x55555555;
+    const uint32_t M1 = M0 >> 7;
+    const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+    // 4. pack the bytes
+
+    const uint8_t *row =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+    const uint8_t *row_2 =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
+                                                            [0];
+
+    const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+    const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+
+    const __m256i utf8_packed = _mm256_shuffle_epi8(
+        utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+    // 5. store bytes
+    _mm_storeu_si128((__m128i *)utf8_output,
+                     _mm256_castsi256_si128(utf8_packed));
+    utf8_output += row[0];
+    _mm_storeu_si128((__m128i *)utf8_output,
+                     _mm256_extractf128_si256(utf8_packed, 1));
+    utf8_output += row_2[0];
+
+    // 6. adjust pointers
+    latin1_input += 16;
+    continue;
+
+  } // while
+  return std::make_pair(latin1_input, utf8_output);
 }
+/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */
+/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+                             char16_t *utf16_output) {
+  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
 
-} // namespace utf8_validation
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_validator.h */
-// transcoding from UTF-8 to UTF-16
-/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+  size_t i = 0;
+  for (; i < rounded_len; i += 16) {
+    // Load 16 bytes from the address (input + i) into a xmm register
+    __m128i xmm0 =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf16 {
-using namespace simd;
+    // Zero extend each byte in xmm0 to word and put it in another xmm register
+    __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    // Shift xmm0 to the right by 8 bytes
+    xmm0 = _mm_srli_si128(xmm0, 8);
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+    // Zero extend each byte in the shifted xmm0 to word in xmm0
+    xmm0 = _mm_cvtepu8_epi16(xmm0);
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+    if (big_endian) {
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      xmm0 = _mm_shuffle_epi8(xmm0, swap);
+      xmm1 = _mm_shuffle_epi8(xmm1, swap);
+    }
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+    // Store the contents of xmm1 into the address pointed by (output + i)
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1);
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+    // Store the contents of xmm0 into the address pointed by (output + i + 8)
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0);
+  }
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
+  return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
 }
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
+/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */
+/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
+std::pair<const char *, char32_t *>
+avx2_convert_latin1_to_utf32(const char *buf, size_t len,
+                             char32_t *utf32_output) {
+  size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
+
+  for (size_t i = 0; i < rounded_len; i += 8) {
+    // Load 8 Latin1 characters into a 64-bit register
+    __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
+
+    // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
+    // vpmovzxbd
+    __m256i out = _mm256_cvtepu8_epi32(in);
+
+    // Store the results back to memory
+    _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
+  }
+
+  // return pointers pointing to where we left off
+  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
 }
+/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
 
-  validating_transcoder() : error(uint8_t(0)) {}
+// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
   //
-  // Check whether the current bytes are valid UTF-8.
   //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i swap =
+      _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    __m256i ascii = _mm256_cvtepu8_epi16(in);
+    if (big_endian) {
+      const __m256i swap256 = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      ascii = _mm256_shuffle_epi8(ascii, swap256);
+    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+    utf16_output += 12; // We wrote 12 16-bit characters.
+    return 12;          // We consumed 12 bytes.
+  }
+  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
+    // UTF-16 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian)
+      composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian)
+      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4;
+    return 12;
   }
 
-  template <endianness endian>
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-code units. The max length in bytes of six
+    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+    // processors where pdep/pext is fast, we might be able to use a small
+    // lookup table.
+    const __m128i sh = _mm_loadu_si128(
+        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    if (big_endian)
+      composed = _mm_shuffle_epi8(composed, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed);
+    utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
+                       // overflow of 4 bytes.
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    const __m128i sh = _mm_loadu_si128(
+        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
+    if (big_endian)
+      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
+    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
+    utf16_output += 4; // Here we overflow by 8 bytes.
+  } else if (idx < 209) {
+    // TWO (2) input code-code units
+    //////////////
+    // There might be garbage inputs where a leading byte mascarades as a
+    // four-byte leading byte (by being followed by 3 continuation byte), but is
+    // not greater than 0xf0. This could trigger a buffer overflow if we only
+    // counted leading bytes of the form 0xf0 as generating surrogate pairs,
+    // without further UTF-8 validation. Thus we must be careful to ensure that
+    // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
+    // do as at the cost of an extra mask.
+    /////////////
+    const __m128i sh = _mm_loadu_si128(
+        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    // We deliberately carry the leading four bits in highbyte if they are
+    // present, we remove them later when computing hightenbits.
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    // When we need to generate a surrogate pair (leading byte > 0xF0), then
+    // the corresponding 32-bit value in 'composed'  will be greater than
+    // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
+    // location of the surrogate pairs.
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    const __m128i composedminus =
+        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
+    const __m128i lowtenbits =
+        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
+    // Notice the 0x3ff mask:
+    const __m128i hightenbits =
+        _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
+    const __m128i lowtenbitsadd =
+        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
+    const __m128i hightenbitsadd =
+        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
+    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
+    __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
+    uint32_t basic_buffer[4];
+    uint32_t basic_buffer_swap[4];
+    if (big_endian) {
+      _mm_storeu_si128((__m128i *)basic_buffer_swap,
+                       _mm_shuffle_epi8(composed, swap));
+      surrogates = _mm_shuffle_epi8(surrogates, swap);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
+    _mm_storeu_si128((__m128i *)basic_buffer, composed);
+    uint32_t surrogate_buffer[4];
+    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    for (size_t i = 0; i < 3; i++) {
+      if (basic_buffer[i] > 0x3c00000) {
+        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
+        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+        utf16_output += 2;
       } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
+                                     : uint16_t(basic_buffer[i]);
+        utf16_output++;
       }
     }
-    if (errors()) {
-      return 0;
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
+
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char32_t *&utf32_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+                        _mm256_cvtepu8_epi32(in));
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
+                        _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+    utf32_output += 12; // We wrote 12 32-bit characters.
+    return 12;          // We consumed 12 bytes.
+  }
+  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
+    // UTF-32 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm256_storeu_si256((__m256i *)utf32_output,
+                        _mm256_cvtepu16_epi32(composed));
+    utf32_output += 8; // We wrote 16 bytes, 8 code points.
+    return 16;
+  }
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+    // UTF-32 code units. There is probably a more efficient sequence, but the
+    // following might do.
+    const __m128i sh =
+        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+    return 12;
+  }
+  /// We do not have a fast path available, so we fallback.
+
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // this is a relatively easy scenario
+    // we process SIX (6) input code-code units. The max length in bytes of six
+    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+    // processors where pdep/pext is fast, we might be able to use a small
+    // lookup table.
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+    _mm256_storeu_si256((__m256i *)utf32_output,
+                        _mm256_cvtepu16_epi32(composed));
+    utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
+    // overflow of 32 - 24 = 8 bytes.
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii =
+        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
+    const __m128i middlebyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    const __m128i highbyte =
+        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output += 4;
+  } else if (idx < 209) {
+    // TWO (2) input code-code units
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i perm = _mm_shuffle_epi8(in, sh);
+    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
+    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
+    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
+    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
+    // correct for spurious high bit
+    const __m128i correct =
+        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
+    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
+    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
+    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
+    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
+    const __m128i composed =
+        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
+                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
+    _mm_storeu_si128((__m128i *)utf32_output, composed);
+    utf32_output +=
+        3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+  } else {
+    // here we know that there is an error but we do not handle errors
+  }
+  return consumed;
+}
+/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+
+/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                             char *latin1_output) {
+  const char16_t *end = buf + len;
+  while (end - buf >= 16) {
+    // Load 16 UTF-16 characters into 256-bit AVX2 register
+    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+    if (!match_system(big_endian)) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-    if (pos < size) {
-      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
-          in + pos, size - pos, utf16_output);
-      if (howmany == 0) {
-        return 0;
-      }
-      utf16_output += howmany;
+
+    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+    if (_mm256_testz_si256(in, high_byte_mask)) {
+      // Pack 16-bit characters into 8-bit and store in latin1_output
+      __m128i lo = _mm256_extractf128_si256(in, 0);
+      __m128i hi = _mm256_extractf128_si256(in, 1);
+      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+                       latin1_packed_lo);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+                       latin1_packed_hi);
+      // Adjust pointers for next iteration
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
-    return utf16_output - start;
-  }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
 
-  template <endianness endian>
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+template <endianness big_endian>
+std::pair<result, char *>
+avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                         char *latin1_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  while (end - buf >= 16) {
+    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+
+    if (!match_system(big_endian)) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res =
-              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-                  pos, in + pos, size - pos, utf16_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
+
+    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
+    if (_mm256_testz_si256(in, high_byte_mask)) {
+      __m128i lo = _mm256_extractf128_si256(in, 0);
+      __m128i hi = _mm256_extractf128_si256(in, 1);
+      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
+      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
+                       latin1_packed_lo);
+      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
+                       latin1_packed_hi);
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      // Fallback to scalar code for handling errors
+      for (int k = 0; k < 16; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(
+              result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
+              latin1_output);
         }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf16_output += res.count;
       }
+      buf += 16;
     }
-    return result(error_code::SUCCESS, utf16_output - start);
-  }
+  } // while
+  return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
+                        latin1_output);
+}
+/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */
+/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
-/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+    Ad 1.
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf16 {
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
 
-using namespace simd;
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
 
-template <endianness endian>
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char16_t *utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the
-  // generic directory.
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the
-    // mask far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow
-      // path. Anything that is not a continuation mask is a 'leading byte',
-      // that is, the start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end*
-      // of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(
-            input + pos, utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+  const char16_t *end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
-      input + pos, size - pos, utf16_output);
-  return utf16_output - start;
-}
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// transcoding from UTF-8 to UTF-32
-/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf32 {
-using namespace simd;
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
+
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // we have an error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-      if (howmany == 0) {
-        return 0;
-      }
-      utf32_output += howmany;
-    }
-    return utf32_output - start;
-  }
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, utf32_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf32_output += res.count;
-      }
-    }
-    return result(error_code::SUCCESS, utf32_output - start);
-  }
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
-/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_utf32 {
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-using namespace simd;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char32_t *utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+      // surrogate pair(s) in a register
     } else {
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      size_t max_starting_point = (pos + 64) - 12;
-      while (pos < max_starting_point) {
-        size_t consumed = convert_masked_utf8_to_utf32(
-            input + pos, utf8_end_of_code_point_mask, utf32_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
       }
+      buf += k;
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
-                                                       utf32_output);
-  return utf32_output - start;
+  } // while
+  return std::make_pair(buf, utf8_output);
 }
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// other functions
-/* begin file src/generic/utf16.h */
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf16 {
-
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
 template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t *in,
-                                               size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
-    }
-    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-    count += count_ones(not_pair) / 2;
-  }
-  return count +
-         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
-}
+std::pair<result, char *>
+avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                       char *utf8_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
     }
-    uint64_t ascii_mask = input.lteq(0x7F);
-    uint64_t twobyte_mask = input.lteq(0x7FF);
-    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
+    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
+    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-    size_t ascii_count = count_ones(ascii_mask) / 2;
-    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
-             ascii_count;
-  }
-  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
-                                                                   size - pos);
-}
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
-                                                     size_t size) {
-  return count_code_points<big_endian>(in, size);
-}
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-simdutf_really_inline void
-change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
-  size_t pos = 0;
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-  while (pos < size / 32 * 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
+
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
-}
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-} // namespace utf16
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf16.h */
-/* begin file src/generic/utf8.h */
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8 {
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-using namespace simd;
+      // 6. adjust pointers
+      buf += 16;
+      continue;
+    }
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
 
-simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.gt(-65);
-    count += count_ones(utf8_continuation_mask);
-  }
-  return count + scalar::utf8::count_code_points(in + pos, size - pos);
-}
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    int64_t utf8_4byte = input.gteq_unsigned(240);
-    count += count_ones(utf8_4byte);
-  }
-  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
-}
-} // namespace utf8
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8.h */
-// transcoding from UTF-8 to Latin 1
-/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+      /* In this branch we handle three cases:
+         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_latin1 {
-using namespace simd;
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
-  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
-  // 0b11000010 and nothing else.
-  //
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
-  constexpr const uint8_t FORBIDDEN = 0xff;
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      FORBIDDEN,
-      // 1110____ ________ <three byte lead in byte 1>
-      FORBIDDEN,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      FORBIDDEN);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-              // ____0100 ________
-              FORBIDDEN,
-              // ____0101 ________
-              FORBIDDEN,
-              // ____011_ ________
-              FORBIDDEN, FORBIDDEN,
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-              // ____1___ ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
-              // ____1101 ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    this->error |= check_special_cases(input, prev1);
-  }
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
 
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char *latin1_output) {
-    size_t pos = 0;
-    char *start{latin1_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 16; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) >
-                       -65); // twos complement of -65 is 1011 1111 ...
-    }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask =
-            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                               // this case, we also have ASCII to account for.
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
-      if (howmany == 0) {
-        return 0;
-      }
-      latin1_output += howmany;
-    }
-    return latin1_output - start;
-  }
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
 
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char *latin1_output) {
-    size_t pos = 0;
-    char *start{latin1_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        if (errors()) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, latin1_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
-    }
-    if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        latin1_output += res.count;
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                utf8_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
       }
-    }
-    return result(error_code::SUCCESS, latin1_output - start);
-  }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+}
+/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
 
-}; // struct utf8_checker
-} // namespace utf8_to_latin1
-} // unnamed namespace
-} // namespace arm64
-} // namespace simdutf
-/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
-/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+    Ad 1.
 
-namespace simdutf {
-namespace arm64 {
-namespace {
-namespace utf8_to_latin1 {
-using namespace simd;
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    char) or 2) two UTF8 bytes.
 
-simdutf_really_inline size_t convert_valid(const char *in, size_t size,
-                                           char *latin1_output) {
-  size_t pos = 0;
-  char *start{latin1_output};
-  // In the worst case, we have the haswell kernel which can cause an overflow
-  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
-  // 16 bytes, and if the data is valid, then it is entirely safe because 16
-  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
-  // assume that you have valid UTF-8 input, so we are going to go back from the
-  // end counting 8 leading bytes, to give us a good margin.
-  size_t leading_byte = 0;
-  size_t margin = size;
-  for (; margin > 0 && leading_byte < 8; margin--) {
-    leading_byte += (int8_t(in[margin - 1]) >
-                     -65); // twos complement of -65 is 1011 1111 ...
-  }
-  // If the input is long enough, then we have that margin-1 is the eight last
-  // leading byte.
-  const size_t safety_margin = size - margin + 1; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    if (input.is_ascii()) {
-      input.store((int8_t *)latin1_output);
-      latin1_output += 64;
-      pos += 64;
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+    Ad 2.
+
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
+
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
+
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
+
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
+
+
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+
+/*
+  Returns a pair: the first unprocessed byte from buf and utf32_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char16_t *end = buf + len;
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
+
+  while (end - buf >= 16) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
+
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+      // units
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+      _mm256_storeu_si256(
+          reinterpret_cast<__m256i *>(utf32_output + 8),
+          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+      utf32_output += 16;
+      buf += 16;
+      // surrogate pair(s) in a register
     } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      uint64_t utf8_continuation_mask =
-          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                             // this case, we also have ASCII to account for.
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        size_t consumed = convert_masked_utf8_to_latin1(
-            in + pos, utf8_end_of_code_point_mask, latin1_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          // No surrogate pair
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr, utf32_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
     }
-  }
-  if (pos < size) {
-    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
-                                                           latin1_output);
-    latin1_output += howmany;
-  }
-  return latin1_output - start;
+  } // while
+  return std::make_pair(buf, utf32_output);
 }
 
-} // namespace utf8_to_latin1
-} // namespace
-} // namespace arm64
-} // namespace simdutf
-  // namespace simdutf
-/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                        char32_t *utf32_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
+  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-// placeholder scalars
+  while (end - buf >= 16) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    if (big_endian) {
+      const __m256i swap = _mm256_setr_epi8(
+          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
+          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
+      in = _mm256_shuffle_epi8(in, swap);
+    }
 
-//
-// Implementation-specific overrides
-//
-namespace simdutf {
-namespace arm64 {
+    // 1. Check if there are any surrogate word in the input chunk.
+    //    We have also deal with situation when there is a surrogate word
+    //    at the end of a chunk.
+    const __m256i surrogates_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
-  }
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
-  }
-  if ((length % 2) == 0) {
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
-  }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
+    // bitmask = 0x0000 if there are no surrogates
+    //         = 0xc000 if the last word is a surrogate
+    const uint32_t surrogates_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (surrogates_bitmask == 0x00000000) {
+      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
+      // units
+      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
+                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
+      _mm256_storeu_si256(
+          reinterpret_cast<__m256i *>(utf32_output + 8),
+          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
+      utf32_output += 16;
+      buf += 16;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          // No surrogate pair
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word =
+              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                utf32_output);
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
+      }
+      buf += k;
     }
-  }
-  return out;
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
+/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8(buf, len);
-}
+/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                             char *latin1_output) {
+  const size_t rounded_len =
+      len & ~0x1F; // Round down to nearest multiple of 32
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
-}
+  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii(buf, len);
-}
+  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return arm64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
-}
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
-                                 size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    // empty input is valid. protected the implementation from nullptr.
-    return true;
-  }
-  const char16_t *tail = arm_validate_utf16<endianness::LITTLE>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::LITTLE>(tail,
-                                                       len - (tail - buf));
-  } else {
-    return false;
-  }
-}
+    __m256i check_combined = _mm256_or_si256(in1, in2);
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
-                                 size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    // empty input is valid. protected the implementation from nullptr.
-    return true;
-  }
-  const char16_t *tail = arm_validate_utf16<endianness::BIG>(buf, len);
-  if (tail) {
-    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
-}
+    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+      return std::make_pair(nullptr, latin1_output);
+    }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  result res = arm_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
-        buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
-}
+    // Turn UTF32 bytes into latin 1 bytes
+    __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+    __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  result res = arm_validate_utf16_with_errors<endianness::BIG>(buf, len);
-  if (res.count != len) {
-    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
-        buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
-  }
-}
+    // move Latin1 bytes to their correct spot
+    __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+    __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+    __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+    __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    // empty input is valid. protected the implementation from nullptr.
-    return true;
-  }
-  const char32_t *tail = arm_validate_utf32le(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    return false;
-  }
-}
+    __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+    _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  result res = arm_validate_utf32le_with_errors(buf, len);
-  if (res.count != len) {
-    result scalar_res =
-        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
-    return result(scalar_res.error, res.count + scalar_res.count);
-  } else {
-    return res;
+    latin1_output += 16;
+    buf += 16;
   }
+
+  return std::make_pair(buf, latin1_output);
 }
+std::pair<result, char *>
+avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                         char *latin1_output) {
+  const size_t rounded_len =
+      len & ~0x1F; // Round down to nearest multiple of 32
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char *, char *> ret =
-      arm_convert_latin1_to_utf8(buf, len, utf8_output);
-  size_t converted_chars = ret.second - utf8_output;
+  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
+                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
-  }
-  return converted_chars;
-}
+  const char32_t *start = buf;
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char *, char16_t *> ret =
-      arm_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  size_t converted_chars = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars =
-        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
-  }
-  return converted_chars;
-}
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
+    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char *, char16_t *> ret =
-      arm_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  size_t converted_chars = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars =
-        scalar::latin1_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
-  }
-  return converted_chars;
-}
+    __m256i check_combined = _mm256_or_si256(in1, in2);
+
+    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+      // Fallback to scalar code for handling errors
+      for (int k = 0; k < 8; k++) {
+        char32_t codepoint = buf[k];
+        if (codepoint <= 0xFF) {
+          *latin1_output++ = static_cast<char>(codepoint);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+      buf += 8;
+    } else {
+      __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
+      __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char *, char32_t *> ret =
-      arm_convert_latin1_to_utf32(buf, len, utf32_output);
-  size_t converted_chars = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    converted_chars += scalar_converted_chars;
+      __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
+      __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
+      __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
+      __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
+
+      __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
+      _mm_storeu_si128((__m128i *)latin1_output,
+                       _mm256_castsi256_si128(result));
+
+      latin1_output += 16;
+      buf += 16;
+    }
   }
-  return converted_chars;
-}
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  utf8_to_latin1::validating_transcoder converter;
-  return converter.convert(buf, len, latin1_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
 }
+/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */
+/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
+std::pair<const char32_t *, char *>
+avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+  const char32_t *end = buf + len;
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  __m256i running_max = _mm256_setzero_si256();
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  utf8_to_latin1::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, latin1_output);
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return arm64::utf8_to_latin1::convert_valid(buf, len, latin1_output);
-}
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
-}
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert<endianness::BIG>(buf, len, utf16_output);
-}
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
-                                                           utf16_output);
-}
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
+    }
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16::validating_transcoder converter;
-  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
-}
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *input, size_t size, char16_t *utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
-                                                          utf16_output);
-}
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *input, size_t size, char16_t *utf16_output) const noexcept {
-  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
-                                                       utf16_output);
-}
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert(buf, len, utf32_output);
-}
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  utf8_to_utf32::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, utf32_output);
-}
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *input, size_t size, char32_t *utf32_output) const noexcept {
-  return utf8_to_utf32::convert_valid(input, size, utf32_output);
-}
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+      // 6. adjust pointers
+      buf += 16;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_latin1::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
-          buf, len, latin1_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
-                                                               latin1_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function.
-  return convert_utf16be_to_latin1(buf, len, latin1_output);
-}
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function.
-  return convert_utf16le_to_latin1(buf, len, latin1_output);
-}
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char16_t *, char *> ret =
-      arm_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf8::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
-                                                                utf8_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      arm_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
-                                                             utf8_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
     } else {
-      ret.second += scalar_res.count;
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
     }
+  } // while
+
+  // check for invalid input
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
+          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+    return std::make_pair(nullptr, utf8_output);
   }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
-}
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf8_output);
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+  return std::make_pair(buf, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return 0;
-  }
-  std::pair<const char32_t *, char *> ret =
-      arm_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+std::pair<result, char *>
+avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                       char *utf8_output) {
+  const char32_t *end = buf + len;
+  const char32_t *start = buf;
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, 0);
-  }
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      arm_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+  const __m256i v_0000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
+  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
+  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
+  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
+  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char16_t *, char32_t *> ret =
-      arm_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char16_t *, char32_t *> ret =
-      arm_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    // Check for too large input
+    const __m256i max_input =
+        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
+    if (static_cast<uint32_t>(_mm256_movemask_epi8(
+            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                            utf8_output);
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char32_t *> ret =
-      arm_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
-                                                                 utf32_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf32_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
+    // saturation
+    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
+                                        _mm256_and_si256(nextin, v_7fffffff));
+    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char32_t *> ret =
-      arm_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
-                                                              utf32_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    // Try to apply UTF-16 => UTF-8 routine on 256 bits
+    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+
+    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      const __m128i utf8_packed = _mm_packus_epi16(
+          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // 2. store (16 bytes)
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
-  }
-  ret.first.count =
-      ret.second -
-      utf32_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+    // no bits set above 7th bit
+    const __m256i one_byte_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
+    const uint32_t one_byte_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      arm_convert_utf32_to_latin1(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+    // no bits set above 11th bit
+    const __m256i one_or_two_bytes_bytemask =
+        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
+    const uint32_t one_or_two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
+    if (one_or_two_bytes_bitmask == 0xffffffff) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
+      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      // t0 = [000a|aaaa|bbbb|bb00]
+      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      // t2 = [0000|0000|00bb|bbbb]
+      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      // t3 = [000a|aaaa|00bb|bbbb]
+      const __m256i t3 = _mm256_or_si256(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<result, char *> ret =
-      arm_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
-  if (ret.first.error) {
-    return ret.first;
-  } // Can return directly since scalar fallback already found correct
-    // ret.first.count
-  if (ret.first.count != len) { // All good so far, but not finished
-    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      // 2. merge ASCII and 2-byte codewords
+      const __m256i utf8_unpacked =
+          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      arm_convert_utf32_to_latin1(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
+      // 3. prepare bitmask for 8-bit lookup
+      const uint32_t M0 = one_byte_bitmask & 0x55555555;
+      const uint32_t M1 = M0 >> 7;
+      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      // 4. pack the bytes
 
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
-        ret.first, len - (ret.first - buf), ret.second);
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      const uint8_t *row =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
+      const uint8_t *row_2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
+                                                                       16)][0];
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  // optimization opportunity: implement a custom function.
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
+      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      arm_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+      const __m256i utf8_packed = _mm256_shuffle_epi8(
+          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_castsi256_si128(utf8_packed));
+      utf8_output += row[0];
+      _mm_storeu_si128((__m128i *)utf8_output,
+                       _mm256_extractf128_si256(utf8_packed, 1));
+      utf8_output += row_2[0];
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      arm_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+      // 6. adjust pointers
+      buf += 16;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
+    // Must check for overflow in packing
+    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
+        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffffffff) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      arm_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
-                                                                 utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      // Check for illegal surrogate code units
+      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf8_output);
+      }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      arm_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
-                                                              utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
+      const __m256i dup_even = _mm256_setr_epi16(
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
+          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
-}
+      /* In this branch we handle three cases:
+        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+        single UFT-8 byte
+        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+        UTF-8 bytes
+        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+        three UTF-8 bytes
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
-}
+        We expand the input word (16-bit) into two code units (32-bit), thus
+        we have room for four bytes. However, we need five distinct bit
+        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return convert_utf16le_to_utf32(buf, len, utf32_output);
-}
+        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+        in register t2.
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return convert_utf16be_to_utf32(buf, len, utf32_output);
-}
+        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+        either byte 1 for case #2 or byte 2 for case #3. Note that they
+        differ by exactly one bit.
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  utf16::change_endianness_utf16(input, length, output);
-}
+        Finally from these two code units we build proper UTF-8 sequence, taking
+        into account the case (i.e, the number of bytes to write).
+      */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::LITTLE>(input, length);
-}
+      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
+      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
+      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
+      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
+      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
+                                             simdutf_vec(0b0100000000000000));
+      const __m256i s4 = _mm256_xor_si256(s3, m0);
+#undef simdutf_vec
+
+      // 4. expand code units 16-bit => 32-bit
+      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
+      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
+                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
+      // Due to the wider registers, the following path is less likely to be
+      // useful.
+      /*if(mask == 0) {
+        // We only have three-byte code units. Use fast path.
+        const __m256i shuffle =
+      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
+      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
+      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
+      _mm256_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
+        _mm_storeu_si128((__m128i*)utf8_output,
+      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        continue;
+      }*/
+      const uint8_t mask0 = uint8_t(mask);
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
+      const __m128i utf8_0 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::count_code_points<endianness::BIG>(input, length);
-}
+      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
+      const __m128i utf8_1 =
+          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
-}
+      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
+      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
+      const __m128i utf8_2 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *buf, size_t len) const noexcept {
-  return count_utf8(buf, len);
-}
+      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
+      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
+      const __m128i utf8_3 =
+          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
-}
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
+      utf8_output += row0[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+      utf8_output += row1[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
+      utf8_output += row2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
+      utf8_output += row3[0];
+      buf += 16;
+    } else {
+      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
+      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD may require
+      // large, non-trivial tables?
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else { // 4-byte
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return scalar::utf32::latin1_length_from_utf32(length);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
+/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
+/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                            char16_t *utf16_output) {
+  const char32_t *end = buf + len;
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t length) const noexcept {
-  // See
-  // https://lemire.me/blog/2023/05/15/computing-the-utf-8-size-of-a-latin-1-string-quickly-arm-neon-edition/
-  // credit to Pete Cawley
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
-  uint64_t result = 0;
-  const int lanes = sizeof(uint8x16_t);
-  uint8_t rem = length % lanes;
-  const uint8_t *simd_end = data + (length / lanes) * lanes;
-  const uint8x16_t threshold = vdupq_n_u8(0x80);
-  for (; data < simd_end; data += lanes) {
-    // load 16 bytes
-    uint8x16_t input_vec = vld1q_u8(data);
-    // compare to threshold (0x80)
-    uint8x16_t withhighbit = vcgeq_u8(input_vec, threshold);
-    // vertical addition
-    result -= vaddvq_s8(vreinterpretq_s8_u8(withhighbit));
-  }
-  return result + (length / lanes) * lanes +
-         scalar::latin1::utf8_length_from_latin1((const char *)simd_end, rem);
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
-}
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
-}
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
-}
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      forbidden_bytemask = _mm256_or_si256(
+          forbidden_bytemask,
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
-}
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr, utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
+  // check for invalid input
+  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+    return std::make_pair(nullptr, utf16_output);
+  }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
+  return std::make_pair(buf, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const uint32x4_t v_7f = vmovq_n_u32((uint32_t)0x7f);
-  const uint32x4_t v_7ff = vmovq_n_u32((uint32_t)0x7ff);
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t ascii_bytes_bytemask = vcleq_u32(in, v_7f);
-    const uint32x4_t one_two_bytes_bytemask = vcleq_u32(in, v_7ff);
-    const uint32x4_t two_bytes_bytemask =
-        veorq_u32(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const uint32x4_t three_bytes_bytemask =
-        veorq_u32(vcleq_u32(in, v_ffff), one_two_bytes_bytemask);
+template <endianness big_endian>
+std::pair<result, char16_t *>
+avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                        char16_t *utf16_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-    const uint16x8_t reduced_ascii_bytes_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(ascii_bytes_bytemask, v_1));
-    const uint16x8_t reduced_two_bytes_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(two_bytes_bytemask, v_1));
-    const uint16x8_t reduced_three_bytes_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(three_bytes_bytemask, v_1));
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-    const uint16x8_t compressed_bytemask0 =
-        vpaddq_u16(reduced_ascii_bytes_bytemask, reduced_two_bytes_bytemask);
-    const uint16x8_t compressed_bytemask1 =
-        vpaddq_u16(reduced_three_bytes_bytemask, reduced_three_bytes_bytemask);
+  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
+    __m256i in = _mm256_loadu_si256((__m256i *)buf);
 
-    size_t ascii_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 0));
-    size_t two_bytes_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask0), 1));
-    size_t three_bytes_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask1), 0));
+    const __m256i v_00000000 = _mm256_setzero_si256();
+    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
 
-    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
-  }
-  return count +
-         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
-}
+    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
+    const __m256i saturation_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t saturation_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const uint32x4_t v_ffff = vmovq_n_u32((uint32_t)0xffff);
-  const uint32x4_t v_1 = vmovq_n_u32((uint32_t)0x1);
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    uint32x4_t in = vld1q_u32(reinterpret_cast<const uint32_t *>(input + pos));
-    const uint32x4_t surrogate_bytemask = vcgtq_u32(in, v_ffff);
-    const uint16x8_t reduced_bytemask =
-        vreinterpretq_u16_u32(vandq_u32(surrogate_bytemask, v_1));
-    const uint16x8_t compressed_bytemask =
-        vpaddq_u16(reduced_bytemask, reduced_bytemask);
-    size_t surrogate_count = count_ones(
-        vgetq_lane_u64(vreinterpretq_u64_u16(compressed_bytemask), 0));
-    count += 4 + surrogate_count;
+    if (saturation_bitmask == 0xffffffff) {
+      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
+      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
+      const __m256i forbidden_bytemask =
+          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
+          0x0) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              utf16_output);
+      }
+
+      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
+                                              _mm256_extractf128_si256(in, 1));
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      }
+      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 7;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k), utf16_output);
+          }
+          *utf16_output++ =
+              big_endian
+                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
+                  : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (big_endian) {
+            high_surrogate =
+                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
+            low_surrogate =
+                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
   }
-  return count +
-         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
-}
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
+/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
-}
+/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask &
+      0xfff; // we are only processing 12 bytes in case it is not all ASCII
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+    latin1_output += 12; // We wrote 12 characters.
+    return 12;           // We consumed 1 bytes.
+  }
+  /// We do not have a fast path available, so we fallback.
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+  // processors where pdep/pext is fast, we might be able to use a small lookup
+  // table.
+  const __m128i sh =
+      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+  const __m128i perm = _mm_shuffle_epi8(in, sh);
+  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  // performance note: it would be faster to use _mm_storeu_si128, we should
+  // investigate.
+  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
 }
+/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+/* begin file src/haswell/avx2_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+template <bool base64_url>
+simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
+  // credit: Wojciech Muła
+  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
+  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
+  result =
+      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
+  __m256i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
-}
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm256_setr_epi8(
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  return encode_base64(output, input, length, options);
+        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  }
+
+  result = _mm256_shuffle_epi8(shift_LUT, result);
+  return _mm256_add_epi8(result, input);
 }
 
-} // namespace arm64
-} // namespace simdutf
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  const uint8_t *input = (const uint8_t *)src;
 
-/* begin file src/simdutf/arm64/end.h */
-/* end file src/simdutf/arm64/end.h */
-/* end file src/arm64/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_FALLBACK
-/* begin file src/fallback/implementation.cpp */
-/* begin file src/simdutf/fallback/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "fallback"
-// #define SIMDUTF_IMPLEMENTATION fallback
-/* end file src/simdutf/fallback/begin.h */
+  uint8_t *out = (uint8_t *)dst;
+  const __m256i shuf =
+      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
 
+                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
+  size_t i = 0;
+  for (; i + 100 <= srclen; i += 96) {
+    const __m128i lo0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
+    const __m128i hi0 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
+    const __m128i lo1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
+    const __m128i hi1 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
+    const __m128i lo2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
+    const __m128i hi2 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
+    const __m128i lo3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
+    const __m128i hi3 = _mm_loadu_si128(
+        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
 
+    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
+    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
+    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
+    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
 
+    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
 
+    const __m256i t1_0 =
+        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_1 =
+        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_2 =
+        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
+    const __m256i t1_3 =
+        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
 
+    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
 
+    const __m256i t3_0 =
+        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_1 =
+        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_2 =
+        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
+    const __m256i t3_3 =
+        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
 
+    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
+    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
+    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
+    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
 
-#include <cstdint>
-#include <cstring>
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input0));
+    out += 32;
 
-namespace simdutf {
-namespace fallback {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input1));
+    out += 32;
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
-  }
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
-  }
-  if ((length % 2) == 0) {
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input2));
+    out += 32;
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(input3));
+    out += 32;
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
-    }
+  for (; i + 28 <= srclen; i += 24) {
+    // lo = [xxxx|DDDC|CCBB|BAAA]
+    // hi = [xxxx|HHHG|GGFF|FEEE]
+    const __m128i lo =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+    const __m128i hi =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+
+    // bytes from groups A, B and C are needed in separate 32-bit lanes
+    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
+    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+
+    // this part is well commented in encode.sse.cpp
+
+    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
+    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
+    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+    const __m256i indices = _mm256_or_si256(t1, t3);
+
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
+                        lookup_pshufb_improved<isbase64url>(indices));
+    out += 32;
   }
-  return out;
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return scalar::utf8::validate(buf, len);
-}
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+    return;
+  }
+  // this particular implementation was inspired by work done by @animetosho
+  // we do it in two steps, first 8 bytes and then second 8 bytes
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  // next line just loads the 64-bit values thintable_epi8[mask1] and
+  // thintable_epi8[mask2] into a 128-bit register, using only
+  // two instructions on most compilers.
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return scalar::utf8::validate_with_errors(buf, len);
-}
+  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
+                                    tables::base64::thintable_epi8[mask1]);
+  // we increment by 0x08 the second half of the mask
+  shufmask =
+      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+  // this is the version "nearly pruned"
+  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+  // we still need to put the two halves together.
+  // we compute the popcount of the first half:
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
+      tables::base64::pshufb_combine_table + pop1 * 8));
+  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return scalar::ascii::validate(buf, len);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return scalar::ascii::validate_with_errors(buf, len);
+static inline void compress(__m256i data, uint32_t mask, char *output) {
+  if (mask == 0) {
+    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
+    return;
+  }
+  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
+  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
+           output + _mm_popcnt_u32(~mask & 0xFFFF));
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
-}
+struct block64 {
+  __m256i chunks[2];
+};
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::BIG>(buf, len);
-}
+template <bool base64_url>
+static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
+  const __m256i ascii_space_tbl =
+      _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
+                       0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
+                       0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
+  // credit: aqrit
+  __m256i delta_asso;
+  if (base64_url) {
+    delta_asso =
+        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
+                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+  } else {
+    delta_asso = _mm256_setr_epi8(
+        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
-}
+  __m256i delta_values;
+  if (base64_url) {
+    delta_values = _mm256_setr_epi8(
+        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
+        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
+        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
+        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+  } else {
+    delta_values = _mm256_setr_epi8(
+        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
+        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
+        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
+        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
+        int8_t(0xB9), int8_t(0xB9));
+  }
+  __m256i check_asso;
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
-}
+  if (base64_url) {
+    check_asso =
+        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
+                         0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
+                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+  } else {
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate(buf, len);
+    check_asso = _mm256_setr_epi8(
+        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+  }
+  __m256i check_values;
+  if (base64_url) {
+    check_values = _mm256_setr_epi8(
+        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
+        uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
+        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
+        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+        uint8_t(0x80), 0x0, uint8_t(0x80));
+  } else {
+    check_values = _mm256_setr_epi8(
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
+        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
+        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
+        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
+        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
+        int8_t(0x91), int8_t(0x80));
+  }
+  const __m256i shifted = _mm256_srli_epi32(*src, 3);
+  const __m256i delta_hash =
+      _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
+  const __m256i check_hash =
+      _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
+  const __m256i out =
+      _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
+  const __m256i chk =
+      _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
+  const int mask = _mm256_movemask_epi8(chk);
+  if (mask) {
+    __m256i ascii_space =
+        _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
+    *error = (mask ^ _mm256_movemask_epi8(ascii_space));
+  }
+  *src = out;
+  return (uint32_t)mask;
 }
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate_with_errors(buf, len);
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
+  uint32_t err0 = 0;
+  uint32_t err1 = 0;
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
+  *error = err0 | ((uint64_t)err1 << 32);
+  return m0 | (m1 << 32);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::latin1_to_utf8::convert(buf, len, utf8_output);
+static inline void copy_block(block64 *b, char *output) {
+  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
+  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::latin1_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                              utf16_output);
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  compress(b->chunks[0], uint32_t(mask), output);
+  compress(b->chunks[1], uint32_t(mask >> 32),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+  return _mm_popcnt_u64(nmask);
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::latin1_to_utf16::convert<endianness::BIG>(buf, len,
-                                                           utf16_output);
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  b->chunks[1] =
+      _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
 }
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::latin1_to_utf32::convert(buf, len, utf32_output);
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
+  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
+  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
+  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
+  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
+  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
+  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
+  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
+  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf8_to_latin1::convert(buf, len, latin1_output);
-}
+static inline void base64_decode(char *out, __m256i str) {
+  // credit: aqrit
+  const __m256i pack_shuffle =
+      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
+  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
+  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf8_to_latin1::convert_with_errors(buf, len, latin1_output);
+  // Store the output:
+  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
+  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
 }
-
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out,
+                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+  base64_decode(out + 24, _mm256_loadu_si256(
+                              reinterpret_cast<const __m256i *>(src + 32)));
 }
-
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                            utf16_output);
+static inline void base64_decode_block_safe(char *out, const char *src) {
+  base64_decode(out,
+                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
+  char buffer[32]; // We enforce safety with a buffer.
+  base64_decode(
+      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
+  std::memcpy(out + 24, buffer, 24);
 }
-
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert<endianness::BIG>(buf, len,
-                                                         utf16_output);
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 24, b->chunks[1]);
 }
-
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  char buffer[32]; // We enforce safety with a buffer.
+  base64_decode(buffer, b->chunks[1]);
+  std::memcpy(out + 24, buffer, 24);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
-}
+template <bool base64_url, typename chartype>
+full_result
+compress_decode_base64(char *dst, const chartype *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
+    }
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  char *end_of_safe_64byte_zone =
+      (srclen + 3) / 4 * 3 >= 63 ? dst + (srclen + 3) / 4 * 3 - 63 : dst;
+
+  const chartype *const srcinit = src;
+  const char *const dstinit = dst;
+  const chartype *const srcend = src + srclen;
+
+  constexpr size_t block_size = 6;
+  static_assert(block_size >= 2, "block_size must be at least two");
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const chartype *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      uint64_t error = 0;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (error) {
+        src -= 64;
+        size_t error_offset = _tzcnt_u64(error);
+        return {error_code::INVALID_BASE64_CHARACTER,
+                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+      }
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else if (bufferptr != buffer) {
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      } else {
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, &b);
+        } else {
+          base64_decode_block(dst, &b);
+        }
+        dst += 48;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 2); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        if (dst >= end_of_safe_64byte_zone) {
+          base64_decode_block_safe(dst, buffer + (block_size - 2) * 64);
+        } else {
+          base64_decode_block(dst, buffer + (block_size - 2) * 64);
+        }
+        dst += 48;
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(buf, len,
-                                                                  utf16_output);
-}
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = to_base64[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf8_to_utf16::convert_valid<endianness::BIG>(buf, len,
-                                                               utf16_output);
-}
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    if (dst >= end_of_safe_64byte_zone) {
+      base64_decode_block_safe(dst, buffer_start);
+    } else {
+      base64_decode_block(dst, buffer_start);
+    }
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert(buf, len, utf32_output);
-}
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert_with_errors(buf, len, utf32_output);
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // backtrack
+    int leftover = int(bufferptr - buffer_start);
+    while (leftover > 0) {
+      while (to_base64[uint8_t(*(src - 1))] == 64) {
+        src--;
+      }
+      src--;
+      leftover--;
+    }
+  }
+  if (src < srcend + equalsigns) {
+    full_result r = scalar::base64::base64_tail_decode(
+        dst, src, srcend - src, equalsigns, options, last_chunk_options);
+    r.input_count += size_t(src - srcinit);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+        r.error == error_code::BASE64_EXTRA_BITS) {
+      return r;
+    } else {
+      r.output_count += size_t(dst - dstinit);
+    }
+    if (last_chunk_options != stop_before_partial &&
+        r.error == error_code::SUCCESS && equalsigns > 0) {
+      // additional checks
+      if ((r.output_count % 3 == 0) ||
+          ((r.output_count % 3) + 1 + equalsigns != 4)) {
+        r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.input_count = equallocation;
+      }
+    }
+    return r;
+  }
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
+/* end file src/haswell/avx2_base64.cpp */
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *input, size_t size, char32_t *utf32_output) const noexcept {
-  return scalar::utf8_to_utf32::convert_valid(input, size, utf32_output);
-}
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert<endianness::LITTLE>(buf, len,
-                                                              latin1_output);
-}
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace haswell {
+namespace {
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert<endianness::BIG>(buf, len,
-                                                           latin1_output);
-}
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
-      buf, len, latin1_output);
-}
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
-      buf, len, latin1_output);
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_valid<endianness::LITTLE>(
-      buf, len, latin1_output);
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
+    }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf16_to_latin1::convert_valid<endianness::BIG>(buf, len,
-                                                                 latin1_output);
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
-                                                            utf8_output);
-}
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-      buf, len, utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
-                                                                  utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
-                                                               utf8_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf32_to_latin1::convert(buf, len, latin1_output);
-}
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_validation {
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf32_to_latin1::convert_with_errors(buf, len, latin1_output);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return scalar::utf32_to_latin1::convert_valid(buf, len, latin1_output);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
-}
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
-}
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                             utf16_output);
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
-                                                          utf16_output);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
-                                                                utf16_output);
-}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
-                                                             utf32_output);
-}
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
-                                                          utf32_output);
-}
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
+    }
+  }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf32_output);
-}
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-      buf, len, utf32_output);
-}
+}; // struct utf8_checker
+} // namespace utf8_validation
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
-      buf, len, utf32_output);
-}
+using utf8_validation::utf8_checker;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
-                                                                utf32_output);
-}
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_validation {
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
+    }
+    reader.advance();
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
 }
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *buf, size_t len) const noexcept {
-  return scalar::utf8::count_code_points(buf, len);
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
 }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return length;
-}
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
+    }
+    reader.advance();
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t length) const noexcept {
-  size_t answer = length;
-  size_t i = 0;
-  auto pop = [](uint64_t v) {
-    return (size_t)(((v >> 7) & UINT64_C(0x0101010101010101)) *
-                        UINT64_C(0x0101010101010101) >>
-                    56);
-  };
-  for (; i + 32 <= length; i += 32) {
-    uint64_t v;
-    memcpy(&v, input + i, 8);
-    answer += pop(v);
-    memcpy(&v, input + i + 8, sizeof(v));
-    answer += pop(v);
-    memcpy(&v, input + i + 16, sizeof(v));
-    answer += pop(v);
-    memcpy(&v, input + i + 24, sizeof(v));
-    answer += pop(v);
-  }
-  for (; i + 8 <= length; i += 8) {
-    uint64_t v;
-    memcpy(&v, input + i, sizeof(v));
-    answer += pop(v);
+    count += 64;
   }
-  for (; i + 1 <= length; i += 1) {
-    answer += static_cast<uint8_t>(input[i]) >> 7;
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
   }
-  return answer;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
-                                                                   length);
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
-                                                                    length);
-}
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf16 {
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
-}
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
-}
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
-}
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
-}
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-  }
-  return r;
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
-    }
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  return r;
-}
-
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
-}
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
     }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+    if (errors()) {
+      return 0;
     }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
     }
+    return utf16_output - start;
   }
-  return r;
-}
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
     }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
     }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
     }
+    return result(error_code::SUCCESS, utf16_output - start);
   }
-  return r;
-}
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
-}
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length, options);
-}
-} // namespace fallback
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace haswell
 } // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
-/* begin file src/simdutf/fallback/end.h */
-/* end file src/simdutf/fallback/end.h */
-/* end file src/fallback/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_ICELAKE
-/* begin file src/icelake/implementation.cpp */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_utf32 {
 
+using namespace simd;
 
-/* begin file src/simdutf/icelake/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "icelake"
-// #define SIMDUTF_IMPLEMENTATION icelake
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
+}
 
-#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
-// nothing needed.
-#else
-SIMDUTF_TARGET_ICELAKE
-#endif
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-// clang-format off
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
-// clang-format on
-#endif // end of workaround
-/* end file src/simdutf/icelake/begin.h */
 namespace simdutf {
-namespace icelake {
+namespace haswell {
 namespace {
-#ifndef SIMDUTF_ICELAKE_H
-  #error "icelake.h must be included"
-#endif
-/* begin file src/icelake/icelake_utf8_common.inl.cpp */
-// Common procedures for both validating and non-validating conversions from
-// UTF-8.
-enum block_processing_mode { SIMDUTF_FULL, SIMDUTF_TAIL };
+namespace utf8_to_utf32 {
+using namespace simd;
 
-using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
-using utf8_to_utf32_result = std::pair<const char *, uint32_t *>;
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-/*
-    process_block_utf8_to_utf16 converts up to 64 bytes from 'in' from UTF-8
-    to UTF-16. When tail = SIMDUTF_FULL, then the full input buffer (64 bytes)
-    might be used. When tail = SIMDUTF_TAIL, we take into account 'gap' which
-    indicates how many input bytes are relevant.
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-    Returns true when the result is correct, otherwise it returns false.
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-    The provided in and out pointers are advanced according to how many input
-    bytes have been processed, upon success.
-*/
-template <block_processing_mode tail, endianness big_endian>
-simdutf_really_inline bool
-process_block_utf8_to_utf16(const char *&in, char16_t *&out, size_t gap) {
-  // constants
-  __m512i mask_identity = _mm512_set_epi8(
-      63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46,
-      45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28,
-      27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9,
-      8, 7, 6, 5, 4, 3, 2, 1, 0);
-  __m512i mask_c0c0c0c0 = _mm512_set1_epi32(0xc0c0c0c0);
-  __m512i mask_80808080 = _mm512_set1_epi32(0x80808080);
-  __m512i mask_f0f0f0f0 = _mm512_set1_epi32(0xf0f0f0f0);
-  __m512i mask_dfdfdfdf_tail = _mm512_set_epi64(
-      0xffffdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
-      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf,
-      0xdfdfdfdfdfdfdfdf, 0xdfdfdfdfdfdfdfdf);
-  __m512i mask_c2c2c2c2 = _mm512_set1_epi32(0xc2c2c2c2);
-  __m512i mask_ffffffff = _mm512_set1_epi32(0xffffffff);
-  __m512i mask_d7c0d7c0 = _mm512_set1_epi32(0xd7c0d7c0);
-  __m512i mask_dc00dc00 = _mm512_set1_epi32(0xdc00dc00);
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  // Note that 'tail' is a compile-time constant !
-  __mmask64 b =
-      (tail == SIMDUTF_FULL) ? 0xFFFFFFFFFFFFFFFF : (uint64_t(1) << gap) - 1;
-  __m512i input = (tail == SIMDUTF_FULL) ? _mm512_loadu_si512(in)
-                                         : _mm512_maskz_loadu_epi8(b, in);
-  __mmask64 m1 = (tail == SIMDUTF_FULL)
-                     ? _mm512_cmplt_epu8_mask(input, mask_80808080)
-                     : _mm512_mask_cmplt_epu8_mask(b, input, mask_80808080);
-  if (_ktestc_mask64_u8(m1,
-                        b)) { // NOT(m1) AND b -- if all zeroes, then all ASCII
-                              // alternatively, we could do 'if (m1 == b) { '
-    if (tail == SIMDUTF_FULL) {
-      in += 64; // consumed 64 bytes
-      // we convert a full 64-byte block, writing 128 bytes.
-      __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-      if (big_endian) {
-        input1 = _mm512_shuffle_epi8(input1, byteflip);
-      }
-      _mm512_storeu_si512(out, input1);
-      out += 32;
-      __m512i input2 =
-          _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-      if (big_endian) {
-        input2 = _mm512_shuffle_epi8(input2, byteflip);
-      }
-      _mm512_storeu_si512(out, input2);
-      out += 32;
-      return true; // we are done
-    } else {
-      in += gap;
-      if (gap <= 32) {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if (big_endian) {
-          input1 = _mm512_shuffle_epi8(input1, byteflip);
-        }
-        _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << (gap)) - 1),
-                                 input1);
-        out += gap;
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
       } else {
-        __m512i input1 = _mm512_cvtepu8_epi16(_mm512_castsi512_si256(input));
-        if (big_endian) {
-          input1 = _mm512_shuffle_epi8(input1, byteflip);
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        _mm512_storeu_si512(out, input1);
-        out += 32;
-        __m512i input2 =
-            _mm512_cvtepu8_epi16(_mm512_extracti64x4_epi64(input, 1));
-        if (big_endian) {
-          input2 = _mm512_shuffle_epi8(input2, byteflip);
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
         }
-        _mm512_mask_storeu_epi16(
-            out, __mmask32((uint32_t(1) << (gap - 32)) - 1), input2);
-        out += gap - 32;
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
       }
-      return true; // we are done
     }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
+    }
+    return utf32_output - start;
   }
-  // classify characters further
-  __mmask64 m234 = _mm512_cmp_epu8_mask(
-      mask_c0c0c0c0, input,
-      _MM_CMPINT_LE); // 0xc0 <= input, 2, 3, or 4 leading byte
-  __mmask64 m34 =
-      _mm512_cmp_epu8_mask(mask_dfdfdfdf_tail, input,
-                           _MM_CMPINT_LT); // 0xdf < input,  3 or 4 leading byte
-
-  __mmask64 milltwobytes = _mm512_mask_cmp_epu8_mask(
-      m234, input, mask_c2c2c2c2,
-      _MM_CMPINT_LT); // 0xc0 <= input < 0xc2 (illegal two byte sequence)
-                      // Overlong 2-byte sequence
-  if (_ktestz_mask64_u8(milltwobytes, milltwobytes) == 0) {
-    // Overlong 2-byte sequence
-    return false;
-  }
-  if (_ktestz_mask64_u8(m34, m34) == 0) {
-    // We have a 3-byte sequence and/or a 2-byte sequence, or possibly even a
-    // 4-byte sequence!
-    __mmask64 m4 = _mm512_cmp_epu8_mask(
-        input, mask_f0f0f0f0,
-        _MM_CMPINT_NLT); // 0xf0 <= zmm0 (4 byte start bytes)
-
-    __mmask64 mask_not_ascii = (tail == SIMDUTF_FULL)
-                                   ? _knot_mask64(m1)
-                                   : _kand_mask64(_knot_mask64(m1), b);
 
-    __mmask64 mp1 = _kshiftli_mask64(m234, 1);
-    __mmask64 mp2 = _kshiftli_mask64(m34, 2);
-    // We could do it as follows...
-    // if (_kortestz_mask64_u8(m4,m4)) { // compute the bitwise OR of the 64-bit
-    // masks a and b and return 1 if all zeroes but GCC generates better code
-    // when we do:
-    if (m4 == 0) { // compute the bitwise OR of the 64-bit masks a and b and
-                   // return 1 if all zeroes
-      // Fast path with 1,2,3 bytes
-      __mmask64 mc = _kor_mask64(mp1, mp2); // expected continuation bytes
-      __mmask64 m1234 = _kor_mask64(m1, m234);
-      // mismatched continuation bytes:
-      if (tail == SIMDUTF_FULL) {
-        __mmask64 xnormcm1234 = _kxnor_mask64(
-            mc,
-            m1234); // XNOR of mc and m1234 should be all zero if they differ
-        // the presence of a 1 bit indicates that they overlap.
-        // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return
-        // 1 if all zeroes.
-        if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
-          return false;
-        }
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
       } else {
-        __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-        if (mc != bxorm1234) {
-          return false;
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-      }
-      // mend: identifying the last bytes of each sequence to be decoded
-      __mmask64 mend = _kshiftri_mask64(m1234, 1);
-      if (tail != SIMDUTF_FULL) {
-        mend = _kor_mask64(mend, (uint64_t(1) << (gap - 1)));
-      }
-
-      __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-      __m512i last_and_thirdu16 =
-          _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
-
-      __m512i nonasciitags = _mm512_maskz_mov_epi8(
-          mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-      __m512i clearedbytes = _mm512_andnot_si512(
-          nonasciitags, input); // high two bits cleared where not ASCII
-      __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
-          0x5555555555555555, last_and_thirdu16,
-          clearedbytes); // the last byte of each character
-
-      __mmask64 mask_before_non_ascii = _kshiftri_mask64(
-          mask_not_ascii, 1); // bytes that precede non-ASCII bytes
-      __m512i indexofsecondlastbytes = _mm512_add_epi16(
-          mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-      __m512i beforeasciibytes =
-          _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-      __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
-          0x5555555555555555, indexofsecondlastbytes,
-          beforeasciibytes); // the second last bytes (of two, three byte seq,
-                             // surrogates)
-      secondlastbytes =
-          _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
-
-      __m512i indexofthirdlastbytes = _mm512_add_epi16(
-          mask_ffffffff,
-          indexofsecondlastbytes); // indices of the second last bytes
-      __m512i thirdlastbyte =
-          _mm512_maskz_mov_epi8(m34,
-                                clearedbytes); // only those that are the third
-                                               // last byte of a sequence
-      __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
-          0x5555555555555555, indexofthirdlastbytes,
-          thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                          // surrogate)
-      thirdlastbytes =
-          _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
-      __m512i Wout = _mm512_ternarylogic_epi32(lastbytes, secondlastbytes,
-                                               thirdlastbytes, 254);
-      // the elements of Wout excluding the last element if it happens to be a
-      // high surrogate:
-
-      __mmask64 mprocessed =
-          (tail == SIMDUTF_FULL)
-              ? _pdep_u64(0xFFFFFFFF, mend)
-              : _pdep_u64(
-                    0xFFFFFFFF,
-                    _kand_mask64(
-                        mend, b)); // we adjust mend at the end of the output.
-
-      // Encodings out of range...
-      {
-        // the location of 3-byte sequence start bytes in the input
-        __mmask64 m3 = m34 & (b ^ m4);
-        // code units in Wout corresponding to 3-byte sequences.
-        __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-        __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-        __mmask32 Msmall800 =
-            _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-        __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-        __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-        __mmask32 M3s =
-            _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-        if (_kor_mask32(Msmall800, M3s)) {
-          return false;
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
         }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
       }
-      int64_t nout = _mm_popcnt_u64(mprocessed);
-      in += 64 - _lzcnt_u64(mprocessed);
-      if (big_endian) {
-        Wout = _mm512_shuffle_epi8(Wout, byteflip);
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
       }
-      _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-      out += nout;
-      return true; // ok
     }
-    //
-    // We have a 4-byte sequence, this is the general case.
-    // Slow!
-    __mmask64 mp3 = _kshiftli_mask64(m4, 3);
-    __mmask64 mc =
-        _kor_mask64(_kor_mask64(mp1, mp2), mp3); // expected continuation bytes
-    __mmask64 m1234 = _kor_mask64(m1, m234);
+    return result(error_code::SUCCESS, utf32_output - start);
+  }
 
-    // mend: identifying the last bytes of each sequence to be decoded
-    __mmask64 mend =
-        _kor_mask64(_kshiftri_mask64(_kor_mask64(mp3, m1234), 1), mp3);
-    if (tail != SIMDUTF_FULL) {
-      mend = _kor_mask64(mend, __mmask64(uint64_t(1) << (gap - 1)));
-    }
-    __m512i last_and_third = _mm512_maskz_compress_epi8(mend, mask_identity);
-    __m512i last_and_thirdu16 =
-        _mm512_cvtepu8_epi16(_mm512_castsi512_si256(last_and_third));
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-    __m512i nonasciitags = _mm512_maskz_mov_epi8(
-        mask_not_ascii, mask_c0c0c0c0); // ASCII: 00000000  other: 11000000
-    __m512i clearedbytes = _mm512_andnot_si512(
-        nonasciitags, input); // high two bits cleared where not ASCII
-    __m512i lastbytes = _mm512_maskz_permutexvar_epi8(
-        0x5555555555555555, last_and_thirdu16,
-        clearedbytes); // the last byte of each character
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+// other functions
+/* begin file src/generic/utf8.h */
 
-    __mmask64 mask_before_non_ascii = _kshiftri_mask64(
-        mask_not_ascii, 1); // bytes that precede non-ASCII bytes
-    __m512i indexofsecondlastbytes = _mm512_add_epi16(
-        mask_ffffffff, last_and_thirdu16); // indices of the second last bytes
-    __m512i beforeasciibytes =
-        _mm512_maskz_mov_epi8(mask_before_non_ascii, clearedbytes);
-    __m512i secondlastbytes = _mm512_maskz_permutexvar_epi8(
-        0x5555555555555555, indexofsecondlastbytes,
-        beforeasciibytes); // the second last bytes (of two, three byte seq,
-                           // surrogates)
-    secondlastbytes =
-        _mm512_slli_epi16(secondlastbytes, 6); // shifted into position
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8 {
 
-    __m512i indexofthirdlastbytes = _mm512_add_epi16(
-        mask_ffffffff,
-        indexofsecondlastbytes); // indices of the second last bytes
-    __m512i thirdlastbyte = _mm512_maskz_mov_epi8(
-        m34,
-        clearedbytes); // only those that are the third last byte of a sequence
-    __m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(
-        0x5555555555555555, indexofthirdlastbytes,
-        thirdlastbyte); // the third last bytes (of three byte sequences, hi
-                        // surrogate)
-    thirdlastbytes =
-        _mm512_slli_epi16(thirdlastbytes, 12); // shifted into position
-    __m512i thirdsecondandlastbytes = _mm512_ternarylogic_epi32(
-        lastbytes, secondlastbytes, thirdlastbytes, 254);
-    uint64_t Mlo_uint64 = _pext_u64(mp3, mend);
-    __mmask32 Mlo = __mmask32(Mlo_uint64);
-    __mmask32 Mhi = __mmask32(Mlo_uint64 >> 1);
-    __m512i lo_surr_mask = _mm512_maskz_mov_epi16(
-        Mlo,
-        mask_dc00dc00); // lo surr: 1101110000000000, other:  0000000000000000
-    __m512i shifted4_thirdsecondandlastbytes =
-        _mm512_srli_epi16(thirdsecondandlastbytes,
-                          4); // hi surr: 00000WVUTSRQPNML  vuts = WVUTS - 1
-    __m512i tagged_lo_surrogates = _mm512_or_si512(
-        thirdsecondandlastbytes,
-        lo_surr_mask); // lo surr: 110111KJHGFEDCBA, other:  unchanged
-    __m512i Wout = _mm512_mask_add_epi16(
-        tagged_lo_surrogates, Mhi, shifted4_thirdsecondandlastbytes,
-        mask_d7c0d7c0); // hi sur: 110110vutsRQPNML, other:  unchanged
-    // the elements of Wout excluding the last element if it happens to be a
-    // high surrogate:
-    __mmask32 Mout = ~(Mhi & 0x80000000);
-    __mmask64 mprocessed =
-        (tail == SIMDUTF_FULL)
-            ? _pdep_u64(Mout, mend)
-            : _pdep_u64(
-                  Mout,
-                  _kand_mask64(mend,
-                               b)); // we adjust mend at the end of the output.
+using namespace simd;
 
-    // mismatched continuation bytes:
-    if (tail == SIMDUTF_FULL) {
-      __mmask64 xnormcm1234 = _kxnor_mask64(
-          mc, m1234); // XNOR of mc and m1234 should be all zero if they differ
-      // the presence of a 1 bit indicates that they overlap.
-      // _kortestz_mask64_u8: compute the bitwise OR of 64-bit masksand return 1
-      // if all zeroes.
-      if (!_kortestz_mask64_u8(xnormcm1234, xnormcm1234)) {
-        return false;
-      }
-    } else {
-      __mmask64 bxorm1234 = _kxor_mask64(b, m1234);
-      if (mc != bxorm1234) {
-        return false;
-      }
-    }
-    // Encodings out of range...
-    {
-      // the location of 3-byte sequence start bytes in the input
-      __mmask64 m3 = m34 & (b ^ m4);
-      // code units in Wout corresponding to 3-byte sequences.
-      __mmask32 M3 = __mmask32(_pext_u64(m3 << 2, mend));
-      __m512i mask_08000800 = _mm512_set1_epi32(0x08000800);
-      __mmask32 Msmall800 =
-          _mm512_mask_cmplt_epu16_mask(M3, Wout, mask_08000800);
-      __m512i mask_d800d800 = _mm512_set1_epi32(0xd800d800);
-      __m512i Moutminusd800 = _mm512_sub_epi16(Wout, mask_d800d800);
-      __mmask32 M3s =
-          _mm512_mask_cmplt_epu16_mask(M3, Moutminusd800, mask_08000800);
-      __m512i mask_04000400 = _mm512_set1_epi32(0x04000400);
-      __mmask32 M4s =
-          _mm512_mask_cmpge_epu16_mask(Mhi, Moutminusd800, mask_04000400);
-      if (!_kortestz_mask32_u8(M4s, _kor_mask32(Msmall800, M3s))) {
-        return false;
-      }
-    }
-    in += 64 - _lzcnt_u64(mprocessed);
-    int64_t nout = _mm_popcnt_u64(mprocessed);
-    if (big_endian) {
-      Wout = _mm512_shuffle_epi8(Wout, byteflip);
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
+  }
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), Wout);
-    out += nout;
-    return true; // ok
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
   }
-  // Fast path 2: all ASCII or 2 byte
-  __mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL)
-                                        ? _knot_mask64(m234)
-                                        : _kand_mask64(_knot_mask64(m234), b);
-  // on top of -0xc0 we subtract -2 which we get back later of the
-  // continuation byte tags
-  __m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
-  __mmask64 leading = tail == (tail == SIMDUTF_FULL)
-                          ? _kor_mask64(m1, m234)
-                          : _kand_mask64(_kor_mask64(m1, m234),
-                                         b); // first bytes of each sequence
-  if (tail == SIMDUTF_FULL) {
-    __mmask64 xnor234leading =
-        _kxnor_mask64(_kshiftli_mask64(m234, 1), leading);
-    if (!_kortestz_mask64_u8(xnor234leading, xnor234leading)) {
-      return false;
-    }
-  } else {
-    __mmask64 bxorleading = _kxor_mask64(b, leading);
-    if (_kshiftli_mask64(m234, 1) != bxorleading) {
-      return false;
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
-  }
-  //
-  if (tail == SIMDUTF_FULL) {
-    // In the two-byte/ASCII scenario, we are easily latency bound, so we want
-    // to increment the input buffer as quickly as possible.
-    // We process 32 bytes unless the byte at index 32 is a continuation byte,
-    // in which case we include it as well for a total of 33 bytes.
-    // Note that if x is an ASCII byte, then the following is false:
-    // int8_t(x) <= int8_t(0xc0) under two's complement.
-    in += 32;
-    if (int8_t(*in) <= int8_t(0xc0))
-      in++;
-    // The alternative is to do
-    // in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-    // but it requires loading the input, doing the mask computation, and
-    // converting back the mask to a general register. It just takes too long,
-    // leaving the processor likely to be idle.
-  } else {
-    in += 64 - _lzcnt_u64(_pdep_u64(0xFFFFFFFF, continuation_or_ascii));
-  }
-  __m512i lead = _mm512_maskz_compress_epi8(
-      leading, leading2byte); // will contain zero for ascii, and the data
-  lead = _mm512_cvtepu8_epi16(
-      _mm512_castsi512_si256(lead)); // ... zero extended into code units
-  __m512i follow = _mm512_maskz_compress_epi8(
-      continuation_or_ascii, input); // the last bytes of each sequence
-  follow = _mm512_cvtepu8_epi16(
-      _mm512_castsi512_si256(follow)); // ... zero extended into code units
-  lead = _mm512_slli_epi16(lead, 6);   // shifted into position
-  __m512i final = _mm512_add_epi16(follow, lead); // combining lead and follow
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
 
-  if (big_endian) {
-    final = _mm512_shuffle_epi8(final, byteflip);
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
   }
-  if (tail == SIMDUTF_FULL) {
-    // Next part is UTF-16 specific and can be generalized to UTF-32.
-    int nout = _mm_popcnt_u32(uint32_t(leading));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
-  } else {
-    int nout = int(_mm_popcnt_u64(_pdep_u64(0xFFFFFFFF, leading)));
-    _mm512_mask_storeu_epi16(out, __mmask32((uint64_t(1) << nout) - 1), final);
-    out += nout; // UTF-8 to UTF-16 is only expansionary in this case.
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
+
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
   }
 
-  return true; // we are fine.
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
 }
 
-/*
-    utf32_to_utf16_masked converts `count` lower UTF-32 code units
-    from input `utf32` into UTF-16. It differs from utf32_to_utf16
-    in that it 'masks' the writes.
+} // namespace utf16
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf16.h */
 
-    Returns how many 16-bit code units were stored.
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
-    byteflip is used for flipping 16-bit code units, and it should be
-        __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    We pass it to the (always inlined) function to encourage the compiler to
-    keep the value in a (constant) register.
-*/
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16_masked(const __m512i byteflip,
-                                                   __m512i utf32,
-                                                   unsigned int count,
-                                                   char16_t *output) {
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-  const __mmask16 valid = uint16_t((1 << count) - 1);
-  // 1. check if we have any surrogate pairs
-  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
-  const __mmask16 sp_mask =
-      _mm512_mask_cmpgt_epu32_mask(valid, utf32, v_0000_ffff);
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
 
-  if (sp_mask == 0) {
-    if (big_endian) {
-      _mm256_mask_storeu_epi16(
-          (__m256i *)output, valid,
-          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
-                              _mm512_castsi512_si256(byteflip)));
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      FORBIDDEN,
+      // 1110____ ________ <three byte lead in byte 1>
+      FORBIDDEN,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      FORBIDDEN);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-    } else {
-      _mm256_mask_storeu_epi16((__m256i *)output, valid,
-                               _mm512_cvtepi32_epi16(utf32));
-    }
-    return count;
-  }
+              // ____0100 ________
+              FORBIDDEN,
+              // ____0101 ________
+              FORBIDDEN,
+              // ____011_ ________
+              FORBIDDEN, FORBIDDEN,
 
-  {
-    // build surrogate pair code units in 32-bit lanes
+              // ____1___ ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              // ____1101 ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
-    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
-    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
-    const __m512i t1 = _mm512_slli_epi32(t0, 6);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
 
-    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
-    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
-    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
-    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
-    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
-    const __m512i t3 =
-        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
-    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
-    __m512i t5 = _mm512_ror_epi32(t4, 16);
-    // Here we want to trim all of the upper 16-bit code units from the 2-byte
-    // characters represented as 4-byte values. We can compute it from
-    // sp_mask or the following... It can be more optimized!
-    const __mmask32 nonzero = _kor_mask32(
-        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-    const __mmask32 nonzero_masked =
-        _kand_mask32(nonzero, __mmask32((uint64_t(1) << (2 * count)) - 1));
-    if (big_endian) {
-      t5 = _mm512_shuffle_epi8(t5, byteflip);
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    this->error |= check_special_cases(input, prev1);
+  }
+
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
     }
-    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
-    // (zen4)
-    __m512i compressed = _mm512_maskz_compress_epi16(nonzero_masked, t5);
-    _mm512_mask_storeu_epi16(
-        output,
-        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
-        compressed);
-    //_mm512_mask_compressstoreu_epi16(output, nonzero_masked, t5);
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      latin1_output += howmany;
+    }
+    return latin1_output - start;
   }
 
-  return count + static_cast<unsigned int>(count_ones(sp_mask));
-}
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        if (errors()) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        latin1_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
+  }
 
-/*
-    utf32_to_utf16 converts `count` lower UTF-32 code units
-    from input `utf32` into UTF-16. It may overflow.
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-    Returns how many 16-bit code units were stored.
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace haswell
+} // namespace simdutf
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
-    byteflip is used for flipping 16-bit code units, and it should be
-        __m512i byteflip = _mm512_setr_epi64(
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809,
-            0x0607040502030001,
-            0x0e0f0c0d0a0b0809
-        );
-    We pass it to the (always inlined) function to encourage the compiler to
-    keep the value in a (constant) register.
-*/
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_to_utf16(const __m512i byteflip,
-                                            __m512i utf32, unsigned int count,
-                                            char16_t *output) {
-  // check if we have any surrogate pairs
-  const __m512i v_0000_ffff = _mm512_set1_epi32(0x0000ffff);
-  const __mmask16 sp_mask = _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
+namespace simdutf {
+namespace haswell {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-  if (sp_mask == 0) {
-    // technically, it should be _mm256_storeu_epi16
-    if (big_endian) {
-      _mm256_storeu_si256(
-          (__m256i *)output,
-          _mm256_shuffle_epi8(_mm512_cvtepi32_epi16(utf32),
-                              _mm512_castsi512_si256(byteflip)));
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
+  }
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
     } else {
-      _mm256_storeu_si256((__m256i *)output, _mm512_cvtepi32_epi16(utf32));
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
     }
-    return count;
   }
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
+}
 
-  {
-    // build surrogate pair code units in 32-bit lanes
-
-    //    t0 = 8 x [000000000000aaaa|aaaaaabbbbbbbbbb]
-    const __m512i v_0001_0000 = _mm512_set1_epi32(0x00010000);
-    const __m512i t0 = _mm512_sub_epi32(utf32, v_0001_0000);
-
-    //    t1 = 8 x [000000aaaaaaaaaa|bbbbbbbbbb000000]
-    const __m512i t1 = _mm512_slli_epi32(t0, 6);
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace haswell
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
-    //    t2 = 8 x [000000aaaaaaaaaa|aaaaaabbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xe4 = (t1 and v_ffff_0000) or (t0 and not v_ffff_0000)
-    const __m512i v_ffff_0000 = _mm512_set1_epi32(0xffff0000);
-    const __m512i t2 = _mm512_ternarylogic_epi32(t1, t0, v_ffff_0000, 0xe4);
+namespace simdutf {
+namespace haswell {
 
-    //    t2 = 8 x [110110aaaaaaaaaa|110111bbbbbbbbbb] -- copy hi word from t1
-    //    to t0
-    //         0xba = (t2 and not v_fc00_fc000) or v_d800_dc00
-    const __m512i v_fc00_fc00 = _mm512_set1_epi32(0xfc00fc00);
-    const __m512i v_d800_dc00 = _mm512_set1_epi32(0xd800dc00);
-    const __m512i t3 =
-        _mm512_ternarylogic_epi32(t2, v_fc00_fc00, v_d800_dc00, 0xba);
-    const __m512i t4 = _mm512_mask_blend_epi32(sp_mask, utf32, t3);
-    __m512i t5 = _mm512_ror_epi32(t4, 16);
-    const __mmask32 nonzero = _kor_mask32(
-        0xaaaaaaaa, _mm512_cmpneq_epi16_mask(t5, _mm512_setzero_si512()));
-    if (big_endian) {
-      t5 = _mm512_shuffle_epi8(t5, byteflip);
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
     }
-    // we deliberately avoid _mm512_mask_compressstoreu_epi16 for portability
-    // (zen4)
-    __m512i compressed = _mm512_maskz_compress_epi16(nonzero, t5);
-    _mm512_mask_storeu_epi16(
-        output,
-        (1 << (count + static_cast<unsigned int>(count_ones(sp_mask)))) - 1,
-        compressed);
-    //_mm512_mask_compressstoreu_epi16(output, nonzero, t5);
   }
-
-  return count + static_cast<unsigned int>(count_ones(sp_mask));
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
 }
 
-/**
- * Store the last N bytes of previous followed by 512-N bytes from input.
- */
-template <int N> __m512i prev(__m512i input, __m512i previous) {
-  static_assert(N <= 32, "N must be no larger than 32");
-  const __m512i movemask =
-      _mm512_setr_epi32(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
-  const __m512i rotated = _mm512_permutex2var_epi32(input, movemask, previous);
-#if SIMDUTF_GCC8 || SIMDUTF_GCC9
-  constexpr int shift = 16 - N; // workaround for GCC8,9
-  return _mm512_alignr_epi8(input, rotated, shift);
-#else
-  return _mm512_alignr_epi8(input, rotated, 16 - N);
-#endif // SIMDUTF_GCC8 || SIMDUTF_GCC9
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-template <unsigned idx0, unsigned idx1, unsigned idx2, unsigned idx3>
-__m512i shuffle_epi128(__m512i v) {
-  static_assert((idx0 >= 0 && idx0 <= 3), "idx0 must be in range 0..3");
-  static_assert((idx1 >= 0 && idx1 <= 3), "idx1 must be in range 0..3");
-  static_assert((idx2 >= 0 && idx2 <= 3), "idx2 must be in range 0..3");
-  static_assert((idx3 >= 0 && idx3 <= 3), "idx3 must be in range 0..3");
-
-  constexpr unsigned shuffle = idx0 | (idx1 << 2) | (idx2 << 4) | (idx3 << 6);
-  return _mm512_shuffle_i32x4(v, v, shuffle);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-template <unsigned idx> constexpr __m512i broadcast_epi128(__m512i v) {
-  return shuffle_epi128<idx, idx, idx, idx>(v);
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-/**
- * Current unused.
- */
-template <int N> __m512i rotate_by_N_epi8(const __m512i input) {
-
-  // lanes order: 1, 2, 3, 0 => 0b00_11_10_01
-  const __m512i permuted = _mm512_shuffle_i32x4(input, input, 0x39);
-
-  return _mm512_alignr_epi8(permuted, input, N);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-/*
-    expanded_utf8_to_utf32 converts expanded UTF-8 characters (`utf8`)
-    stored at separate 32-bit lanes.
-
-    For each lane we have also a character class (`char_class), given in form
-    0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
-    corresponding bytes during pshufb.
-*/
-simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class,
-                                                     __m512i utf8) {
-  /*
-      Input:
-      - utf8: bytes stored at separate 32-bit code units
-      - valid: which code units have valid UTF-8 characters
-
-      Bit layout of single word. We show 4 cases for each possible
-      UTF-8 character encoding. The `?` denotes bits we must not
-      assume their value.
-
-      |10dd.dddd|10cc.cccc|10bb.bbbb|1111.0aaa| 4-byte char
-      |????.????|10cc.cccc|10bb.bbbb|1110.aaaa| 3-byte char
-      |????.????|????.????|10bb.bbbb|110a.aaaa| 2-byte char
-      |????.????|????.????|????.????|0aaa.aaaa| ASCII char
-        byte 3    byte 2    byte 1     byte 0
-  */
-
-  /* 1. Reset control bits of continuation bytes and the MSB
-        of the leading byte; this makes all bytes unsigned (and
-        does not alter ASCII char).
-
-      |00dd.dddd|00cc.cccc|00bb.bbbb|0111.0aaa| 4-byte char
-      |00??.????|00cc.cccc|00bb.bbbb|0110.aaaa| 3-byte char
-      |00??.????|00??.????|00bb.bbbb|010a.aaaa| 2-byte char
-      |00??.????|00??.????|00??.????|0aaa.aaaa| ASCII char
-       ^^        ^^        ^^        ^
-  */
-  __m512i values;
-  const __m512i v_3f3f_3f7f = _mm512_set1_epi32(0x3f3f3f7f);
-  values = _mm512_and_si512(utf8, v_3f3f_3f7f);
-
-  /* 2. Swap and join fields A-B and C-D
-
-      |0000.cccc|ccdd.dddd|0001.110a|aabb.bbbb| 4-byte char
-      |0000.cccc|cc??.????|0001.10aa|aabb.bbbb| 3-byte char
-      |0000.????|????.????|0001.0aaa|aabb.bbbb| 2-byte char
-      |0000.????|????.????|000a.aaaa|aa??.????| ASCII char */
-  const __m512i v_0140_0140 = _mm512_set1_epi32(0x01400140);
-  values = _mm512_maddubs_epi16(values, v_0140_0140);
-
-  /* 3. Swap and join fields AB & CD
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail,
+                                                       len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-      |0000.0001|110a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char
-      |0000.0001|10aa.aabb|bbbb.cccc|cc??.????| 3-byte char
-      |0000.0001|0aaa.aabb|bbbb.????|????.????| 2-byte char
-      |0000.000a|aaaa.aa??|????.????|????.????| ASCII char */
-  const __m512i v_0001_1000 = _mm512_set1_epi32(0x00011000);
-  values = _mm512_madd_epi16(values, v_0001_1000);
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-16. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-  /* 4. Shift left the values by variable amounts to reset highest UTF-8 bits
-      |aaab.bbbb|bccc.cccd|dddd.d000|0000.0000| 4-byte char -- by 11
-      |aaaa.bbbb|bbcc.cccc|????.??00|0000.0000| 3-byte char -- by 10
-      |aaaa.abbb|bbb?.????|????.???0|0000.0000| 2-byte char -- by 9
-      |aaaa.aaa?|????.????|????.????|?000.0000| ASCII char -- by 7 */
-  {
-    /** pshufb
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    continuation = 0
-    ascii    = 7
-    _2_bytes = 9
-    _3_bytes = 10
-    _4_bytes = 11
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    shift_left_v3 = 4 * [
-        ascii, # 0000
-        ascii, # 0001
-        ascii, # 0010
-        ascii, # 0011
-        ascii, # 0100
-        ascii, # 0101
-        ascii, # 0110
-        ascii, # 0111
-        continuation, # 1000
-        continuation, # 1001
-        continuation, # 1010
-        continuation, # 1011
-        _2_bytes, # 1100
-        _2_bytes, # 1101
-        _3_bytes, # 1110
-        _4_bytes, # 1111
-    ] */
-    const __m512i shift_left_v3 = _mm512_setr_epi64(
-        0x0707070707070707, 0x0b0a090900000000, 0x0707070707070707,
-        0x0b0a090900000000, 0x0707070707070707, 0x0b0a090900000000,
-        0x0707070707070707, 0x0b0a090900000000);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return true;
+  }
+  const char32_t *tail = avx2_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-    const __m512i shift = _mm512_shuffle_epi8(shift_left_v3, char_class);
-    values = _mm512_sllv_epi32(values, shift);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid UTF-32. protect the implementation from
+    // handling nullptr
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = avx2_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res =
+        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
   }
+}
 
-  /* 5. Shift right the values by variable amounts to reset lowest bits
-      |0000.0000|000a.aabb|bbbb.cccc|ccdd.dddd| 4-byte char -- by 11
-      |0000.0000|0000.0000|aaaa.bbbb|bbcc.cccc| 3-byte char -- by 16
-      |0000.0000|0000.0000|0000.0aaa|aabb.bbbb| 2-byte char -- by 21
-      |0000.0000|0000.0000|0000.0000|0aaa.aaaa| ASCII char -- by 25 */
-  {
-    // 4 * [25, 25, 25, 25, 25, 25, 25, 25, 0, 0, 0, 0, 21, 21, 16, 11]
-    const __m512i shift_right = _mm512_setr_epi64(
-        0x1919191919191919, 0x0b10151500000000, 0x1919191919191919,
-        0x0b10151500000000, 0x1919191919191919, 0x0b10151500000000,
-        0x1919191919191919, 0x0b10151500000000);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char *, char *> ret =
+      avx2_convert_latin1_to_utf8(buf, len, utf8_output);
+  size_t converted_chars = ret.second - utf8_output;
 
-    const __m512i shift = _mm512_shuffle_epi8(shift_right, char_class);
-    values = _mm512_srlv_epi32(values, shift);
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
   }
 
-  return values;
+  return converted_chars;
 }
 
-simdutf_really_inline __m512i expand_and_identify(__m512i lane0, __m512i lane1,
-                                                  int &count) {
-  const __m512i merged = _mm512_mask_mov_epi32(lane0, 0x1000, lane1);
-  const __m512i expand_ver2 = _mm512_setr_epi64(
-      0x0403020103020100, 0x0605040305040302, 0x0807060507060504,
-      0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,
-      0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);
-  const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);
-  const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);
-  const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);
-  const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);
-  const __mmask16 leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);
-  count = static_cast<int>(count_ones(leading_bytes));
-  return _mm512_mask_compress_epi32(_mm512_setzero_si512(), leading_bytes,
-                                    input);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_really_inline __m512i expand_utf8_to_utf32(__m512i input) {
-  __m512i char_class = _mm512_srli_epi32(input, 4);
-  /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */
-  const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);
-  const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);
-  char_class =
-      _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea);
-  return expanded_utf8_to_utf32(char_class, input);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
-/* end file src/icelake/icelake_utf8_common.inl.cpp */
-/* begin file src/icelake/icelake_macros.inl.cpp */
-
-/*
-    This upcoming macro (SIMDUTF_ICELAKE_TRANSCODE16) takes 16 + 4 bytes (of a
-   UTF-8 string) and loads all possible 4-byte substring into an AVX512
-   register.
-
-    For example if we have bytes abcdefgh... we create following 32-bit lanes
-
-    [abcd|bcde|cdef|defg|efgh|...]
-     ^                          ^
-     byte 0 of reg              byte 63 of reg
-*/
-/** pshufb
-        # lane{0,1,2} have got bytes: [  0,  1,  2,  3,  4,  5,  6,  8,  9, 10,
-   11, 12, 13, 14, 15] # lane3 has got bytes:        [ 16, 17, 18, 19,  4,  5,
-   6,  8,  9, 10, 11, 12, 13, 14, 15]
 
-        expand_ver2 = [
-            # lane 0:
-            0, 1, 2, 3,
-            1, 2, 3, 4,
-            2, 3, 4, 5,
-            3, 4, 5, 6,
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      avx2_convert_latin1_to_utf32(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_converted_chars == 0) {
+      return 0;
+    }
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
+}
 
-            # lane 1:
-            4, 5, 6, 7,
-            5, 6, 7, 8,
-            6, 7, 8, 9,
-            7, 8, 9, 10,
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert(buf, len, latin1_output);
+}
 
-            # lane 2:
-             8,  9, 10, 11,
-             9, 10, 11, 12,
-            10, 11, 12, 13,
-            11, 12, 13, 14,
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, latin1_output);
+}
 
-            # lane 3 order: 13, 14, 15, 16 14, 15, 16, 17, 15, 16, 17, 18, 16,
-   17, 18, 19 12, 13, 14, 15, 13, 14, 15,  0, 14, 15,  0,  1, 15,  0,  1,  2,
-        ]
-*/
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *input, size_t size, char *latin1_output) const noexcept {
+  return utf8_to_latin1::convert_valid(input, size, latin1_output);
+}
 
-#define SIMDUTF_ICELAKE_TRANSCODE16(LANE0, LANE1, MASKED)                      \
-  {                                                                            \
-    const __m512i merged = _mm512_mask_mov_epi32(LANE0, 0x1000, LANE1);        \
-    const __m512i expand_ver2 = _mm512_setr_epi64(                             \
-        0x0403020103020100, 0x0605040305040302, 0x0807060507060504,            \
-        0x0a09080709080706, 0x0c0b0a090b0a0908, 0x0e0d0c0b0d0c0b0a,            \
-        0x000f0e0d0f0e0d0c, 0x0201000f01000f0e);                               \
-    const __m512i input = _mm512_shuffle_epi8(merged, expand_ver2);            \
-                                                                               \
-    __mmask16 leading_bytes;                                                   \
-    const __m512i v_0000_00c0 = _mm512_set1_epi32(0xc0);                       \
-    const __m512i t0 = _mm512_and_si512(input, v_0000_00c0);                   \
-    const __m512i v_0000_0080 = _mm512_set1_epi32(0x80);                       \
-    leading_bytes = _mm512_cmpneq_epu32_mask(t0, v_0000_0080);                 \
-                                                                               \
-    __m512i char_class;                                                        \
-    char_class = _mm512_srli_epi32(input, 4);                                  \
-    /*  char_class = ((input >> 4) & 0x0f) | 0x80808000 */                     \
-    const __m512i v_0000_000f = _mm512_set1_epi32(0x0f);                       \
-    const __m512i v_8080_8000 = _mm512_set1_epi32(0x80808000);                 \
-    char_class =                                                               \
-        _mm512_ternarylogic_epi32(char_class, v_0000_000f, v_8080_8000, 0xea); \
-                                                                               \
-    const int valid_count = static_cast<int>(count_ones(leading_bytes));       \
-    const __m512i utf32 = expanded_utf8_to_utf32(char_class, input);           \
-                                                                               \
-    const __m512i out = _mm512_mask_compress_epi32(_mm512_setzero_si512(),     \
-                                                   leading_bytes, utf32);      \
-                                                                               \
-    if (UTF32) {                                                               \
-      if (MASKED) {                                                            \
-        const __mmask16 valid = uint16_t((1 << valid_count) - 1);              \
-        _mm512_mask_storeu_epi32((__m512i *)output, valid, out);               \
-      } else {                                                                 \
-        _mm512_storeu_si512((__m512i *)output, out);                           \
-      }                                                                        \
-      output += valid_count;                                                   \
-    } else {                                                                   \
-      if (MASKED) {                                                            \
-        output += utf32_to_utf16_masked<big_endian>(                           \
-            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
-      } else {                                                                 \
-        output += utf32_to_utf16<big_endian>(                                  \
-            byteflip, out, valid_count, reinterpret_cast<char16_t *>(output)); \
-      }                                                                        \
-    }                                                                          \
-  }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
+}
 
-#define SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(INPUT, VALID_COUNT, MASKED)       \
-  {                                                                            \
-    if (UTF32) {                                                               \
-      if (MASKED) {                                                            \
-        const __mmask16 valid_mask = uint16_t((1 << VALID_COUNT) - 1);         \
-        _mm512_mask_storeu_epi32((__m512i *)output, valid_mask, INPUT);        \
-      } else {                                                                 \
-        _mm512_storeu_si512((__m512i *)output, INPUT);                         \
-      }                                                                        \
-      output += VALID_COUNT;                                                   \
-    } else {                                                                   \
-      if (MASKED) {                                                            \
-        output += utf32_to_utf16_masked<big_endian>(                           \
-            byteflip, INPUT, VALID_COUNT,                                      \
-            reinterpret_cast<char16_t *>(output));                             \
-      } else {                                                                 \
-        output +=                                                              \
-            utf32_to_utf16<big_endian>(byteflip, INPUT, VALID_COUNT,           \
-                                       reinterpret_cast<char16_t *>(output));  \
-      }                                                                        \
-    }                                                                          \
-  }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
+}
 
-#define SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)                       \
-  if (UTF32) {                                                                 \
-    const __m128i t0 = _mm512_castsi512_si128(utf8);                           \
-    const __m128i t1 = _mm512_extracti32x4_epi32(utf8, 1);                     \
-    const __m128i t2 = _mm512_extracti32x4_epi32(utf8, 2);                     \
-    const __m128i t3 = _mm512_extracti32x4_epi32(utf8, 3);                     \
-    _mm512_storeu_si512((__m512i *)(output + 0 * 16),                          \
-                        _mm512_cvtepu8_epi32(t0));                             \
-    _mm512_storeu_si512((__m512i *)(output + 1 * 16),                          \
-                        _mm512_cvtepu8_epi32(t1));                             \
-    _mm512_storeu_si512((__m512i *)(output + 2 * 16),                          \
-                        _mm512_cvtepu8_epi32(t2));                             \
-    _mm512_storeu_si512((__m512i *)(output + 3 * 16),                          \
-                        _mm512_cvtepu8_epi32(t3));                             \
-  } else {                                                                     \
-    const __m256i h0 = _mm512_castsi512_si256(utf8);                           \
-    const __m256i h1 = _mm512_extracti64x4_epi64(utf8, 1);                     \
-    if (big_endian) {                                                          \
-      _mm512_storeu_si512(                                                     \
-          (__m512i *)(output + 0 * 16),                                        \
-          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h0), byteflip));            \
-      _mm512_storeu_si512(                                                     \
-          (__m512i *)(output + 2 * 16),                                        \
-          _mm512_shuffle_epi8(_mm512_cvtepu8_epi16(h1), byteflip));            \
-    } else {                                                                   \
-      _mm512_storeu_si512((__m512i *)(output + 0 * 16),                        \
-                          _mm512_cvtepu8_epi16(h0));                           \
-      _mm512_storeu_si512((__m512i *)(output + 2 * 16),                        \
-                          _mm512_cvtepu8_epi16(h1));                           \
-    }                                                                          \
-  }
-/* end file src/icelake/icelake_macros.inl.cpp */
-/* begin file src/icelake/icelake_from_valid_utf8.inl.cpp */
-// file included directly
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+                                                           utf16_output);
+}
 
-// File contains conversion procedure from VALID UTF-8 strings.
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
 
-/*
-    valid_utf8_to_fixed_length converts a valid UTF-8 string into UTF-32.
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+                                                          utf16_output);
+}
 
-    The `OUTPUT` template type decides what to do with UTF-32: store
-    it directly or convert into UTF-16 (with AVX512).
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+                                                       utf16_output);
+}
 
-    Input:
-    - str           - valid UTF-8 string
-    - len           - string length
-    - out_buffer    - output buffer
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
+}
 
-    Result:
-    - pair.first    - the first unprocessed input byte
-    - pair.second   - the first unprocessed output word
-*/
-template <endianness big_endian, typename OUTPUT>
-std::pair<const char *, OUTPUT *>
-valid_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
-  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
-  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
-  static_assert(
-      UTF32 or UTF16,
-      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
-  static_assert(!(UTF32 and big_endian),
-                "we do not currently support big-endian UTF-32");
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
+}
 
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  const char *ptr = str;
-  const char *end = ptr + len;
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
 
-  OUTPUT *output = dwords;
-  /**
-   * In the main loop, we consume 64 bytes per iteration,
-   * but we access 64 + 4 bytes.
-   * We check for ptr + 64 + 64 <= end because
-   * we want to be do maskless writes without overruns.
-   */
-  while (end - ptr >= 64 + 4) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-    if (ascii == 0) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
+                                                                latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    const __m512i lane0 = broadcast_epi128<0>(utf8);
-    const __m512i lane1 = broadcast_epi128<1>(utf8);
-    int valid_count0;
-    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-    const __m512i lane2 = broadcast_epi128<2>(utf8);
-    int valid_count1;
-    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-    if (valid_count0 + valid_count1 <= 16) {
-      vec0 = _mm512_mask_expand_epi32(
-          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-      valid_count0 += valid_count1;
-      vec0 = expand_utf8_to_utf32(vec0);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-    } else {
-      vec0 = expand_utf8_to_utf32(vec0);
-      vec1 = expand_utf8_to_utf32(vec1);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
+                                                             latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    const __m512i lane3 = broadcast_epi128<3>(utf8);
-    int valid_count2;
-    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
-    uint32_t tmp1;
-    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
-    const __m512i lane4 = _mm512_set1_epi32(tmp1);
-    int valid_count3;
-    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-    if (valid_count2 + valid_count3 <= 16) {
-      vec2 = _mm512_mask_expand_epi32(
-          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
-      valid_count2 += valid_count3;
-      vec2 = expand_utf8_to_utf32(vec2);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+          buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      vec2 = expand_utf8_to_utf32(vec2);
-      vec3 = expand_utf8_to_utf32(vec3);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+      ret.second += scalar_res.count;
     }
-    ptr += 4 * 16;
   }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  if (end - ptr >= 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-    const __mmask64 ascii = _mm512_test_epi8_mask(utf8, v_80);
-    if (ascii == 0) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                                latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      const __m512i lane0 = broadcast_epi128<0>(utf8);
-      const __m512i lane1 = broadcast_epi128<1>(utf8);
-      int valid_count0;
-      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-      const __m512i lane2 = broadcast_epi128<2>(utf8);
-      int valid_count1;
-      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-      if (valid_count0 + valid_count1 <= 16) {
-        vec0 = _mm512_mask_expand_epi32(
-            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-        valid_count0 += valid_count1;
-        vec0 = expand_utf8_to_utf32(vec0);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      } else {
-        vec0 = expand_utf8_to_utf32(vec0);
-        vec1 = expand_utf8_to_utf32(vec1);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-      }
-
-      const __m512i lane3 = broadcast_epi128<3>(utf8);
-      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
-
-      ptr += 3 * 16;
+      ret.second += scalar_res.count;
     }
   }
-  return {ptr, output};
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-using utf8_to_utf16_result = std::pair<const char *, char16_t *>;
-/* end file src/icelake/icelake_from_valid_utf8.inl.cpp */
-/* begin file src/icelake/icelake_utf8_validation.inl.cpp */
-// file included directly
-
-simdutf_really_inline __m512i check_special_cases(__m512i input,
-                                                  const __m512i prev1) {
-  __m512i mask1 = _mm512_setr_epi64(0x0202020202020202, 0x4915012180808080,
-                                    0x0202020202020202, 0x4915012180808080,
-                                    0x0202020202020202, 0x4915012180808080,
-                                    0x0202020202020202, 0x4915012180808080);
-  const __m512i v_0f = _mm512_set1_epi8(0x0f);
-  __m512i index1 = _mm512_and_si512(_mm512_srli_epi16(prev1, 4), v_0f);
-
-  __m512i byte_1_high = _mm512_shuffle_epi8(mask1, index1);
-  __m512i mask2 = _mm512_setr_epi64(0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
-                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
-                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb,
-                                    0xcbcbcb8b8383a3e7, 0xcbcbdbcbcbcbcbcb);
-  __m512i index2 = _mm512_and_si512(prev1, v_0f);
-
-  __m512i byte_1_low = _mm512_shuffle_epi8(mask2, index2);
-  __m512i mask3 =
-      _mm512_setr_epi64(0x101010101010101, 0x1010101babaaee6, 0x101010101010101,
-                        0x1010101babaaee6, 0x101010101010101, 0x1010101babaaee6,
-                        0x101010101010101, 0x1010101babaaee6);
-  __m512i index3 = _mm512_and_si512(_mm512_srli_epi16(input, 4), v_0f);
-  __m512i byte_2_high = _mm512_shuffle_epi8(mask3, index3);
-  return _mm512_ternarylogic_epi64(byte_1_high, byte_1_low, byte_2_high, 128);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
-simdutf_really_inline __m512i check_multibyte_lengths(const __m512i input,
-                                                      const __m512i prev_input,
-                                                      const __m512i sc) {
-  __m512i prev2 = prev<2>(input, prev_input);
-  __m512i prev3 = prev<3>(input, prev_input);
-  __m512i is_third_byte = _mm512_subs_epu8(
-      prev2, _mm512_set1_epi8(0b11100000u - 1)); // Only 111_____ will be > 0
-  __m512i is_fourth_byte = _mm512_subs_epu8(
-      prev3, _mm512_set1_epi8(0b11110000u - 1)); // Only 1111____ will be > 0
-  __m512i is_third_or_fourth_byte =
-      _mm512_or_si512(is_third_byte, is_fourth_byte);
-  const __m512i v_7f = _mm512_set1_epi8(char(0x7f));
-  is_third_or_fourth_byte = _mm512_adds_epu8(v_7f, is_third_or_fourth_byte);
-  // We want to compute (is_third_or_fourth_byte AND v80) XOR sc.
-  const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-  return _mm512_ternarylogic_epi32(is_third_or_fourth_byte, v_80, sc,
-                                   0b1101010);
-  //__m512i is_third_or_fourth_byte_mask =
-  //_mm512_and_si512(is_third_or_fourth_byte, v_80); return
-  // _mm512_xor_si512(is_third_or_fourth_byte_mask, sc);
-}
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the
-// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
-//
-simdutf_really_inline __m512i is_incomplete(const __m512i input) {
-  // If the previous input's last 3 bytes match this, they're too short (they
-  // ended at EOF):
-  // ... 1111____ 111_____ 11______
-  __m512i max_value = _mm512_setr_epi64(0xffffffffffffffff, 0xffffffffffffffff,
-                                        0xffffffffffffffff, 0xffffffffffffffff,
-                                        0xffffffffffffffff, 0xffffffffffffffff,
-                                        0xffffffffffffffff, 0xbfdfefffffffffff);
-  return _mm512_subs_epu8(input, max_value);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
-struct avx512_utf8_checker {
-  // If this is nonzero, there has been a UTF-8 error.
-  __m512i error{};
-
-  // The last input we received
-  __m512i prev_input_block{};
-  // Whether the last input we received was incomplete (used for ASCII fast
-  // path)
-  __m512i prev_incomplete{};
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
+                                                              utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const __m512i input,
-                                              const __m512i prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    __m512i prev1 = prev<1>(input, prev_input);
-    __m512i sc = check_special_cases(input, prev1);
-    this->error = _mm512_or_si512(
-        check_multibyte_lengths(input, prev_input, sc), this->error);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
+                                                           utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
+}
 
-  // The only problem that can happen at EOF is that a multibyte character is
-  // too short or a byte value too large in the last bytes: check_special_cases
-  // only checks for bytes too large in the first of two bytes.
-  simdutf_really_inline void check_eof() {
-    // If the previous block had incomplete UTF-8 characters at the end, an
-    // ASCII block can't possibly finish them.
-    this->error = _mm512_or_si512(this->error, this->prev_incomplete);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+          buf, len, utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  // returns true if ASCII.
-  simdutf_really_inline bool check_next_input(const __m512i input) {
-    const __m512i v_80 = _mm512_set1_epi8(char(0x80));
-    const __mmask64 ascii = _mm512_test_epi8_mask(input, v_80);
-    if (ascii == 0) {
-      this->error = _mm512_or_si512(this->error, this->prev_incomplete);
-      return true;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+          buf, len, utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      this->check_utf8_bytes(input, this->prev_input_block);
-      this->prev_incomplete = is_incomplete(input);
-      this->prev_input_block = input;
-      return false;
+      ret.second += scalar_res.count;
     }
   }
-  // do not forget to call check_eof!
-  simdutf_really_inline bool errors() const {
-    return _mm512_test_epi8_mask(this->error, this->error) != 0;
-  }
-}; // struct avx512_utf8_checker
-/* end file src/icelake/icelake_utf8_validation.inl.cpp */
-/* begin file src/icelake/icelake_from_utf8.inl.cpp */
-// file included directly
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-// File contains conversion procedure from possibly invalid UTF-8 strings.
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
+}
 
-/**
- * Attempts to convert up to len 1-byte code units from in (in UTF-8 format) to
- * out.
- * Returns the position of the input and output after the processing is
- * completed. Upon error, the output is set to null.
- */
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
+}
 
-template <endianness big_endian>
-utf8_to_utf16_result
-fast_avx512_convert_utf8_to_utf16(const char *in, size_t len, char16_t *out) {
-  const char *const final_in = in + len;
-  bool result = true;
-  while (result) {
-    if (final_in - in >= 64) {
-      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
-          in, out, final_in - in);
-    } else if (in < final_in) {
-      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
-          in, out, final_in - in);
-    } else {
-      break;
-    }
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      avx2_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  if (!result) {
-    out = nullptr;
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-  return std::make_pair(in, out);
+  return saved_bytes;
 }
 
-template <endianness big_endian>
-simdutf::result fast_avx512_convert_utf8_to_utf16_with_errors(const char *in,
-                                                              size_t len,
-                                                              char16_t *out) {
-  const char *const init_in = in;
-  const char16_t *const init_out = out;
-  const char *const final_in = in + len;
-  bool result = true;
-  while (result) {
-    if (final_in - in >= 64) {
-      result = process_block_utf8_to_utf16<SIMDUTF_FULL, big_endian>(
-          in, out, final_in - in);
-    } else if (in < final_in) {
-      result = process_block_utf8_to_utf16<SIMDUTF_TAIL, big_endian>(
-          in, out, final_in - in);
-    } else {
-      break;
-    }
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      avx2_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  if (!result) {
-    size_t pos = size_t(in - init_in);
-    if (pos < len && (init_in[pos] & 0xc0) == 0x80 && pos >= 64) {
-      // We must check whether we are the fourth continuation byte
-      bool c1 = (init_in[pos - 1] & 0xc0) == 0x80;
-      bool c2 = (init_in[pos - 2] & 0xc0) == 0x80;
-      bool c3 = (init_in[pos - 3] & 0xc0) == 0x80;
-      if (c1 && c2 && c3) {
-        return {simdutf::TOO_LONG, pos};
-      }
+  size_t saved_bytes = ret.second - latin1_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    // rewind_and_convert_with_errors will seek a potential error from in
-    // onward, with the ability to go back up to in - init_in bytes, and read
-    // final_in - in bytes forward.
-    simdutf::result res =
-        scalar::utf8_to_utf16::rewind_and_convert_with_errors<big_endian>(
-            in - init_in, in, final_in - in, out);
-    res.count += (in - init_in);
-    return res;
-  } else {
-    return simdutf::result(error_code::SUCCESS, out - init_out);
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
 }
 
-template <endianness big_endian, typename OUTPUT>
-// todo: replace with the utf-8 to utf-16 routine adapted to utf-32. This code
-// is legacy.
-std::pair<const char *, OUTPUT *>
-validating_utf8_to_fixed_length(const char *str, size_t len, OUTPUT *dwords) {
-  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
-  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
-  static_assert(
-      UTF32 or UTF16,
-      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
-  static_assert(!(UTF32 and big_endian),
-                "we do not currently support big-endian UTF-32");
-
-  const char *ptr = str;
-  const char *end = ptr + len;
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  OUTPUT *output = dwords;
-  avx512_utf8_checker checker{};
-  /**
-   * In the main loop, we consume 64 bytes per iteration,
-   * but we access 64 + 4 bytes.
-   * We use masked writes to avoid overruns, see
-   * https://github.com/simdutf/simdutf/issues/471
-   */
-  while (end - ptr >= 64 + 4) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    if (checker.check_next_input(utf8)) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-      continue;
-    }
-    const __m512i lane0 = broadcast_epi128<0>(utf8);
-    const __m512i lane1 = broadcast_epi128<1>(utf8);
-    int valid_count0;
-    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-    const __m512i lane2 = broadcast_epi128<2>(utf8);
-    int valid_count1;
-    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-    if (valid_count0 + valid_count1 <= 16) {
-      vec0 = _mm512_mask_expand_epi32(
-          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-      valid_count0 += valid_count1;
-      vec0 = expand_utf8_to_utf32(vec0);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-    } else {
-      vec0 = expand_utf8_to_utf32(vec0);
-      vec1 = expand_utf8_to_utf32(vec1);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-    }
-    const __m512i lane3 = broadcast_epi128<3>(utf8);
-    int valid_count2;
-    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
-    uint32_t tmp1;
-    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
-    const __m512i lane4 = _mm512_set1_epi32(tmp1);
-    int valid_count3;
-    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-    if (valid_count2 + valid_count3 <= 16) {
-      vec2 = _mm512_mask_expand_epi32(
-          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
-      valid_count2 += valid_count3;
-      vec2 = expand_utf8_to_utf32(vec2);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      vec2 = expand_utf8_to_utf32(vec2);
-      vec3 = expand_utf8_to_utf32(vec3);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+      ret.second += scalar_res.count;
     }
-    ptr += 4 * 16;
   }
-  const char *validatedptr = ptr; // validated up to ptr
-
-  // For the final pass, we validate 64 bytes, but we only transcode
-  // 3*16 bytes, so we may end up double-validating 16 bytes.
-  if (end - ptr >= 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    if (checker.check_next_input(utf8)) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-    } else {
-      const __m512i lane0 = broadcast_epi128<0>(utf8);
-      const __m512i lane1 = broadcast_epi128<1>(utf8);
-      int valid_count0;
-      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-      const __m512i lane2 = broadcast_epi128<2>(utf8);
-      int valid_count1;
-      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-      if (valid_count0 + valid_count1 <= 16) {
-        vec0 = _mm512_mask_expand_epi32(
-            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-        valid_count0 += valid_count1;
-        vec0 = expand_utf8_to_utf32(vec0);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      } else {
-        vec0 = expand_utf8_to_utf32(vec0);
-        vec1 = expand_utf8_to_utf32(vec1);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-      }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      const __m512i lane3 = broadcast_epi128<3>(utf8);
-      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  return convert_utf32_to_latin1(buf, len, latin1_output);
+}
 
-      ptr += 3 * 16;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    validatedptr += 4 * 16;
   }
-  if (end != validatedptr) {
-    const __m512i utf8 =
-        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
-                                (const __m512i *)validatedptr);
-    checker.check_next_input(utf8);
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
+                                                               utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
   }
-  checker.check_eof();
-  if (checker.errors()) {
-    return {ptr, nullptr}; // We found an error.
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
   }
-  return {ptr, output};
+  return saved_bytes;
 }
 
-// Like validating_utf8_to_fixed_length but returns as soon as an error is
-// identified todo: replace with the utf-8 to utf-16 routine adapted to utf-32.
-// This code is legacy.
-template <endianness big_endian, typename OUTPUT>
-std::tuple<const char *, OUTPUT *, bool>
-validating_utf8_to_fixed_length_with_constant_checks(const char *str,
-                                                     size_t len,
-                                                     OUTPUT *dwords) {
-  constexpr bool UTF32 = std::is_same<OUTPUT, uint32_t>::value;
-  constexpr bool UTF16 = std::is_same<OUTPUT, char16_t>::value;
-  static_assert(
-      UTF32 or UTF16,
-      "output type has to be uint32_t (for UTF-32) or char16_t (for UTF-16)");
-  static_assert(!(UTF32 and big_endian),
-                "we do not currently support big-endian UTF-32");
-
-  const char *ptr = str;
-  const char *end = ptr + len;
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  OUTPUT *output = dwords;
-  avx512_utf8_checker checker{};
-  /**
-   * In the main loop, we consume 64 bytes per iteration,
-   * but we access 64 + 4 bytes.
-   */
-  while (end - ptr >= 4 + 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    bool ascii = checker.check_next_input(utf8);
-    if (checker.errors()) {
-      return {ptr, output, false}; // We found an error.
-    }
-    if (ascii) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
-      continue;
-    }
-    const __m512i lane0 = broadcast_epi128<0>(utf8);
-    const __m512i lane1 = broadcast_epi128<1>(utf8);
-    int valid_count0;
-    __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-    const __m512i lane2 = broadcast_epi128<2>(utf8);
-    int valid_count1;
-    __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-    if (valid_count0 + valid_count1 <= 16) {
-      vec0 = _mm512_mask_expand_epi32(
-          vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-      valid_count0 += valid_count1;
-      vec0 = expand_utf8_to_utf32(vec0);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-    } else {
-      vec0 = expand_utf8_to_utf32(vec0);
-      vec1 = expand_utf8_to_utf32(vec1);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-    }
-    const __m512i lane3 = broadcast_epi128<3>(utf8);
-    int valid_count2;
-    __m512i vec2 = expand_and_identify(lane2, lane3, valid_count2);
-    uint32_t tmp1;
-    ::memcpy(&tmp1, ptr + 64, sizeof(tmp1));
-    const __m512i lane4 = _mm512_set1_epi32(tmp1);
-    int valid_count3;
-    __m512i vec3 = expand_and_identify(lane3, lane4, valid_count3);
-    if (valid_count2 + valid_count3 <= 16) {
-      vec2 = _mm512_mask_expand_epi32(
-          vec2, __mmask16(((1 << valid_count3) - 1) << valid_count2), vec3);
-      valid_count2 += valid_count3;
-      vec2 = expand_utf8_to_utf32(vec2);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-    } else {
-      vec2 = expand_utf8_to_utf32(vec2);
-      vec3 = expand_utf8_to_utf32(vec3);
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec2, valid_count2, true)
-      SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec3, valid_count3, true)
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
+                                                            utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    ptr += 4 * 16;
+    saved_bytes += scalar_saved_bytes;
   }
-  const char *validatedptr = ptr; // validated up to ptr
+  return saved_bytes;
+}
 
-  // For the final pass, we validate 64 bytes, but we only transcode
-  // 3*16 bytes, so we may end up double-validating 16 bytes.
-  if (end - ptr >= 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    bool ascii = checker.check_next_input(utf8);
-    if (checker.errors()) {
-      return {ptr, output, false}; // We found an error.
-    }
-    if (ascii) {
-      SIMDUTF_ICELAKE_STORE_ASCII(UTF32, utf8, output)
-      output += 64;
-      ptr += 64;
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+          buf, len, utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
     } else {
-      const __m512i lane0 = broadcast_epi128<0>(utf8);
-      const __m512i lane1 = broadcast_epi128<1>(utf8);
-      int valid_count0;
-      __m512i vec0 = expand_and_identify(lane0, lane1, valid_count0);
-      const __m512i lane2 = broadcast_epi128<2>(utf8);
-      int valid_count1;
-      __m512i vec1 = expand_and_identify(lane1, lane2, valid_count1);
-      if (valid_count0 + valid_count1 <= 16) {
-        vec0 = _mm512_mask_expand_epi32(
-            vec0, __mmask16(((1 << valid_count1) - 1) << valid_count0), vec1);
-        valid_count0 += valid_count1;
-        vec0 = expand_utf8_to_utf32(vec0);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-      } else {
-        vec0 = expand_utf8_to_utf32(vec0);
-        vec1 = expand_utf8_to_utf32(vec1);
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec0, valid_count0, true)
-        SIMDUTF_ICELAKE_WRITE_UTF16_OR_UTF32(vec1, valid_count1, true)
-      }
-
-      const __m512i lane3 = broadcast_epi128<3>(utf8);
-      SIMDUTF_ICELAKE_TRANSCODE16(lane2, lane3, true)
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      ptr += 3 * 16;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+          buf, len, utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    validatedptr += 4 * 16;
-  }
-  if (end != validatedptr) {
-    const __m512i utf8 =
-        _mm512_maskz_loadu_epi8(~UINT64_C(0) >> (64 - (end - validatedptr)),
-                                (const __m512i *)validatedptr);
-    checker.check_next_input(utf8);
-  }
-  checker.check_eof();
-  if (checker.errors()) {
-    return {ptr, output, false}; // We found an error.
   }
-  return {ptr, output, true};
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
-/* end file src/icelake/icelake_from_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
-// file included directly
 
-// File contains conversion procedure from possibly invalid UTF-8 strings.
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
 
-template <bool is_remaining>
-simdutf_really_inline size_t process_block_from_utf8_to_latin1(
-    const char *buf, size_t len, char *latin_output, __m512i minus64,
-    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
-  __mmask64 load_mask =
-      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
-  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
-  if (nonascii == 0) {
-    if (*next_leading_ptr) { // If we ended with a leading byte, it is an error.
-      return 0;              // Indicates error
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    is_remaining
-        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
-        : _mm512_storeu_si512((__m512i *)latin_output, input);
-    return len;
+    saved_bytes += scalar_saved_bytes;
   }
+  return saved_bytes;
+}
 
-  const __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
-  __mmask64 invalid_leading_bytes =
-      _mm512_mask_cmpgt_epu8_mask(leading, highbits, one);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+          buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  if (invalid_leading_bytes) {
-    return 0; // Indicates error
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+          buf, len, utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  __mmask64 leading_shift = (leading << 1) | *next_leading_ptr;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
 
-  if ((nonascii ^ leading) != leading_shift) {
-    return 0; // Indicates error
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-  const __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
-  input =
-      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
 
-  __mmask64 retain = ~leading & load_mask;
-  __m512i output = _mm512_maskz_compress_epi8(retain, input);
-  int64_t written_out = count_ones(retain);
-  if (written_out == 0) {
-    return 0; // Indicates error
-  }
-  *next_bit6_ptr = bit6 >> 63;
-  *next_leading_ptr = leading >> 63;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
+}
 
-  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
+}
 
-  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
+}
 
-  return written_out;
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-size_t utf8_to_latin1_avx512(const char *&inbuf, size_t len,
-                             char *&inlatin_output) {
-  const char *buf = inbuf;
-  char *latin_output = inlatin_output;
-  char *start = latin_output;
-  size_t pos = 0;
-  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
-  __m512i one = _mm512_set1_epi8(1);
-  __mmask64 next_leading = 0;
-  __mmask64 next_bit6 = 0;
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
 
-  while (pos + 64 <= len) {
-    size_t written = process_block_from_utf8_to_latin1<false>(
-        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
-    if (written == 0) {
-      inlatin_output = latin_output;
-      inbuf = buf + pos - next_leading;
-      return 0; // Indicates error at pos or after, or just before pos (too
-                // short error)
-    }
-    latin_output += written;
-    pos += 64;
-  }
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
+}
 
-  if (pos < len) {
-    size_t remaining = len - pos;
-    size_t written = process_block_from_utf8_to_latin1<true>(
-        buf + pos, remaining, latin_output, minus64, one, &next_leading,
-        &next_bit6);
-    if (written == 0) {
-      inbuf = buf + pos - next_leading;
-      inlatin_output = latin_output;
-      return 0; // Indicates error at pos or after, or just before pos (too
-                // short error)
-    }
-    latin_output += written;
-  }
-  if (next_leading) {
-    inbuf = buf + len - next_leading;
-    inlatin_output = latin_output;
-    return 0; // Indicates error at end of buffer
-  }
-  inlatin_output = latin_output;
-  inbuf += len;
-  return size_t(latin_output - start);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return scalar::utf16::latin1_length_from_utf16(length);
 }
-/* end file src/icelake/icelake_convert_utf8_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
-// file included directly
 
-// File contains conversion procedure from valid UTF-8 strings.
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return scalar::utf32::latin1_length_from_utf32(length);
+}
 
-template <bool is_remaining>
-simdutf_really_inline size_t process_valid_block_from_utf8_to_latin1(
-    const char *buf, size_t len, char *latin_output, __m512i minus64,
-    __m512i one, __mmask64 *next_leading_ptr, __mmask64 *next_bit6_ptr) {
-  __mmask64 load_mask =
-      is_remaining ? _bzhi_u64(~0ULL, (unsigned int)len) : ~0ULL;
-  __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)buf);
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
+}
 
-  if (nonascii == 0) {
-    is_remaining
-        ? _mm512_mask_storeu_epi8((__m512i *)latin_output, load_mask, input)
-        : _mm512_storeu_si512((__m512i *)latin_output, input);
-    return len;
-  }
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
+}
 
-  __mmask64 leading = _mm512_cmpge_epu8_mask(input, minus64);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
 
-  __m512i highbits = _mm512_xor_si512(input, _mm512_set1_epi8(-62));
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
 
-  *next_leading_ptr = leading >> 63;
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf16_length_from_latin1(length);
+}
 
-  __mmask64 bit6 = _mm512_cmpeq_epi8_mask(highbits, one);
-  input =
-      _mm512_mask_sub_epi8(input, (bit6 << 1) | *next_bit6_ptr, input, minus64);
-  *next_bit6_ptr = bit6 >> 63;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
 
-  __mmask64 retain = ~leading & load_mask;
-  __m512i output = _mm512_maskz_compress_epi8(retain, input);
-  int64_t written_out = count_ones(retain);
-  if (written_out == 0) {
-    return 0; // Indicates error
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return scalar::latin1::utf32_length_from_latin1(length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t len) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+  size_t i = 0;
+  if (answer >= 2048) { // long strings optimization
+    __m256i four_64bits = _mm256_setzero_si256();
+    while (i + sizeof(__m256i) <= len) {
+      __m256i runner = _mm256_setzero_si256();
+      // We can do up to 255 loops without overflow.
+      size_t iterations = (len - i) / sizeof(__m256i);
+      if (iterations > 255) {
+        iterations = 255;
+      }
+      size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
+      for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
+        __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
+        __m256i input2 =
+            _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
+        __m256i input3 = _mm256_loadu_si256(
+            (const __m256i *)(data + i + 2 * sizeof(__m256i)));
+        __m256i input4 = _mm256_loadu_si256(
+            (const __m256i *)(data + i + 3 * sizeof(__m256i)));
+        __m256i input12 =
+            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
+                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
+        __m256i input23 =
+            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
+                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
+        __m256i input1234 = _mm256_add_epi8(input12, input23);
+        runner = _mm256_sub_epi8(runner, input1234);
+      }
+      for (; i <= max_i; i += sizeof(__m256i)) {
+        __m256i input_256_chunk =
+            _mm256_loadu_si256((const __m256i *)(data + i));
+        runner = _mm256_sub_epi8(
+            runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+      }
+      four_64bits = _mm256_add_epi64(
+          four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+    }
+    answer += _mm256_extract_epi64(four_64bits, 0) +
+              _mm256_extract_epi64(four_64bits, 1) +
+              _mm256_extract_epi64(four_64bits, 2) +
+              _mm256_extract_epi64(four_64bits, 3);
+  } else if (answer > 0) {
+    for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
+      __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
+      uint32_t non_ascii = _mm256_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
   }
-  __mmask64 store_mask = ~UINT64_C(0) >> (64 - written_out);
-  // Optimization opportunity: sometimes, masked writes are not needed.
-  _mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
-  return written_out;
+  return answer + scalar::latin1::utf8_length_from_latin1(
+                      reinterpret_cast<const char *>(data + i), len - i);
 }
 
-size_t valid_utf8_to_latin1_avx512(const char *buf, size_t len,
-                                   char *latin_output) {
-  char *start = latin_output;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m256i v_00000000 = _mm256_setzero_si256();
+  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
+  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
   size_t pos = 0;
-  __m512i minus64 = _mm512_set1_epi8(-64); // 11111111111 ... 1100 0000
-  __m512i one = _mm512_set1_epi8(1);
-  __mmask64 next_leading = 0;
-  __mmask64 next_bit6 = 0;
+  size_t count = 0;
+  for (; pos + 8 <= length; pos += 8) {
+    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+    const __m256i ascii_bytes_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
+    const __m256i one_two_bytes_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
+    const __m256i two_bytes_bytemask =
+        _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m256i one_two_three_bytes_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const __m256i three_bytes_bytemask =
+        _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+    const uint32_t ascii_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
+    const uint32_t two_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
+    const uint32_t three_bytes_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
 
-  while (pos + 64 <= len) {
-    size_t written = process_valid_block_from_utf8_to_latin1<false>(
-        buf + pos, 64, latin_output, minus64, one, &next_leading, &next_bit6);
-    latin_output += written;
-    pos += 64;
+    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
+    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
+    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
+    count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
   }
+  return count +
+         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
 
-  if (pos < len) {
-    size_t remaining = len - pos;
-    size_t written = process_valid_block_from_utf8_to_latin1<true>(
-        buf + pos, remaining, latin_output, minus64, one, &next_leading,
-        &next_bit6);
-    latin_output += written;
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m256i v_00000000 = _mm256_setzero_si256();
+  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 8 <= length; pos += 8) {
+    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
+    const __m256i surrogate_bytemask =
+        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+    const uint32_t surrogate_bitmask =
+        static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
+    size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
+    count += 8 + surrogate_count;
   }
+  return count +
+         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
+}
 
-  return (size_t)(latin_output - start);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
-/* end file src/icelake/icelake_convert_valid_utf8_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
-// file included directly
-template <endianness big_endian>
-size_t icelake_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                                       char *latin1_output) {
-  const char16_t *end = buf + len;
-  __m512i v_0xFF = _mm512_set1_epi16(0xff);
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
-      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-  while (end - buf >= 32) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm256_storeu_si256(
-        (__m256i *)latin1_output,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 32;
-    buf += 32;
-  }
-  if (buf < end) {
-    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm256_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
-  }
-  return len;
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
-template <endianness big_endian>
-std::pair<result, char *>
-icelake_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                            char *latin1_output) {
-  const char16_t *end = buf + len;
-  const char16_t *start = buf;
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  __m512i v_0xFF = _mm512_set1_epi16(0xff);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38,
-      36, 34, 32, 30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
-  while (end - buf >= 32) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
-      uint16_t word;
-      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
-                                 : uint16_t(*buf))) <= 0xff) {
-        *latin1_output++ = uint8_t(word);
-        buf++;
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
-    }
-    _mm256_storeu_si256(
-        (__m256i *)latin1_output,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 32;
-    buf += 32;
-  }
-  if (buf < end) {
-    uint32_t mask(uint32_t(1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi16(mask, buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    if (_mm512_cmpgt_epu16_mask(in, v_0xFF)) {
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
 
-      uint16_t word;
-      while ((word = (big_endian ? scalar::utf16::swap_bytes(uint16_t(*buf))
-                                 : uint16_t(*buf))) <= 0xff) {
-        *latin1_output++ = uint8_t(word);
-        buf++;
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
-    }
-    _mm256_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si256(_mm512_permutexvar_epi8(shufmask, in)));
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
   }
-  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
-/* end file src/icelake/icelake_convert_utf16_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
-// file included directly
+} // namespace haswell
+} // namespace simdutf
 
-/**
- * This function converts the input (inbuf, inlen), assumed to be valid
- * UTF16 (little endian) into UTF-8 (to outbuf). The number of code units
- * written is written to 'outlen' and the function reports the number of input
- * word consumed.
- */
-template <endianness big_endian>
-size_t utf16_to_utf8_avx512i(const char16_t *inbuf, size_t inlen,
-                             unsigned char *outbuf, size_t *outlen) {
-  __m512i in;
-  __mmask32 inmask = _cvtu32_mask32(0x7fffffff);
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  const char16_t *const inbuf_orig = inbuf;
-  const unsigned char *const outbuf_orig = outbuf;
-  int adjust = 0;
-  int carry = 0;
+/* begin file src/simdutf/haswell/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
 
-  while (inlen >= 32) {
-    in = _mm512_loadu_si512(inbuf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    inlen -= 31;
-  lastiteration:
-    inbuf += 31;
 
-  failiteration:
-    const __mmask32 is234byte = _mm512_mask_cmp_epu16_mask(
-        inmask, in, _mm512_set1_epi16(0x0080), _MM_CMPINT_NLT);
+#if SIMDUTF_GCC11ORMORE // workaround for
+                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
+SIMDUTF_POP_DISABLE_WARNINGS
+#endif // end of workaround
+/* end file src/simdutf/haswell/end.h */
+/* end file src/haswell/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_PPC64
+/* begin file src/ppc64/implementation.cpp */
 
-    if (_ktestz_mask32_u8(inmask, is234byte)) {
-      // fast path for ASCII only
-      _mm512_mask_cvtepi16_storeu_epi8(outbuf, inmask, in);
-      outbuf += 31;
-      carry = 0;
 
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
 
-    const __mmask32 is12byte =
-        _mm512_cmp_epu16_mask(in, _mm512_set1_epi16(0x0800), _MM_CMPINT_LT);
 
-    if (_ktestc_mask32_u8(is12byte, inmask)) {
-      // fast path for 1 and 2 byte only
 
-      const __m512i twobytes = _mm512_ternarylogic_epi32(
-          _mm512_slli_epi16(in, 8), _mm512_srli_epi16(in, 6),
-          _mm512_set1_epi16(0x3f3f), 0xa8); // (A|B)&C
-      in = _mm512_mask_add_epi16(in, is234byte, twobytes,
-                                 _mm512_set1_epi16(int16_t(0x80c0)));
-      const __m512i cmpmask =
-          _mm512_mask_blend_epi16(inmask, _mm512_set1_epi16(int16_t(0xffff)),
-                                  _mm512_set1_epi16(0x0800));
-      const __mmask64 smoosh =
-          _mm512_cmp_epu8_mask(in, cmpmask, _MM_CMPINT_NLT);
-      const __m512i out = _mm512_maskz_compress_epi8(smoosh, in);
-      _mm512_mask_storeu_epi8(outbuf,
-                              _cvtu64_mask64(_pext_u64(_cvtmask64_u64(smoosh),
-                                                       _cvtmask64_u64(smoosh))),
-                              out);
-      outbuf += 31 + _mm_popcnt_u32(_cvtmask32_u32(is234byte));
-      carry = 0;
+/* begin file src/simdutf/ppc64/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
+// #define SIMDUTF_IMPLEMENTATION ppc64
+/* end file src/simdutf/ppc64/begin.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+#ifndef SIMDUTF_PPC64_H
+  #error "ppc64.h must be included"
+#endif
+using namespace simd;
 
-      if (inlen < 32) {
-        goto tail;
-      } else {
-        continue;
-      }
-    }
-    __m512i lo = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-    __m512i hi = _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  // careful: 0x80 is not ascii.
+  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
+}
 
-    __m512i taglo = _mm512_set1_epi32(0x8080e000);
-    __m512i taghi = taglo;
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_second_byte =
+      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
+  // Caller requires a bool (all 1's). All values resulting from the subtraction
+  // will be <= 64, so signed comparison is fine.
+  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
+         int8_t(0);
+}
 
-    const __m512i fc00masked =
-        _mm512_and_epi32(in, _mm512_set1_epi16(int16_t(0xfc00)));
-    const __mmask32 hisurr = _mm512_mask_cmp_epu16_mask(
-        inmask, fc00masked, _mm512_set1_epi16(int16_t(0xd800)), _MM_CMPINT_EQ);
-    const __mmask32 losurr = _mm512_cmp_epu16_mask(
-        fc00masked, _mm512_set1_epi16(int16_t(0xdc00)), _MM_CMPINT_EQ);
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<uint8_t> is_third_byte =
+      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
+  simd8<uint8_t> is_fourth_byte =
+      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
+  // Caller requires a bool (all 1's). All values resulting from the subtraction
+  // will be <= 64, so signed comparison is fine.
+  return simd8<bool>(is_third_byte | is_fourth_byte);
+}
 
-    int carryout = 0;
-    if (!_kortestz_mask32_u8(hisurr, losurr)) {
-      // handle surrogates
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
 
-      __m512i los = _mm512_alignr_epi32(hi, lo, 1);
-      __m512i his = _mm512_alignr_epi32(lo, hi, 1);
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
 
-      const __mmask32 hisurrhi = _kshiftri_mask32(hisurr, 16);
-      taglo = _mm512_mask_mov_epi32(taglo, __mmask16(hisurr),
-                                    _mm512_set1_epi32(0x808080f0));
-      taghi = _mm512_mask_mov_epi32(taghi, __mmask16(hisurrhi),
-                                    _mm512_set1_epi32(0x808080f0));
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
 
-      lo = _mm512_mask_slli_epi32(lo, __mmask16(hisurr), lo, 10);
-      hi = _mm512_mask_slli_epi32(hi, __mmask16(hisurrhi), hi, 10);
-      los = _mm512_add_epi32(los, _mm512_set1_epi32(0xfca02400));
-      his = _mm512_add_epi32(his, _mm512_set1_epi32(0xfca02400));
-      lo = _mm512_mask_add_epi32(lo, __mmask16(hisurr), lo, los);
-      hi = _mm512_mask_add_epi32(hi, __mmask16(hisurrhi), hi, his);
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
 
-      carryout = _cvtu32_mask32(_kshiftri_mask32(hisurr, 30));
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
 
-      const uint32_t h = _cvtmask32_u32(hisurr);
-      const uint32_t l = _cvtmask32_u32(losurr);
-      // check for mismatched surrogates
-      if ((h + h + carry) ^ l) {
-        const uint32_t lonohi = l & ~(h + h + carry);
-        const uint32_t hinolo = h & ~(l >> 1);
-        inlen = _tzcnt_u32(hinolo | lonohi);
-        inmask = __mmask32(0x7fffffff & ((1U << inlen) - 1));
-        in = _mm512_maskz_mov_epi16(inmask, in);
-        adjust = (int)inlen - 31;
-        inlen = 0;
-        goto failiteration;
-      }
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
     }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
+}
 
-    hi = _mm512_maskz_mov_epi32(_cvtu32_mask16(0x7fff), hi);
-    carry = carryout;
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
+}
 
-    __m512i mslo =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), lo);
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
 
-    __m512i mshi =
-        _mm512_multishift_epi64_epi8(_mm512_set1_epi64(0x20262c3200060c12), hi);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
+}
 
-    const __mmask32 outmask = __mmask32(_kandn_mask64(losurr, inmask));
-    const __mmask64 outmhi = _kshiftri_mask64(outmask, 16);
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
+}
 
-    const __mmask32 is1byte = __mmask32(_knot_mask64(is234byte));
-    const __mmask64 is1bhi = _kshiftri_mask64(is1byte, 16);
-    const __mmask64 is12bhi = _kshiftri_mask64(is12byte, 16);
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
+}
 
-    taglo = _mm512_mask_mov_epi32(taglo, __mmask16(is12byte),
-                                  _mm512_set1_epi32(0x80c00000));
-    taghi = _mm512_mask_mov_epi32(taghi, __mmask16(is12bhi),
-                                  _mm512_set1_epi32(0x80c00000));
-    __m512i magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
-                                              _mm512_set1_epi32(0xffffffff),
-                                              _mm512_set1_epi32(0x00010101));
-    __m512i magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
-                                              _mm512_set1_epi32(0xffffffff),
-                                              _mm512_set1_epi32(0x00010101));
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
+}
 
-    magiclo = _mm512_mask_blend_epi32(__mmask16(outmask),
-                                      _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
-    magichi = _mm512_mask_blend_epi32(__mmask16(outmhi),
-                                      _mm512_set1_epi32(0xffffffff),
-                                      _mm512_set1_epi32(0x00010101));
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
+}
 
-    mslo = _mm512_ternarylogic_epi32(mslo, _mm512_set1_epi32(0x3f3f3f3f), taglo,
-                                     0xea); // A&B|C
-    mshi = _mm512_ternarylogic_epi32(mshi, _mm512_set1_epi32(0x3f3f3f3f), taghi,
-                                     0xea);
-    mslo = _mm512_mask_slli_epi32(mslo, __mmask16(is1byte), lo, 24);
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
 
-    mshi = _mm512_mask_slli_epi32(mshi, __mmask16(is1bhi), hi, 24);
+using namespace simd;
 
-    const __mmask64 wantlo =
-        _mm512_cmp_epu8_mask(mslo, magiclo, _MM_CMPINT_NLT);
-    const __mmask64 wanthi =
-        _mm512_cmp_epu8_mask(mshi, magichi, _MM_CMPINT_NLT);
-    const __m512i outlo = _mm512_maskz_compress_epi8(wantlo, mslo);
-    const __m512i outhi = _mm512_maskz_compress_epi8(wanthi, mshi);
-    const uint64_t wantlo_uint64 = _cvtmask64_u64(wantlo);
-    const uint64_t wanthi_uint64 = _cvtmask64_u64(wanthi);
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
+
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
+
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
+
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
+}
 
-    uint64_t advlo = _mm_popcnt_u64(wantlo_uint64);
-    uint64_t advhi = _mm_popcnt_u64(wanthi_uint64);
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
 
-    _mm512_mask_storeu_epi8(
-        outbuf, _cvtu64_mask64(_pext_u64(wantlo_uint64, wantlo_uint64)), outlo);
-    _mm512_mask_storeu_epi8(
-        outbuf + advlo, _cvtu64_mask64(_pext_u64(wanthi_uint64, wanthi_uint64)),
-        outhi);
-    outbuf += advlo + advhi;
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  outbuf += -adjust;
 
-tail:
-  if (inlen != 0) {
-    // We must have inlen < 31.
-    inmask = _cvtu32_mask32((1U << inlen) - 1);
-    in = _mm512_maskz_loadu_epi16(inmask, inbuf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
-    }
-    adjust = (int)inlen - 31;
-    inlen = 0;
-    goto lastiteration;
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
   }
-  *outlen = (outbuf - outbuf_orig) + adjust;
-  return ((inbuf - inbuf_orig) + adjust);
-}
-/* end file src/icelake/icelake_convert_utf16_to_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
-// file included directly
 
-/*
-  Returns a pair: the first unprocessed byte from buf and utf32_output
-  A scalar routing should carry on the conversion of the tail.
-*/
-template <endianness big_endian>
-std::tuple<const char16_t *, char32_t *, bool>
-convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                       char32_t *utf32_output) {
-  const char16_t *end = buf + len;
-  const __m512i v_fc00 = _mm512_set1_epi16((uint16_t)0xfc00);
-  const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-  const __m512i v_dc00 = _mm512_set1_epi16((uint16_t)0xdc00);
-  __mmask32 carry{0};
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  while (std::distance(buf, end) >= 32) {
-    // Always safe because buf + 32 <= end so that end - buf >= 32 bytes:
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (big_endian) {
-      in = _mm512_shuffle_epi8(in, byteflip);
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
+  }
 
-    // H - bitmask for high surrogates
-    const __mmask32 H =
-        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_d800);
-    // H - bitmask for low surrogates
-    const __mmask32 L =
-        _mm512_cmpeq_epi16_mask(_mm512_and_si512(in, v_fc00), v_dc00);
-
-    if ((H | L)) {
-      // surrogate pair(s) in a register
-      const __mmask32 V =
-          (L ^
-           (carry | (H << 1))); // A high surrogate must be followed by low one
-                                // and a low one must be preceded by a high one.
-                                // If valid, V should be equal to 0
-
-      if (V == 0) {
-        // valid case
-        /*
-            Input surrogate pair:
-            |1101.11aa.aaaa.aaaa|1101.10bb.bbbb.bbbb|
-                low surrogate      high surrogate
-        */
-        /*  1. Expand all code units to 32-bit code units
-            in
-           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-        */
-        const __m512i first = _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in));
-        const __m512i second =
-            _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1));
-
-        /*  2. Shift by one 16-bit word to align low surrogates with high
-           surrogates in
-           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0000.0000.0000.1101.10bb.bbbb.bbbb|
-            shifted
-           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-        */
-        const __m512i shifted_first = _mm512_alignr_epi32(second, first, 1);
-        const __m512i shifted_second =
-            _mm512_alignr_epi32(_mm512_setzero_si512(), second, 1);
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-        /*  3. Align all high surrogates in first and second by shifting to the
-           left by 10 bits
-            |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-        */
-        const __m512i aligned_first =
-            _mm512_mask_slli_epi32(first, (__mmask16)H, first, 10);
-        const __m512i aligned_second =
-            _mm512_mask_slli_epi32(second, (__mmask16)(H >> 16), second, 10);
+}; // struct utf8_checker
+} // namespace utf8_validation
 
-        /*  4. Remove surrogate prefixes and add offset 0x10000 by adding in,
-           shifted and constant in
-           |0000.0000.0000.0000.1101.11aa.aaaa.aaaa|0000.0011.0110.bbbb.bbbb.bb00.0000.0000|
-            shifted
-           |????.????.????.????.????.????.????.????|0000.0000.0000.0000.1101.11aa.aaaa.aaaa|
-            constant|1111.1100.1010.0000.0010.0100.0000.0000|1111.1100.1010.0000.0010.0100.0000.0000|
-        */
-        const __m512i constant = _mm512_set1_epi32((uint32_t)0xfca02400);
-        const __m512i added_first = _mm512_mask_add_epi32(
-            aligned_first, (__mmask16)H, aligned_first, shifted_first);
-        const __m512i utf32_first = _mm512_mask_add_epi32(
-            added_first, (__mmask16)H, added_first, constant);
+using utf8_validation::utf8_checker;
 
-        const __m512i added_second =
-            _mm512_mask_add_epi32(aligned_second, (__mmask16)(H >> 16),
-                                  aligned_second, shifted_second);
-        const __m512i utf32_second = _mm512_mask_add_epi32(
-            added_second, (__mmask16)(H >> 16), added_second, constant);
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_validation {
 
-        //  5. Store all valid UTF-32 code units (low surrogate positions and
-        //  32nd word are invalid)
-        const __mmask32 valid = ~L & 0x7fffffff;
-        // We deliberately do a _mm512_maskz_compress_epi32 followed by
-        // storeu_epi32 to ease performance portability to Zen 4.
-        const __m512i compressed_first =
-            _mm512_maskz_compress_epi32((__mmask16)(valid), utf32_first);
-        const size_t howmany1 = count_ones((uint16_t)(valid));
-        _mm512_storeu_si512((__m512i *)utf32_output, compressed_first);
-        utf32_output += howmany1;
-        const __m512i compressed_second =
-            _mm512_maskz_compress_epi32((__mmask16)(valid >> 16), utf32_second);
-        const size_t howmany2 = count_ones((uint16_t)(valid >> 16));
-        // The following could be unsafe in some cases?
-        //_mm512_storeu_epi32((__m512i *) utf32_output, compressed_second);
-        _mm512_mask_storeu_epi32((__m512i *)utf32_output,
-                                 __mmask16((1 << howmany2) - 1),
-                                 compressed_second);
-        utf32_output += howmany2;
-        // Only process 31 code units, but keep track if the 31st word is a high
-        // surrogate as a carry
-        buf += 31;
-        carry = (H >> 30) & 0x1;
-      } else {
-        // invalid case
-        return std::make_tuple(buf + carry, utf32_output, false);
-      }
-    } else {
-      // no surrogates
-      // extend all thirty-two 16-bit code units to thirty-two 32-bit code units
-      _mm512_storeu_si512((__m512i *)(utf32_output),
-                          _mm512_cvtepu16_epi32(_mm512_castsi512_si256(in)));
-      _mm512_storeu_si512(
-          (__m512i *)(utf32_output) + 1,
-          _mm512_cvtepu16_epi32(_mm512_extracti32x8_epi32(in, 1)));
-      utf32_output += 32;
-      buf += 32;
-      carry = 0;
-    }
-  } // while
-  return std::make_tuple(buf + carry, utf32_output, true);
-}
-/* end file src/icelake/icelake_convert_utf16_to_utf32.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
-// file included directly
-size_t icelake_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                                       char *latin1_output) {
-  const char32_t *end = buf + len;
-  __m512i v_0xFF = _mm512_set1_epi32(0xff);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
-      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
-  while (end - buf >= 16) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm_storeu_si128(
-        (__m128i *)latin1_output,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 16;
-    buf += 16;
-  }
-  if (buf < end) {
-    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      return 0;
-    }
-    _mm_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
   }
-  return len;
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
 }
 
-std::pair<result, char *>
-icelake_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                            char *latin1_output) {
-  const char32_t *end = buf + len;
-  const char32_t *start = buf;
-  __m512i v_0xFF = _mm512_set1_epi32(0xff);
-  __m512i shufmask = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60,
-      56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0);
-  while (end - buf >= 16) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      while (uint32_t(*buf) <= 0xff) {
-        *latin1_output++ = uint8_t(*buf++);
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
+
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
     }
-    _mm_storeu_si128(
-        (__m128i *)latin1_output,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
-    latin1_output += 16;
-    buf += 16;
+    reader.advance();
+    count += 64;
   }
-  if (buf < end) {
-    uint16_t mask = uint16_t((1 << (end - buf)) - 1);
-    __m512i in = _mm512_maskz_loadu_epi32(mask, buf);
-    if (_mm512_cmpgt_epu32_mask(in, v_0xFF)) {
-      while (uint32_t(*buf) <= 0xff) {
-        *latin1_output++ = uint8_t(*buf++);
-      }
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            latin1_output);
-    }
-    _mm_mask_storeu_epi8(
-        latin1_output, mask,
-        _mm512_castsi512_si128(_mm512_permutexvar_epi8(shufmask, in)));
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
   }
-  return std::make_pair(result(error_code::SUCCESS, len), latin1_output);
 }
-/* end file src/icelake/icelake_convert_utf32_to_latin1.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
-// file included directly
-
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<const char32_t *, char *>
-avx512_convert_utf32_to_utf8(const char32_t *buf, size_t len,
-                             char *utf8_output) {
-  const char32_t *end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
-
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
+}
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
     }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+    reader.advance();
 
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
 
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
+} // namespace utf8_validation
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
-          forbidden_bytemask,
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef simdutf_vec
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
+    }
+    return utf16_output - start;
+  }
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
+  }
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf16 {
 
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+using namespace simd;
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf8_output);
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf8_output);
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
       }
-      buf += k;
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
     }
-  } // while
-
-  // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
-          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
-    return std::make_pair(nullptr, utf8_output);
   }
-
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  return std::make_pair(buf, utf8_output);
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-std::pair<result, char *>
-avx512_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                         char *utf8_output) {
-  const char32_t *end = buf + len;
-  const char32_t *start = buf;
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
-    // Check for too large input
-    const __m256i max_input =
-        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(
-            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            utf8_output);
-    }
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
+    }
+    return utf32_output - start;
+  }
 
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
+  }
 
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
-      // 6. adjust pointers
-      buf += 16;
-      continue;
-    }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8_to_utf32 {
 
-      // Check for illegal surrogate code units
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf8_output);
+using namespace simd;
+
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
       }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
+}
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+// other functions
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf16 {
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
+  }
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
-#undef simdutf_vec
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
-        continue;
-      }*/
-      const uint8_t mask0 = uint8_t(mask);
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
+} // namespace utf16
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf16.h */
+/* begin file src/generic/utf8.h */
 
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
+namespace simdutf {
+namespace ppc64 {
+namespace {
+namespace utf8 {
 
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+using namespace simd;
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
-    } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
-      size_t forward = 15;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
-          *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
-          *utf8_output++ = char((word >> 6) | 0b11000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf8_output);
-          }
-          *utf8_output++ = char((word >> 12) | 0b11100000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
-          }
-          *utf8_output++ = char((word >> 18) | 0b11110000);
-          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
-          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
-          *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        }
-      }
-      buf += k;
-    }
-  } // while
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
+  }
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
-/* end file src/icelake/icelake_convert_utf32_to_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
-// file included directly
+} // namespace utf8
+} // unnamed namespace
+} // namespace ppc64
+} // namespace simdutf
+/* end file src/generic/utf8.h */
 
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<const char32_t *, char16_t *>
-avx512_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                              char16_t *utf16_output) {
-  const char32_t *end = buf + len;
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace ppc64 {
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
+  }
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
 
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  return out;
+}
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8(buf, len);
+}
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+}
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
-          forbidden_bytemask,
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_ascii(buf, len);
+}
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf16_output);
-          }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf16_output);
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
-    }
-  }
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+}
 
-  // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf16_output);
-  }
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
+}
 
-  return std::make_pair(buf, utf16_output);
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  return scalar::utf16::validate<endianness::BIG>(buf, len);
 }
 
-// Todo: currently, this is just the haswell code, optimize for icelake kernel.
-template <endianness big_endian>
-std::pair<result, char16_t *>
-avx512_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                          char16_t *utf16_output) {
-  const char32_t *start = buf;
-  const char32_t *end = buf + len;
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+}
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
+}
 
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate_with_errors(buf, len);
+}
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
+  return scalar::utf32::validate(buf, len);
+}
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf16_output);
-      }
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
-      }
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
-    } else {
-      size_t forward = 7;
-      size_t k = 0;
-      if (size_t(end - buf) < forward + 1) {
-        forward = size_t(end - buf - 1);
-      }
-      for (; k < forward; k++) {
-        uint32_t word = buf[k];
-        if ((word & 0xFFFF0000) == 0) {
-          // will not generate a surrogate pair
-          if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf16_output);
-          }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
-        } else {
-          // will generate a surrogate pair
-          if (word > 0x10FFFF) {
-            return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
-          }
-          word -= 0x10000;
-          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
-          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
-            high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
-          }
-          *utf16_output++ = char16_t(high_surrogate);
-          *utf16_output++ = char16_t(low_surrogate);
-        }
-      }
-      buf += k;
-    }
-  }
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
+}
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
 }
-/* end file src/icelake/icelake_convert_utf32_to_utf16.inl.cpp */
-/* begin file src/icelake/icelake_ascii_validation.inl.cpp */
-// file included directly
 
-bool validate_ascii(const char *buf, size_t len) {
-  const char *end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  __m512i running_or = _mm512_setzero_si512();
-  for (; end - buf >= 64; buf += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
-                                           0xf8); // running_or | (utf8 & ascii)
-  }
-  if (buf < end) {
-    const __m512i utf8 = _mm512_maskz_loadu_epi8(
-        (uint64_t(1) << (end - buf)) - 1, (const __m512i *)buf);
-    running_or = _mm512_ternarylogic_epi32(running_or, utf8, ascii,
-                                           0xf8); // running_or | (utf8 & ascii)
-  }
-  return (_mm512_test_epi8_mask(running_or, running_or) == 0);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
 }
-/* end file src/icelake/icelake_ascii_validation.inl.cpp */
-/* begin file src/icelake/icelake_utf32_validation.inl.cpp */
-// file included directly
 
-const char32_t *validate_utf32(const char32_t *buf, size_t len) {
-  if (len < 16) {
-    return buf;
-  }
-  const char32_t *end = buf + len - 16;
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char * /*buf*/, size_t /*len*/,
+    char16_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-  const __m512i offset = _mm512_set1_epi32((uint32_t)0xffff2000);
-  __m512i currentmax = _mm512_setzero_si512();
-  __m512i currentoffsetmax = _mm512_setzero_si512();
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char * /*buf*/, size_t /*len*/,
+    char32_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-  while (buf <= end) {
-    __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
-    buf += 16;
-    currentoffsetmax =
-        _mm512_max_epu32(_mm512_add_epi32(utf32, offset), currentoffsetmax);
-    currentmax = _mm512_max_epu32(utf32, currentmax);
-  }
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char * /*buf*/, size_t /*len*/,
+    char32_t * /*utf16_output*/) const noexcept {
+  return result(error_code::OTHER, 0); // stub
+}
 
-  const __m512i standardmax = _mm512_set1_epi32((uint32_t)0x10ffff);
-  const __m512i standardoffsetmax = _mm512_set1_epi32((uint32_t)0xfffff7ff);
-  __m512i is_zero =
-      _mm512_xor_si512(_mm512_max_epu32(currentmax, standardmax), standardmax);
-  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-    return nullptr;
-  }
-  is_zero = _mm512_xor_si512(
-      _mm512_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-  if (_mm512_test_epi8_mask(is_zero, is_zero) != 0) {
-    return nullptr;
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char * /*buf*/, size_t /*len*/,
+    char32_t * /*utf16_output*/) const noexcept {
+  return 0; // stub
+}
 
-  return buf;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
+                                                            utf8_output);
 }
-/* end file src/icelake/icelake_utf32_validation.inl.cpp */
-/* begin file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
-// file included directly
 
-static inline size_t latin1_to_utf8_avx512_vec(__m512i input, size_t input_len,
-                                               char *utf8_output,
-                                               int mask_output) {
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
-  size_t output_size = input_len + (size_t)count_ones(nonascii);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+}
 
-  // Mask to denote whether the byte is a leading byte that is not ascii
-  __mmask64 sixth = _mm512_cmpge_epu8_mask(
-      input, _mm512_set1_epi8(-64)); // binary representation of -64: 1100 0000
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf8_output);
+}
 
-  const uint64_t alternate_bits = UINT64_C(0x5555555555555555);
-  uint64_t ascii = ~nonascii;
-  // the bits in ascii are inverted and zeros are interspersed in between them
-  uint64_t maskA = ~_pdep_u64(ascii, alternate_bits);
-  uint64_t maskB = ~_pdep_u64(ascii >> 32, alternate_bits);
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+      buf, len, utf8_output);
+}
 
-  // interleave bytes from top and bottom halves (abcd...ABCD -> aAbBcCdD)
-  __m512i input_interleaved = _mm512_permutexvar_epi8(
-      _mm512_set_epi32(0x3f1f3e1e, 0x3d1d3c1c, 0x3b1b3a1a, 0x39193818,
-                       0x37173616, 0x35153414, 0x33133212, 0x31113010,
-                       0x2f0f2e0e, 0x2d0d2c0c, 0x2b0b2a0a, 0x29092808,
-                       0x27072606, 0x25052404, 0x23032202, 0x21012000),
-      input);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
+                                                                  utf8_output);
+}
 
-  // double size of each byte, and insert the leading byte 1100 0010
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
+                                                               utf8_output);
+}
 
-  /*
-  upscale the bytes to 16-bit value, adding the 0b11000000 leading byte in the
-  process. We adjust for the bytes that have their two most significant bits.
-  This takes care of the first 32 bytes, assuming we interleaved the bytes. */
-  __m512i outputA =
-      _mm512_shldi_epi16(input_interleaved, _mm512_set1_epi8(-62), 8);
-  outputA = _mm512_mask_add_epi16(
-      outputA, (__mmask32)sixth, outputA,
-      _mm512_set1_epi16(1 - 0x4000)); // 1- 0x4000 = 1100 0000 0000 0001????
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+}
 
-  // in the second 32-bit half, set first or second option based on whether
-  // original input is leading byte (second case) or not (first case)
-  __m512i leadingB =
-      _mm512_mask_blend_epi16((__mmask32)(sixth >> 32),
-                              _mm512_set1_epi16(0x00c2),  // 0000 0000 1101 0010
-                              _mm512_set1_epi16(0x40c3)); // 0100 0000 1100 0011
-  __m512i outputB = _mm512_ternarylogic_epi32(
-      input_interleaved, leadingB, _mm512_set1_epi16((short)0xff00),
-      (240 & 170) ^ 204); // (input_interleaved & 0xff00) ^ leadingB
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+}
 
-  // prune redundant bytes
-  outputA = _mm512_maskz_compress_epi8(maskA, outputA);
-  outputB = _mm512_maskz_compress_epi8(maskB, outputB);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+}
 
-  size_t output_sizeA = (size_t)count_ones((uint32_t)nonascii) + 32;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
+                                                             utf16_output);
+}
 
-  if (mask_output) {
-    if (input_len > 32) { // is the second half of the input vector used?
-      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_sizeA);
-      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
-      utf8_output += output_sizeA;
-      write_mask = _bzhi_u64(~0ULL, (unsigned int)(output_size - output_sizeA));
-      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputB);
-    } else {
-      __mmask64 write_mask = _bzhi_u64(~0ULL, (unsigned int)output_size);
-      _mm512_mask_storeu_epi8(utf8_output, write_mask, outputA);
-    }
-  } else {
-    _mm512_storeu_si512(utf8_output, outputA);
-    utf8_output += output_sizeA;
-    _mm512_storeu_si512(utf8_output, outputB);
-  }
-  return output_size;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
+                                                          utf16_output);
 }
 
-static inline size_t latin1_to_utf8_avx512_branch(__m512i input,
-                                                  char *utf8_output) {
-  __mmask64 nonascii = _mm512_movepi8_mask(input);
-  if (nonascii) {
-    return latin1_to_utf8_avx512_vec(input, 64, utf8_output, 0);
-  } else {
-    _mm512_storeu_si512(utf8_output, input);
-    return 64;
-  }
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf16_output);
 }
 
-size_t latin1_to_utf8_avx512_start(const char *buf, size_t len,
-                                   char *utf8_output) {
-  char *start = utf8_output;
-  size_t pos = 0;
-  // if there's at least 128 bytes remaining, we don't need to mask the output
-  for (; pos + 128 <= len; pos += 64) {
-    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
-    utf8_output += latin1_to_utf8_avx512_branch(input, utf8_output);
-  }
-  // in the last 128 bytes, the first 64 may require masking the output
-  if (pos + 64 <= len) {
-    __m512i input = _mm512_loadu_si512((__m512i *)(buf + pos));
-    utf8_output += latin1_to_utf8_avx512_vec(input, 64, utf8_output, 1);
-    pos += 64;
-  }
-  // with the last 64 bytes, the input also needs to be masked
-  if (pos < len) {
-    __mmask64 load_mask = _bzhi_u64(~0ULL, (unsigned int)(len - pos));
-    __m512i input = _mm512_maskz_loadu_epi8(load_mask, (__m512i *)(buf + pos));
-    utf8_output += latin1_to_utf8_avx512_vec(input, len - pos, utf8_output, 1);
-  }
-  return (size_t)(utf8_output - start);
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+      buf, len, utf16_output);
 }
-/* end file src/icelake/icelake_convert_latin1_to_utf8.inl.cpp */
-/* begin file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
-// file included directly
-template <endianness big_endian>
-size_t icelake_convert_latin1_to_utf16(const char *latin1_input, size_t len,
-                                       char16_t *utf16_output) {
-  size_t rounded_len = len & ~0x1F; // Round down to nearest multiple of 32
 
-  __m512i byteflip = _mm512_setr_epi64(0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809,
-                                       0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  for (size_t i = 0; i < rounded_len; i += 32) {
-    // Load 32 Latin1 characters into a 256-bit register
-    __m256i in = _mm256_loadu_si256((__m256i *)&latin1_input[i]);
-    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
-    __m512i out = _mm512_cvtepu8_epi16(in);
-    if (big_endian) {
-      out = _mm512_shuffle_epi8(out, byteflip);
-    }
-    // Store the results back to memory
-    _mm512_storeu_si512((__m512i *)&utf16_output[i], out);
-  }
-  if (rounded_len != len) {
-    uint32_t mask = uint32_t(1 << (len - rounded_len)) - 1;
-    __m256i in = _mm256_maskz_loadu_epi8(mask, latin1_input + rounded_len);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
+      buf, len, utf16_output);
+}
 
-    // Zero extend each set of 8 Latin1 characters to 32 16-bit integers
-    __m512i out = _mm512_cvtepu8_epi16(in);
-    if (big_endian) {
-      out = _mm512_shuffle_epi8(out, byteflip);
-    }
-    // Store the results back to memory
-    _mm512_mask_storeu_epi16(utf16_output + rounded_len, mask, out);
-  }
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
+                                                                utf16_output);
+}
 
-  return len;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
+                                                             utf32_output);
 }
-/* end file src/icelake/icelake_convert_latin1_to_utf16.inl.cpp */
-/* begin file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
-std::pair<const char *, char32_t *>
-avx512_convert_latin1_to_utf32(const char *buf, size_t len,
-                               char32_t *utf32_output) {
-  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    // Load 16 Latin1 characters into a 128-bit register
-    __m128i in = _mm_loadu_si128((__m128i *)&buf[i]);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
+                                                          utf32_output);
+}
 
-    // Zero extend each set of 8 Latin1 characters to 16 32-bit integers using
-    // vpmovzxbd
-    __m512i out = _mm512_cvtepu8_epi32(in);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+      buf, len, utf32_output);
+}
 
-    // Store the results back to memory
-    _mm512_storeu_si512((__m512i *)&utf32_output[i], out);
-  }
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+      buf, len, utf32_output);
+}
 
-  // Return pointers pointing to where we left off
-  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
+      buf, len, utf32_output);
 }
-/* end file src/icelake/icelake_convert_latin1_to_utf32.inl.cpp */
-/* begin file src/icelake/icelake_base64.inl.cpp */
-// file included directly
-/**
- * References and further reading:
- *
- * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
- * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
- * https://arxiv.org/abs/1910.05109
- *
- * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
- * Instructions, ACM Transactions on the Web 12 (3), 2018.
- * https://arxiv.org/abs/1704.00605
- *
- * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
- * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
- * Request for Comments: 4648.
- *
- * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
- * http://www.alfredklomp.com/programming/sse-base64/. (2014).
- *
- * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
- * acceleration. https://github.com/aklomp/base64. (2014).
- *
- * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
- * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
- *
- * Nick Kopp. 2013. Base64 Encoding on a GPU.
- * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
- */
 
-struct block64 {
-  __m512i chunks[1];
-};
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
+                                                                utf32_output);
+}
 
-template <bool base64_url>
-size_t encode_base64(char *dst, const char *src, size_t srclen,
-                     base64_options options) {
-  // credit: Wojciech Muła
-  const uint8_t *input = (const uint8_t *)src;
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  scalar::utf16::change_endianness_utf16(input, length, output);
+}
 
-  uint8_t *out = (uint8_t *)dst;
-  static const char *lookup_tbl =
-      base64_url
-          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
-          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+}
 
-  const __m512i shuffle_input = _mm512_setr_epi32(
-      0x01020001, 0x04050304, 0x07080607, 0x0a0b090a, 0x0d0e0c0d, 0x10110f10,
-      0x13141213, 0x16171516, 0x191a1819, 0x1c1d1b1c, 0x1f201e1f, 0x22232122,
-      0x25262425, 0x28292728, 0x2b2c2a2b, 0x2e2f2d2e);
-  const __m512i lookup =
-      _mm512_loadu_si512(reinterpret_cast<const __m512i *>(lookup_tbl));
-  const __m512i multi_shifts = _mm512_set1_epi64(UINT64_C(0x3036242a1016040a));
-  size_t size = srclen;
-  __mmask64 input_mask = 0xffffffffffff; // (1 << 48) - 1
-  while (size >= 48) {
-    const __m512i v = _mm512_maskz_loadu_epi8(
-        input_mask, reinterpret_cast<const __m512i *>(input));
-    const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
-    const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
-    const __m512i result = _mm512_permutexvar_epi8(indices, lookup);
-    _mm512_storeu_si512(reinterpret_cast<__m512i *>(out), result);
-    out += 64;
-    input += 48;
-    size -= 48;
-  }
-  input_mask = ((__mmask64)1 << size) - 1;
-  const __m512i v = _mm512_maskz_loadu_epi8(
-      input_mask, reinterpret_cast<const __m512i *>(input));
-  const __m512i in = _mm512_permutexvar_epi8(shuffle_input, v);
-  const __m512i indices = _mm512_multishift_epi64_epi8(multi_shifts, in);
-  bool padding_needed =
-      (((options & base64_url) == 0) ^
-       ((options & base64_reverse_padding) == base64_reverse_padding));
-  size_t padding_amount = ((size % 3) > 0) ? (3 - (size % 3)) : 0;
-  size_t output_len = ((size + 2) / 3) * 4;
-  size_t non_padded_output_len = output_len - padding_amount;
-  if (!padding_needed) {
-    output_len = non_padded_output_len;
-  }
-  __mmask64 output_mask = output_len == 64 ? (__mmask64)UINT64_MAX
-                                           : ((__mmask64)1 << output_len) - 1;
-  __m512i result = _mm512_mask_permutexvar_epi8(
-      _mm512_set1_epi8('='), ((__mmask64)1 << non_padded_output_len) - 1,
-      indices, lookup);
-  _mm512_mask_storeu_epi8(reinterpret_cast<__m512i *>(out), output_mask,
-                          result);
-  return (size_t)(out - (uint8_t *)dst) + output_len;
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-template <bool base64_url>
-static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
-  __m512i input = b->chunks[0];
-  const __m512i ascii_space_tbl = _mm512_set_epi8(
-      0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10,
-      9, 0, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0,
-      0, 0, 32, 0, 0, 13, 12, 0, 10, 9, 0, 0, 0, 0, 0, 0, 0, 0, 32);
-  __m512i lookup0;
-  if (base64_url) {
-    lookup0 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-        52, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1,
-        -128, -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -1);
-  } else {
-    lookup0 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, -128, 61, 60, 59, 58, 57, 56, 55, 54, 53,
-        52, 63, -128, -128, -128, 62, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -1, -128, -128, -128, -128, -128, -128, -128, -128,
-        -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -1, -128,
-        -128, -1, -1, -128, -128, -128, -128, -128, -128, -128, -128, -128);
-  }
-  __m512i lookup1;
-  if (base64_url) {
-    lookup1 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
-        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
-        63, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15,
-        14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
-  } else {
-    lookup1 = _mm512_set_epi8(
-        -128, -128, -128, -128, -128, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42,
-        41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, -128,
-        -128, -128, -128, -128, -128, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16,
-        15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -128);
-  }
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
+}
 
-  const __m512i translated = _mm512_permutex2var_epi8(lookup0, input, lookup1);
-  const __m512i combined = _mm512_or_si512(translated, input);
-  const __mmask64 mask = _mm512_movepi8_mask(combined);
-  if (mask) {
-    const __mmask64 spaces = _mm512_cmpeq_epi8_mask(
-        _mm512_shuffle_epi8(ascii_space_tbl, input), input);
-    *error = (mask ^ spaces);
-  }
-  b->chunks[0] = translated;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
+                                                                   length);
+}
 
-  return mask;
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-static inline void copy_block(block64 *b, char *output) {
-  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), b->chunks[0]);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
+                                                                    length);
 }
 
-static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
-  uint64_t nmask = ~mask;
-  __m512i c = _mm512_maskz_compress_epi8(nmask, b->chunks[0]);
-  _mm512_storeu_si512(reinterpret_cast<__m512i *>(output), c);
-  return _mm_popcnt_u64(nmask);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
 }
 
-// The caller of this function is responsible to ensure that there are 64 bytes
-// available from reading at src. The data is read into a block64 structure.
-static inline void load_block(block64 *b, const char *src) {
-  b->chunks[0] = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::utf16_length_from_utf8(input, length);
 }
 
-// The caller of this function is responsible to ensure that there are 128 bytes
-// available from reading at src. The data is read into a block64 structure.
-static inline void load_block(block64 *b, const char16_t *src) {
-  __m512i m1 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src));
-  __m512i m2 = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src + 32));
-  __m512i p = _mm512_packus_epi16(m1, m2);
-  b->chunks[0] =
-      _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7), p);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf8_length_from_utf32(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  return scalar::utf32::utf16_length_from_utf32(input, length);
 }
 
-static inline void base64_decode(char *out, __m512i str) {
-  const __m512i merge_ab_and_bc =
-      _mm512_maddubs_epi16(str, _mm512_set1_epi32(0x01400140));
-  const __m512i merged =
-      _mm512_madd_epi16(merge_ab_and_bc, _mm512_set1_epi32(0x00011000));
-  const __m512i pack = _mm512_set_epi8(
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 60, 61, 62, 56, 57, 58,
-      52, 53, 54, 48, 49, 50, 44, 45, 46, 40, 41, 42, 36, 37, 38, 32, 33, 34,
-      28, 29, 30, 24, 25, 26, 20, 21, 22, 16, 17, 18, 12, 13, 14, 8, 9, 10, 4,
-      5, 6, 0, 1, 2);
-  const __m512i shuffled = _mm512_permutexvar_epi8(pack, merged);
-  _mm512_mask_storeu_epi8(
-      (__m512i *)out, 0xffffffffffff,
-      shuffled); // mask would be 0xffffffffffff since we write 48 bytes.
-}
-// decode 64 bytes and output 48 bytes
-static inline void base64_decode_block(char *out, const char *src) {
-  base64_decode(out,
-                _mm512_loadu_si512(reinterpret_cast<const __m512i *>(src)));
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return scalar::utf8::count_code_points(input, length);
 }
-static inline void base64_decode_block(char *out, block64 *b) {
-  base64_decode(out, b->chunks[0]);
+
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
 }
 
-template <bool base64_url, typename chartype>
-full_result
-compress_decode_base64(char *dst, const chartype *src, size_t srclen,
-                       base64_options options,
-                       last_chunk_handling_options last_chunk_options) {
-  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
-                                        : tables::base64::to_base64_value;
-  size_t equallocation =
-      srclen; // location of the first padding character if any
-  size_t equalsigns = 0;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
   // skip trailing spaces
-  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-         to_base64[uint8_t(src[srclen - 1])] == 64) {
-    srclen--;
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
   }
-  if (srclen > 0 && src[srclen - 1] == '=') {
-    equallocation = srclen - 1;
-    srclen--;
-    equalsigns = 1;
-    // skip trailing spaces
-    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
-           to_base64[uint8_t(src[srclen - 1])] == 64) {
-      srclen--;
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
     }
-    if (srclen > 0 && src[srclen - 1] == '=') {
-      equallocation = srclen - 1;
-      srclen--;
-      equalsigns = 2;
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
     }
   }
-  if (srclen == 0) {
+  if (length == 0) {
     if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
-    return {SUCCESS, 0, 0};
+    return {SUCCESS, 0};
   }
-  const chartype *const srcinit = src;
-  const char *const dstinit = dst;
-  const chartype *const srcend = src + srclen;
-
-  // figure out why block_size == 2 is sometimes best???
-  constexpr size_t block_size = 6;
-  char buffer[block_size * 64];
-  char *bufferptr = buffer;
-  if (srclen >= 64) {
-    const chartype *const srcend64 = src + srclen - 64;
-    while (src <= srcend64) {
-      block64 b;
-      load_block(&b, src);
-      src += 64;
-      uint64_t error = 0;
-      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
-      if (error) {
-        src -= 64;
-        size_t error_offset = _tzcnt_u64(error);
-        return {error_code::INVALID_BASE64_CHARACTER,
-                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
-      }
-      if (badcharmask != 0) {
-        // optimization opportunity: check for simple masks like those made of
-        // continuous 1s followed by continuous 0s. And masks containing a
-        // single bad character.
-        bufferptr += compress_block(&b, badcharmask, bufferptr);
-      } else if (bufferptr != buffer) {
-        copy_block(&b, bufferptr);
-        bufferptr += 64;
-      } else {
-        base64_decode_block(dst, &b);
-        dst += 48;
-      }
-      if (bufferptr >= (block_size - 1) * 64 + buffer) {
-        for (size_t i = 0; i < (block_size - 1); i++) {
-          base64_decode_block(dst, buffer + i * 64);
-          dst += 48;
-        }
-        std::memcpy(buffer, buffer + (block_size - 1) * 64,
-                    64); // 64 might be too much
-        bufferptr -= (block_size - 1) * 64;
-      }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
+  return r;
+}
 
-  char *buffer_start = buffer;
-  // Optimization note: if this is almost full, then it is worth our
-  // time, otherwise, we should just decode directly.
-  int last_block = (int)((bufferptr - buffer_start) % 64);
-  if (last_block != 0 && srcend - src + last_block >= 64) {
-
-    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
-      uint8_t val = to_base64[uint8_t(*src)];
-      *bufferptr = char(val);
-      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
-        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
-                size_t(dst - dstinit)};
-      }
-      bufferptr += (val <= 63);
-      src++;
-    }
-  }
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
 
-  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
-    base64_decode_block(dst, buffer_start);
-    dst += 48;
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  // skip trailing spaces
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
   }
-  if ((bufferptr - buffer_start) % 64 != 0) {
-    while (buffer_start + 4 < bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 4);
-      dst += 3;
-      buffer_start += 4;
-    }
-    if (buffer_start + 4 <= bufferptr) {
-      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
-                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
-                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
-                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
-                        << 8;
-      triple = scalar::utf32::swap_bytes(triple);
-      std::memcpy(dst, &triple, 3);
-      dst += 3;
-      buffer_start += 4;
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
     }
-    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
-    // backtrack
-    int leftover = int(bufferptr - buffer_start);
-    while (leftover > 0) {
-      while (to_base64[uint8_t(*(src - 1))] == 64) {
-        src--;
-      }
-      src--;
-      leftover--;
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
     }
   }
-  if (src < srcend + equalsigns) {
-    full_result r = scalar::base64::base64_tail_decode(
-        dst, src, srcend - src, equalsigns, options, last_chunk_options);
-    r.input_count += size_t(src - srcinit);
-    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
-        r.error == error_code::BASE64_EXTRA_BITS) {
-      return r;
-    } else {
-      r.output_count += size_t(dst - dstinit);
-    }
-    if (last_chunk_options != stop_before_partial &&
-        r.error == error_code::SUCCESS && equalsigns > 0) {
-      // additional checks
-      if ((r.output_count % 3 == 0) ||
-          ((r.output_count % 3) + 1 + equalsigns != 4)) {
-        r.error = error_code::INVALID_BASE64_CHARACTER;
-        r.input_count = equallocation;
-      }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
-    return r;
+    return {SUCCESS, 0};
   }
-  if (equalsigns > 0) {
-    if ((size_t(dst - dstinit) % 3 == 0) ||
-        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
     }
   }
-  return {SUCCESS, srclen, size_t(dst - dstinit)};
+  return r;
 }
-/* end file src/icelake/icelake_base64.inl.cpp */
 
-#include <cstdint>
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
 
-} // namespace
-} // namespace icelake
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  return scalar::base64::binary_to_base64(input, length, output, options);
+}
+} // namespace ppc64
 } // namespace simdutf
 
+/* begin file src/simdutf/ppc64/end.h */
+/* end file src/simdutf/ppc64/end.h */
+/* end file src/ppc64/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_RVV
+/* begin file src/rvv/implementation.cpp */
+
+
+
+
+
+/* begin file src/simdutf/rvv/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "rvv"
+// #define SIMDUTF_IMPLEMENTATION rvv
+
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
+// nothing needed.
+#else
+SIMDUTF_TARGET_RVV
+#endif
+/* end file src/simdutf/rvv/begin.h */
 namespace simdutf {
-namespace icelake {
+namespace rvv {
+namespace {
+#ifndef SIMDUTF_RVV_H
+  #error "rvv.h must be included"
+#endif
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  // todo: convert to a one-pass algorithm
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
+} // unnamed namespace
+} // namespace rvv
+} // namespace simdutf
+
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace rvv {
+/* begin file src/rvv/rvv_helpers.inl.cpp */
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
+                         vbool4_t m4even) {
+  /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
+   * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */
+  vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
+  sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
+                             __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
+  sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
+  sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
+  /* merge 1 byte utf32 and 2 byte sur */
+  vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
+  vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
+      __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
+  /* compress and store */
+  vbool4_t mOut = __riscv_vmor_mm_b4(
+      __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
+  vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
+  vl = __riscv_vcpop_m_b4(mOut, vl * 2);
+  __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
+  return vl;
+};
+/* end file src/rvv/rvv_helpers.inl.cpp */
+
+/* begin file src/rvv/rvv_length_from.inl.cpp */
+
+simdutf_warn_unused size_t
+implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
+  return utf32_length_from_utf16le(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
+  return utf32_length_from_utf16be(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *src, size_t len) const noexcept {
+  return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *src, size_t len) const noexcept {
+  return utf32_length_from_utf8(src, len);
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t len) const noexcept {
+  return len;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+    count += __riscv_vcpop_m_b1(mask, vl);
   }
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
+  return count;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool2_t notHigh =
+        __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
+                           __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
+    count += __riscv_vcpop_m_b2(notHigh, vl);
   }
-  if ((length % 2) == 0) {
-    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
-                         length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *src, size_t len) const noexcept {
+  return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *src, size_t len) const noexcept {
+  size_t count = len;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
-    }
+  return count;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
+    vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
+    vbool2_t notSur =
+        __riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
+                           __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
+    vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
+    count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *src, size_t len) const noexcept {
+  return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
+    vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
+    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+    count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
+             __riscv_vcpop_m_b4(m4, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
+    vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v),
+                                            (uint8_t)0b11101111, vl);
+    count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
+  }
+  return count;
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *src, size_t len) const noexcept {
+  size_t count = 0;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
+    count += vl + __riscv_vcpop_m_b4(m4, vl);
+  }
+  return count;
+}
+/* end file src/rvv/rvv_length_from.inl.cpp */
+/* begin file src/rvv/rvv_validate.inl.cpp */
+
+
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *src, size_t len) const noexcept {
+  size_t vlmax = __riscv_vsetvlmax_e8m8();
+  vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
   }
-  return out;
+  return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
+         0;
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return true;
-  }
-  avx512_utf8_checker checker{};
-  const char *ptr = buf;
-  const char *end = ptr + len;
-  for (; end - ptr >= 64; ptr += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    checker.check_next_input(utf8);
-  }
-  if (end != ptr) {
-    const __m512i utf8 = _mm512_maskz_loadu_epi8(
-        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
-    checker.check_next_input(utf8);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *src, size_t len) const noexcept {
+  const char *beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m8(len);
+    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
+    long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
   }
-  checker.check_eof();
-  return !checker.errors();
+  return result(error_code::SUCCESS, src - beg);
 }
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return result(error_code::SUCCESS, len);
-  }
-  avx512_utf8_checker checker{};
-  const char *ptr = buf;
-  const char *end = ptr + len;
-  size_t count{0};
-  for (; end - ptr >= 64; ptr += 64) {
-    const __m512i utf8 = _mm512_loadu_si512((const __m512i *)ptr);
-    checker.check_next_input(utf8);
-    if (checker.errors()) {
-      if (count != 0) {
-        count--;
-      } // Sometimes the error is only detected in the next chunk
-      result res = scalar::utf8::rewind_and_validate_with_errors(
-          reinterpret_cast<const char *>(buf),
-          reinterpret_cast<const char *>(buf + count), len - count);
-      res.count += count;
-      return res;
-    }
-    count += 64;
-  }
-  if (end != ptr) {
-    const __m512i utf8 = _mm512_maskz_loadu_epi8(
-        ~UINT64_C(0) >> (64 - (end - ptr)), (const __m512i *)ptr);
-    checker.check_next_input(utf8);
+/* Returns a close estimation of the number of valid UTF-8 bytes up to the
+ * first invalid one, but never overestimating. */
+simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
+                                                         size_t len) {
+  const char *beg = src;
+  if (len < 32)
+    return 0;
+
+  /* validate first three bytes */
+  {
+    size_t idx = 3;
+    while (idx < len && (src[idx] >> 6) == 0b10)
+      ++idx;
+    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
+      return 0;
   }
-  checker.check_eof();
-  if (checker.errors()) {
-    if (count != 0) {
-      count--;
-    } // Sometimes the error is only detected in the next chunk
-    result res = scalar::utf8::rewind_and_validate_with_errors(
-        reinterpret_cast<const char *>(buf),
-        reinterpret_cast<const char *>(buf + count), len - count);
-    res.count += count;
-    return res;
+
+  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
+  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
+  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
+
+  const vuint8m1_t err1tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+  const vuint8m1_t err2tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+  const vuint8m1_t err3tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+
+  size_t tail = 3;
+  size_t n = len - tail;
+
+  for (size_t vl; n > 0; n -= vl, src += vl) {
+    vl = __riscv_vsetvl_e8m4(n);
+    vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);
+
+    uint8_t next0 = src[vl + 0];
+    uint8_t next1 = src[vl + 1];
+    uint8_t next2 = src[vl + 2];
+
+    /* fast path: ASCII */
+    if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
+            0 &&
+        (next0 | next1 | next2) < 0b10000000)
+      continue;
+
+    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+     * https://arxiv.org/abs/2010.03090 */
+    vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
+    vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);
+    vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl);
+
+    vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
+        __riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4()));
+    vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
+        __riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4()));
+
+    vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
+    vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl);
+    vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl);
+
+    vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
+    vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
+    vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
+    vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
+        __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
+
+    vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
+    vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
+    vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
+    vbool2_t err34 =
+        __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
+    vbool2_t errm =
+        __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
+    if (__riscv_vfirst_m_b2(errm, vl) >= 0)
+      break;
   }
-  return result(error_code::SUCCESS, len);
+
+  /* we need to validate the last character */
+  while (tail < len && (src[0] >> 6) == 0b10)
+    --src, ++tail;
+  return src - beg;
 }
 
 simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return icelake::validate_ascii(buf, len);
+implementation::validate_utf8(const char *src, size_t len) const noexcept {
+  size_t count = rvv_count_valid_utf8(src, len);
+  return scalar::utf8::validate(src + count, len - count);
 }
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  const char *buf_orig = buf;
-  const char *end = buf + len;
-  const __m512i ascii = _mm512_set1_epi8((uint8_t)0x80);
-  for (; end - buf >= 64; buf += 64) {
-    const __m512i input = _mm512_loadu_si512((const __m512i *)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if (notascii) {
-      return result(error_code::TOO_LARGE,
-                    buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  if (end != buf) {
-    const __m512i input = _mm512_maskz_loadu_epi8(
-        ~UINT64_C(0) >> (64 - (end - buf)), (const __m512i *)buf);
-    __mmask64 notascii = _mm512_cmp_epu8_mask(input, ascii, _MM_CMPINT_NLT);
-    if (notascii) {
-      return result(error_code::TOO_LARGE,
-                    buf - buf_orig + _tzcnt_u64(notascii));
-    }
-  }
-  return result(error_code::SUCCESS, len);
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *src, size_t len) const noexcept {
+  size_t count = rvv_count_valid_utf8(src, len);
+  result res = scalar::utf8::validate_with_errors(src + count, len - count);
+  return result(res.error, count + res.count);
 }
 
 simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
+implementation::validate_utf16le(const char16_t *src,
                                  size_t len) const noexcept {
-  const char16_t *end = buf + len;
-
-  for (; end - buf >= 32;) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
-    }
-  }
-  if (buf < end) {
-    __m512i in =
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-    }
-  }
-  return true;
+  return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS;
 }
 
 simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
+implementation::validate_utf16be(const char16_t *src,
                                  size_t len) const noexcept {
-  const char16_t *end = buf + len;
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  for (; end - buf >= 32;) {
-    __m512i in =
-        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
+  return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
+  const char16_t *beg = src;
+  uint16_t last = 0;
+  for (size_t vl; len > 0;
+       len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl);
+    v1 = simdutf_byteflip<bflip>(v1, vl);
+    vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
+
+    vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(
+        __riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
+    vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(
+        __riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
+
+    long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
+    if (idx >= 0) {
+      last = idx > 0 ? simdutf_byteflip<bflip>(src[idx - 1]) : last;
+      return result(error_code::SURROGATE,
+                    src - beg + idx - (last - 0xD800u < 0x400u));
+      break;
     }
   }
-  if (buf < end) {
-    __m512i in = _mm512_shuffle_epi8(
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
-        byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        return false;
-      }
-    }
+  if (last - 0xD800u < 0x400u) {
+    return result(error_code::SURROGATE,
+                  src - beg - 1); /* end on high surrogate */
+  } else {
+    return result(error_code::SUCCESS, src - beg);
   }
-  return true;
 }
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  const char16_t *start_buf = buf;
-  const char16_t *end = buf + len;
-  for (; end - buf >= 32;) {
-    __m512i in = _mm512_loadu_si512((__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
-    }
-  }
-  if (buf < end) {
-    __m512i in =
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-    }
-  }
-  return result(error_code::SUCCESS, len);
+    const char16_t *src, size_t len) const noexcept {
+  return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
 }
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  const char16_t *start_buf = buf;
-  const char16_t *end = buf + len;
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  for (; end - buf >= 32;) {
-    __m512i in =
-        _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)buf), byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-      bool ends_with_high = ((highsurrogates & 0x80000000) != 0);
-      if (ends_with_high) {
-        buf += 31; // advance only by 31 code units so that we start with the
-                   // high surrogate on the next round.
-      } else {
-        buf += 32;
-      }
-    } else {
-      buf += 32;
-    }
-  }
-  if (buf < end) {
-    __m512i in = _mm512_shuffle_epi8(
-        _mm512_maskz_loadu_epi16((1U << (end - buf)) - 1, (__m512i *)buf),
-        byteflip);
-    __m512i diff = _mm512_sub_epi16(in, _mm512_set1_epi16(uint16_t(0xD800)));
-    __mmask32 surrogates =
-        _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0800)));
-    if (surrogates) {
-      __mmask32 highsurrogates =
-          _mm512_cmplt_epu16_mask(diff, _mm512_set1_epi16(uint16_t(0x0400)));
-      __mmask32 lowsurrogates = surrogates ^ highsurrogates;
-      // high must be followed by low
-      if ((highsurrogates << 1) != lowsurrogates) {
-        uint32_t extra_low = _tzcnt_u32(lowsurrogates & ~(highsurrogates << 1));
-        uint32_t extra_high =
-            _tzcnt_u32(highsurrogates & ~(lowsurrogates >> 1));
-        return result(error_code::SURROGATE,
-                      (buf - start_buf) +
-                          (extra_low < extra_high ? extra_low : extra_high));
-      }
-    }
-  }
-  return result(error_code::SUCCESS, len);
+    const char16_t *src, size_t len) const noexcept {
+  if (supports_zvbb())
+    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
+  else
+    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
 }
 
 simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
-  const char32_t *tail = icelake::validate_utf32(buf, len);
-  if (tail) {
-    return scalar::utf32::validate(tail, len - (tail - buf));
-  } else {
-    // we come here if there was an error, or buf was nullptr which may happen
-    // for empty input.
-    return len == 0;
+implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
+  size_t vlmax = __riscv_vsetvlmax_e32m8();
+  vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
+  vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+    max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
+    maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
   }
+  return __riscv_vfirst_m_b4(
+             __riscv_vmor_mm_b4(
+                 __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
+                 __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
+             vlmax) < 0;
 }
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  const char32_t *buf_orig = buf;
-  if (len >= 16) {
-    const char32_t *end = buf + len - 16;
-    while (buf <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i *)buf);
-      __mmask16 outside_range = _mm512_cmp_epu32_mask(
-          utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
-
-      __m512i utf32_off =
-          _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-      __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
-          utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
-      if ((outside_range | surrogate_range)) {
-        auto outside_idx = _tzcnt_u32(outside_range);
-        auto surrogate_idx = _tzcnt_u32(surrogate_range);
-
-        if (outside_idx < surrogate_idx) {
-          return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
-        }
-
-        return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
-      }
-
-      buf += 16;
-    }
-  }
-  if (len > 0) {
-    __m512i utf32 = _mm512_maskz_loadu_epi32(
-        __mmask16((1U << (buf_orig + len - buf)) - 1), (const __m512i *)buf);
-    __mmask16 outside_range = _mm512_cmp_epu32_mask(
-        utf32, _mm512_set1_epi32(0x10ffff), _MM_CMPINT_GT);
-    __m512i utf32_off = _mm512_add_epi32(utf32, _mm512_set1_epi32(0xffff2000));
-
-    __mmask16 surrogate_range = _mm512_cmp_epu32_mask(
-        utf32_off, _mm512_set1_epi32(0xfffff7ff), _MM_CMPINT_GT);
-    if ((outside_range | surrogate_range)) {
-      auto outside_idx = _tzcnt_u32(outside_range);
-      auto surrogate_idx = _tzcnt_u32(surrogate_range);
-
-      if (outside_idx < surrogate_idx) {
-        return result(error_code::TOO_LARGE, buf - buf_orig + outside_idx);
+    const char32_t *src, size_t len) const noexcept {
+  const char32_t *beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
+    long idx1 =
+        __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
+    long idx2 = __riscv_vfirst_m_b4(
+        __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2) {
+        return result(error_code::TOO_LARGE, src - beg + idx1);
+      } else {
+        return result(error_code::SURROGATE, src - beg + idx2);
       }
-
-      return result(error_code::SURROGATE, buf - buf_orig + surrogate_idx);
+    }
+    if (idx1 >= 0) {
+      return result(error_code::TOO_LARGE, src - beg + idx1);
+    }
+    if (idx2 >= 0) {
+      return result(error_code::SURROGATE, src - beg + idx2);
     }
   }
-
-  return result(error_code::SUCCESS, len);
+  return result(error_code::SUCCESS, src - beg);
 }
+/* end file src/rvv/rvv_validate.inl.cpp */
+
+/* begin file src/rvv/rvv_latin1_to.inl.cpp */
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *buf, size_t len, char *utf8_output) const noexcept {
-  return icelake::latin1_to_utf8_avx512_start(buf, len, utf8_output);
+    const char *src, size_t len, char *dst) const noexcept {
+  char *beg = dst;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    vbool4_t nascii =
+        __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
+    size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
+    vlOut = vl + cnt;
+    if (cnt == 0) {
+      __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
+      continue;
+    }
+
+    vuint8m2_t v0 =
+        __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
+    v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
+
+    vuint8m4_t wide =
+        __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
+            __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
+    vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
+        __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
+    vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);
+
+    __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
+  }
+  return dst - beg;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return icelake_convert_latin1_to_utf16<endianness::LITTLE>(buf, len,
-                                                             utf16_output);
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  char16_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m4(len);
+    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
+  }
+  return dst - beg;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return icelake_convert_latin1_to_utf16<endianness::BIG>(buf, len,
-                                                          utf16_output);
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  char16_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m4(len);
+    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
+    __riscv_vse16_v_u16m8(
+        (uint16_t *)dst,
+        __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
+  }
+  return dst - beg;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::pair<const char *, char32_t *> ret =
-      avx512_convert_latin1_to_utf32(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  char32_t *beg = dst;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
   }
-  size_t converted_chars = ret.second - utf32_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
-    converted_chars += scalar_converted_chars;
+  return dst - beg;
+}
+/* end file src/rvv/rvv_latin1_to.inl.cpp */
+/* begin file src/rvv/rvv_utf16_to.inl.cpp */
+#include <cstdio>
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
+    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
   }
-  return converted_chars;
+  return result(error_code::SUCCESS, src - beg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake::utf8_to_latin1_avx512(buf, len, latin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  // First, try to convert as much as possible using the SIMD implementation.
-  const char *obuf = buf;
-  char *olatin1_output = latin1_output;
-  size_t written = icelake::utf8_to_latin1_avx512(obuf, len, olatin1_output);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
 
-  // If we have completely converted the string
-  if (obuf == buf + len) {
-    return {simdutf::SUCCESS, written};
-  }
-  size_t pos = obuf - buf;
-  result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-      pos, buf + pos, len - pos, latin1_output);
-  res.count += pos;
-  return res;
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake::valid_utf8_to_latin1_avx512(buf, len, latin1_output);
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
+                                                                   dst);
+  else
+    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret =
-      fast_avx512_convert_utf8_to_utf16<endianness::LITTLE>(buf, len,
-                                                            utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
   }
-  return ret.second - utf16_output;
+  return src - beg;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret = fast_avx512_convert_utf8_to_utf16<endianness::BIG>(
-      buf, len, utf16_output);
-  if (ret.second == nullptr) {
-    return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  const char16_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
   }
-  return ret.second - utf16_output;
+  return src - beg;
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
+  size_t n = len;
+  const char16_t *srcBeg = src;
+  const char *dstBeg = dst;
+  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
+      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return fast_avx512_convert_utf8_to_utf16_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
-}
+  for (size_t vl, vlOut; n > 0;) {
+    vl = __riscv_vsetvl_e16m2(n);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret =
-      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, char16_t>(
-          buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+    vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
+    v = simdutf_byteflip<bflip>(v, vl);
+    vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl);
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
+    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+      vlOut = vl;
+      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut),
+                          vlOut);
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes =
-        scalar::utf8_to_utf16::convert_valid<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+    vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl);
+
+    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
+      /* 0: [     aaa|aabbbbbb]
+       * 1: [aabbbbbb|        ] vsll 8
+       * 2: [        |   aaaaa] vsrl 6
+       * 3: [00111111|00011111]
+       * 4: [  bbbbbb|000aaaaa] (1|2)&3
+       * 5: [11000000|11000000]
+       * 6: [10bbbbbb|110aaaaa] 4|5 */
+      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
+          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl),
+                               __riscv_vsrl_vx_u16m2(v, 6, vl), vl),
+          0b0011111100011111, vl);
+      vuint16m2_t vout16 =
+          __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
+      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+      /* Every high byte that is zero should be compressed
+       * low bytes should never be compressed, so we set them
+       * to all ones, and then create a non-zero bytes mask */
+      vbool4_t mcomp =
+          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
+                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
+                                   0, vl * 2);
+      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+
+      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
+      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
 
-  return saved_bytes;
-}
+    vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
+    long first = __riscv_vfirst_m_b8(sur, vl);
+    size_t tail = vl - first;
+    vl = first < 0 ? vl : first;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
-  utf8_to_utf16_result ret =
-      icelake::valid_utf8_to_fixed_length<endianness::BIG, char16_t>(
-          buf, len, utf16_output);
-  size_t saved_bytes = ret.second - utf16_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+    if (vl > 0) { /* 1/2/3 byte utf8 */
+      /* in: [aaaabbbb|bbcccccc]
+       * v1: [0bcccccc|        ] vsll  8
+       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
+       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
+       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
+       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
+       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
+       * [10cccccc]
+       */
+      vuint16m2_t v1, v2, v3, v12;
+      v1 = __riscv_vor_vx_u16m2_mu(
+          m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
+      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
+      v2 = __riscv_vor_vx_u16m2(
+          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111,
+                                vl),
+          0b10000000, vl);
+      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
+                                   0b01000000, vl);
+      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000,
+                                vl);
+      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes =
-        scalar::utf8_to_utf16::convert_valid<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
+      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+      vbool2_t mcomp = __riscv_vmor_mm_b2(
+          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
+      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+
+      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
+      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
     }
-    saved_bytes += scalar_saved_bytes;
+
+    if (tail)
+      while (n) {
+        uint16_t word = simdutf_byteflip<bflip>(src[0]);
+        if ((word & 0xFF80) == 0) {
+          break;
+        } else if ((word & 0xF800) == 0) {
+          break;
+        } else if ((word & 0xF800) != 0xD800) {
+          break;
+        } else {
+          // must be a surrogate pair
+          if (n <= 1)
+            return result(error_code::SURROGATE, src - srcBeg);
+          uint16_t diff = word - 0xD800;
+          if (diff > 0x3FF)
+            return result(error_code::SURROGATE, src - srcBeg);
+          uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
+          if (diff2 > 0x3FF)
+            return result(error_code::SURROGATE, src - srcBeg);
+
+          uint32_t value = ((diff + 0x40) << 10) + diff2;
+
+          // will generate four UTF-8 bytes
+          // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+          *dst++ = (char)((value >> 18) | 0b11110000);
+          *dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000);
+          *dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000);
+          *dst++ = (char)((value & 0b111111) | 0b10000000);
+          src += 2;
+          n -= 2;
+        }
+      }
   }
 
-  return saved_bytes;
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret =
-      icelake::validating_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
-          buf, len, utf32_output);
-  if (ret.second == nullptr)
-    return 0;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
 
-  size_t saved_bytes = ret.second - utf32_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
+}
 
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert(
-        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+}
 
-  return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *buf, size_t len, char32_t *utf32) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
-    return {error_code::SUCCESS, 0};
-  }
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32);
-  auto ret = icelake::validating_utf8_to_fixed_length_with_constant_checks<
-      endianness::LITTLE, uint32_t>(buf, len, utf32_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf16le_to_utf8(src, len, dst);
+}
 
-  if (!std::get<2>(ret)) {
-    size_t pos = std::get<0>(ret) - buf;
-    // We might have an error that occurs right before  pos.
-    // This is only a concern if buf[pos] is not a continuation byte.
-    if ((buf[pos] & 0xc0) != 0x80 && pos >= 64) {
-      pos -= 1;
-    } else if ((buf[pos] & 0xc0) == 0x80 && pos >= 64) {
-      // We must check whether we are the fourth continuation byte
-      bool c1 = (buf[pos - 1] & 0xc0) == 0x80;
-      bool c2 = (buf[pos - 2] & 0xc0) == 0x80;
-      bool c3 = (buf[pos - 3] & 0xc0) == 0x80;
-      if (c1 && c2 && c3) {
-        return {simdutf::TOO_LONG, pos};
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf16be_to_utf8(src, len, dst);
+}
+
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
+  const char16_t *const srcBeg = src;
+  char32_t *const dstBeg = dst;
+
+  constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800;
+  constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800;
+  constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00;
+  constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00;
+  constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00;
+  constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800;
+
+  uint16_t last = 0;
+  while (len > 0) {
+    size_t vl = __riscv_vsetvl_e16m2(len);
+    vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
+    v0 = simdutf_byteflip<bflip>(v0, vl);
+
+    { // check fast-path
+      const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl);
+      const vbool8_t any_surrogate =
+          __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl);
+      if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) {
+        /* no surrogates */
+        __riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl),
+                              vl);
+        len -= vl;
+        src += vl;
+        dst += vl;
+        continue;
       }
     }
-    // todo: we reset the output to utf32 instead of using std::get<2.(ret) as
-    // you'd expect. that is because
-    // validating_utf8_to_fixed_length_with_constant_checks may have processed
-    // data beyond the error.
-    result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-        pos, buf + pos, len - pos, utf32);
-    res.count += pos;
-    return res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  const char *end = buf + len;
-  if (std::get<0>(ret) == end) {
-    return {simdutf::SUCCESS, saved_bytes};
-  }
 
-  // Note: the AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outside 16-byte window.
-  //       It means, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (std::get<0>(ret) != end and
-         ((uint8_t(*std::get<0>(ret)) & 0xc0) == 0x80)) {
-    std::get<0>(ret) += 1;
-  }
+    if ((simdutf_byteflip<bflip>(src[0]) & LO_SURROGATE_MASK) ==
+        LO_SURROGATE_VALUE) {
+      return result(error_code::SURROGATE, src - srcBeg);
+    }
+
+    // decode surrogates
+    vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl);
+    vl = __riscv_vsetvl_e16m2(vl - 1);
+    if (vl == 0) {
+      return result(error_code::SURROGATE, src - srcBeg);
+    }
+
+    const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE,
+        vl);
+    const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
+        vl);
+
+    // compress everything but lo surrogates
+    const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(
+        __riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
+        vl);
 
-  if (std::get<0>(ret) != end) {
-    auto scalar_result = scalar::utf8_to_utf32::convert_with_errors(
-        std::get<0>(ret), len - (std::get<0>(ret) - buf),
-        reinterpret_cast<char32_t *>(utf32_output) + saved_bytes);
-    if (scalar_result.error != simdutf::SUCCESS) {
-      scalar_result.count += (std::get<0>(ret) - buf);
-    } else {
-      scalar_result.count += saved_bytes;
+    {
+      const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
+      const long idx = __riscv_vfirst_m_b8(diff, vl);
+      if (idx >= 0) {
+        uint16_t word = simdutf_byteflip<bflip>(src[idx]);
+        if (word < 0xD800 || word > 0xDBFF) {
+          return result(error_code::SURROGATE, src - srcBeg + idx + 1);
+        }
+        return result(error_code::SURROGATE, src - srcBeg + idx);
+      }
     }
-    return scalar_result;
-  }
 
-  return {simdutf::SUCCESS, size_t(std::get<1>(ret) - utf32_output)};
-}
+    last = simdutf_byteflip<bflip>(src[vl]);
+    vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl);
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *buf, size_t len, char32_t *utf32_out) const noexcept {
-  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
-  utf8_to_utf32_result ret =
-      icelake::valid_utf8_to_fixed_length<endianness::LITTLE, uint32_t>(
-          buf, len, utf32_output);
-  size_t saved_bytes = ret.second - utf32_output;
-  const char *end = buf + len;
-  if (ret.first == end) {
-    return saved_bytes;
-  }
+    // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate
+    // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate
 
-  // Note: AVX512 procedure looks up 4 bytes forward, and
-  //       correctly converts multi-byte chars even if their
-  //       continuation bytes lie outsiede 16-byte window.
-  //       It meas, we have to skip continuation bytes from
-  //       the beginning ret.first, as they were already consumed.
-  while (ret.first != end && ((uint8_t(*ret.first) & 0xc0) == 0x80)) {
-    ret.first += 1;
-  }
+    // t0 = u16(                    0000_00yy_yyyy_yyyy)
+    const vuint32m4_t t0 =
+        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl);
+    // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000)
+    const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl);
 
-  if (ret.first != end) {
-    const size_t scalar_saved_bytes = scalar::utf8_to_utf32::convert_valid(
-        ret.first, len - (ret.first - buf), utf32_out + saved_bytes);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
+    // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx)
+    const vuint32m4_t t2 =
+        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl);
 
-  return saved_bytes;
-}
+    // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx)
+    const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl);
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
-                                                             latin1_output);
-}
+    // t4 = utf32 from surrogate pairs
+    const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl);
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1<endianness::BIG>(buf, len,
-                                                          latin1_output);
-}
+    const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl);
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
-             buf, len, latin1_output)
-      .first;
-}
+    const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl);
+    const size_t vlOut = __riscv_vcpop_m_b8(compress, vl);
+    __riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut);
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf16_to_latin1_with_errors<endianness::BIG>(
-             buf, len, latin1_output)
-      .first;
-}
+    len -= vl;
+    src += vl;
+    dst += vlOut;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement custom function
-  return convert_utf16be_to_latin1(buf, len, latin1_output);
-}
+    if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) {
+      // last item is lo surrogate and got already consumed
+      len -= 1;
+      src += 1;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement custom function
-  return convert_utf16le_to_latin1(buf, len, latin1_output);
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    return 0;
-  }
-  return outlen;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    return 0;
-  }
-  return outlen;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::LITTLE>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-        buf + inlen, len - inlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  size_t outlen;
-  size_t inlen = utf16_to_utf8_avx512i<endianness::BIG>(
-      buf, len, (unsigned char *)utf8_output, &outlen);
-  if (inlen != len) {
-    result res = scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-        buf + inlen, len - inlen, utf8_output + outlen);
-    res.count += inlen;
-    return res;
-  }
-  return {simdutf::SUCCESS, outlen};
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
+                                                                  dst);
+  else
+    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16le_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return convert_utf16le_to_utf32(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf16be_to_utf8(buf, len, utf8_output);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *src, size_t len, char32_t *dst) const noexcept {
+  return convert_utf16be_to_utf32(src, len, dst);
 }
+/* end file src/rvv/rvv_utf16_to.inl.cpp */
+/* begin file src/rvv/rvv_utf32_to.inl.cpp */
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf32_to_latin1_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
 simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf32_to_latin1_with_errors(buf, len, latin1_output)
-      .first;
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  const char32_t *const beg = src;
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e32m8(len);
+    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
+    long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
+    if (idx >= 0)
+      return result(error_code::TOO_LARGE, src - beg + idx);
+    /* We don't use vcompress here, because its performance varies widely on
+     * current platforms. This might be worth reconsidering once there is more
+     * hardware available. */
+    __riscv_vse8_v_u8m2(
+        (uint8_t *)dst,
+        __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
+  }
+  return result(error_code::SUCCESS, src - beg);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  return icelake_convert_utf32_to_latin1(buf, len, latin1_output);
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf32_to_latin1(src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      avx512_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  size_t n = len;
+  const char32_t *srcBeg = src;
+  const char *dstBeg = dst;
+  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
+  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
+      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
+
+  for (size_t vl, vlOut; n > 0;) {
+    vl = __riscv_vsetvl_e32m4(n);
+
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
+    vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
+    vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
+
+    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
+      vlOut = vl;
+      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
+                          vlOut);
+      n -= vl, src += vl, dst += vlOut;
+      continue;
     }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      icelake::avx512_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+    vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);
+
+    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
+      /* 0: [     aaa|aabbbbbb]
+       * 1: [aabbbbbb|        ] vsll 8
+       * 2: [        |   aaaaa] vsrl 6
+       * 3: [00111111|00111111]
+       * 4: [  bbbbbb|000aaaaa] (1|2)&3
+       * 5: [10000000|11000000]
+       * 6: [10bbbbbb|110aaaaa] 4|5 */
+      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
+          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
+                               __riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
+          0b0011111100111111, vl);
+      vuint16m2_t vout16 =
+          __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
+      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+
+      /* Every high byte that is zero should be compressed
+       * low bytes should never be compressed, so we set them
+       * to all ones, and then create a non-zero bytes mask */
+      vbool4_t mcomp =
+          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
+                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
+                                   0, vl * 2);
+      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+
+      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
+      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
+      continue;
+    }
+    long idx1 =
+        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+    vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
+        __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
+    long idx2 = __riscv_vfirst_m_b8(sur, vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2) {
+        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+      } else {
+        return result(error_code::SURROGATE, src - srcBeg + idx2);
+      }
+    }
+    if (idx1 >= 0) {
+      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+    }
+    if (idx2 >= 0) {
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
     }
-  }
-  ret.first.count =
-      ret.second -
-      utf8_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return convert_utf32_to_utf8(buf, len, utf8_output);
-}
+    vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
+    long first = __riscv_vfirst_m_b8(m4, vl);
+    size_t tail = vl - first;
+    vl = first < 0 ? vl : first;
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      avx512_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
+    if (vl > 0) { /* 1/2/3 byte utf8 */
+      /* vn: [aaaabbbb|bbcccccc]
+       * v1: [0bcccccc|        ] vsll  8
+       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
+       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
+       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
+       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
+       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
+       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
+       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
+       * [10cccccc]
+       */
+      vuint16m2_t v1, v2, v3, v12;
+      v1 = __riscv_vor_vx_u16m2_mu(
+          m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
+      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+
+      v2 = __riscv_vor_vx_u16m2(
+          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
+                                vl),
+          0b10000000, vl);
+      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
+                                   0b01000000, vl);
+      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
+                                vl);
+      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+
+      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
+      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
+      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+
+      vbool2_t mcomp = __riscv_vmor_mm_b2(
+          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
+      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+
+      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
+      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+
+      n -= vl, src += vl, dst += vlOut;
     }
-    saved_bytes += scalar_saved_bytes;
+
+    if (tail)
+      while (n) {
+        uint32_t word = src[0];
+        if (word < 0x10000)
+          break;
+        if (word > 0x10FFFF)
+          return result(error_code::TOO_LARGE, src - srcBeg);
+        *dst++ = (uint8_t)((word >> 18) | 0b11110000);
+        *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
+        *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
+        *dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
+        ++src;
+        --n;
+      }
   }
-  return saved_bytes;
+
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  std::pair<const char32_t *, char16_t *> ret =
-      avx512_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf16_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf32_to_utf16::convert<endianness::BIG>(
-            ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  result res = convert_utf32_to_utf8_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      avx512_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
-          buf, len, utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *src, size_t len, char *dst) const noexcept {
+  return convert_utf32_to_utf8(src, len, dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char16_t *> ret =
-      avx512_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
-                                                                 utf16_output);
-  if (ret.first.count != len) {
-    result scalar_res =
-        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-            buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static result
+rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len,
+                                       char16_t *dst) {
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+  const char16_t *dstBeg = dst;
+  const char32_t *srcBeg = src;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e32m4(len);
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
+    vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
+    long idx1 =
+        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
+    long idx2 = __riscv_vfirst_m_b8(
+        __riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
+    if (idx1 >= 0 && idx2 >= 0) {
+      if (idx1 <= idx2)
+        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
+    }
+    if (idx1 >= 0)
+      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
+    if (idx2 >= 0)
+      return result(error_code::SURROGATE, src - srcBeg + idx2);
+    long idx =
+        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
+    if (idx < 0) {
+      vlOut = vl;
+      vuint16m2_t n =
+          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
+      continue;
     }
+    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
   }
-  ret.first.count =
-      ret.second -
-      utf16_output; // Set count to the number of 8-bit code units written
-  return ret.first;
+  return result(error_code::SUCCESS, dst - dstBeg);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16le(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return convert_utf32_to_utf16be(buf, len, utf16_output);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
+  return res.error == error_code::SUCCESS ? res.count : 0;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                          utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(
+      src, len, dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(
+        src, len, dst);
+  else
+    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len,
+                                                                       dst);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                          utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static size_t
+rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len,
+                                 char16_t *dst) {
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
+  char16_t *dstBeg = dst;
+  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e32m4(len);
+    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
+    if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) <
+        0) {
+      vlOut = vl;
+      vuint16m2_t n =
+          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
+      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
+      continue;
     }
+    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
   }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+  return dst - dstBeg;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    scalar_res.count += (std::get<0>(ret) - buf);
-    return scalar_res;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    result scalar_res =
-        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_res.error) {
-      scalar_res.count += (std::get<0>(ret) - buf);
-      return scalar_res;
-    } else {
-      scalar_res.count += saved_bytes;
-      return scalar_res;
-    }
-  }
-  return simdutf::result(simdutf::SUCCESS, saved_bytes);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len,
+                                                                  dst);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                          utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len,
+                                                                    dst);
+  else
+    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
 }
+/* end file src/rvv/rvv_utf32_to.inl.cpp */
+/* begin file src/rvv/rvv_utf8_to.inl.cpp */
+template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
+simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
+                                                       size_t len, Tdst *dst) {
+  static_assert(std::is_same<Tdst, uint16_t>() ||
+                    std::is_same<Tdst, uint32_t>(),
+                "invalid type");
+  constexpr bool is16 = std::is_same<Tdst, uint16_t>();
+  constexpr endianness endian =
+      bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
+  const auto scalar = [](char const *in, size_t count, Tdst *out) {
+    return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
+                                                         (char16_t *)out)
+                : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
+  };
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  std::tuple<const char16_t *, char32_t *, bool> ret =
-      icelake::convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
-  if (!std::get<2>(ret)) {
-    return 0;
-  }
-  size_t saved_bytes = std::get<1>(ret) - utf32_output;
-  if (std::get<0>(ret) != buf + len) {
-    const size_t scalar_saved_bytes =
-        scalar::utf16_to_utf32::convert<endianness::BIG>(
-            std::get<0>(ret), len - (std::get<0>(ret) - buf), std::get<1>(ret));
-    if (scalar_saved_bytes == 0) {
+  if (len < 32)
+    return scalar(src, len, dst);
+
+  /* validate first three bytes */
+  if (validate) {
+    size_t idx = 3;
+    while (idx < len && (src[idx] >> 6) == 0b10)
+      ++idx;
+    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
       return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
   }
-  return saved_bytes;
-}
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  size_t pos = 0;
-  const __m512i byteflip = _mm512_setr_epi64(
-      0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-      0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-      0x0607040502030001, 0x0e0f0c0d0a0b0809);
-  while (pos + 32 <= length) {
-    __m512i utf16 = _mm512_loadu_si512((const __m512i *)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_storeu_si512(output + pos, utf16);
-    pos += 32;
-  }
-  if (pos < length) {
-    __mmask32 m((1U << (length - pos)) - 1);
-    __m512i utf16 = _mm512_maskz_loadu_epi16(m, (const __m512i *)(input + pos));
-    utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-    _mm512_mask_storeu_epi16(output + pos, m, utf16);
-  }
-}
+  size_t tail = 3;
+  size_t n = len - tail;
+  Tdst *beg = dst;
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
+  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
+  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
+  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
 
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
+  const vuint8m1_t err1tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
+  const vuint8m1_t err2tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
+  const vuint8m1_t err3tbl =
+      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
 
-    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
+  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
+      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
 
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 32;
-      uint64_t not_high_surrogate =
-          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
-                                _mm512_cmplt_epu16_mask(utf16, low));
-      count += count_ones(not_high_surrogate);
+  for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
+    vl = __riscv_vsetvl_e8m2(n);
+
+    vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
+    uint64_t max = __riscv_vmv_x_s_u8m1_u8(
+        __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
+
+    uint8_t next0 = src[vl + 0];
+    uint8_t next1 = src[vl + 1];
+    uint8_t next2 = src[vl + 2];
+
+    /* fast path: ASCII */
+    if ((max | next0 | next1 | next2) < 0b10000000) {
+      vlOut = vl;
+      if (is16)
+        __riscv_vse16_v_u16m4(
+            (uint16_t *)dst,
+            simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
+            vlOut);
+      else
+        __riscv_vse32_v_u32m8((uint32_t *)dst,
+                              __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
+      continue;
     }
-  }
 
-  return count + scalar::utf16::count_code_points<endianness::LITTLE>(
-                     ptr, length - (ptr - input));
-}
+    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
+     * https://arxiv.org/abs/2010.03090 */
+    vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
+    vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
+    vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-  if (length >= 32) {
+    if (validate) {
+      vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
+          __riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2()));
+      vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
+          __riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2()));
 
-    const char16_t *end = input + length - 32;
+      vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
+      vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
+      vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
 
-    const __m512i low = _mm512_set1_epi16((uint16_t)0xdc00);
-    const __m512i high = _mm512_set1_epi16((uint16_t)0xdfff);
+      vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
+      vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
+      vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
+      vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
+          __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
 
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-        0x0607040502030001, 0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-      __m512i utf16 =
-          _mm512_shuffle_epi8(_mm512_loadu_si512((__m512i *)ptr), byteflip);
-      ptr += 32;
-      uint64_t not_high_surrogate =
-          static_cast<uint64_t>(_mm512_cmpgt_epu16_mask(utf16, high) |
-                                _mm512_cmplt_epu16_mask(utf16, low));
-      count += count_ones(not_high_surrogate);
+      vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
+      vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
+      vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
+      vbool4_t err34 =
+          __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
+      vbool4_t errm =
+          __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
+      if (__riscv_vfirst_m_b4(errm, vl) >= 0)
+        return 0;
     }
-  }
 
-  return count + scalar::utf16::count_code_points<endianness::BIG>(
-                     ptr, length - (ptr - input));
-}
+    /* decoding */
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
-  size_t answer =
-      length / sizeof(__m512i) *
-      sizeof(__m512i); // Number of 512-bit chunks that fits into the length.
-  size_t i = 0;
-  __m512i unrolled_popcount{0};
+    /* mask of non continuation bytes */
+    vbool4_t m =
+        __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
+    vlOut = __riscv_vcpop_m_b4(m, vl);
 
-  const __m512i continuation = _mm512_set1_epi8(char(0b10111111));
+    /* extract first and second bytes */
+    vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
+    vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
 
-  while (i + sizeof(__m512i) <= length) {
-    size_t iterations = (length - i) / sizeof(__m512i);
+    /* fast path: one and two byte */
+    if (max < 0b11100000) {
+      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
 
-    size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
-    for (; i + 8 * sizeof(__m512i) <= max_i; i += 8 * sizeof(__m512i)) {
-      __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
-      __m512i input2 =
-          _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
-      __m512i input3 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 2 * sizeof(__m512i)));
-      __m512i input4 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 3 * sizeof(__m512i)));
-      __m512i input5 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 4 * sizeof(__m512i)));
-      __m512i input6 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 5 * sizeof(__m512i)));
-      __m512i input7 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 6 * sizeof(__m512i)));
-      __m512i input8 =
-          _mm512_loadu_si512((const __m512i *)(str + i + 7 * sizeof(__m512i)));
+      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+      b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
 
-      __mmask64 mask1 = _mm512_cmple_epi8_mask(input1, continuation);
-      __mmask64 mask2 = _mm512_cmple_epi8_mask(input2, continuation);
-      __mmask64 mask3 = _mm512_cmple_epi8_mask(input3, continuation);
-      __mmask64 mask4 = _mm512_cmple_epi8_mask(input4, continuation);
-      __mmask64 mask5 = _mm512_cmple_epi8_mask(input5, continuation);
-      __mmask64 mask6 = _mm512_cmple_epi8_mask(input6, continuation);
-      __mmask64 mask7 = _mm512_cmple_epi8_mask(input7, continuation);
-      __mmask64 mask8 = _mm512_cmple_epi8_mask(input8, continuation);
+      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
+          b1,
+          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
+                                  vlOut),
+          vlOut);
+      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+      if (is16)
+        __riscv_vse16_v_u16m4((uint16_t *)dst,
+                              simdutf_byteflip<bflip>(b12, vlOut), vlOut);
+      else
+        __riscv_vse32_v_u32m8((uint32_t *)dst,
+                              __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
+      continue;
+    }
 
-      __m512i mask_register = _mm512_set_epi64(mask8, mask7, mask6, mask5,
-                                               mask4, mask3, mask2, mask1);
+    /* fast path: one, two and three byte */
+    if (max < 0b11110000) {
+      vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
 
-      unrolled_popcount = _mm512_add_epi64(unrolled_popcount,
-                                           _mm512_popcnt_epi64(mask_register));
-    }
+      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+      b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
 
-    for (; i <= max_i; i += sizeof(__m512i)) {
-      __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
-      uint64_t continuation_bitmask = static_cast<uint64_t>(
-          _mm512_cmple_epi8_mask(more_input, continuation));
-      answer -= count_ones(continuation_bitmask);
+      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
+      vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
+
+      vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+      b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
+
+      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
+          b1,
+          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
+                                  vlOut),
+          vlOut);
+      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
+      vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
+          m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
+      if (is16)
+        __riscv_vse16_v_u16m4((uint16_t *)dst,
+                              simdutf_byteflip<bflip>(b123, vlOut), vlOut);
+      else
+        __riscv_vse32_v_u32m8((uint32_t *)dst,
+                              __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
+      continue;
     }
-  }
 
-  __m256i first_half = _mm512_extracti64x4_epi64(unrolled_popcount, 0);
-  __m256i second_half = _mm512_extracti64x4_epi64(unrolled_popcount, 1);
-  answer -= (size_t)_mm256_extract_epi64(first_half, 0) +
-            (size_t)_mm256_extract_epi64(first_half, 1) +
-            (size_t)_mm256_extract_epi64(first_half, 2) +
-            (size_t)_mm256_extract_epi64(first_half, 3) +
-            (size_t)_mm256_extract_epi64(second_half, 0) +
-            (size_t)_mm256_extract_epi64(second_half, 1) +
-            (size_t)_mm256_extract_epi64(second_half, 2) +
-            (size_t)_mm256_extract_epi64(second_half, 3);
+    /* extract third and fourth bytes */
+    vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
+    vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
+
+    /* remove prefix from leading bytes
+     *
+     * We could also use vrgather here, but it increases register pressure,
+     * and its performance varies widely on current platforms. It might be
+     * worth reconsidering, though, once there is more hardware available.
+     * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
+     *
+     * We shift left and then right by the number of bytes in the prefix,
+     * which can be calculated as follows:
+     *         x                                max(x-10, 0)
+     * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
+     * 10xx -> 1000-1011 -> don't care
+     * 110x -> 1100,1101 -> sift by 3        -> 2,3
+     * 1110 -> 1110      -> sift by 4        -> 4
+     * 1111 -> 1111      -> sift by 5        -> 5
+     *
+     * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
+     * just need to manually detect and handle the one special case:
+     */
+#define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx)                                     \
+  vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx);                           \
+  vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx);                           \
+  vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx);                           \
+  vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx);                           \
+  /* remove prefix from trailing bytes */                                      \
+  c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut);                            \
+  c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut);                            \
+  c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut);                            \
+  vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut);                       \
+  shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \
+                                  __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut),  \
+                                  vlOut);                                      \
+  c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut);                                 \
+  c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut);                                 \
+  /* unconditionally widen and combine to c1234 */                             \
+  vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(                                   \
+      __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut);                  \
+  vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(                                   \
+      __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut);                  \
+  vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(                                 \
+      __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut);               \
+  /* derive required right-shift amount from `shift` to reduce                 \
+   * c1234 to the required number of bytes */                                  \
+  c1234 = __riscv_vsrl_vv_u32m4(                                               \
+      c1234,                                                                   \
+      __riscv_vzext_vf4_u32m4(                                                 \
+          __riscv_vmul_vx_u8m1(                                                \
+              __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut),   \
+                                    3, vlOut),                                 \
+              6, vlOut),                                                       \
+          vlOut),                                                              \
+      vlOut);                                                                  \
+  /* store result in desired format */                                         \
+  if (is16)                                                                    \
+    vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut,     \
+                                            m4even);                           \
+  else                                                                         \
+    vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);
 
-  return answer + scalar::utf8::count_code_points(
-                      reinterpret_cast<const char *>(str + i), length - i);
-}
+    /* Unrolling this manually reduces register pressure and allows
+     * us to terminate early. */
+    {
+      size_t vlOutm2 = vlOut, vlDst;
+      vlOut = __riscv_vsetvl_e8m1(vlOut);
+      SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
+      if (vlOutm2 == vlOut) {
+        vlOut = vlDst;
+        continue;
+      }
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *buf, size_t len) const noexcept {
-  return count_utf8(buf, len);
-}
+      dst += vlDst;
+      vlOut = vlOutm2 - vlOut;
+    }
+    {
+      size_t vlDst;
+      SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
+      vlOut = vlDst;
+    }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
-}
+#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
+  }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return scalar::utf32::latin1_length_from_utf32(length);
+  /* validate the last character and reparse it + tail */
+  if (len > tail) {
+    if ((src[0] >> 6) == 0b10)
+      --dst;
+    while ((src[0] >> 6) == 0b10 && tail < len)
+      --src, ++tail;
+    if (is16) {
+      /* go back one more, when on high surrogate */
+      if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
+          simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
+        --dst;
+    }
+  }
+  size_t ret = scalar(src, tail, dst);
+  if (ret == 0)
+    return 0;
+  return (size_t)(dst - beg) + ret;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
-
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *src, size_t len, char *dst) const noexcept {
+  const char *beg = dst;
+  uint8_t last = 0;
+  for (size_t vl, vlOut; len > 0;
+       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    // check which bytes are ASCII
+    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+    // count ASCII bytes
+    vlOut = __riscv_vcpop_m_b4(ascii, vl);
+    // The original code would only enter the next block after this check:
+    //   vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+    //   vlOut = __riscv_vcpop_m_b4(m, vl);
+    //   if (vlOut != vl || last > 0b01111111) {...}q
+    // So that everything is ASCII or continuation bytes, we just proceeded
+    // without any processing, going straight to __riscv_vse8_v_u8m2.
+    // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
+    // byte.
+    if (vlOut != vl) { // If not pure ASCII
+      // Non-ASCII characters
+      // We now want to mark the ascii and continuation bytes
+      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+      // We count them, that's our new vlOut (output vector length)
+      vlOut = __riscv_vcpop_m_b4(m, vl);
 
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 32;
-      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-      __mmask32 two_bytes_bitmask =
-          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-      __mmask32 surrogates_bitmask =
-          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
-          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
 
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-      size_t three_bytes_count =
-          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
+      vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
+      vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
+          __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
+      // -62 i 0b11000010, so we check whether any of v0 is too big
+      vbool4_t tobig = __riscv_vmand_mm_b4(
+          leading0,
+          __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
+                                    1, vl),
+          vl);
+      if (__riscv_vfirst_m_b4(
+              __riscv_vmor_mm_b4(
+                  tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
+              vl) >= 0)
+        return 0;
 
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               2 * surrogate_bytes_count;
+      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
+                                  v1, v1, 0b01000000, vl);
+      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+    } else if (last >= 0b11000000) { // If last byte is a leading  byte and we
+                                     // got only ASCII, error!
+      return 0;
     }
+    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
   }
-
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(
-                     ptr, length - (ptr - input));
+  if (last > 0b10111111)
+    return 0;
+  return dst - beg;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  const char16_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 32) {
-    const char16_t *end = input + length - 32;
-
-    const __m512i v_007f = _mm512_set1_epi16((uint16_t)0x007f);
-    const __m512i v_07ff = _mm512_set1_epi16((uint16_t)0x07ff);
-    const __m512i v_dfff = _mm512_set1_epi16((uint16_t)0xdfff);
-    const __m512i v_d800 = _mm512_set1_epi16((uint16_t)0xd800);
-
-    const __m512i byteflip = _mm512_setr_epi64(
-        0x0607040502030001, 0x0e0f0c0d0a0b0809, 0x0607040502030001,
-        0x0e0f0c0d0a0b0809, 0x0607040502030001, 0x0e0f0c0d0a0b0809,
-        0x0607040502030001, 0x0e0f0c0d0a0b0809);
-    while (ptr <= end) {
-      __m512i utf16 = _mm512_loadu_si512((const __m512i *)ptr);
-      utf16 = _mm512_shuffle_epi8(utf16, byteflip);
-      ptr += 32;
-      __mmask32 ascii_bitmask = _mm512_cmple_epu16_mask(utf16, v_007f);
-      __mmask32 two_bytes_bitmask =
-          _mm512_mask_cmple_epu16_mask(~ascii_bitmask, utf16, v_07ff);
-      __mmask32 not_one_two_bytes = ~(ascii_bitmask | two_bytes_bitmask);
-      __mmask32 surrogates_bitmask =
-          _mm512_mask_cmple_epu16_mask(not_one_two_bytes, utf16, v_dfff) &
-          _mm512_mask_cmpge_epu16_mask(not_one_two_bytes, utf16, v_d800);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *src, size_t len, char *dst) const noexcept {
+  size_t res = convert_utf8_to_latin1(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
+}
 
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t surrogate_bytes_count = count_ones(surrogates_bitmask);
-      size_t three_bytes_count =
-          32 - ascii_count - two_bytes_count - surrogate_bytes_count;
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               2 * surrogate_bytes_count;
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *src, size_t len, char *dst) const noexcept {
+  const char *beg = dst;
+  uint8_t last = 0;
+  for (size_t vl, vlOut; len > 0;
+       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
+    vl = __riscv_vsetvl_e8m2(len);
+    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
+    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
+    vlOut = __riscv_vcpop_m_b4(ascii, vl);
+    if (vlOut != vl) { // If not pure ASCII
+      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
+      vlOut = __riscv_vcpop_m_b4(m, vl);
+      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
+                                  v1, v1, 0b01000000, vl);
+      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
     }
+    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
   }
-
-  return count + scalar::utf16::utf8_length_from_utf16<endianness::BIG>(
-                     ptr, length - (ptr - input));
+  return dst - beg;
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return implementation::count_utf16le(input, length);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len,
+                                                              (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return implementation::count_utf16be(input, length);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(
+        src, len, (uint16_t *)dst);
+  else
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len,
+                                                             (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf16le(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
+      src, len, dst);
 }
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf16be(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len,
+                                                                     dst);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t length) const noexcept {
-  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
-  size_t answer = length / sizeof(__m512i) * sizeof(__m512i);
-  size_t i = 0;
-  if (answer >= 2048) { // long strings optimization
-    unsigned char v_0xFF = 0xff;
-    __m512i eight_64bits = _mm512_setzero_si512();
-    while (i + sizeof(__m512i) <= length) {
-      __m512i runner = _mm512_setzero_si512();
-      size_t iterations = (length - i) / sizeof(__m512i);
-      if (iterations > 255) {
-        iterations = 255;
-      }
-      size_t max_i = i + iterations * sizeof(__m512i) - sizeof(__m512i);
-      for (; i + 4 * sizeof(__m512i) <= max_i; i += 4 * sizeof(__m512i)) {
-        // Load four __m512i vectors
-        __m512i input1 = _mm512_loadu_si512((const __m512i *)(str + i));
-        __m512i input2 =
-            _mm512_loadu_si512((const __m512i *)(str + i + sizeof(__m512i)));
-        __m512i input3 = _mm512_loadu_si512(
-            (const __m512i *)(str + i + 2 * sizeof(__m512i)));
-        __m512i input4 = _mm512_loadu_si512(
-            (const __m512i *)(str + i + 3 * sizeof(__m512i)));
-
-        // Generate four masks
-        __mmask64 mask1 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input1);
-        __mmask64 mask2 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input2);
-        __mmask64 mask3 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input3);
-        __mmask64 mask4 =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), input4);
-        // Apply the masks and subtract from the runner
-        __m512i not_ascii1 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask1, v_0xFF);
-        __m512i not_ascii2 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask2, v_0xFF);
-        __m512i not_ascii3 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask3, v_0xFF);
-        __m512i not_ascii4 =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask4, v_0xFF);
-
-        runner = _mm512_sub_epi8(runner, not_ascii1);
-        runner = _mm512_sub_epi8(runner, not_ascii2);
-        runner = _mm512_sub_epi8(runner, not_ascii3);
-        runner = _mm512_sub_epi8(runner, not_ascii4);
-      }
-
-      for (; i <= max_i; i += sizeof(__m512i)) {
-        __m512i more_input = _mm512_loadu_si512((const __m512i *)(str + i));
-
-        __mmask64 mask =
-            _mm512_cmpgt_epi8_mask(_mm512_setzero_si512(), more_input);
-        __m512i not_ascii =
-            _mm512_mask_set1_epi8(_mm512_setzero_si512(), mask, v_0xFF);
-        runner = _mm512_sub_epi8(runner, not_ascii);
-      }
-
-      eight_64bits = _mm512_add_epi64(
-          eight_64bits, _mm512_sad_epu8(runner, _mm512_setzero_si512()));
-    }
-
-    __m256i first_half = _mm512_extracti64x4_epi64(eight_64bits, 0);
-    __m256i second_half = _mm512_extracti64x4_epi64(eight_64bits, 1);
-    answer += (size_t)_mm256_extract_epi64(first_half, 0) +
-              (size_t)_mm256_extract_epi64(first_half, 1) +
-              (size_t)_mm256_extract_epi64(first_half, 2) +
-              (size_t)_mm256_extract_epi64(first_half, 3) +
-              (size_t)_mm256_extract_epi64(second_half, 0) +
-              (size_t)_mm256_extract_epi64(second_half, 1) +
-              (size_t)_mm256_extract_epi64(second_half, 2) +
-              (size_t)_mm256_extract_epi64(second_half, 3);
-  } else if (answer > 0) {
-    for (; i + sizeof(__m512i) <= length; i += sizeof(__m512i)) {
-      __m512i latin = _mm512_loadu_si512((const __m512i *)(str + i));
-      uint64_t non_ascii = _mm512_movepi8_mask(latin);
-      answer += count_ones(non_ascii);
-    }
-  }
-  return answer + scalar::latin1::utf8_length_from_latin1(
-                      reinterpret_cast<const char *>(str + i), length - i);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(
+      src, len, (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= length; pos += 64) {
-    __m512i utf8 = _mm512_loadu_si512((const __m512i *)(input + pos));
-    uint64_t utf8_continuation_mask =
-        _mm512_cmplt_epi8_mask(utf8, _mm512_set1_epi8(-65 + 1));
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    uint64_t utf8_4byte =
-        _mm512_cmpge_epu8_mask(utf8, _mm512_set1_epi8(int8_t(240)));
-    count += count_ones(utf8_4byte);
-  }
-  return count +
-         scalar::utf8::utf16_length_from_utf8(input + pos, length - pos);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *src, size_t len, char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(
+        src, len, (uint16_t *)dst);
+  else
+    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(
+        src, len, (uint16_t *)dst);
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const char32_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 16) {
-    const char32_t *end = input + length - 16;
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
+                                                              (uint32_t *)dst);
+}
 
-    const __m512i v_0000_007f = _mm512_set1_epi32((uint32_t)0x7f);
-    const __m512i v_0000_07ff = _mm512_set1_epi32((uint32_t)0x7ff);
-    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  size_t res = convert_utf8_to_utf32(src, len, dst);
+  if (res)
+    return result(error_code::SUCCESS, res);
+  return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
+}
 
-    while (ptr <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 16;
-      __mmask16 ascii_bitmask = _mm512_cmple_epu32_mask(utf32, v_0000_007f);
-      __mmask16 two_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
-          _knot_mask16(ascii_bitmask), utf32, v_0000_07ff);
-      __mmask16 three_bytes_bitmask = _mm512_mask_cmple_epu32_mask(
-          _knot_mask16(_mm512_kor(ascii_bitmask, two_bytes_bitmask)), utf32,
-          v_0000_ffff);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *src, size_t len, char32_t *dst) const noexcept {
+  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
+      src, len, (uint32_t *)dst);
+}
+/* end file src/rvv/rvv_utf8_to.inl.cpp */
 
-      size_t ascii_count = count_ones(ascii_bitmask);
-      size_t two_bytes_count = count_ones(two_bytes_bitmask);
-      size_t three_bytes_count = count_ones(three_bytes_bitmask);
-      size_t four_bytes_count =
-          16 - ascii_count - two_bytes_count - three_bytes_count;
-      count += ascii_count + 2 * two_bytes_count + 3 * three_bytes_count +
-               4 * four_bytes_count;
-    }
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  if (bom_encoding != encoding_type::unspecified)
+    return bom_encoding;
+  // todo: reimplement as a one-pass algorithm.
+  int out = 0;
+  if (validate_utf8(input, length))
+    out |= encoding_type::UTF8;
+  if (length % 2 == 0) {
+    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2))
+      out |= encoding_type::UTF16_LE;
+  }
+  if (length % 4 == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4))
+      out |= encoding_type::UTF32_LE;
   }
 
-  return count +
-         scalar::utf32::utf8_length_from_utf32(ptr, length - (ptr - input));
+  return out;
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  const char32_t *ptr = input;
-  size_t count{0};
-
-  if (length >= 16) {
-    const char32_t *end = input + length - 16;
-
-    const __m512i v_0000_ffff = _mm512_set1_epi32((uint32_t)0x0000ffff);
-
-    while (ptr <= end) {
-      __m512i utf32 = _mm512_loadu_si512((const __m512i *)ptr);
-      ptr += 16;
-      __mmask16 surrogates_bitmask =
-          _mm512_cmpgt_epu32_mask(utf32, v_0000_ffff);
-
-      count += 16 + count_ones(surrogates_bitmask);
-    }
+template <simdutf_ByteFlip bflip>
+simdutf_really_inline static void
+rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
+  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
+    vl = __riscv_vsetvl_e16m8(len);
+    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
+    __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
   }
-
-  return count +
-         scalar::utf32::utf16_length_from_utf32(ptr, length - (ptr - input));
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return implementation::count_utf8(input, length);
+void implementation::change_endianness_utf16(const char16_t *src, size_t len,
+                                             char16_t *dst) const noexcept {
+  if (supports_zvbb())
+    return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
+  else
+    return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -25315,21 +37828,86 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -25340,21 +37918,86 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  auto equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+    return {SUCCESS, 0};
+  }
+  result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  while (length > 0 &&
+         scalar::base64::is_ascii_white_space(input[length - 1])) {
+    length--;
+  }
+  size_t equallocation =
+      length; // location of the first padding character if any
+  size_t equalsigns = 0;
+  if (length > 0 && input[length - 1] == '=') {
+    equallocation = length - 1;
+    length -= 1;
+    equalsigns++;
+    while (length > 0 &&
+           scalar::base64::is_ascii_white_space(input[length - 1])) {
+      length--;
+    }
+    if (length > 0 && input[length - 1] == '=') {
+      equallocation = length - 1;
+      equalsigns++;
+      length -= 1;
+    }
+  }
+  if (length == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  full_result r = scalar::base64::base64_tail_decode(
+      output, input, length, equalsigns, options, last_chunk_options);
+  if (last_chunk_options != stop_before_partial &&
+      r.error == error_code::SUCCESS && equalsigns > 0) {
+    // additional checks
+    if ((r.output_count % 3 == 0) ||
+        ((r.output_count % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
+    }
+  }
+  return r;
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(
@@ -25365,56 +38008,38 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(
 size_t implementation::binary_to_base64(const char *input, size_t length,
                                         char *output,
                                         base64_options options) const noexcept {
-  if (options & base64_url) {
-    return encode_base64<true>(output, input, length, options);
-  } else {
-    return encode_base64<false>(output, input, length, options);
-  }
+  return scalar::base64::tail_encode_base64(output, input, length, options);
 }
-
-} // namespace icelake
+} // namespace rvv
 } // namespace simdutf
 
-/* begin file src/simdutf/icelake/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_ICELAKE
+/* begin file src/simdutf/rvv/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_RVV
 // nothing needed.
 #else
 SIMDUTF_UNTARGET_REGION
 #endif
 
-
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_POP_DISABLE_WARNINGS
-#endif // end of workaround
-/* end file src/simdutf/icelake/end.h */
-/* end file src/icelake/implementation.cpp */
+/* end file src/simdutf/rvv/end.h */
+/* end file src/rvv/implementation.cpp */
 #endif
-#if SIMDUTF_IMPLEMENTATION_HASWELL
-/* begin file src/haswell/implementation.cpp */
-
-/* begin file src/simdutf/haswell/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "haswell"
-// #define SIMDUTF_IMPLEMENTATION haswell
+#if SIMDUTF_IMPLEMENTATION_WESTMERE
+/* begin file src/westmere/implementation.cpp */
+/* begin file src/simdutf/westmere/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "westmere"
+// #define SIMDUTF_IMPLEMENTATION westmere
 
-#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
+#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
 // nothing needed.
 #else
-SIMDUTF_TARGET_HASWELL
+SIMDUTF_TARGET_WESTMERE
 #endif
-
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-// clang-format off
-SIMDUTF_DISABLE_GCC_WARNING(-Wmaybe-uninitialized)
-// clang-format on
-#endif // end of workaround
-/* end file src/simdutf/haswell/begin.h */
+/* end file src/simdutf/westmere/begin.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
-#ifndef SIMDUTF_HASWELL_H
-  #error "haswell.h must be included"
+#ifndef SIMDUTF_WESTMERE_H
+  #error "westmere.h must be included"
 #endif
 using namespace simd;
 
@@ -25441,13 +38066,90 @@ simdutf_really_inline simd8<bool>
 must_be_2_3_continuation(const simd8<uint8_t> prev2,
                          const simd8<uint8_t> prev3) {
   simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be > 0x80
+      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
   simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be > 0x80
+      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
   return simd8<bool>(is_third_byte | is_fourth_byte);
 }
 
-/* begin file src/haswell/avx2_validate_utf16.cpp */
+/* begin file src/westmere/internal/loader.cpp */
+namespace internal {
+namespace westmere {
+
+/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
+/*
+ * reads a vector of uint16 values
+ * bits after 11th are ignored
+ * first 11 bits are encoded into utf8
+ * !important! utf8_output must have at least 16 writable bytes
+ */
+
+inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
+                                       const __m128i one_byte_bytemask,
+                                       const uint16_t one_byte_bitmask) {
+  // 0b1100_0000_1000_0000
+  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
+  // 0b0001_1111_0000_0000
+  const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+  // 0b0000_0000_0011_1111
+  const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
+
+  // 1. prepare 2-byte values
+  // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+  // expected output   : [110a|aaaa|10bb|bbbb] x 8
+
+  // t0 = [000a|aaaa|bbbb|bb00]
+  const __m128i t0 = _mm_slli_epi16(v_u16, 2);
+  // t1 = [000a|aaaa|0000|0000]
+  const __m128i t1 = _mm_and_si128(t0, v_1f00);
+  // t2 = [0000|0000|00bb|bbbb]
+  const __m128i t2 = _mm_and_si128(v_u16, v_003f);
+  // t3 = [000a|aaaa|00bb|bbbb]
+  const __m128i t3 = _mm_or_si128(t1, t2);
+  // t4 = [110a|aaaa|10bb|bbbb]
+  const __m128i t4 = _mm_or_si128(t3, v_c080);
+
+  // 2. merge ASCII and 2-byte codewords
+  const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
+
+  // 3. prepare bitmask for 8-bit lookup
+  //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
+  //    - LSB)
+  const uint16_t m0 = one_byte_bitmask & 0x5555;      // m0 = 0h0g0f0e0d0c0b0a
+  const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+  const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
+  // 4. pack the bytes
+  const uint8_t *row =
+      &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
+  const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
+  const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+  // 5. store bytes
+  _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+
+  // 6. adjust pointers
+  utf8_output += row[0];
+}
+
+inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
+                                       const __m128i v_0000,
+                                       const __m128i v_ff80) {
+  // no bits set above 7th bit
+  const __m128i one_byte_bytemask =
+      _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
+  const uint16_t one_byte_bitmask =
+      static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+
+  write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
+                             one_byte_bitmask);
+}
+/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
+
+} // namespace westmere
+} // namespace internal
+/* end file src/westmere/internal/loader.cpp */
+
+/* begin file src/westmere/sse_validate_utf16.cpp */
 /*
     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
 
@@ -25468,7 +38170,7 @@ must_be_2_3_continuation(const simd8<uint8_t> prev2,
     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
     - there must not be sole low surrogate nor high surrogate
 
-    We're going to build three bitmasks based on the 3rd nibble:
+    We are going to build three bitmasks based on the 3rd nibble:
     - V = valid word,
     - L = low surrogate (0xd800 .. 0xdbff)
     - H = high surrogate (0xdc00 .. 0xdfff)
@@ -25495,7 +38197,7 @@ must_be_2_3_continuation(const simd8<uint8_t> prev2,
    - nullptr if an error was detected.
 */
 template <endianness big_endian>
-const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
+const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
   const char16_t *end = input + size;
 
   const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -25503,13 +38205,13 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
-
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
     if (big_endian) {
       in0 = in0.swap_bytes();
       in1 = in1.swap_bytes();
@@ -25522,9 +38224,10 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
-    if (surrogates_bitmask == 0x0) {
-      input += simd16<uint16_t>::ELEMENTS * 2;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -25534,35 +38237,36 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint32_t V = ~surrogates_bitmask;
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint32_t H = vH.to_bitmask();
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint32_t L = ~H & surrogates_bitmask;
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-      const uint32_t a =
-          L & (H >> 1); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint32_t b =
-          a << 1; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
 
-      if (c == 0xffffffff) {
+      if (c == 0xffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += simd16<uint16_t>::ELEMENTS * 2;
-      } else if (c == 0x7fffffff) {
-        // The 31 lower code units of the input register contains valid UTF-16.
-        // The 31 word may be either a low or high surrogate. It the next
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+        input += 15;
       } else {
         return nullptr;
       }
@@ -25573,8 +38277,8 @@ const char16_t *avx2_validate_utf16(const char16_t *input, size_t size) {
 }
 
 template <endianness big_endian>
-const result avx2_validate_utf16_with_errors(const char16_t *input,
-                                             size_t size) {
+const result sse_validate_utf16_with_errors(const char16_t *input,
+                                            size_t size) {
   if (simdutf_unlikely(size == 0)) {
     return result(error_code::SUCCESS, 0);
   }
@@ -25586,12 +38290,13 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
 
     if (big_endian) {
       in0 = in0.swap_bytes();
@@ -25605,9 +38310,10 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
-    if (surrogates_bitmask == 0x0) {
-      input += simd16<uint16_t>::ELEMENTS * 2;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -25617,35 +38323,36 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint32_t V = ~surrogates_bitmask;
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint32_t H = vH.to_bitmask();
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint32_t L = ~H & surrogates_bitmask;
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
 
-      const uint32_t a =
-          L & (H >> 1); // A low surrogate must be followed by high one.
-                        // (A low surrogate placed in the 7th register's word
-                        // is an exception we handle.)
-      const uint32_t b =
-          a << 1; // Just mark that the opposite fact is hold,
-                  // thanks to that we have only two masks for valid case.
-      const uint32_t c = V | a | b; // Combine all the masks into the final one.
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
 
-      if (c == 0xffffffff) {
+      if (c == 0xffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += simd16<uint16_t>::ELEMENTS * 2;
-      } else if (c == 0x7fffffff) {
-        // The 31 lower code units of the input register contains valid UTF-16.
-        // The 31 word may be either a low or high surrogate. It the next
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
+        input += 15;
       } else {
         return result(error_code::SURROGATE, input - start);
       }
@@ -25654,228 +38361,210 @@ const result avx2_validate_utf16_with_errors(const char16_t *input,
 
   return result(error_code::SUCCESS, input - start);
 }
-/* end file src/haswell/avx2_validate_utf16.cpp */
-/* begin file src/haswell/avx2_validate_utf32le.cpp */
+/* end file src/westmere/sse_validate_utf16.cpp */
+/* begin file src/westmere/sse_validate_utf32le.cpp */
 /* Returns:
    - pointer to the last unprocessed character (a scalar fallback should check
    the rest);
    - nullptr if an error was detected.
 */
-const char32_t *avx2_validate_utf32le(const char32_t *input, size_t size) {
+const char32_t *sse_validate_utf32le(const char32_t *input, size_t size) {
   const char32_t *end = input + size;
 
-  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
-  const __m256i offset = _mm256_set1_epi32(0xffff2000);
-  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
-  __m256i currentmax = _mm256_setzero_si256();
-  __m256i currentoffsetmax = _mm256_setzero_si256();
+  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
+  const __m128i offset = _mm_set1_epi32(0xffff2000);
+  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
+  __m128i currentmax = _mm_setzero_si128();
+  __m128i currentoffsetmax = _mm_setzero_si128();
 
-  while (input + 8 < end) {
-    const __m256i in = _mm256_loadu_si256((__m256i *)input);
-    currentmax = _mm256_max_epu32(in, currentmax);
+  while (input + 4 < end) {
+    const __m128i in = _mm_loadu_si128((__m128i *)input);
+    currentmax = _mm_max_epu32(in, currentmax);
     currentoffsetmax =
-        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
-    input += 8;
+        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
+    input += 4;
   }
-  __m256i is_zero =
-      _mm256_xor_si256(_mm256_max_epu32(currentmax, standardmax), standardmax);
-  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+  __m128i is_zero =
+      _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
+  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
     return nullptr;
   }
 
-  is_zero = _mm256_xor_si256(
-      _mm256_max_epu32(currentoffsetmax, standardoffsetmax), standardoffsetmax);
-  if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+  is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
+                          standardoffsetmax);
+  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
     return nullptr;
   }
 
   return input;
 }
 
-const result avx2_validate_utf32le_with_errors(const char32_t *input,
-                                               size_t size) {
+const result sse_validate_utf32le_with_errors(const char32_t *input,
+                                              size_t size) {
   const char32_t *start = input;
   const char32_t *end = input + size;
 
-  const __m256i standardmax = _mm256_set1_epi32(0x10ffff);
-  const __m256i offset = _mm256_set1_epi32(0xffff2000);
-  const __m256i standardoffsetmax = _mm256_set1_epi32(0xfffff7ff);
-  __m256i currentmax = _mm256_setzero_si256();
-  __m256i currentoffsetmax = _mm256_setzero_si256();
+  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
+  const __m128i offset = _mm_set1_epi32(0xffff2000);
+  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
+  __m128i currentmax = _mm_setzero_si128();
+  __m128i currentoffsetmax = _mm_setzero_si128();
 
-  while (input + 8 < end) {
-    const __m256i in = _mm256_loadu_si256((__m256i *)input);
-    currentmax = _mm256_max_epu32(in, currentmax);
+  while (input + 4 < end) {
+    const __m128i in = _mm_loadu_si128((__m128i *)input);
+    currentmax = _mm_max_epu32(in, currentmax);
     currentoffsetmax =
-        _mm256_max_epu32(_mm256_add_epi32(in, offset), currentoffsetmax);
+        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
 
-    __m256i is_zero = _mm256_xor_si256(
-        _mm256_max_epu32(currentmax, standardmax), standardmax);
-    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
+    __m128i is_zero =
+        _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
       return result(error_code::TOO_LARGE, input - start);
     }
 
-    is_zero =
-        _mm256_xor_si256(_mm256_max_epu32(currentoffsetmax, standardoffsetmax),
-                         standardoffsetmax);
-    if (_mm256_testz_si256(is_zero, is_zero) == 0) {
-      return result(error_code::SURROGATE, input - start);
-    }
-    input += 8;
-  }
-
-  return result(error_code::SUCCESS, input - start);
-}
-/* end file src/haswell/avx2_validate_utf32le.cpp */
-
-/* begin file src/haswell/avx2_convert_latin1_to_utf8.cpp */
-std::pair<const char *, char *>
-avx2_convert_latin1_to_utf8(const char *latin1_input, size_t len,
-                            char *utf8_output) {
-  const char *end = latin1_input + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
-  const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-  const size_t safety_margin = 12;
-
-  while (end - latin1_input >= std::ptrdiff_t(16 + safety_margin)) {
-    __m128i in8 = _mm_loadu_si128((__m128i *)latin1_input);
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_80 = _mm_set1_epi8((char)0x80);
-    if (_mm_testz_si128(in8, v_80)) { // ASCII fast path!!!!
-      // 1. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, in8);
-      // 2. adjust pointers
-      latin1_input += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
-    }
-    // We proceed only with the first 16 bytes.
-    const __m256i in = _mm256_cvtepu8_epi16((in8));
-
-    // 1. prepare 2-byte values
-    // input 16-bit word : [0000|0000|aabb|bbbb] x 8
-    // expected output   : [1100|00aa|10bb|bbbb] x 8
-    const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-    const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-    // t0 = [0000|00aa|bbbb|bb00]
-    const __m256i t0 = _mm256_slli_epi16(in, 2);
-    // t1 = [0000|00aa|0000|0000]
-    const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-    // t2 = [0000|0000|00bb|bbbb]
-    const __m256i t2 = _mm256_and_si256(in, v_003f);
-    // t3 = [000a|aaaa|00bb|bbbb]
-    const __m256i t3 = _mm256_or_si256(t1, t2);
-    // t4 = [1100|00aa|10bb|bbbb]
-    const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-    // 2. merge ASCII and 2-byte codewords
-
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    const __m256i utf8_unpacked = _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-    // 3. prepare bitmask for 8-bit lookup
-    const uint32_t M0 = one_byte_bitmask & 0x55555555;
-    const uint32_t M1 = M0 >> 7;
-    const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-    // 4. pack the bytes
-
-    const uint8_t *row =
-        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-    const uint8_t *row_2 =
-        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >> 16)]
-                                                            [0];
-
-    const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-    const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
-
-    const __m256i utf8_packed = _mm256_shuffle_epi8(
-        utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-    // 5. store bytes
-    _mm_storeu_si128((__m128i *)utf8_output,
-                     _mm256_castsi256_si128(utf8_packed));
-    utf8_output += row[0];
-    _mm_storeu_si128((__m128i *)utf8_output,
-                     _mm256_extractf128_si256(utf8_packed, 1));
-    utf8_output += row_2[0];
-
-    // 6. adjust pointers
-    latin1_input += 16;
-    continue;
-
-  } // while
-  return std::make_pair(latin1_input, utf8_output);
+    is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
+                            standardoffsetmax);
+    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+      return result(error_code::SURROGATE, input - start);
+    }
+    input += 4;
+  }
+
+  return result(error_code::SUCCESS, input - start);
 }
-/* end file src/haswell/avx2_convert_latin1_to_utf8.cpp */
-/* begin file src/haswell/avx2_convert_latin1_to_utf16.cpp */
-template <endianness big_endian>
-std::pair<const char *, char16_t *>
-avx2_convert_latin1_to_utf16(const char *latin1_input, size_t len,
-                             char16_t *utf16_output) {
-  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 32
+/* end file src/westmere/sse_validate_utf32le.cpp */
 
-  size_t i = 0;
-  for (; i < rounded_len; i += 16) {
-    // Load 16 bytes from the address (input + i) into a xmm register
-    __m128i xmm0 =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(latin1_input + i));
+/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
+std::pair<const char *const, char *const>
+sse_convert_latin1_to_utf8(const char *latin_input,
+                           const size_t latin_input_length, char *utf8_output) {
+  const char *end = latin_input + latin_input_length;
 
-    // Zero extend each byte in xmm0 to word and put it in another xmm register
-    __m128i xmm1 = _mm_cvtepu8_epi16(xmm0);
+  const __m128i v_0000 = _mm_setzero_si128();
+  // 0b1000_0000
+  const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
+  // 0b1111_1111_1000_0000
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
 
-    // Shift xmm0 to the right by 8 bytes
-    xmm0 = _mm_srli_si128(xmm0, 8);
+  const __m128i latin_1_half_into_u16_byte_mask =
+      _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
+                    '\x80', 6, '\x80', 7, '\x80');
 
-    // Zero extend each byte in the shifted xmm0 to word in xmm0
-    xmm0 = _mm_cvtepu8_epi16(xmm0);
+  const __m128i latin_2_half_into_u16_byte_mask =
+      _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
+                    13, '\x80', 14, '\x80', 15, '\x80');
 
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      xmm0 = _mm_shuffle_epi8(xmm0, swap);
-      xmm1 = _mm_shuffle_epi8(xmm1, swap);
+  // each latin1 takes 1-2 utf8 bytes
+  // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
+  // adjust the pointer) so the last write can exceed the utf8_output size by
+  // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
+  // 8-16 bytes free
+  while (end - latin_input >= 16 + 8) {
+    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+
+    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
+      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
+      latin_input += 16;
+      utf8_output += 16;
+      continue;
     }
 
-    // Store the contents of xmm1 into the address pointed by (output + i)
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i), xmm1);
+    // assuming a/b are bytes and A/B are uint16 of the same value
+    // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
+    __m128i v_u16_latin_1_half =
+        _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
+    // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
+    __m128i v_u16_latin_2_half =
+        _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
 
-    // Store the contents of xmm0 into the address pointed by (output + i + 8)
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + i + 8), xmm0);
+    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
+                                                   utf8_output, v_0000, v_ff80);
+    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
+                                                   utf8_output, v_0000, v_ff80);
+    latin_input += 16;
+  }
+
+  if (end - latin_input >= 16) {
+    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+
+    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
+      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
+      latin_input += 16;
+      utf8_output += 16;
+    } else {
+      // assuming a/b are bytes and A/B are uint16 of the same value
+      // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
+      __m128i v_u16_latin_1_half =
+          _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
+      internal::westmere::write_v_u16_11bits_to_utf8(
+          v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
+      latin_input += 8;
+    }
   }
 
+  return std::make_pair(latin_input, utf8_output);
+}
+/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */
+/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char *, char16_t *>
+sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
+                            char16_t *utf16_output) {
+  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
+  for (size_t i = 0; i < rounded_len; i += 16) {
+    // Load 16 Latin1 characters into a 128-bit register
+    __m128i in =
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&latin1_input[i]));
+    __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
+                              : _mm_unpacklo_epi8(in, _mm_setzero_si128());
+    __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
+                              : _mm_unpackhi_epi8(in, _mm_setzero_si128());
+    // Zero extend each Latin1 character to 16-bit integers and store the
+    // results back to memory
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2);
+  }
+  // return pointers pointing to where we left off
   return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
 }
-/* end file src/haswell/avx2_convert_latin1_to_utf16.cpp */
-/* begin file src/haswell/avx2_convert_latin1_to_utf32.cpp */
+/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */
+/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
 std::pair<const char *, char32_t *>
-avx2_convert_latin1_to_utf32(const char *buf, size_t len,
-                             char32_t *utf32_output) {
-  size_t rounded_len = ((len | 7) ^ 7); // Round down to nearest multiple of 8
+sse_convert_latin1_to_utf32(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char *end = buf + len;
 
-  for (size_t i = 0; i < rounded_len; i += 8) {
-    // Load 8 Latin1 characters into a 64-bit register
-    __m128i in = _mm_loadl_epi64((__m128i *)&buf[i]);
+  while (end - buf >= 16) {
+    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
 
-    // Zero extend each set of 8 Latin1 characters to 8 32-bit integers using
-    // vpmovzxbd
-    __m256i out = _mm256_cvtepu8_epi32(in);
+    // Shift input to process next 4 bytes
+    __m128i in_shifted1 = _mm_srli_si128(in, 4);
+    __m128i in_shifted2 = _mm_srli_si128(in, 8);
+    __m128i in_shifted3 = _mm_srli_si128(in, 12);
 
-    // Store the results back to memory
-    _mm256_storeu_si256((__m256i *)&utf32_output[i], out);
+    // expand 8-bit to 32-bit unit
+    __m128i out1 = _mm_cvtepu8_epi32(in);
+    __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
+    __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
+    __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
+
+    _mm_storeu_si128((__m128i *)utf32_output, out1);
+    _mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
+    _mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
+    _mm_storeu_si128((__m128i *)(utf32_output + 12), out4);
+
+    utf32_output += 16;
+    buf += 16;
   }
 
-  // return pointers pointing to where we left off
-  return std::make_pair(buf + rounded_len, utf32_output + rounded_len);
+  return std::make_pair(buf, utf32_output);
 }
-/* end file src/haswell/avx2_convert_latin1_to_utf32.cpp */
+/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */
 
-/* begin file src/haswell/avx2_convert_utf8_to_utf16.cpp */
+/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
 // Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
@@ -25904,18 +38593,20 @@ size_t convert_masked_utf8_to_utf16(const char *input,
       utf8_end_of_code_point_mask & 0xfff;
   if (utf8_end_of_code_point_mask == 0xfff) {
     // We process the data in chunks of 12 bytes.
-    __m256i ascii = _mm256_cvtepu8_epi16(in);
+    // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
+    __m128i ascii_first = _mm_cvtepu8_epi16(in);
+    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
     if (big_endian) {
-      const __m256i swap256 = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      ascii = _mm256_shuffle_epi8(ascii, swap256);
+      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
+      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
     }
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf16_output), ascii);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8),
+                     ascii_second);
     utf16_output += 12; // We wrote 12 16-bit characters.
     return 12;          // We consumed 12 bytes.
   }
-  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
+  if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
     // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
     // UTF-16 code units. There is probably a more efficient sequence, but the
     // following might do.
@@ -25955,11 +38646,12 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     utf16_output += 4;
     return 12;
   }
+  /// We do not have a fast path available, so we fallback.
 
-  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
-      [input_utf8_end_of_code_point_mask][1];
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
   if (idx < 64) {
     // SIX (6) input code-code units
     // this is a relatively easy scenario
@@ -25967,8 +38659,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // code code units spanning between 1 and 2 bytes each is 12 bytes. On
     // processors where pdep/pext is fast, we might be able to use a small
     // lookup table.
-    const __m128i sh = _mm_loadu_si128(
-        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
     const __m128i perm = _mm_shuffle_epi8(in, sh);
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
@@ -25976,12 +38668,11 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     if (big_endian)
       composed = _mm_shuffle_epi8(composed, swap);
     _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points. There is a potential
-                       // overflow of 4 bytes.
+    utf16_output += 6; // We wrote 12 bytes, 6 code points.
   } else if (idx < 145) {
     // FOUR (4) input code-code units
-    const __m128i sh = _mm_loadu_si128(
-        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
     const __m128i perm = _mm_shuffle_epi8(in, sh);
     const __m128i ascii =
         _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
@@ -25997,7 +38688,7 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     if (big_endian)
       composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
     _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4; // Here we overflow by 8 bytes.
+    utf16_output += 4;
   } else if (idx < 209) {
     // TWO (2) input code-code units
     //////////////
@@ -26009,8 +38700,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
     // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
     // do as at the cost of an extra mask.
     /////////////
-    const __m128i sh = _mm_loadu_si128(
-        (const __m128i *)simdutf::tables::utf8_to_utf16::shufutf8[idx]);
+    const __m128i sh =
+        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
     const __m128i perm = _mm_shuffle_epi8(in, sh);
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
     const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
@@ -26071,8 +38762,8 @@ size_t convert_masked_utf8_to_utf16(const char *input,
   }
   return consumed;
 }
-/* end file src/haswell/avx2_convert_utf8_to_utf16.cpp */
-/* begin file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
+/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
 // depends on "tables/utf8_to_utf16_tables.h"
 
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
@@ -26098,10 +38789,14 @@ size_t convert_masked_utf8_to_utf32(const char *input,
       utf8_end_of_code_point_mask & 0xfff;
   if (utf8_end_of_code_point_mask == 0xfff) {
     // We process the data in chunks of 12 bytes.
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
-                        _mm256_cvtepu8_epi32(in));
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output + 8),
-                        _mm256_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                     _mm_cvtepu8_epi32(in));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
+                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
+                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
     utf32_output += 12; // We wrote 12 32-bit characters.
     return 12;          // We consumed 12 bytes.
   }
@@ -26115,9 +38810,11 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output,
-                        _mm256_cvtepu16_epi32(composed));
-    utf32_output += 8; // We wrote 16 bytes, 8 code points.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                     _mm_cvtepu16_epi32(composed));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+    utf32_output += 8; // We wrote 32 bytes, 8 code points.
     return 16;
   }
   if (input_utf8_end_of_code_point_mask == 0x924) {
@@ -26160,10 +38857,11 @@ size_t convert_masked_utf8_to_utf32(const char *input,
     const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
     const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
     const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm256_storeu_si256((__m256i *)utf32_output,
-                        _mm256_cvtepu16_epi32(composed));
-    utf32_output += 6; // We wrote 24 bytes, 6 code points. There is a potential
-    // overflow of 32 - 24 = 8 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                     _mm_cvtepu16_epi32(composed));
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
+    utf32_output += 6; // We wrote 12 bytes, 6 code points.
   } else if (idx < 145) {
     // FOUR (4) input code-code units
     const __m128i sh =
@@ -26201,46 +38899,99 @@ size_t convert_masked_utf8_to_utf32(const char *input,
         _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
                      _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
     _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output +=
-        3; // We wrote 3 * 4 bytes, there is a potential overflow of 4 bytes.
+    utf32_output += 3;
   } else {
     // here we know that there is an error but we do not handle errors
   }
   return consumed;
 }
-/* end file src/haswell/avx2_convert_utf8_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
+/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
+// depends on "tables/utf8_to_utf16_tables.h"
 
-/* begin file src/haswell/avx2_convert_utf16_to_latin1.cpp */
+// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  const __m128i in = _mm_loadu_si128((__m128i *)input);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask &
+      0xfff; // we are only processing 12 bytes in case it is not all ASCII
+  if (utf8_end_of_code_point_mask == 0xfff) {
+    // We process the data in chunks of 12 bytes.
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
+    latin1_output += 12; // We wrote 12 characters.
+    return 12;           // We consumed 12 bytes.
+  }
+  /// We do not have a fast path available, so we fallback.
+  const uint8_t idx =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed =
+      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
+  // processors where pdep/pext is fast, we might be able to use a small lookup
+  // table.
+  const __m128i sh =
+      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
+  const __m128i perm = _mm_shuffle_epi8(in, sh);
+  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
+  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
+  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
+  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  // performance note: it would be faster to use _mm_storeu_si128, we should
+  // investigate.
+  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
+}
+/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */
+
+/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                             char *latin1_output) {
+sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) {
   const char16_t *end = buf + len;
-  while (end - buf >= 16) {
-    // Load 16 UTF-16 characters into 256-bit AVX2 register
-    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+  while (end - buf >= 8) {
+    // Load 8 UTF-16 characters into 128-bit SSE register
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
 
     if (!match_system(big_endian)) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
-    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
-    if (_mm256_testz_si256(in, high_byte_mask)) {
+    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
+    if (_mm_testz_si128(in, high_byte_mask)) {
       // Pack 16-bit characters into 8-bit and store in latin1_output
-      __m128i lo = _mm256_extractf128_si256(in, 0);
-      __m128i hi = _mm256_extractf128_si256(in, 1);
-      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
-      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+      __m128i latin1_packed = _mm_packus_epi16(in, in);
       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed_lo);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
-                       latin1_packed_hi);
+                       latin1_packed);
       // Adjust pointers for next iteration
-      buf += 16;
-      latin1_output += 16;
+      buf += 8;
+      latin1_output += 8;
     } else {
       return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
@@ -26250,54 +39001,47 @@ avx2_convert_utf16_to_latin1(const char16_t *buf, size_t len,
 
 template <endianness big_endian>
 std::pair<result, char *>
-avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                         char *latin1_output) {
+sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                        char *latin1_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  while (end - buf >= 16) {
-    __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(buf));
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
 
     if (!match_system(big_endian)) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
-    __m256i high_byte_mask = _mm256_set1_epi16((int16_t)0xFF00);
-    if (_mm256_testz_si256(in, high_byte_mask)) {
-      __m128i lo = _mm256_extractf128_si256(in, 0);
-      __m128i hi = _mm256_extractf128_si256(in, 1);
-      __m128i latin1_packed_lo = _mm_packus_epi16(lo, lo);
-      __m128i latin1_packed_hi = _mm_packus_epi16(hi, hi);
+    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
+    if (_mm_testz_si128(in, high_byte_mask)) {
+      __m128i latin1_packed = _mm_packus_epi16(in, in);
       _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed_lo);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output + 8),
-                       latin1_packed_hi);
-      buf += 16;
-      latin1_output += 16;
+                       latin1_packed);
+      buf += 8;
+      latin1_output += 8;
     } else {
       // Fallback to scalar code for handling errors
-      for (int k = 0; k < 16; k++) {
+      for (int k = 0; k < 8; k++) {
         uint16_t word = !match_system(big_endian)
                             ? scalar::utf16::swap_bytes(buf[k])
                             : buf[k];
         if (word <= 0xff) {
           *latin1_output++ = char(word);
         } else {
-          return std::make_pair(
-              result{error_code::TOO_LARGE, (size_t)(buf - start + k)},
-              latin1_output);
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
         }
       }
-      buf += 16;
+      buf += 8;
     }
   } // while
-  return std::make_pair(result{error_code::SUCCESS, (size_t)(buf - start)},
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/haswell/avx2_convert_utf16_to_latin1.cpp */
-/* begin file src/haswell/avx2_convert_utf16_to_utf8.cpp */
+/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */
+/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
     loads eight 16-bit code units.
@@ -26353,117 +39097,91 @@ avx2_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
 */
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
+
   const char16_t *end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
   while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        nextin = _mm_shuffle_epi8(nextin, swap);
+      }
+      if (!_mm_testz_si128(nextin, v_ff80)) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, in);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
     }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
 
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+    // no bits set above 7th bit
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+    // no bits set above 11th bit
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
 
-      // 6. adjust pointers
-      buf += 16;
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      internal::westmere::write_v_u16_11bits_to_utf8(
+          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
+      buf += 8;
       continue;
     }
+
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
+    if (surrogates_bitmask == 0x0000) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
@@ -26492,90 +39210,67 @@ avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      const __m128i s0 = _mm_srli_epi16(in, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -26617,6 +39312,7 @@ avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
       buf += k;
     }
   } // while
+
   return std::make_pair(buf, utf8_output);
 }
 
@@ -26629,120 +39325,92 @@ avx2_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
 */
 template <endianness big_endian>
 std::pair<result, char *>
-avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
-                                       char *utf8_output) {
+sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                      char *utf8_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
 
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
-  const __m256i v_c080 = _mm256_set1_epi16((int16_t)0xc080);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
   while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
     // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m256i v_ff80 = _mm256_set1_epi16((int16_t)0xff80);
-    if (_mm256_testz_si256(in, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in), _mm256_extractf128_si256(in, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
+    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
+      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+      if (big_endian) {
+        const __m128i swap =
+            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+        nextin = _mm_shuffle_epi8(nextin, swap);
+      }
+      if (!_mm_testz_si128(nextin, v_ff80)) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, in);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
     }
+
     // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
 
     // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
-
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
-      // 4. pack the bytes
-
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
 
-      // 6. adjust pointers
-      buf += 16;
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      internal::westmere::write_v_u16_11bits_to_utf8(
+          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
+      buf += 8;
       continue;
     }
+
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
+    if (surrogates_bitmask == 0x0000) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
@@ -26771,90 +39439,67 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in, 4);
+      const __m128i s0 = _mm_srli_epi16(in, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -26898,10 +39543,11 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
       buf += k;
     }
   } // while
+
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
-/* end file src/haswell/avx2_convert_utf16_to_utf8.cpp */
-/* begin file src/haswell/avx2_convert_utf16_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
+/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
 /*
     The vectorized algorithm works on single SSE register i.e., it
     loads eight 16-bit code units.
@@ -26910,14 +39556,14 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
     1. an input register contains no surrogates and each value
        is in range 0x0000 .. 0x07ff.
     2. an input register contains no surrogates and values are
-       in range 0x0000 .. 0xffff.
+       is in range 0x0000 .. 0xffff.
     3. an input register contains surrogates --- i.e. codepoints
        can have 16 or 32 bits.
 
     Ad 1.
 
     When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
     char) or 2) two UTF8 bytes.
 
     For this case we do only some shuffle to obtain these 2-byte
@@ -26952,48 +39598,47 @@ avx2_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
 */
 
 /*
-  Returns a pair: the first unprocessed byte from buf and utf32_output
+  Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
 template <endianness big_endian>
 std::pair<const char16_t *, char32_t *>
-avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                            char32_t *utf32_output) {
+sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_output) {
   const char16_t *end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-  while (end - buf >= 16) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
-      // units
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
-                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i *>(utf32_output + 8),
-          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
-      utf32_output += 16;
-      buf += 16;
+    if (surrogates_bitmask == 0x0000) {
+      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                       _mm_cvtepu16_epi32(in));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+      utf32_output += 8;
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -27007,7 +39652,6 @@ avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
       for (; k < forward; k++) {
         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
         if ((word & 0xF800) != 0xD800) {
-          // No surrogate pair
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
@@ -27038,44 +39682,43 @@ avx2_convert_utf16_to_utf32(const char16_t *buf, size_t len,
 */
 template <endianness big_endian>
 std::pair<result, char32_t *>
-avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
-                                        char32_t *utf32_output) {
+sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                       char32_t *utf32_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  const __m256i v_f800 = _mm256_set1_epi16((int16_t)0xf800);
-  const __m256i v_d800 = _mm256_set1_epi16((int16_t)0xd800);
 
-  while (end - buf >= 16) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
+  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+
     if (big_endian) {
-      const __m256i swap = _mm256_setr_epi8(
-          1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17, 16, 19, 18,
-          21, 20, 23, 22, 25, 24, 27, 26, 29, 28, 31, 30);
-      in = _mm256_shuffle_epi8(in, swap);
+      const __m128i swap =
+          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
+      in = _mm_shuffle_epi8(in, swap);
     }
 
     // 1. Check if there are any surrogate word in the input chunk.
     //    We have also deal with situation when there is a surrogate word
     //    at the end of a chunk.
-    const __m256i surrogates_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in, v_f800), v_d800);
+    const __m128i surrogates_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
 
     // bitmask = 0x0000 if there are no surrogates
     //         = 0xc000 if the last word is a surrogate
-    const uint32_t surrogates_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogates_bytemask));
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x00000000) {
-      // case: we extend all sixteen 16-bit code units to sixteen 32-bit code
-      // units
-      _mm256_storeu_si256(reinterpret_cast<__m256i *>(utf32_output),
-                          _mm256_cvtepu16_epi32(_mm256_castsi256_si128(in)));
-      _mm256_storeu_si256(
-          reinterpret_cast<__m256i *>(utf32_output + 8),
-          _mm256_cvtepu16_epi32(_mm256_extractf128_si256(in, 1)));
-      utf32_output += 16;
-      buf += 16;
+    if (surrogates_bitmask == 0x0000) {
+      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
+                       _mm_cvtepu16_epi32(in));
+      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
+                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
+      utf32_output += 8;
+      buf += 8;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -27089,7 +39732,6 @@ avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
       for (; k < forward; k++) {
         uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
         if ((word & 0xF800) != 0xD800) {
-          // No surrogate pair
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
@@ -27112,229 +39754,290 @@ avx2_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
   } // while
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
 }
-/* end file src/haswell/avx2_convert_utf16_to_utf32.cpp */
+/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
 
-/* begin file src/haswell/avx2_convert_utf32_to_latin1.cpp */
+/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
 std::pair<const char32_t *, char *>
-avx2_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                             char *latin1_output) {
-  const size_t rounded_len =
-      len & ~0x1F; // Round down to nearest multiple of 32
-
-  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
+sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
-                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
+  __m128i shufmask =
+      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
   for (size_t i = 0; i < rounded_len; i += 16) {
-    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
-    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
+    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
+    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
+    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
 
-    __m256i check_combined = _mm256_or_si256(in1, in2);
+    __m128i check_combined = _mm_or_si128(in1, in2);
+    check_combined = _mm_or_si128(check_combined, in3);
+    check_combined = _mm_or_si128(check_combined, in4);
 
-    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
       return std::make_pair(nullptr, latin1_output);
     }
-
-    // Turn UTF32 bytes into latin 1 bytes
-    __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
-    __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
-
-    // move Latin1 bytes to their correct spot
-    __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
-    __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
-    __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
-    __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
-
-    __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
-    _mm_storeu_si128((__m128i *)latin1_output, _mm256_castsi256_si128(result));
-
+    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
+                                       _mm_shuffle_epi8(in2, shufmask));
+    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
+                                       _mm_shuffle_epi8(in4, shufmask));
+    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
+    _mm_storeu_si128((__m128i *)latin1_output, pack);
     latin1_output += 16;
     buf += 16;
   }
 
   return std::make_pair(buf, latin1_output);
 }
-std::pair<result, char *>
-avx2_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                         char *latin1_output) {
-  const size_t rounded_len =
-      len & ~0x1F; // Round down to nearest multiple of 32
 
-  __m256i high_bytes_mask = _mm256_set1_epi32(0xFFFFFF00);
-  __m256i shufmask = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-                                     -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1,
-                                     -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+std::pair<result, char *>
+sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char32_t *start = buf;
+  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
 
-  const char32_t *start = buf;
+  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
+  __m128i shufmask =
+      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
 
   for (size_t i = 0; i < rounded_len; i += 16) {
-    __m256i in1 = _mm256_loadu_si256((__m256i *)buf);
-    __m256i in2 = _mm256_loadu_si256((__m256i *)(buf + 8));
+    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
+    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
+    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
+    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
 
-    __m256i check_combined = _mm256_or_si256(in1, in2);
+    __m128i check_combined = _mm_or_si128(in1, in2);
+    check_combined = _mm_or_si128(check_combined, in3);
+    check_combined = _mm_or_si128(check_combined, in4);
 
-    if (!_mm256_testz_si256(check_combined, high_bytes_mask)) {
+    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
       // Fallback to scalar code for handling errors
-      for (int k = 0; k < 8; k++) {
+      for (int k = 0; k < 16; k++) {
         char32_t codepoint = buf[k];
-        if (codepoint <= 0xFF) {
-          *latin1_output++ = static_cast<char>(codepoint);
+        if (codepoint <= 0xff) {
+          *latin1_output++ = char(codepoint);
         } else {
           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                 latin1_output);
         }
       }
-      buf += 8;
-    } else {
-      __m256i shuffled1 = _mm256_shuffle_epi8(in1, shufmask);
-      __m256i shuffled2 = _mm256_shuffle_epi8(in2, shufmask);
-
-      __m256i idx1 = _mm256_set_epi32(-1, -1, -1, -1, -1, -1, 4, 0);
-      __m256i idx2 = _mm256_set_epi32(-1, -1, -1, -1, 4, 0, -1, -1);
-      __m256i reshuffled1 = _mm256_permutevar8x32_epi32(shuffled1, idx1);
-      __m256i reshuffled2 = _mm256_permutevar8x32_epi32(shuffled2, idx2);
-
-      __m256i result = _mm256_or_si256(reshuffled1, reshuffled2);
-      _mm_storeu_si128((__m128i *)latin1_output,
-                       _mm256_castsi256_si128(result));
-
-      latin1_output += 16;
       buf += 16;
+      continue;
     }
+    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
+                                       _mm_shuffle_epi8(in2, shufmask));
+    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
+                                       _mm_shuffle_epi8(in4, shufmask));
+    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
+    _mm_storeu_si128((__m128i *)latin1_output, pack);
+    latin1_output += 16;
+    buf += 16;
   }
 
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/haswell/avx2_convert_utf32_to_latin1.cpp */
-/* begin file src/haswell/avx2_convert_utf32_to_utf8.cpp */
+/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */
+/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
 std::pair<const char32_t *, char *>
-avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
   const char32_t *end = buf + len;
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  __m256i running_max = _mm256_setzero_si256();
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
 
+  const __m128i v_0000 = _mm_setzero_si128();              //__m128 = 128 bits
+  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
+                                                           // 0000
+  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
+                                                           // 0000
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
+                                                           // 0000
+  const __m128i v_ffff0000 = _mm_set1_epi32(
+      (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
+  const __m128i v_7fffffff = _mm_set1_epi32(
+      (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
+  __m128i running_max = _mm_setzero_si128();
+  __m128i forbidden_bytemask = _mm_setzero_si128();
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
-    running_max = _mm256_max_epu32(_mm256_max_epu32(in, running_max), nextin);
+  while (end - buf >=
+         std::ptrdiff_t(
+             16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
+                                    // has 4 bytes or 32 bits, thus buf + 16 *
+                                    // char_32t = 512 bits = 64 bytes
+    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128(
+        (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
+    running_max = _mm_max_epu32(
+        _mm_max_epu32(in, running_max), // take element-wise max char32_t from
+                                        // in and running_max vector
+        nextin); // and take element-wise max element from nextin and
+                 // running_max vector
 
     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
     // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+    __m128i in_16 = _mm_packus_epi32(
+        _mm_and_si128(in, v_7fffffff),
+        _mm_and_si128(
+            nextin,
+            v_7fffffff)); // in this context pack the two __m128 into a single
+    // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
+    // all values are interpreted as non-negative, or specifically, the values
+    // are within the range of valid Unicode code points. remember : having
+    // leading byte 0 means a positive number by the two complements system.
+    // Unicode is well beneath the range where you'll start getting issues so
+    // that's OK.
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+    // Check for ASCII fast path
+
+    // ASCII fast path!!!!
+    // We eagerly load another 32 bytes, hoping that they will be ASCII too.
+    // The intuition is that we try to collect 16 ASCII characters which
+    // requires a total of 64 bytes of input. If we fail, we just pass thirdin
+    // and fourthin as our new inputs.
+    if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
+      __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
+      __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
+      running_max = _mm_max_epu32(
+          _mm_max_epu32(thirdin, running_max),
+          fourthin); // take the running max of all 4 vectors thus far
+      __m128i nextin_16 = _mm_packus_epi32(
+          _mm_and_si128(thirdin, v_7fffffff),
+          _mm_and_si128(fourthin,
+                        v_7fffffff)); // pack into 1 vector, now you have two
+      if (!_mm_testz_si128(
+              nextin_16,
+              v_ff80)) { // checks if the second packed vector is ASCII, if not:
+        // 1. pack the bytes
+        // obviously suboptimal.
+        const __m128i utf8_packed = _mm_packus_epi16(
+            in_16, in_16); // creates two copy of in_16 in 1 vector
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output,
+                         utf8_packed); // put them into the output
+        // 3. adjust pointers
+        buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
+                  // bits =  256 bits
+        utf8_output +=
+            8; // same with output, e.g. lift the first two blocks alone.
+        // Proceed with next input
+        in_16 = nextin_16;
+        // We need to update in and nextin because they are used later.
+        in = thirdin;
+        nextin = fourthin;
+      } else {
+        // 1. pack the bytes
+        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
+        // 2. store (16 bytes)
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      }
     }
-    // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+
+    // no bits set above 7th bit -- find out all the ASCII characters
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
+            _mm_and_si128(in_16, v_ff80), // the vector that get only the first
+                                          // 9 bits of each 16-bit/2-byte units
+            v_0000                        //
+        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
+           // of format 0000 0000 0000 0XXX XXXX
+    // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
+    // 0000 0000 0000 0000 if not for each 16-bit/2-byte units
+    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
+        one_byte_bytemask)); // collect the MSB from previous vector and put
+                             // them into uint16_t mas
 
     // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
+      // produces 2 bytes)
       // 1. prepare 2-byte values
       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
       // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+      const __m128i v_1f00 =
+          _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
+      const __m128i v_003f =
+          _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
 
       // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
       // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      const __m128i t1 =
+          _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
       // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      const __m128i t2 =
+          _mm_and_si128(in_16, v_003f); // potential second utf8 byte
       // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
+      const __m128i t3 =
+          _mm_or_si128(t1, t2); // first and second potential utf8 byte together
       // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+      const __m128i t4 = _mm_or_si128(
+          t3,
+          v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
 
       // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+      const __m128i utf8_unpacked =
+          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
 
       // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
+      //    MSB, a - LSB)
+      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+      const uint16_t m1 =
+          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+      const uint8_t m2 =
+          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
       // 4. pack the bytes
-
       const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
       const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
+      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
 
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
       // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
 
       // 6. adjust pointers
-      buf += 16;
+      buf += 8;
+      utf8_output += row[0];
       continue;
     }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+
+    // Check for overflow in packing
+
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+    if (saturation_bitmask == 0xffff) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
-          forbidden_bytemask,
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800));
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask =
+          _mm_or_si128(forbidden_bytemask,
+                       _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
         single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+        two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
         three UTF-8 bytes
 
         We expand the input word (16-bit) into two code units (32-bit), thus
@@ -27356,95 +40059,72 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      const __m128i s0 = _mm_srli_epi16(in_16, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
     } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
+      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD in the
+      // presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -27452,19 +40132,19 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
       }
       for (; k < forward; k++) {
         uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+        if ((word & 0xFFFFFF80) == 0) {
           *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+        } else if ((word & 0xFFFFF800) == 0) {
           *utf8_output++ = char((word >> 6) | 0b11000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+        } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(nullptr, utf8_output);
           }
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
+        } else {
           if (word > 0x10FFFF) {
             return std::make_pair(nullptr, utf8_output);
           }
@@ -27479,13 +40159,13 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
   } // while
 
   // check for invalid input
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(_mm256_cmpeq_epi32(
-          _mm256_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffffffff) {
+  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+  if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
+          _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
     return std::make_pair(nullptr, utf8_output);
   }
 
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
     return std::make_pair(nullptr, utf8_output);
   }
 
@@ -27493,145 +40173,141 @@ avx2_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
 }
 
 std::pair<result, char *>
-avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                       char *utf8_output) {
+sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                      char *utf8_output) {
   const char32_t *end = buf + len;
   const char32_t *start = buf;
 
-  const __m256i v_0000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
-  const __m256i v_ff80 = _mm256_set1_epi16((uint16_t)0xff80);
-  const __m256i v_f800 = _mm256_set1_epi16((uint16_t)0xf800);
-  const __m256i v_c080 = _mm256_set1_epi16((uint16_t)0xc080);
-  const __m256i v_7fffffff = _mm256_set1_epi32((uint32_t)0x7fffffff);
-  const __m256i v_10ffff = _mm256_set1_epi32((uint32_t)0x10ffff);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
+  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
+  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
 
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
   while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-    __m256i nextin = _mm256_loadu_si256((__m256i *)buf + 1);
+    // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
     // Check for too large input
-    const __m256i max_input =
-        _mm256_max_epu32(_mm256_max_epu32(in, nextin), v_10ffff);
-    if (static_cast<uint32_t>(_mm256_movemask_epi8(
-            _mm256_cmpeq_epi32(max_input, v_10ffff))) != 0xffffffff) {
+    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
+    if (static_cast<uint16_t>(_mm_movemask_epi8(
+            _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
       return std::make_pair(result(error_code::TOO_LARGE, buf - start),
                             utf8_output);
     }
 
     // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
     // saturation
-    __m256i in_16 = _mm256_packus_epi32(_mm256_and_si256(in, v_7fffffff),
-                                        _mm256_and_si256(nextin, v_7fffffff));
-    in_16 = _mm256_permute4x64_epi64(in_16, 0b11011000);
+    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
+                                     _mm_and_si128(nextin, v_7fffffff));
 
-    // Try to apply UTF-16 => UTF-8 routine on 256 bits
-    // (haswell/avx2_convert_utf16_to_utf8.cpp)
+    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
 
-    if (_mm256_testz_si256(in_16, v_ff80)) { // ASCII fast path!!!!
+    // Check for ASCII fast path
+    if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
       // 1. pack the bytes
-      const __m128i utf8_packed = _mm_packus_epi16(
-          _mm256_castsi256_si128(in_16), _mm256_extractf128_si256(in_16, 1));
+      // obviously suboptimal.
+      const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
       // 2. store (16 bytes)
       _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
       // 3. adjust pointers
-      buf += 16;
-      utf8_output += 16;
-      continue; // we are done for this round!
+      buf += 8;
+      utf8_output += 8;
+      continue;
     }
+
     // no bits set above 7th bit
-    const __m256i one_byte_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_ff80), v_0000);
-    const uint32_t one_byte_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_byte_bytemask));
+    const __m128i one_byte_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
+    const uint16_t one_byte_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
 
     // no bits set above 11th bit
-    const __m256i one_or_two_bytes_bytemask =
-        _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_0000);
-    const uint32_t one_or_two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(one_or_two_bytes_bytemask));
-    if (one_or_two_bytes_bitmask == 0xffffffff) {
+    const __m128i one_or_two_bytes_bytemask =
+        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
+    const uint16_t one_or_two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+
+    if (one_or_two_bytes_bitmask == 0xffff) {
+      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
+      // produces 2 bytes)
       // 1. prepare 2-byte values
       // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
       // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m256i v_1f00 = _mm256_set1_epi16((int16_t)0x1f00);
-      const __m256i v_003f = _mm256_set1_epi16((int16_t)0x003f);
+      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
+      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
 
       // t0 = [000a|aaaa|bbbb|bb00]
-      const __m256i t0 = _mm256_slli_epi16(in_16, 2);
+      const __m128i t0 = _mm_slli_epi16(in_16, 2);
       // t1 = [000a|aaaa|0000|0000]
-      const __m256i t1 = _mm256_and_si256(t0, v_1f00);
+      const __m128i t1 = _mm_and_si128(t0, v_1f00);
       // t2 = [0000|0000|00bb|bbbb]
-      const __m256i t2 = _mm256_and_si256(in_16, v_003f);
+      const __m128i t2 = _mm_and_si128(in_16, v_003f);
       // t3 = [000a|aaaa|00bb|bbbb]
-      const __m256i t3 = _mm256_or_si256(t1, t2);
+      const __m128i t3 = _mm_or_si128(t1, t2);
       // t4 = [110a|aaaa|10bb|bbbb]
-      const __m256i t4 = _mm256_or_si256(t3, v_c080);
+      const __m128i t4 = _mm_or_si128(t3, v_c080);
 
       // 2. merge ASCII and 2-byte codewords
-      const __m256i utf8_unpacked =
-          _mm256_blendv_epi8(t4, in_16, one_byte_bytemask);
+      const __m128i utf8_unpacked =
+          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
 
       // 3. prepare bitmask for 8-bit lookup
-      const uint32_t M0 = one_byte_bitmask & 0x55555555;
-      const uint32_t M1 = M0 >> 7;
-      const uint32_t M2 = (M1 | M0) & 0x00ff00ff;
+      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
+      //    MSB, a - LSB)
+      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
+      const uint16_t m1 =
+          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
+      const uint8_t m2 =
+          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
       // 4. pack the bytes
-
       const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2)][0];
-      const uint8_t *row_2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[uint8_t(M2 >>
-                                                                       16)][0];
-
+          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
       const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i shuffle_2 = _mm_loadu_si128((__m128i *)(row_2 + 1));
-
-      const __m256i utf8_packed = _mm256_shuffle_epi8(
-          utf8_unpacked, _mm256_setr_m128i(shuffle, shuffle_2));
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_castsi256_si128(utf8_packed));
-      utf8_output += row[0];
-      _mm_storeu_si128((__m128i *)utf8_output,
-                       _mm256_extractf128_si256(utf8_packed, 1));
-      utf8_output += row_2[0];
+      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
+
+      // 5. store bytes
+      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
 
       // 6. adjust pointers
-      buf += 16;
+      buf += 8;
+      utf8_output += row[0];
       continue;
     }
-    // Must check for overflow in packing
-    const __m256i saturation_bytemask = _mm256_cmpeq_epi32(
-        _mm256_and_si256(_mm256_or_si256(in, nextin), v_ffff0000), v_0000);
+
+    // Check for overflow in packing
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffffffff) {
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+
+    if (saturation_bitmask == 0xffff) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
 
       // Check for illegal surrogate code units
-      const __m256i v_d800 = _mm256_set1_epi16((uint16_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi16(_mm256_and_si256(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      const __m128i forbidden_bytemask =
+          _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
         return std::make_pair(result(error_code::SURROGATE, buf - start),
                               utf8_output);
       }
 
-      const __m256i dup_even = _mm256_setr_epi16(
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e,
-          0x0000, 0x0202, 0x0404, 0x0606, 0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
+      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
+                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
 
       /* In this branch we handle three cases:
-        1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
         single UFT-8 byte
-        2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-        3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+        two UTF-8 bytes
+          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
         three UTF-8 bytes
 
         We expand the input word (16-bit) into two code units (32-bit), thus
@@ -27653,95 +40329,72 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm256_set1_epi16(static_cast<uint16_t>(x))
+#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m256i t0 = _mm256_shuffle_epi8(in_16, dup_even);
+      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
       // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m256i t1 = _mm256_and_si256(t0, simdutf_vec(0b0011111101111111));
+      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m256i t2 = _mm256_or_si256(t1, simdutf_vec(0b1000000000000000));
+      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
 
       // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m256i s0 = _mm256_srli_epi16(in_16, 4);
+      const __m128i s0 = _mm_srli_epi16(in_16, 4);
       // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m256i s1 = _mm256_and_si256(s0, simdutf_vec(0b0000111111111100));
+      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
       // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m256i s2 = _mm256_maddubs_epi16(s1, simdutf_vec(0x0140));
+      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
       // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m256i s3 = _mm256_or_si256(s2, simdutf_vec(0b1100000011100000));
-      const __m256i m0 = _mm256_andnot_si256(one_or_two_bytes_bytemask,
-                                             simdutf_vec(0b0100000000000000));
-      const __m256i s4 = _mm256_xor_si256(s3, m0);
+      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
+      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
+                                          simdutf_vec(0b0100000000000000));
+      const __m128i s4 = _mm_xor_si128(s3, m0);
 #undef simdutf_vec
 
       // 4. expand code units 16-bit => 32-bit
-      const __m256i out0 = _mm256_unpacklo_epi16(t2, s4);
-      const __m256i out1 = _mm256_unpackhi_epi16(t2, s4);
+      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
+      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint32_t mask = (one_byte_bitmask & 0x55555555) |
-                            (one_or_two_bytes_bitmask & 0xaaaaaaaa);
-      // Due to the wider registers, the following path is less likely to be
-      // useful.
-      /*if(mask == 0) {
+      const uint16_t mask =
+          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
+      if (mask == 0) {
         // We only have three-byte code units. Use fast path.
-        const __m256i shuffle =
-      _mm256_setr_epi8(2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1,
-      2,3,1,6,7,5,10,11,9,14,15,13,-1,-1,-1,-1); const __m256i utf8_0 =
-      _mm256_shuffle_epi8(out0, shuffle); const __m256i utf8_1 =
-      _mm256_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_0));
+        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
+                                              15, 13, -1, -1, -1, -1);
+        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
+        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output, _mm256_castsi256_si128(utf8_1));
+        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
         utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_0,1)); utf8_output += 12;
-        _mm_storeu_si128((__m128i*)utf8_output,
-      _mm256_extractf128_si256(utf8_1,1)); utf8_output += 12; buf += 16;
+        buf += 8;
         continue;
-      }*/
+      }
       const uint8_t mask0 = uint8_t(mask);
+
       const uint8_t *row0 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
       const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out0), shuffle0);
+      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
 
       const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+
       const uint8_t *row1 =
           &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
       const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 =
-          _mm_shuffle_epi8(_mm256_castsi256_si128(out1), shuffle1);
-
-      const uint8_t mask2 = static_cast<uint8_t>(mask >> 16);
-      const uint8_t *row2 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask2][0];
-      const __m128i shuffle2 = _mm_loadu_si128((__m128i *)(row2 + 1));
-      const __m128i utf8_2 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out0, 1), shuffle2);
-
-      const uint8_t mask3 = static_cast<uint8_t>(mask >> 24);
-      const uint8_t *row3 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask3][0];
-      const __m128i shuffle3 = _mm_loadu_si128((__m128i *)(row3 + 1));
-      const __m128i utf8_3 =
-          _mm_shuffle_epi8(_mm256_extractf128_si256(out1, 1), shuffle3);
+      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
 
       _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
       utf8_output += row0[0];
       _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
       utf8_output += row1[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_2);
-      utf8_output += row2[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_3);
-      utf8_output += row3[0];
-      buf += 16;
+
+      buf += 8;
     } else {
-      // case: at least one 32-bit word is larger than 0xFFFF <=> it will
-      // produce four UTF-8 bytes. Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD may require
-      // large, non-trivial tables?
+      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
+      // wasteful to use scalar code, but being efficient with SIMD in the
+      // presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -27749,12 +40402,12 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
       }
       for (; k < forward; k++) {
         uint32_t word = buf[k];
-        if ((word & 0xFFFFFF80) == 0) { // 1-byte (ASCII)
+        if ((word & 0xFFFFFF80) == 0) {
           *utf8_output++ = char(word);
-        } else if ((word & 0xFFFFF800) == 0) { // 2-byte
+        } else if ((word & 0xFFFFF800) == 0) {
           *utf8_output++ = char((word >> 6) | 0b11000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else if ((word & 0xFFFF0000) == 0) { // 3-byte
+        } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(
                 result(error_code::SURROGATE, buf - start + k), utf8_output);
@@ -27762,7 +40415,7 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
-        } else { // 4-byte
+        } else {
           if (word > 0x10FFFF) {
             return std::make_pair(
                 result(error_code::TOO_LARGE, buf - start + k), utf8_output);
@@ -27776,48 +40429,46 @@ avx2_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
       buf += k;
     }
   } // while
-
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
 }
-/* end file src/haswell/avx2_convert_utf32_to_utf8.cpp */
-/* begin file src/haswell/avx2_convert_utf32_to_utf16.cpp */
+/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
+/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
 template <endianness big_endian>
 std::pair<const char32_t *, char16_t *>
-avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                            char16_t *utf16_output) {
-  const char32_t *end = buf + len;
-
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-  __m256i forbidden_bytemask = _mm256_setzero_si256();
+sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                           char16_t *utf16_output) {
 
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
+  const char32_t *end = buf + len;
 
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+  __m128i forbidden_bytemask = _mm_setzero_si128();
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      forbidden_bytemask = _mm256_or_si256(
+    // Check if no bits set above 16th
+    if (saturation_bitmask == 0xffff) {
+      // Pack UTF-32 to UTF-16
+      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      forbidden_bytemask = _mm_or_si128(
           forbidden_bytemask,
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800));
+          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
       if (big_endian) {
         const __m128i swap =
             _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
       }
+
       _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
       utf16_output += 8;
       buf += 8;
@@ -27861,7 +40512,7 @@ avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   }
 
   // check for invalid input
-  if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) != 0) {
+  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
     return std::make_pair(nullptr, utf16_output);
   }
 
@@ -27870,45 +40521,42 @@ avx2_convert_utf32_to_utf16(const char32_t *buf, size_t len,
 
 template <endianness big_endian>
 std::pair<result, char16_t *>
-avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                        char16_t *utf16_output) {
+sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                       char16_t *utf16_output) {
   const char32_t *start = buf;
   const char32_t *end = buf + len;
 
-  const size_t safety_margin =
-      12; // to avoid overruns, see issue
-          // https://github.com/simdutf/simdutf/issues/92
-
-  while (end - buf >= std::ptrdiff_t(8 + safety_margin)) {
-    __m256i in = _mm256_loadu_si256((__m256i *)buf);
-
-    const __m256i v_00000000 = _mm256_setzero_si256();
-    const __m256i v_ffff0000 = _mm256_set1_epi32((int32_t)0xffff0000);
+  const __m128i v_0000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
 
-    // no bits set above 16th bit <=> can pack to UTF16 without surrogate pairs
-    const __m256i saturation_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
+  while (end - buf >= 8) {
+    __m128i in = _mm_loadu_si128((__m128i *)buf);
+    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
+    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
+        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
     const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(saturation_bytemask));
+        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
 
-    if (saturation_bitmask == 0xffffffff) {
-      const __m256i v_f800 = _mm256_set1_epi32((uint32_t)0xf800);
-      const __m256i v_d800 = _mm256_set1_epi32((uint32_t)0xd800);
-      const __m256i forbidden_bytemask =
-          _mm256_cmpeq_epi32(_mm256_and_si256(in, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm256_movemask_epi8(forbidden_bytemask)) !=
-          0x0) {
+    // Check if no bits set above 16th
+    if (saturation_bitmask == 0xffff) {
+      // Pack UTF-32 to UTF-16
+      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
+
+      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
+      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
+      const __m128i forbidden_bytemask =
+          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
+      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
         return std::make_pair(result(error_code::SURROGATE, buf - start),
                               utf16_output);
       }
 
-      __m128i utf16_packed = _mm_packus_epi32(_mm256_castsi256_si128(in),
-                                              _mm256_extractf128_si256(in, 1));
       if (big_endian) {
         const __m128i swap =
             _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
         utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
       }
+
       _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
       utf16_output += 8;
       buf += 8;
@@ -27955,72 +40603,8 @@ avx2_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
 
   return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
 }
-/* end file src/haswell/avx2_convert_utf32_to_utf16.cpp */
-
-/* begin file src/haswell/avx2_convert_utf8_to_latin1.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
-
-// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 12).
-size_t convert_masked_utf8_to_latin1(const char *input,
-                                     uint64_t utf8_end_of_code_point_mask,
-                                     char *&latin1_output) {
-  // we use an approach where we try to process up to 12 input bytes.
-  // Why 12 input bytes and not 16? Because we are concerned with the size of
-  // the lookup tables. Also 12 is nicely divisible by two and three.
-  //
-  //
-  // Optimization note: our main path below is load-latency dependent. Thus it
-  // is maybe beneficial to have fast paths that depend on branch prediction but
-  // have less latency. This results in more instructions but, potentially, also
-  // higher speeds.
-  //
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask &
-      0xfff; // we are only processing 12 bytes in case it is not all ASCII
-
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
-    latin1_output += 12; // We wrote 12 characters.
-    return 12;           // We consumed 1 bytes.
-  }
-  /// We do not have a fast path available, so we fallback.
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
-  // this indicates an invalid input:
-  if (idx >= 64) {
-    return consumed;
-  }
-  // Here we should have (idx < 64), if not, there is a bug in the validation or
-  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
-  // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-  // processors where pdep/pext is fast, we might be able to use a small lookup
-  // table.
-  const __m128i sh =
-      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-  const __m128i perm = _mm_shuffle_epi8(in, sh);
-  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
-  // writing 8 bytes even though we only care about the first 6 bytes.
-  // performance note: it would be faster to use _mm_storeu_si128, we should
-  // investigate.
-  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
-  latin1_output += 6; // We wrote 6 bytes.
-  return consumed;
-}
-/* end file src/haswell/avx2_convert_utf8_to_latin1.cpp */
-
-/* begin file src/haswell/avx2_base64.cpp */
+/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
+/* begin file src/westmere/sse_base64.cpp */
 /**
  * References and further reading:
  *
@@ -28048,151 +40632,155 @@ size_t convert_masked_utf8_to_latin1(const char *input,
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-
-template <bool base64_url>
-simdutf_really_inline __m256i lookup_pshufb_improved(const __m256i input) {
+template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
   // credit: Wojciech Muła
-  __m256i result = _mm256_subs_epu8(input, _mm256_set1_epi8(51));
-  const __m256i less = _mm256_cmpgt_epi8(_mm256_set1_epi8(26), input);
-  result =
-      _mm256_or_si256(result, _mm256_and_si256(less, _mm256_set1_epi8(13)));
-  __m256i shift_LUT;
-  if (base64_url) {
-    shift_LUT = _mm256_setr_epi8(
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0,
+  // reduce  0..51 -> 0
+  //        52..61 -> 1 .. 10
+  //            62 -> 11
+  //            63 -> 12
+  __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
 
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
-  } else {
-    shift_LUT = _mm256_setr_epi8(
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0,
+  // distinguish between ranges 0..25 and 26..51:
+  //         0 .. 25 -> remains 0
+  //        26 .. 51 -> becomes 13
+  const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
+  result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
 
-        'a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-        '0' - 52, '0' - 52, '0' - 52, '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
+  __m128i shift_LUT;
+  if (base64_url) {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
+  } else {
+    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
+                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
   }
 
-  result = _mm256_shuffle_epi8(shift_LUT, result);
-  return _mm256_add_epi8(result, input);
+  // read shift
+  result = _mm_shuffle_epi8(shift_LUT, result);
+
+  return _mm_add_epi8(result, input);
 }
 
 template <bool isbase64url>
 size_t encode_base64(char *dst, const char *src, size_t srclen,
                      base64_options options) {
   // credit: Wojciech Muła
+  // SSE (lookup: pshufb improved unrolled)
   const uint8_t *input = (const uint8_t *)src;
 
   uint8_t *out = (uint8_t *)dst;
-  const __m256i shuf =
-      _mm256_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
+  const __m128i shuf =
+      _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
 
-                      10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
   size_t i = 0;
-  for (; i + 100 <= srclen; i += 96) {
-    const __m128i lo0 = _mm_loadu_si128(
+  for (; i + 52 <= srclen; i += 48) {
+    __m128i in0 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
-    const __m128i hi0 = _mm_loadu_si128(
+    __m128i in1 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
-    const __m128i lo1 = _mm_loadu_si128(
+    __m128i in2 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
-    const __m128i hi1 = _mm_loadu_si128(
+    __m128i in3 = _mm_loadu_si128(
         reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
-    const __m128i lo2 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 4));
-    const __m128i hi2 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 5));
-    const __m128i lo3 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 6));
-    const __m128i hi3 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 7));
 
-    __m256i in0 = _mm256_shuffle_epi8(_mm256_set_m128i(hi0, lo0), shuf);
-    __m256i in1 = _mm256_shuffle_epi8(_mm256_set_m128i(hi1, lo1), shuf);
-    __m256i in2 = _mm256_shuffle_epi8(_mm256_set_m128i(hi2, lo2), shuf);
-    __m256i in3 = _mm256_shuffle_epi8(_mm256_set_m128i(hi3, lo3), shuf);
+    in0 = _mm_shuffle_epi8(in0, shuf);
+    in1 = _mm_shuffle_epi8(in1, shuf);
+    in2 = _mm_shuffle_epi8(in2, shuf);
+    in3 = _mm_shuffle_epi8(in3, shuf);
 
-    const __m256i t0_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t0_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t0_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t0_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x0fc0fc00));
+    const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
+    const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
 
-    const __m256i t1_0 =
-        _mm256_mulhi_epu16(t0_0, _mm256_set1_epi32(0x04000040));
-    const __m256i t1_1 =
-        _mm256_mulhi_epu16(t0_1, _mm256_set1_epi32(0x04000040));
-    const __m256i t1_2 =
-        _mm256_mulhi_epu16(t0_2, _mm256_set1_epi32(0x04000040));
-    const __m256i t1_3 =
-        _mm256_mulhi_epu16(t0_3, _mm256_set1_epi32(0x04000040));
+    const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
+    const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
+    const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
+    const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
 
-    const __m256i t2_0 = _mm256_and_si256(in0, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t2_1 = _mm256_and_si256(in1, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t2_2 = _mm256_and_si256(in2, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t2_3 = _mm256_and_si256(in3, _mm256_set1_epi32(0x003f03f0));
+    const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
+    const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
 
-    const __m256i t3_0 =
-        _mm256_mullo_epi16(t2_0, _mm256_set1_epi32(0x01000010));
-    const __m256i t3_1 =
-        _mm256_mullo_epi16(t2_1, _mm256_set1_epi32(0x01000010));
-    const __m256i t3_2 =
-        _mm256_mullo_epi16(t2_2, _mm256_set1_epi32(0x01000010));
-    const __m256i t3_3 =
-        _mm256_mullo_epi16(t2_3, _mm256_set1_epi32(0x01000010));
+    const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
+    const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
+    const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
+    const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
 
-    const __m256i input0 = _mm256_or_si256(t1_0, t3_0);
-    const __m256i input1 = _mm256_or_si256(t1_1, t3_1);
-    const __m256i input2 = _mm256_or_si256(t1_2, t3_2);
-    const __m256i input3 = _mm256_or_si256(t1_3, t3_3);
+    const __m128i input0 = _mm_or_si128(t1_0, t3_0);
+    const __m128i input1 = _mm_or_si128(t1_1, t3_1);
+    const __m128i input2 = _mm_or_si128(t1_2, t3_2);
+    const __m128i input3 = _mm_or_si128(t1_3, t3_3);
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input0));
-    out += 32;
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input0));
+    out += 16;
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input1));
-    out += 32;
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input1));
+    out += 16;
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input2));
-    out += 32;
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(input3));
-    out += 32;
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input2));
+    out += 16;
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(input3));
+    out += 16;
   }
-  for (; i + 28 <= srclen; i += 24) {
-    // lo = [xxxx|DDDC|CCBB|BAAA]
-    // hi = [xxxx|HHHG|GGFF|FEEE]
-    const __m128i lo =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
-    const __m128i hi =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i + 4 * 3));
+  for (; i + 16 <= srclen; i += 12) {
+
+    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
 
     // bytes from groups A, B and C are needed in separate 32-bit lanes
-    // in = [0HHH|0GGG|0FFF|0EEE[0DDD|0CCC|0BBB|0AAA]
-    __m256i in = _mm256_shuffle_epi8(_mm256_set_m128i(hi, lo), shuf);
+    // in = [DDDD|CCCC|BBBB|AAAA]
+    //
+    //      an input triplet has layout
+    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
+    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
+    //        triplet
+    //
+    //      shuffling changes the order of bytes: 1, 0, 2, 1
+    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+    //                  processed bits
+    in = _mm_shuffle_epi8(in, shuf);
 
-    // this part is well commented in encode.sse.cpp
+    // unpacking
 
-    const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0fc0fc00));
-    const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
-    const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003f03f0));
-    const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
-    const __m256i indices = _mm256_or_si256(t1, t3);
+    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
+    const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
+    //          (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
+    //          multiplication)
+    const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
 
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(out),
-                        lookup_pshufb_improved<isbase64url>(indices));
-    out += 32;
+    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
+    const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000](
+    //          (d * (1 << 8), b * (1 << 4))
+    const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+
+    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+    const __m128i indices = _mm_or_si128(t1, t3);
+
+    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
+                     lookup_pshufb_improved<isbase64url>(indices));
+    out += 16;
   }
+
   return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                         srclen - i, options);
 }
-
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
     _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
     return;
   }
+
   // this particular implementation was inspired by work done by @animetosho
   // we do it in two steps, first 8 bytes and then second 8 bytes
   uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
@@ -28218,198 +40806,209 @@ static inline void compress(__m128i data, uint16_t mask, char *output) {
   __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
       tables::base64::pshufb_combine_table + pop1 * 8));
   __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
-
   _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
 }
 
-static inline void compress(__m256i data, uint32_t mask, char *output) {
-  if (mask == 0) {
-    _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), data);
-    return;
-  }
-  compress(_mm256_castsi256_si128(data), uint16_t(mask), output);
-  compress(_mm256_extracti128_si256(data, 1), uint16_t(mask >> 16),
-           output + _mm_popcnt_u32(~mask & 0xFFFF));
-}
-
 struct block64 {
-  __m256i chunks[2];
+  __m128i chunks[4];
 };
 
 template <bool base64_url>
-static inline uint32_t to_base64_mask(__m256i *src, uint32_t *error) {
-  const __m256i ascii_space_tbl =
-      _mm256_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa,
-                       0x0, 0xc, 0xd, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x0, 0x0,
-                       0x0, 0x0, 0x0, 0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0);
+static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
+  const __m128i ascii_space_tbl =
+      _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
+                    0xc, 0xd, 0x0, 0x0);
   // credit: aqrit
-  __m256i delta_asso;
+  __m128i delta_asso;
   if (base64_url) {
-    delta_asso =
-        _mm256_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0,
-                         0x0, 0x0, 0xF, 0x0, 0xF, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
   } else {
-    delta_asso = _mm256_setr_epi8(
-        0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00, 0x00,
-        0x00, 0x0F, 0x00, 0x0F, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-        0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
-  }
 
-  __m256i delta_values;
+    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+  }
+  __m128i delta_values;
   if (base64_url) {
-    delta_values = _mm256_setr_epi8(
-        0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF), uint8_t(0xBF), uint8_t(0xB9),
-        uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF), uint8_t(0xE0),
-        uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
-        uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9), 0x0, 0x11, uint8_t(0xC3),
-        uint8_t(0xBF), uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
+    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
+                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
+                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
+                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
   } else {
-    delta_values = _mm256_setr_epi8(
-        int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13), int8_t(0x04),
-        int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9), int8_t(0x00),
-        int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-        int8_t(0xB9), int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-        int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9),
-        int8_t(0x00), int8_t(0x10), int8_t(0xC3), int8_t(0xBF), int8_t(0xBF),
-        int8_t(0xB9), int8_t(0xB9));
-  }
-  __m256i check_asso;
 
+    delta_values =
+        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
+  }
+  __m128i check_asso;
   if (base64_url) {
-    check_asso =
-        _mm256_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x3,
-                         0x7, 0xB, 0xE, 0xB, 0x6, 0xD, 0x1, 0x1, 0x1, 0x1, 0x1,
-                         0x1, 0x1, 0x1, 0x1, 0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                               0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
   } else {
 
-    check_asso = _mm256_setr_epi8(
-        0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
-        0x0B, 0x0B, 0x0B, 0x0F, 0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-        0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
   }
-  __m256i check_values;
+  __m128i check_values;
   if (base64_url) {
-    check_values = _mm256_setr_epi8(
-        uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6), uint8_t(0xA6),
-        uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0, uint8_t(0x80),
-        0x0, uint8_t(0x80), uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-        uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF), uint8_t(0xB6),
-        uint8_t(0xA6), uint8_t(0xB5), uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
-        uint8_t(0x80), 0x0, uint8_t(0x80));
+    check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
+                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
+                                 uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
+                                 uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
+                                 uint8_t(0x80), 0x0, uint8_t(0x80));
   } else {
-    check_values = _mm256_setr_epi8(
-        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0xCF),
-        int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5), int8_t(0x86),
-        int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80), int8_t(0x91),
-        int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-        int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6), int8_t(0xB5),
-        int8_t(0x86), int8_t(0xD1), int8_t(0x80), int8_t(0xB1), int8_t(0x80),
-        int8_t(0x91), int8_t(0x80));
+
+    check_values =
+        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
   }
-  const __m256i shifted = _mm256_srli_epi32(*src, 3);
-  const __m256i delta_hash =
-      _mm256_avg_epu8(_mm256_shuffle_epi8(delta_asso, *src), shifted);
-  const __m256i check_hash =
-      _mm256_avg_epu8(_mm256_shuffle_epi8(check_asso, *src), shifted);
-  const __m256i out =
-      _mm256_adds_epi8(_mm256_shuffle_epi8(delta_values, delta_hash), *src);
-  const __m256i chk =
-      _mm256_adds_epi8(_mm256_shuffle_epi8(check_values, check_hash), *src);
-  const int mask = _mm256_movemask_epi8(chk);
+  const __m128i shifted = _mm_srli_epi32(*src, 3);
+
+  const __m128i delta_hash =
+      _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
+  const __m128i check_hash =
+      _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
+
+  const __m128i out =
+      _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
+  const __m128i chk =
+      _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
+  const int mask = _mm_movemask_epi8(chk);
   if (mask) {
-    __m256i ascii_space =
-        _mm256_cmpeq_epi8(_mm256_shuffle_epi8(ascii_space_tbl, *src), *src);
-    *error = (mask ^ _mm256_movemask_epi8(ascii_space));
+    __m128i ascii_space =
+        _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
+    *error = (mask ^ _mm_movemask_epi8(ascii_space));
   }
   *src = out;
-  return (uint32_t)mask;
+  return (uint16_t)mask;
 }
 
 template <bool base64_url>
 static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
   uint32_t err0 = 0;
   uint32_t err1 = 0;
+  uint32_t err2 = 0;
+  uint32_t err3 = 0;
   uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
   uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
-  *error = err0 | ((uint64_t)err1 << 32);
-  return m0 | (m1 << 32);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], &err2);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], &err3);
+  *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
+           ((uint64_t)err3 << 48);
+  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
+}
+
+#if defined(_MSC_VER) && !defined(__clang__)
+static inline size_t simdutf_tzcnt_u64(uint64_t num) {
+  unsigned long ret;
+  if (num == 0) {
+    return 64;
+  }
+  _BitScanForward64(&ret, num);
+  return ret;
+}
+#else // GCC or Clang
+static inline size_t simdutf_tzcnt_u64(uint64_t num) {
+  return num ? __builtin_ctzll(num) : 64;
 }
+#endif
 
 static inline void copy_block(block64 *b, char *output) {
-  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output), b->chunks[0]);
-  _mm256_storeu_si256(reinterpret_cast<__m256i *>(output + 32), b->chunks[1]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]);
+  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]);
 }
 
 static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   uint64_t nmask = ~mask;
-  compress(b->chunks[0], uint32_t(mask), output);
-  compress(b->chunks[1], uint32_t(mask >> 32),
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16),
+           output + _mm_popcnt_u64(nmask & 0xFFFF));
+  compress(b->chunks[2], uint16_t(mask >> 32),
            output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
+  compress(b->chunks[3], uint16_t(mask >> 48),
+           output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL));
   return _mm_popcnt_u64(nmask);
 }
 
 // The caller of this function is responsible to ensure that there are 64 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
-  b->chunks[0] = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
-  b->chunks[1] =
-      _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
+  b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  b->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
 }
 
 // The caller of this function is responsible to ensure that there are 128 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
-  __m256i m1 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
-  __m256i m2 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 16));
-  __m256i m3 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32));
-  __m256i m4 = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 48));
-  __m256i m1p = _mm256_permute2x128_si256(m1, m2, 0x20);
-  __m256i m2p = _mm256_permute2x128_si256(m1, m2, 0x31);
-  __m256i m3p = _mm256_permute2x128_si256(m3, m4, 0x20);
-  __m256i m4p = _mm256_permute2x128_si256(m3, m4, 0x31);
-  b->chunks[0] = _mm256_packus_epi16(m1p, m2p);
-  b->chunks[1] = _mm256_packus_epi16(m3p, m4p);
+  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
+  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
+  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
+  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
+  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
+  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
+  b->chunks[0] = _mm_packus_epi16(m1, m2);
+  b->chunks[1] = _mm_packus_epi16(m3, m4);
+  b->chunks[2] = _mm_packus_epi16(m5, m6);
+  b->chunks[3] = _mm_packus_epi16(m7, m8);
 }
 
-static inline void base64_decode(char *out, __m256i str) {
+static inline void base64_decode(char *out, __m128i str) {
   // credit: aqrit
-  const __m256i pack_shuffle =
-      _mm256_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
-                       2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
-  const __m256i t0 = _mm256_maddubs_epi16(str, _mm256_set1_epi32(0x01400140));
-  const __m256i t1 = _mm256_madd_epi16(t0, _mm256_set1_epi32(0x00011000));
-  const __m256i t2 = _mm256_shuffle_epi8(t1, pack_shuffle);
 
+  const __m128i pack_shuffle =
+      _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+
+  const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
+  const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
+  const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
   // Store the output:
-  _mm_storeu_si128((__m128i *)out, _mm256_castsi256_si128(t2));
-  _mm_storeu_si128((__m128i *)(out + 12), _mm256_extracti128_si256(t2, 1));
+  // this writes 16 bytes, but we only need 12.
+  _mm_storeu_si128((__m128i *)out, t2);
 }
 // decode 64 bytes and output 48 bytes
 static inline void base64_decode_block(char *out, const char *src) {
-  base64_decode(out,
-                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
-  base64_decode(out + 24, _mm256_loadu_si256(
-                              reinterpret_cast<const __m256i *>(src + 32)));
+  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+  base64_decode(out + 12,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out + 24,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+  base64_decode(out + 36,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
 }
 static inline void base64_decode_block_safe(char *out, const char *src) {
-  base64_decode(out,
-                _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src)));
-  char buffer[32]; // We enforce safety with a buffer.
-  base64_decode(
-      buffer, _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + 32)));
-  std::memcpy(out + 24, buffer, 24);
+  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
+  base64_decode(out + 12,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out + 24,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
+  char buffer[16];
+  base64_decode(buffer,
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+  std::memcpy(out + 36, buffer, 12);
 }
 static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  base64_decode(out + 24, b->chunks[1]);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  base64_decode(out + 36, b->chunks[3]);
 }
 static inline void base64_decode_block_safe(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  char buffer[32]; // We enforce safety with a buffer.
-  base64_decode(buffer, b->chunks[1]);
-  std::memcpy(out + 24, buffer, 24);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  char buffer[16];
+  base64_decode(buffer, b->chunks[3]);
+  std::memcpy(out + 36, buffer, 12);
 }
 
 template <bool base64_url, typename chartype>
@@ -28456,7 +41055,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
-  static_assert(block_size >= 2, "block_size must be at least two");
+  static_assert(block_size >= 2, "block should of size 2 or more");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
@@ -28469,7 +41068,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        size_t error_offset = _tzcnt_u64(error);
+        size_t error_offset = simdutf_tzcnt_u64(error);
         return {error_code::INVALID_BASE64_CHARACTER,
                 size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
       }
@@ -28512,7 +41111,6 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   // time, otherwise, we should just decode directly.
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
-
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
@@ -28598,15 +41196,15 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   }
   return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
-/* end file src/haswell/avx2_base64.cpp */
+/* end file src/westmere/sse_base64.cpp */
 
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 
 // Walks through a buffer in block-sized increments, loading the last part with
@@ -28712,12 +41310,12 @@ simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
 }
 
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_validation {
 
@@ -28937,12 +41535,12 @@ struct utf8_checker {
 using utf8_validation::utf8_checker;
 
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_validation {
 
@@ -29077,14 +41675,14 @@ result generic_validate_ascii_with_errors(const char *input, size_t length) {
 
 } // namespace utf8_validation
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_validator.h */
 // transcoding from UTF-8 to UTF-16
 /* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 
@@ -29155,13 +41753,13 @@ simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
 
 } // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 /* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf16 {
 using namespace simd;
@@ -29490,14 +42088,14 @@ struct validating_transcoder {
 }; // struct utf8_checker
 } // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 // transcoding from UTF-8 to UTF-32
 /* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 
@@ -29536,13 +42134,13 @@ simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
 
 } // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 /* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_utf32 {
 using namespace simd;
@@ -29857,14 +42455,14 @@ struct validating_transcoder {
 }; // struct utf8_checker
 } // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 // other functions
 /* begin file src/generic/utf8.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8 {
 
@@ -29899,12 +42497,12 @@ simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
 }
 } // namespace utf8
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8.h */
 /* begin file src/generic/utf16.h */
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf16 {
 
@@ -29974,15 +42572,14 @@ change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
 
 } // namespace utf16
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf16.h */
-
 // transcoding from UTF-8 to Latin 1
 /* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_latin1 {
 using namespace simd;
@@ -30292,13 +42889,13 @@ struct validating_transcoder {
 }; // struct utf8_checker
 } // namespace utf8_to_latin1
 } // unnamed namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
 /* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 /* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 namespace {
 namespace utf8_to_latin1 {
 using namespace simd;
@@ -30372,19 +42969,24 @@ simdutf_really_inline size_t convert_valid(const char *in, size_t size,
 
 } // namespace utf8_to_latin1
 } // namespace
-} // namespace haswell
+} // namespace westmere
 } // namespace simdutf
   // namespace simdutf
 /* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
+//
+// Implementation-specific overrides
+//
+
 namespace simdutf {
-namespace haswell {
+namespace westmere {
 
 simdutf_warn_unused int
 implementation::detect_encodings(const char *input,
                                  size_t length) const noexcept {
   // If there is a BOM, then we trust it.
   auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: reimplement as a one-pass algorithm.
   if (bom_encoding != encoding_type::unspecified) {
     return bom_encoding;
   }
@@ -30408,22 +43010,23 @@ implementation::detect_encodings(const char *input,
 
 simdutf_warn_unused bool
 implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8(buf, len);
+  return westmere::utf8_validation::generic_validate_utf8(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_utf8_with_errors(
     const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
 simdutf_warn_unused bool
 implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii(buf, len);
+  return westmere::utf8_validation::generic_validate_ascii(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_ascii_with_errors(
     const char *buf, size_t len) const noexcept {
-  return haswell::utf8_validation::generic_validate_ascii_with_errors(buf, len);
+  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,
+                                                                       len);
 }
 
 simdutf_warn_unused bool
@@ -30434,7 +43037,7 @@ implementation::validate_utf16le(const char16_t *buf,
     // handling nullptr
     return true;
   }
-  const char16_t *tail = avx2_validate_utf16<endianness::LITTLE>(buf, len);
+  const char16_t *tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail,
                                                        len - (tail - buf));
@@ -30451,7 +43054,7 @@ implementation::validate_utf16be(const char16_t *buf,
     // handling nullptr
     return true;
   }
-  const char16_t *tail = avx2_validate_utf16<endianness::BIG>(buf, len);
+  const char16_t *tail = sse_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
   } else {
@@ -30461,7 +43064,7 @@ implementation::validate_utf16be(const char16_t *buf,
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
         buf + res.count, len - res.count);
@@ -30473,7 +43076,7 @@ simdutf_warn_unused result implementation::validate_utf16le_with_errors(
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = avx2_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
         buf + res.count, len - res.count);
@@ -30490,7 +43093,7 @@ implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
     // handling nullptr
     return true;
   }
-  const char32_t *tail = avx2_validate_utf32le(buf, len);
+  const char32_t *tail = sse_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
   } else {
@@ -30500,12 +43103,12 @@ implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(
     const char32_t *buf, size_t len) const noexcept {
-  if (simdutf_unlikely(len == 0)) {
+  if (len == 0) {
     // empty input is valid UTF-32. protect the implementation from
     // handling nullptr
     return result(error_code::SUCCESS, 0);
   }
-  result res = avx2_validate_utf32le_with_errors(buf, len);
+  result res = sse_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res =
         scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -30517,8 +43120,9 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
     const char *buf, size_t len, char *utf8_output) const noexcept {
+
   std::pair<const char *, char *> ret =
-      avx2_convert_latin1_to_utf8(buf, len, utf8_output);
+      sse_convert_latin1_to_utf8(buf, len, utf8_output);
   size_t converted_chars = ret.second - utf8_output;
 
   if (ret.first != buf + len) {
@@ -30533,7 +43137,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      avx2_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+      sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30553,7 +43157,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      avx2_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
+      sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30573,7 +43177,7 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
     const char *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char *, char32_t *> ret =
-      avx2_convert_latin1_to_utf32(buf, len, utf32_output);
+      sse_convert_latin1_to_utf32(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30602,8 +43206,8 @@ simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *input, size_t size, char *latin1_output) const noexcept {
-  return utf8_to_latin1::convert_valid(input, size, latin1_output);
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
@@ -30663,12 +43267,12 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_latin1<endianness::LITTLE>(buf, len,
-                                                                latin1_output);
+      sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
   size_t saved_bytes = ret.second - latin1_output;
+
   if (ret.first != buf + len) {
     const size_t scalar_saved_bytes =
         scalar::utf16_to_latin1::convert<endianness::LITTLE>(
@@ -30684,12 +43288,12 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_latin1<endianness::BIG>(buf, len,
-                                                             latin1_output);
+      sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
   size_t saved_bytes = ret.second - latin1_output;
+
   if (ret.first != buf + len) {
     const size_t scalar_saved_bytes =
         scalar::utf16_to_latin1::convert<endianness::BIG>(
@@ -30706,7 +43310,7 @@ simdutf_warn_unused result
 implementation::convert_utf16le_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      avx2_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+      sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
           buf, len, latin1_output);
   if (ret.first.error) {
     return ret.first;
@@ -30733,8 +43337,8 @@ simdutf_warn_unused result
 implementation::convert_utf16be_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      avx2_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
-                                                                latin1_output);
+      sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                               latin1_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -30758,21 +43362,20 @@ implementation::convert_utf16be_to_latin1_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function
+  // optimization opportunity: we could provide an optimized function.
   return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: implement a custom function
+  // optimization opportunity: we could provide an optimized function.
   return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8<endianness::LITTLE>(buf, len,
-                                                              utf8_output);
+      sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30792,8 +43395,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8<endianness::BIG>(buf, len,
-                                                           utf8_output);
+      sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30815,7 +43417,7 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
+      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
           buf, len, utf8_output);
   if (ret.first.error) {
     return ret.first;
@@ -30843,7 +43445,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      haswell::avx2_convert_utf16_to_utf8_with_errors<endianness::BIG>(
+      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(
           buf, len, utf8_output);
   if (ret.first.error) {
     return ret.first;
@@ -30876,34 +43478,16 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
   return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      avx2_convert_utf32_to_utf8(buf, len, utf8_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - utf8_output;
-  if (ret.first != buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
 simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
     const char32_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char32_t *, char *> ret =
-      avx2_convert_utf32_to_latin1(buf, len, latin1_output);
+      sse_convert_utf32_to_latin1(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
   size_t saved_bytes = ret.second - latin1_output;
-  if (ret.first != buf + len) {
+  // if (ret.first != buf + len) {
+  if (ret.first < buf + len) {
     const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
         ret.first, len - (ret.first - buf), ret.second);
     if (scalar_saved_bytes == 0) {
@@ -30919,7 +43503,8 @@ simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      avx2_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+      westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
+                                                        latin1_output);
   if (ret.first.count != len) {
     result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
         buf + ret.first.count, len - ret.first.count, ret.second);
@@ -30938,15 +43523,35 @@ simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
     const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: we could provide an optimized function.
   return convert_utf32_to_latin1(buf, len, latin1_output);
 }
 
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      sse_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      haswell::avx2_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+      westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
   if (ret.first.count != len) {
     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
         buf + ret.first.count, len - ret.first.count, ret.second);
@@ -30966,8 +43571,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32<endianness::LITTLE>(buf, len,
-                                                               utf32_output);
+      sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -30987,8 +43591,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32<endianness::BIG>(buf, len,
-                                                            utf32_output);
+      sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -31010,7 +43613,7 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
+      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
           buf, len, utf32_output);
   if (ret.first.error) {
     return ret.first;
@@ -31038,7 +43641,7 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      haswell::avx2_convert_utf16_to_utf32_with_errors<endianness::BIG>(
+      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(
           buf, len, utf32_output);
   if (ret.first.error) {
     return ret.first;
@@ -31069,7 +43672,7 @@ simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      avx2_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+      sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -31089,7 +43692,7 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      avx2_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+      sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -31111,7 +43714,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
+      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
           buf, len, utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
@@ -31135,7 +43738,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      haswell::avx2_convert_utf32_to_utf16_with_errors<endianness::BIG>(
+      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(
           buf, len, utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
@@ -31220,26 +43823,11 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
   return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
-}
-
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
-
 simdutf_warn_unused size_t
 implementation::utf16_length_from_latin1(size_t length) const noexcept {
   return scalar::latin1::utf16_length_from_latin1(length);
 }
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return utf8::utf16_length_from_utf8(input, length);
-}
-
 simdutf_warn_unused size_t
 implementation::utf32_length_from_latin1(size_t length) const noexcept {
   return scalar::latin1::utf32_length_from_latin1(length);
@@ -31247,91 +43835,110 @@ implementation::utf32_length_from_latin1(size_t length) const noexcept {
 
 simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
     const char *input, size_t len) const noexcept {
-  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
-  size_t answer = len / sizeof(__m256i) * sizeof(__m256i);
+  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
+  size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
   size_t i = 0;
   if (answer >= 2048) { // long strings optimization
-    __m256i four_64bits = _mm256_setzero_si256();
-    while (i + sizeof(__m256i) <= len) {
-      __m256i runner = _mm256_setzero_si256();
-      // We can do up to 255 loops without overflow.
-      size_t iterations = (len - i) / sizeof(__m256i);
+    __m128i two_64bits = _mm_setzero_si128();
+    while (i + sizeof(__m128i) <= len) {
+      __m128i runner = _mm_setzero_si128();
+      size_t iterations = (len - i) / sizeof(__m128i);
       if (iterations > 255) {
         iterations = 255;
       }
-      size_t max_i = i + iterations * sizeof(__m256i) - sizeof(__m256i);
-      for (; i + 4 * sizeof(__m256i) <= max_i; i += 4 * sizeof(__m256i)) {
-        __m256i input1 = _mm256_loadu_si256((const __m256i *)(data + i));
-        __m256i input2 =
-            _mm256_loadu_si256((const __m256i *)(data + i + sizeof(__m256i)));
-        __m256i input3 = _mm256_loadu_si256(
-            (const __m256i *)(data + i + 2 * sizeof(__m256i)));
-        __m256i input4 = _mm256_loadu_si256(
-            (const __m256i *)(data + i + 3 * sizeof(__m256i)));
-        __m256i input12 =
-            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input1),
-                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input2));
-        __m256i input23 =
-            _mm256_add_epi8(_mm256_cmpgt_epi8(_mm256_setzero_si256(), input3),
-                            _mm256_cmpgt_epi8(_mm256_setzero_si256(), input4));
-        __m256i input1234 = _mm256_add_epi8(input12, input23);
-        runner = _mm256_sub_epi8(runner, input1234);
+      size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
+      for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
+        __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
+        __m128i input2 =
+            _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
+        __m128i input3 =
+            _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
+        __m128i input4 =
+            _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
+        __m128i input12 =
+            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
+                         _mm_cmpgt_epi8(_mm_setzero_si128(), input2));
+        __m128i input34 =
+            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
+                         _mm_cmpgt_epi8(_mm_setzero_si128(), input4));
+        __m128i input1234 = _mm_add_epi8(input12, input34);
+        runner = _mm_sub_epi8(runner, input1234);
       }
-      for (; i <= max_i; i += sizeof(__m256i)) {
-        __m256i input_256_chunk =
-            _mm256_loadu_si256((const __m256i *)(data + i));
-        runner = _mm256_sub_epi8(
-            runner, _mm256_cmpgt_epi8(_mm256_setzero_si256(), input_256_chunk));
+      for (; i <= max_i; i += sizeof(__m128i)) {
+        __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
+        runner = _mm_sub_epi8(runner,
+                              _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
       }
-      four_64bits = _mm256_add_epi64(
-          four_64bits, _mm256_sad_epu8(runner, _mm256_setzero_si256()));
+      two_64bits =
+          _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
     }
-    answer += _mm256_extract_epi64(four_64bits, 0) +
-              _mm256_extract_epi64(four_64bits, 1) +
-              _mm256_extract_epi64(four_64bits, 2) +
-              _mm256_extract_epi64(four_64bits, 3);
-  } else if (answer > 0) {
-    for (; i + sizeof(__m256i) <= len; i += sizeof(__m256i)) {
-      __m256i latin = _mm256_loadu_si256((const __m256i *)(data + i));
-      uint32_t non_ascii = _mm256_movemask_epi8(latin);
+    answer +=
+        _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
+  } else if (answer > 0) { // short string optimization
+    for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
+      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
+      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+      latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
+      non_ascii = (uint16_t)_mm_movemask_epi8(latin);
+      answer += count_ones(non_ascii);
+    }
+    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
+      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
+      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
       answer += count_ones(non_ascii);
     }
   }
   return answer + scalar::latin1::utf8_length_from_latin1(
-                      reinterpret_cast<const char *>(data + i), len - i);
+                      reinterpret_cast<const char *>(str + i), len - i);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffffff80 = _mm256_set1_epi32((uint32_t)0xffffff80);
-  const __m256i v_fffff800 = _mm256_set1_epi32((uint32_t)0xfffff800);
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_00000000 = _mm_setzero_si128();
+  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
+  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
   size_t pos = 0;
   size_t count = 0;
-  for (; pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
-    const __m256i ascii_bytes_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffffff80), v_00000000);
-    const __m256i one_two_bytes_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_fffff800), v_00000000);
-    const __m256i two_bytes_bytemask =
-        _mm256_xor_si256(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m256i one_two_three_bytes_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const __m256i three_bytes_bytemask =
-        _mm256_xor_si256(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint32_t ascii_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(ascii_bytes_bytemask));
-    const uint32_t two_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(two_bytes_bytemask));
-    const uint32_t three_bytes_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(three_bytes_bytemask));
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
+    const __m128i ascii_bytes_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
+    const __m128i one_two_bytes_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
+    const __m128i two_bytes_bytemask =
+        _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m128i one_two_three_bytes_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+    const __m128i three_bytes_bytemask =
+        _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
+    const uint16_t ascii_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
+    const uint16_t two_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
+    const uint16_t three_bytes_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
 
     size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
     size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
     size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 32 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
   }
   return count +
          scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
@@ -31339,18 +43946,18 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
 
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m256i v_00000000 = _mm256_setzero_si256();
-  const __m256i v_ffff0000 = _mm256_set1_epi32((uint32_t)0xffff0000);
+  const __m128i v_00000000 = _mm_setzero_si128();
+  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
   size_t pos = 0;
   size_t count = 0;
-  for (; pos + 8 <= length; pos += 8) {
-    __m256i in = _mm256_loadu_si256((__m256i *)(input + pos));
-    const __m256i surrogate_bytemask =
-        _mm256_cmpeq_epi32(_mm256_and_si256(in, v_ffff0000), v_00000000);
-    const uint32_t surrogate_bitmask =
-        static_cast<uint32_t>(_mm256_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (32 - count_ones(surrogate_bitmask)) / 4;
-    count += 8 + surrogate_count;
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
+    const __m128i surrogate_bytemask =
+        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
+    const uint16_t surrogate_bitmask =
+        static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
+    size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+    count += 4 + surrogate_count;
   }
   return count +
          scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
@@ -31386,3612 +43993,5867 @@ simdutf_warn_unused full_result implementation::base64_to_binary_details(
                                              last_chunk_options);
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
+    const char16_t *input, size_t length) const noexcept {
+  return scalar::base64::maximal_binary_length_from_base64(input, length);
+}
+
+simdutf_warn_unused result implementation::base64_to_binary(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused full_result implementation::base64_to_binary_details(
+    const char16_t *input, size_t length, char *output, base64_options options,
+    last_chunk_handling_options last_chunk_options) const noexcept {
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
+}
+
+simdutf_warn_unused size_t implementation::base64_length_from_binary(
+    size_t length, base64_options options) const noexcept {
+  return scalar::base64::base64_length_from_binary(length, options);
+}
+
+size_t implementation::binary_to_base64(const char *input, size_t length,
+                                        char *output,
+                                        base64_options options) const noexcept {
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
+}
+} // namespace westmere
+} // namespace simdutf
+
+/* begin file src/simdutf/westmere/end.h */
+#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
+// nothing needed.
+#else
+SIMDUTF_UNTARGET_REGION
+#endif
+
+/* end file src/simdutf/westmere/end.h */
+/* end file src/westmere/implementation.cpp */
+#endif
+#if SIMDUTF_IMPLEMENTATION_LSX
+/* begin file src/lsx/implementation.cpp */
+/* begin file src/simdutf/lsx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lsx"
+// #define SIMDUTF_IMPLEMENTATION lsx
+/* end file src/simdutf/lsx/begin.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+#ifndef SIMDUTF_LSX_H
+  #error "lsx.h must be included"
+#endif
+using namespace simd;
+
+// convert vmskltz/vmskgez/vmsknz to
+// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
+const uint8_t lsx_1_2_utf8_bytes_mask[] = {
+    0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
+    85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
+    86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
+    89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
+    90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
+    101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
+    102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
+    105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
+    106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+    149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+    150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+    153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+    154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+    165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+    166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+    169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+    170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+    255};
+
+simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
+  // const v16u8 shuf = {1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+  // return __lsx_vshuf_b(__lsx_vldi(0), vec, shuf);
+  return __lsx_vshuf4i_b(vec, 0b10110001);
+  // return __lsx_vor_v(__lsx_vslli_h(vec, 8), __lsx_vsrli_h(vec, 8));
+}
+
+simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
+  return input.is_ascii();
+}
+
+simdutf_unused simdutf_really_inline simd8<bool>
+must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
+                     const simd8<uint8_t> prev3) {
+  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+  // is using ^ as well. This will work fine because we only have to report
+  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+  // overlapping multibyte characters, and if that happens, there is guaranteed
+  // to be at least *one* lead byte that is part of only 1 other multibyte
+  // character. The error will be detected there.
+  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
+}
+
+simdutf_really_inline simd8<bool>
+must_be_2_3_continuation(const simd8<uint8_t> prev2,
+                         const simd8<uint8_t> prev3) {
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  return is_third_byte ^ is_fourth_byte;
+}
+
+// common functions for utf8 conversions
+simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
+  // Low half contains  10bbbbbb|10cccccc
+  // High half contains 1110aaaa|1110aaaa
+  const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
+  const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
+
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
+  // 1110aaaa => aaaa0000
+  __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
+  // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
+                                     perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
+  // 0010bbbb bbcccccc => aaaabbbb bbcccccc
+  composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
+
+  return composed;
+}
+
+simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
+  // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
+  __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
+  // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
+  composed = __lsx_vbitsel_v(
+      __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
+      __lsx_vsrli_h(composed, 8),                   /* bbbbbb >> 8 */
+      __lsx_vrepli_h(0x3f));                        /* 0x003f */
+  return composed;
+}
+
+simdutf_really_inline __m128i
+convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh =
+      __lsx_vld(reinterpret_cast<const uint8_t *>(
+                    simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
+                0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00000aaa aa000000
+  const __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+  __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  composed = __lsx_vadd_h(ascii, composed);
+  return composed;
+}
+
+/* begin file src/lsx/lsx_validate_utf16.cpp */
+/*
+    In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
+
+    In a vectorized algorithm we want to examine the most significant
+    nibble in order to select a fast path. If none of highest nibbles
+    are 0xD (13), than we are sure that UTF-16 chunk in a vector
+    register is valid.
+
+    Let us analyze what we need to check if the nibble is 0xD. The
+    value of the preceding nibble determines what we have:
+
+    0xd000 .. 0xd7ff - a valid word
+    0xd800 .. 0xdbff - low surrogate
+    0xdc00 .. 0xdfff - high surrogate
+
+    Other constraints we have to consider:
+    - there must not be two consecutive low surrogates (0xd800 .. 0xdbff)
+    - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
+    - there must not be sole low surrogate nor high surrogate
+
+    We're going to build three bitmasks based on the 3rd nibble:
+    - V = valid word,
+    - L = low surrogate (0xd800 .. 0xdbff)
+    - H = high surrogate (0xdc00 .. 0xdfff)
+
+      0   1   2   3   4   5   6   7    <--- word index
+    [ V | L | H | L | H | V | V | L ]
+      1   0   0   0   0   1   1   0     - V = valid masks
+      0   1   0   1   0   0   0   1     - L = low surrogate
+      0   0   1   0   1   0   0   0     - H high surrogate
+
+
+      1   0   0   0   0   1   1   0   V = valid masks
+      0   1   0   1   0   0   0   0   a = L & (H >> 1)
+      0   0   1   0   1   0   0   0   b = a << 1
+      1   1   1   1   1   1   1   0   c = V | a | b
+                                  ^
+                                  the last bit can be zero, we just consume 7
+   code units and recheck this word in the next iteration
+*/
+
+/* Returns:
+   - pointer to the last unprocessed character (a scalar fallback should check
+   the rest);
+   - nullptr if an error was detected.
+*/
+template <endianness big_endian>
+const char16_t *lsx_validate_utf16(const char16_t *input, size_t size) {
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
+    const auto in = simd8<uint8_t>(__lsx_vssrlni_bu_h(in1.value, in0.value, 8));
+
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
+
+      if (c == 0xffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return nullptr;
+      }
+    }
+  }
+
+  return input;
+}
+
+template <endianness big_endian>
+const result lsx_validate_utf16_with_errors(const char16_t *input,
+                                            size_t size) {
+  const char16_t *start = input;
+  const char16_t *end = input + size;
+
+  const auto v_d8 = simd8<uint8_t>::splat(0xd8);
+  const auto v_f8 = simd8<uint8_t>::splat(0xf8);
+  const auto v_fc = simd8<uint8_t>::splat(0xfc);
+  const auto v_dc = simd8<uint8_t>::splat(0xdc);
+
+  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+    // 0. Load data: since the validation takes into account only higher
+    //    byte of each word, we compress the two vectors into one which
+    //    consists only the higher bytes.
+    auto in0 = simd16<uint16_t>(input);
+    auto in1 =
+        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+
+    if (big_endian) {
+      in0 = in0.swap_bytes();
+      in1 = in1.swap_bytes();
+    }
+
+    const auto in = simd8<uint8_t>(__lsx_vssrlni_bu_h(in1.value, in0.value, 8));
+
+    // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
+    const auto surrogates_wordmask = (in & v_f8) == v_d8;
+    const uint16_t surrogates_bitmask =
+        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
+    if (surrogates_bitmask == 0x0000) {
+      input += 16;
+    } else {
+      // 2. We have some surrogates that have to be distinguished:
+      //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
+      //    - high surrogates: 0b1101'11xx'yyyy'yyyy (0xDC00..0xDFFF)
+      //
+      //    Fact: high surrogate has 11th bit set (3rd bit in the higher word)
+
+      // V - non-surrogate code units
+      //     V = not surrogates_wordmask
+      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+
+      // H - word-mask for high surrogates: the six highest bits are 0b1101'11
+      const auto vH = (in & v_fc) == v_dc;
+      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+
+      // L - word mask for low surrogates
+      //     L = not H and surrogates_wordmask
+      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+
+      const uint16_t a = static_cast<uint16_t>(
+          L & (H >> 1)); // A low surrogate must be followed by high one.
+                         // (A low surrogate placed in the 7th register's word
+                         // is an exception we handle.)
+      const uint16_t b = static_cast<uint16_t>(
+          a << 1); // Just mark that the opinput - startite fact is hold,
+                   // thanks to that we have only two masks for valid case.
+      const uint16_t c = static_cast<uint16_t>(
+          V | a | b); // Combine all the masks into the final one.
+
+      if (c == 0xffff) {
+        // The whole input register contains valid UTF-16, i.e.,
+        // either single code units or proper surrogate pairs.
+        input += 16;
+      } else if (c == 0x7fff) {
+        // The 15 lower code units of the input register contains valid UTF-16.
+        // The 15th word may be either a low or high surrogate. It the next
+        // iteration we 1) check if the low surrogate is followed by a high
+        // one, 2) reject sole high surrogate.
+        input += 15;
+      } else {
+        return result(error_code::SURROGATE, input - start);
+      }
+    }
+  }
+
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/lsx/lsx_validate_utf16.cpp */
+/* begin file src/lsx/lsx_validate_utf32le.cpp */
+
+const char32_t *lsx_validate_utf32le(const char32_t *input, size_t size) {
+  const char32_t *end = input + size;
+
+  __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
+  __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
+  __m128i currentmax = __lsx_vldi(0x0);
+  __m128i currentoffsetmax = __lsx_vldi(0x0);
+
+  while (input + 4 < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lsx_vmax_wu(in, currentmax);
+    // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
+    currentoffsetmax =
+        __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
+
+    input += 4;
+  }
+
+  __m128i is_zero =
+      __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
+  if (__lsx_bnz_v(is_zero)) {
+    return nullptr;
+  }
+
+  is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
+                         standardoffsetmax);
+  if (__lsx_bnz_v(is_zero)) {
+    return nullptr;
+  }
+
+  return input;
+}
+
+const result lsx_validate_utf32le_with_errors(const char32_t *input,
+                                              size_t size) {
+  const char32_t *start = input;
+  const char32_t *end = input + size;
+
+  __m128i offset = __lsx_vreplgr2vr_w(uint32_t(0xffff2000));
+  __m128i standardoffsetmax = __lsx_vreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m128i standardmax = __lsx_vldi(-2288); /*0x10ffff*/
+  __m128i currentmax = __lsx_vldi(0x0);
+  __m128i currentoffsetmax = __lsx_vldi(0x0);
+
+  while (input + 4 < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lsx_vmax_wu(in, currentmax);
+    currentoffsetmax =
+        __lsx_vmax_wu(__lsx_vadd_w(in, offset), currentoffsetmax);
+
+    __m128i is_zero =
+        __lsx_vxor_v(__lsx_vmax_wu(currentmax, standardmax), standardmax);
+    if (__lsx_bnz_v(is_zero)) {
+      return result(error_code::TOO_LARGE, input - start);
+    }
+
+    is_zero = __lsx_vxor_v(__lsx_vmax_wu(currentoffsetmax, standardoffsetmax),
+                           standardoffsetmax);
+    if (__lsx_bnz_v(is_zero)) {
+      return result(error_code::SURROGATE, input - start);
+    }
+
+    input += 4;
+  }
+
+  return result(error_code::SUCCESS, input - start);
+}
+/* end file src/lsx/lsx_validate_utf32le.cpp */
+
+/* begin file src/lsx/lsx_convert_latin1_to_utf8.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+
+std::pair<const char *, char *>
+lsx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                           char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char *end = latin1_input + len;
+
+  __m128i zero = __lsx_vldi(0);
+  // We always write 16 bytes, of which more than the first 8 bytes
+  // are valid. A safety margin of 8 is more than sufficient.
+  while (latin1_input + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
+    uint32_t ascii = __lsx_vpickve2gr_hu(__lsx_vmskgez_b(in8), 0);
+    if (ascii == 0xffff) { // ASCII fast path!!!!
+      __lsx_vst(in8, utf8_output, 0);
+      utf8_output += 16;
+      latin1_input += 16;
+      continue;
+    }
+    // We just fallback on UTF-16 code. This could be optimized/simplified
+    // further.
+    __m128i in16 = __lsx_vilvl_b(zero, in8);
+    // 1. prepare 2-byte values
+    // input 8-bit word : [aabb|bbbb] x 8
+    // expected output   : [1100|00aa|10bb|bbbb] x 8
+    // t0 = [0000|00aa|bbbb|bb00]
+    __m128i t0 = __lsx_vslli_h(in16, 2);
+    // t1 = [0000|00aa|0000|0000]
+    __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785));
+    // t3 = [0000|00aa|00bb|bbbb]
+    __m128i t2 = __lsx_vbitsel_v(t1, in16, __lsx_vrepli_h(0x3f));
+    // t4 = [1100|00aa|10bb|bbbb]
+    __m128i t3 = __lsx_vor_v(t2, __lsx_vreplgr2vr_h(uint16_t(0xc080)));
+    // merge ASCII and 2-byte codewords
+    __m128i one_byte_bytemask = __lsx_vsle_hu(in16, __lsx_vrepli_h(0x7F));
+    __m128i utf8_unpacked = __lsx_vbitsel_v(t3, in16, one_byte_bytemask);
+
+    const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                             [lsx_1_2_utf8_bytes_mask[(ascii & 0xff)]][0];
+    __m128i shuffle = __lsx_vld(row + 1, 0);
+    __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+
+    // store bytes
+    __lsx_vst(utf8_packed, utf8_output, 0);
+    // adjust pointers
+    latin1_input += 8;
+    utf8_output += row[0];
+
+  } // while
+
+  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lsx/lsx_convert_latin1_to_utf8.cpp */
+/* begin file src/lsx/lsx_convert_latin1_to_utf16.cpp */
+std::pair<const char *, char16_t *>
+lsx_convert_latin1_to_utf16le(const char *buf, size_t len,
+                              char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  __m128i zero = __lsx_vldi(0);
+  while (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(zero, in8);
+    __m128i inhigh = __lsx_vilvh_b(zero, in8);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+
+    utf16_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf16_output);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
-}
+std::pair<const char *, char16_t *>
+lsx_convert_latin1_to_utf16be(const char *buf, size_t len,
+                              char16_t *utf16_output) {
+  const char *end = buf + len;
+  __m128i zero = __lsx_vldi(0);
+  while (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(in8, zero);
+    __m128i inhigh = __lsx_vilvh_b(in8, zero);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+    utf16_output += 16;
+    buf += 16;
+  }
 
-simdutf_warn_unused full_result implementation::base64_to_binary_details(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  return (options & base64_url)
-             ? compress_decode_base64<true>(output, input, length, options,
-                                            last_chunk_options)
-             : compress_decode_base64<false>(output, input, length, options,
-                                             last_chunk_options);
+  return std::make_pair(buf, utf16_output);
 }
+/* end file src/lsx/lsx_convert_latin1_to_utf16.cpp */
+/* begin file src/lsx/lsx_convert_latin1_to_utf32.cpp */
+std::pair<const char *, char32_t *>
+lsx_convert_latin1_to_utf32(const char *buf, size_t len,
+                            char32_t *utf32_output) {
+  const char *end = buf + len;
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
-}
+  while (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  if (options & base64_url) {
-    return encode_base64<true>(output, input, length, options);
-  } else {
-    return encode_base64<false>(output, input, length, options);
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, in8);
+    __m128i in16high = __lsx_vilvh_b(zero, in8);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output + 4), 0);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output + 8), 0);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output + 12), 0);
+
+    utf32_output += 16;
+    buf += 16;
   }
-}
-} // namespace haswell
-} // namespace simdutf
 
-/* begin file src/simdutf/haswell/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_HASWELL
-// nothing needed.
-#else
-SIMDUTF_UNTARGET_REGION
-#endif
+  return std::make_pair(buf, utf32_output);
+}
+/* end file src/lsx/lsx_convert_latin1_to_utf32.cpp */
 
+/* begin file src/lsx/lsx_convert_utf8_to_utf16.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 16, usually 12).
+template <endianness big_endian>
+size_t convert_masked_utf8_to_utf16(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char16_t *&utf16_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
 
-#if SIMDUTF_GCC11ORMORE // workaround for
-                        // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105593
-SIMDUTF_POP_DISABLE_WARNINGS
-#endif // end of workaround
-/* end file src/simdutf/haswell/end.h */
-/* end file src/haswell/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_PPC64
-/* begin file src/ppc64/implementation.cpp */
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    // The routine in simd.h is reused.
+    simd8<int8_t> temp{in};
+    temp.store_ascii_as_utf16<big_endian>(utf16_output);
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16;          // We consumed 16 bytes.
+  }
 
+  uint64_t buffer[2];
+  // 3 byte sequences are the next most common, as seen in CJK, which has long
+  // sequences of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
 
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
 
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
 
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return 12;         // We consumed 12 bytes.
+  }
 
-/* begin file src/simdutf/ppc64/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "ppc64"
-// #define SIMDUTF_IMPLEMENTATION ppc64
-/* end file src/simdutf/ppc64/begin.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-#ifndef SIMDUTF_PPC64_H
-  #error "ppc64.h must be included"
-#endif
-using namespace simd;
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
 
-simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
-  // careful: 0x80 is not ascii.
-  return input.reduce_or().saturating_sub(0b01111111u).bits_not_set_anywhere();
-}
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  const __m128i zero = __lsx_vldi(0);
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+    // Store
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    __m128i lowperm = __lsx_vpickev_h(perm, perm);
+    // 1 byte: 00000000 00000000
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 1110aaaa
+    __m128i highperm = __lsx_vpickod_h(perm, perm);
+    // 3 byte: aaaa0000 00000000
+    highperm = __lsx_vslli_h(highperm, 12);
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 0010bbbb bbcccccc
+    // 3 byte: 0010bbbb bbcccccc
+    __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
 
-simdutf_unused simdutf_really_inline simd8<bool>
-must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
-                     const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte =
-      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction
-  // will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
-         int8_t(0);
-}
+    __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
+    // aaaabbbb bbcccccc
+    composed = __lsx_vbitsel_v(highperm, composed, v0fff);
 
-simdutf_really_inline simd8<bool>
-must_be_2_3_continuation(const simd8<uint8_t> prev2,
-                         const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
-  // Caller requires a bool (all 1's). All values resulting from the subtraction
-  // will be <= 64, so signed comparison is fine.
-  return simd8<bool>(is_third_byte | is_fourth_byte);
-}
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
 
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+      // it is easier when we can assume they are all pairs. This version does
+      // not use the LUT, but 4 byte sequences are less common and the overhead
+      // of the extra memory access is less important than the early branch
+      // overhead in shorter sequences.
 
-/* begin file src/generic/buf_block_reader.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      __m128i shift = __lsx_vslli_b(swap, 2);
+      // Create a magic number containing the low 2 bits of the trail surrogate
+      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
+      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // vec(0000FF00) = __lsx_vldi(-1758)
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      __m128i trail =
+          __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
 
-// Walks through a buffer in block-sized increments, loading the last part with
-// spaces
-template <size_t STEP_SIZE> struct buf_block_reader {
-public:
-  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
-  simdutf_really_inline size_t block_index();
-  simdutf_really_inline bool has_full_block() const;
-  simdutf_really_inline const uint8_t *full_block() const;
-  /**
-   * Get the last block, padded with spaces.
-   *
-   * There will always be a last block, with at least 1 byte, unless len == 0
-   * (in which case this function fills the buffer with spaces and returns 0. In
-   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
-   * block with STEP_SIZE bytes and no spaces for padding.
-   *
-   * @return the number of effective characters in the last block.
-   */
-  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
-  simdutf_really_inline void advance();
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      __m128i lead = __lsx_vbitsel_v(
+          __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
+          __lsx_vrepli_h(0x3f /* 0x003f*/));
 
-private:
-  const uint8_t *buf;
-  const size_t len;
-  const size_t lenminusstep;
-  size_t idx;
-};
+      // Blend pairs
+      // __lsx_vldi(-1741) => vec(0x0000FFFF)
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      __m128i blend =
+          __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text_64(const uint8_t *text) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      __m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed = lsx_swap_bytes(composed);
+      }
+      // __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+      __lsx_vst(composed, reinterpret_cast<uint16_t *>(buffer), 0);
+      std::memcpy(utf16_output, buffer, 12);
+      utf16_output += 6; // We 3 32-bit surrogate pairs.
+      return 12;         // We consumed 12 bytes.
+    }
+    // 3 1-4 byte sequences
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we
+    // have. Unfortunately, we cannot trust the input. So it is possible to get
+    // 0xff as an input byte and it should not result in a surrogate pair. We
+    // need to check for that.
+    uint32_t permbuffer[4];
+    __lsx_vst(perm, permbuffer, 0);
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed
+    // first.
+    __m128i middlehigh = __lsx_vslli_w(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+    // Start assembling the sequence. Since the 4th byte is in the same position
+    // as it would be in a surrogate and there is no dependency, shift left
+    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    __m128i ab =
+        __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+    // Top 16 bits contains the high ten bits of the surrogate pair before
+    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
+    __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+    // correction
+    __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    __m128i mixed =
+        __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+    // 11110aaa bbbbbbcc|000000cc ccdddddd
+    __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
+    __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+    // surrogate prefixes in one magic 16-bit addition. similar magic number but
+    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
+    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    __m128i surrogates = __lsx_vadd_w(masked_pair, magic);
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    __m128i is_pair = __lsx_vslt_w(perm, zero);
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected = lsx_swap_bytes(selected);
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer_tmp[4];
+    __lsx_vst(selected, buffer_tmp, 0);
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+    // 0x00800000;
+    for (size_t i = 0; i < 3; i++) {
+      // Surrogate
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+        utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
+        utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
+        utf16_output += 2;
+      } else {
+        utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
+        utf16_output++;
+      }
+    }
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
 }
+/* end file src/lsx/lsx_convert_utf8_to_utf16.cpp */
+/* begin file src/lsx/lsx_convert_utf8_to_utf32.cpp */
+// Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
+// end of the code points. Only the least significant 12 bits of the mask
+// are accessed.
+// It returns how many bytes were consumed (up to 12).
+size_t convert_masked_utf8_to_utf32(const char *input,
+                                    uint64_t utf8_end_of_code_point_mask,
+                                    char32_t *&utf32_out) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
+  //
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+  //
+  // We first try a few fast paths.
+  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+    // We process in chunks of 16 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    simd8<int8_t> temp{in};
+    temp.store_ascii_as_utf32_tbl(utf32_out);
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16;          // We consumed 16 bytes.
+  }
+  __m128i zero = __lsx_vldi(0);
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
 
-// Routines to print masks and text for debugging bitmask operations
-simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
-  static char *buf =
-      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
-  in.store(reinterpret_cast<uint8_t *>(buf));
-  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
-    if (buf[i] < ' ') {
-      buf[i] = '_';
-    }
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12;         // We consumed 12 bytes.
   }
-  buf[sizeof(simd8x64<uint8_t>)] = '\0';
-  return buf;
-}
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if (input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
 
-simdutf_unused static char *format_mask(uint64_t mask) {
-  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
-  for (size_t i = 0; i < 64; i++) {
-    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return 12; // We consumed 12 bytes.
   }
-  buf[64] = '\0';
-  return buf;
-}
+  /// Either no fast path or an unimportant fast path.
 
-template <size_t STEP_SIZE>
-simdutf_really_inline
-buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
-    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
-      idx{0} {}
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
-  return idx;
+  if (idx < 64) {
+    // SIX (6) input code-code units
+    // Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return consumed;
+  } else if (idx < 145) {
+    // FOUR (4) input code-code units
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // Split
+    // 00000000 00000000 0ccccccc
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    __m128i high =
+        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    __m128i middle =
+        __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    __m128i composed =
+        __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return consumed;
+  } else if (idx < 209) {
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-32 code units. This uses the same method as the fixed 3 byte
+      // version, reversing and shift left insert. However, there is no need for
+      // a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common
+      // and the overhead of the extra memory access is less important than the
+      // early branch overhead in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
+                                       __lsx_vrepli_h(0x3f /*0x003F*/));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      __m128i merge2 =
+          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
+                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
+                          __lsx_vldi(-2545));        /*0x00000FFF*/
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+      // Store
+      __lsx_vst(composed, utf32_output, 0);
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12;         // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+    // due to surrogates no longer being involved.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+
+    // Ascii
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
+    __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+    // 00000000 00000000 0000cccc ccdddddd
+    __m128i cd =
+        __lsx_vbitsel_v(__lsx_vsrli_w(middle, 2), ascii, __lsx_vrepli_w(0x3f));
+
+    __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
+    // Insert twice
+    // 00000000 000aaabb bbbbxxxx xxxxxxxx
+    __m128i corrected_srli2 =
+        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
+    __m128i ab =
+        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
+    ab = __lsx_vsrli_w(ab, 4);
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    __m128i composed =
+        __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+    // Store
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
+  } else {
+    // here we know that there is an error but we do not handle errors
+    return 12;
+  }
 }
+/* end file src/lsx/lsx_convert_utf8_to_utf32.cpp */
+/* begin file src/lsx/lsx_convert_utf8_to_latin1.cpp */
+size_t convert_masked_utf8_to_latin1(const char *input,
+                                     uint64_t utf8_end_of_code_point_mask,
+                                     char *&latin1_output) {
+  // we use an approach where we try to process up to 12 input bytes.
+  // Why 12 input bytes and not 16? Because we are concerned with the size of
+  // the lookup tables. Also 12 is nicely divisible by two and three.
+  //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
+  // Optimization note: our main path below is load-latency dependent. Thus it
+  // is maybe beneficial to have fast paths that depend on branch prediction but
+  // have less latency. This results in more instructions but, potentially, also
+  // higher speeds.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
+    latin1_output += 16; // We wrote 16 18-bit characters.
+    return 16;           // We consumed 16 bytes.
+  }
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  // this indicates an invalid input:
+  if (idx >= 64) {
+    return consumed;
+  }
+  // Here we should have (idx < 64), if not, there is a bug in the validation or
+  // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+  // scenario we process SIX (6) input code-code units. The max length in bytes
+  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                             simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                         0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // ascii mask
+  // 1 byte: 11111111 11111111
+  // 2 byte: 00000000 00000000
+  __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
+  // utf8 mask
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00111111 00111111
+  __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
+                                   __lsx_vldi(0b00111111));
+  // mask
+  //  1 byte: 11111111 11111111
+  //  2 byte: 00111111 00111111
+  __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
+
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
+  // writing 8 bytes even though we only care about the first 6 bytes.
+  __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
 
-template <size_t STEP_SIZE>
-simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
-  return idx < lenminusstep;
+  uint64_t buffer[2];
+  // __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+  __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(buffer), 0);
+  std::memcpy(latin1_output, buffer, 6);
+  latin1_output += 6; // We wrote 6 bytes.
+  return consumed;
 }
+/* end file src/lsx/lsx_convert_utf8_to_latin1.cpp */
 
-template <size_t STEP_SIZE>
-simdutf_really_inline const uint8_t *
-buf_block_reader<STEP_SIZE>::full_block() const {
-  return &buf[idx];
+/* begin file src/lsx/lsx_convert_utf16_to_latin1.cpp */
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lsx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                            char *latin1_output) {
+  const char16_t *end = buf + len;
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
+    }
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
 }
 
-template <size_t STEP_SIZE>
-simdutf_really_inline size_t
-buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
-  if (len == idx) {
-    return 0;
-  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
-  std::memset(dst, 0x20,
-              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
-                          // to write out 8 or 16 bytes at once.
-  std::memcpy(dst, buf + idx, len - idx);
-  return len - idx;
+template <endianness big_endian>
+std::pair<result, char *>
+lsx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
+    }
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 16; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
+      }
+    }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
 }
+/* end file src/lsx/lsx_convert_utf16_to_latin1.cpp */
+/* begin file src/lsx/lsx_convert_utf16_to_utf8.cpp */
+/*
+    The vectorized algorithm works on single SSE register i.e., it
+    loads eight 16-bit code units.
 
-template <size_t STEP_SIZE>
-simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
-  idx += STEP_SIZE;
-}
+    We consider three cases:
+    1. an input register contains no surrogates and each value
+       is in range 0x0000 .. 0x07ff.
+    2. an input register contains no surrogates and values are
+       is in range 0x0000 .. 0xffff.
+    3. an input register contains surrogates --- i.e. codepoints
+       can have 16 or 32 bits.
 
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/buf_block_reader.h */
-/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_validation {
+    Ad 1.
 
-using namespace simd;
+    When values are less than 0x0800, it means that a 16-bit code unit
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
+    char) or 2) two UTF8 bytes.
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+    For this case we do only some shuffle to obtain these 2-byte
+    codes and finally compress the whole SSE register with a single
+    shuffle.
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+    Ad 2.
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+    When values fit in 16-bit code units, but are above 0x07ff, then
+    a single word may produce one, two or three UTF8 bytes.
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+    We prepare data for all these three cases in two registers.
+    The first register contains lower two UTF8 bytes (used in all
+    cases), while the second one contains just the third byte for
+    the three-UTF8-bytes case.
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+    Finally these two registers are interleaved forming eight-element
+    array of 32-bit values. The array spans two SSE registers.
+    The bytes from the registers are compressed using two shuffles.
 
-//
-// Return nonzero if there are incomplete multibyte characters at the end of the
-// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
-//
-simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
-  // If the previous input's last 3 bytes match this, they're too short (they
-  // ended at EOF):
-  // ... 1111____ 111_____ 11______
-  static const uint8_t max_array[32] = {255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        255,
-                                        0b11110000u - 1,
-                                        0b11100000u - 1,
-                                        0b11000000u - 1};
-  const simd8<uint8_t> max_value(
-      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
-  return input.gt_bits(max_value);
-}
+    We need 256-entry lookup table to get a compression pattern
+    and the number of output bytes in the compressed vector register.
+    Each entry occupies 17 bytes.
 
-struct utf8_checker {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
-  // The last input we received
-  simd8<uint8_t> prev_input_block;
-  // Whether the last input we received was incomplete (used for ASCII fast
-  // path)
-  simd8<uint8_t> prev_incomplete;
 
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+    To summarize:
+    - We need two 256-entry tables that have 8704 bytes in total.
+*/
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
+template <endianness big_endian>
+std::pair<const char16_t *, char *>
+lsx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *end = buf + len;
 
-  // The only problem that can happen at EOF is that a multibyte character is
-  // too short or a byte value too large in the last bytes: check_special_cases
-  // only checks for bytes too large in the first of two bytes.
-  simdutf_really_inline void check_eof() {
-    // If the previous block had incomplete UTF-8 characters at the end, an
-    // ASCII block can't possibly finish them.
-    this->error |= this->prev_incomplete;
-  }
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
-    if (simdutf_likely(is_ascii(input))) {
-      this->error |= this->prev_incomplete;
-    } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-                    "We support either two or four chunks per 64-byte block.");
-      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
-        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
+  while (buf + 16 + safety_margin <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+    }
+    if (__lsx_bz_v(
+            __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      __m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+      if (!match_system(big_endian)) {
+        nextin = lsx_swap_bytes(nextin);
+      }
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(nextin, in);
+        // 2. store (16 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(in, in);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
       }
-      this->prev_incomplete =
-          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
-      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
-  }
 
-  // do not forget to call check_eof!
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+    __m128i zero = __lsx_vldi(0);
+    if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m128i t0 = __lsx_vslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m128i t3 = __lsx_vor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
+      __m128i t4 = __lsx_vor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m128i one_byte_bytemask =
+          __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
+      __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+      // 4. pack the bytes
+      const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                               [lsx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle = __lsx_vld(row, 1);
+      __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+      // 5. store bytes
+      __lsx_vst(utf8_packed, utf8_output, 0);
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
+    }
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
+                     __lsx_vldi(-2600 /*0xD800*/));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-}; // struct utf8_checker
-} // namespace utf8_validation
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-using utf8_validation::utf8_checker;
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
-/* begin file src/generic/utf8_validation/utf8_validator.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_validation {
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-/**
- * Validates that the string is actual UTF-8.
- */
-template <class checker>
-bool generic_validate_utf8(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    reader.advance();
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  return !c.errors();
-}
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      __m128i t0 = __lsx_vpickev_b(in, in);
+      t0 = __lsx_vilvl_b(t0, t0);
 
-bool generic_validate_utf8(const char *input, size_t length) {
-  return generic_validate_utf8<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+      __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
 
-/**
- * Validates that the string is actual UTF-8 and stops on errors.
- */
-template <class checker>
-result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
-  checker c{};
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    c.check_next_input(in);
-    if (c.errors()) {
-      if (count != 0) {
-        count--;
-      } // Sometimes the error is only detected in the next chunk
-      result res = scalar::utf8::rewind_and_validate_with_errors(
-          reinterpret_cast<const char *>(input),
-          reinterpret_cast<const char *>(input + count), length - count);
-      res.count += count;
-      return res;
-    }
-    reader.advance();
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  c.check_next_input(in);
-  reader.advance();
-  c.check_eof();
-  if (c.errors()) {
-    if (count != 0) {
-      count--;
-    } // Sometimes the error is only detected in the next chunk
-    result res = scalar::utf8::rewind_and_validate_with_errors(
-        reinterpret_cast<const char *>(input),
-        reinterpret_cast<const char *>(input) + count, length - count);
-    res.count += count;
-    return res;
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
-}
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m128i s0 = __lsx_vsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m128i s1 = __lsx_vslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
 
-result generic_validate_utf8_with_errors(const char *input, size_t length) {
-  return generic_validate_utf8_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
+      // [00bb|bbbb|0000|aaaa]
+      __m128i s2 = __lsx_vor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+      __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+      __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
+      __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                 __lsx_vldi(-2752 /*0x4000*/));
+      __m128i s4 = __lsx_vxor_v(s3, m0);
+
+      // 4. expand code units 16-bit => 32-bit
+      __m128i out0 = __lsx_vilvl_h(s4, t2);
+      __m128i out1 = __lsx_vilvh_h(s4, t2);
+
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
+
+      __m128i one_or_two_bytes_bytemask_low =
+          __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+      __m128i one_or_two_bytes_bytemask_high =
+          __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m128i one_byte_bytemask_low =
+          __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m128i one_byte_bytemask_high =
+          __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      const uint32_t mask0 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
+                                      one_byte_bytemask_low)),
+          0);
+      const uint32_t mask1 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
+                                      one_byte_bytemask_high)),
+          0);
+
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
+      __lsx_vst(utf8_1, utf8_output, 0);
+      utf8_output += row1[0];
+
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
+    }
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
-template <class checker>
-bool generic_validate_ascii(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  uint8_t blocks[64]{};
-  simd::simd8x64<uint8_t> running_or(blocks);
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    running_or |= in;
-    reader.advance();
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  running_or |= in;
-  return running_or.is_ascii();
-}
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char *>
+lsx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
 
-bool generic_validate_ascii(const char *input, size_t length) {
-  return generic_validate_ascii<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+  while (buf + 16 + safety_margin <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
+    }
+    if (__lsx_bz_v(
+            __lsx_vslt_hu(__lsx_vrepli_h(0x7F), in))) { // ASCII fast path!!!!
+      // It is common enough that we have sequences of 16 consecutive ASCII
+      // characters.
+      __m128i nextin = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
+      if (!match_system(big_endian)) {
+        nextin = lsx_swap_bytes(nextin);
+      }
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F), nextin))) {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(nextin, in);
+        // 2. store (16 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
+      } else {
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(in, in);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        in = nextin;
+      }
+    }
 
-template <class checker>
-result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
-  buf_block_reader<64> reader(input, length);
-  size_t count{0};
-  while (reader.has_full_block()) {
-    simd::simd8x64<uint8_t> in(reader.full_block());
-    if (!in.is_ascii()) {
-      result res = scalar::ascii::validate_with_errors(
-          reinterpret_cast<const char *>(input + count), length - count);
-      return result(res.error, count + res.count);
+    __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7ff));
+    __m128i zero = __lsx_vldi(0);
+    if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+      // expected output   : [110a|aaaa|10bb|bbbb] x 8
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m128i t0 = __lsx_vslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m128i t2 = __lsx_vand_v(in, __lsx_vrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m128i t3 = __lsx_vor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xc080));
+      __m128i t4 = __lsx_vor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m128i one_byte_bytemask =
+          __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F /*0x007F*/));
+      __m128i utf8_unpacked = __lsx_vbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      uint32_t m2 = __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+      // 4. pack the bytes
+      const uint8_t *row = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                               [lsx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle = __lsx_vld(row, 1);
+      __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+      // 5. store bytes
+      __lsx_vst(utf8_packed, utf8_output, 0);
+      // 6. adjust pointers
+      buf += 8;
+      utf8_output += row[0];
+      continue;
     }
-    reader.advance();
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, __lsx_vldi(-2568 /*0xF800*/)),
+                     __lsx_vldi(-2600 /*0xD800*/));
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+      /* In this branch we handle three cases:
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-    count += 64;
-  }
-  uint8_t block[64]{};
-  reader.get_remainder(block);
-  simd::simd8x64<uint8_t> in(block);
-  if (!in.is_ascii()) {
-    result res = scalar::ascii::validate_with_errors(
-        reinterpret_cast<const char *>(input + count), length - count);
-    return result(res.error, count + res.count);
-  } else {
-    return result(error_code::SUCCESS, length);
-  }
-}
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-result generic_validate_ascii_with_errors(const char *input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-} // namespace utf8_validation
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_validator.h */
-// transcoding from UTF-8 to UTF-16
-/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf16 {
-using namespace simd;
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
+      /**
+       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+       * t2 => [0ccc|cccc] [10cc|cccc]
+       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+       */
+      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+      __m128i t0 = __lsx_vpickev_b(in, in);
+      t0 = __lsx_vilvl_b(t0, t0);
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+      __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+      __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688));
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m128i s0 = __lsx_vsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m128i s1 = __lsx_vslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+      // [00bb|bbbb|0000|aaaa]
+      __m128i s2 = __lsx_vor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+      __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+      __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(in, v_07ff);
+      __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                 __lsx_vldi(-2752 /*0x4000*/));
+      __m128i s4 = __lsx_vxor_v(s3, m0);
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+      // 4. expand code units 16-bit => 32-bit
+      __m128i out0 = __lsx_vilvl_h(s4, t2);
+      __m128i out1 = __lsx_vilvh_h(s4, t2);
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+      __m128i one_byte_bytemask = __lsx_vsle_hu(in, __lsx_vrepli_h(0x7F));
+
+      __m128i one_or_two_bytes_bytemask_low =
+          __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+      __m128i one_or_two_bytes_bytemask_high =
+          __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m128i one_byte_bytemask_low =
+          __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m128i one_byte_bytemask_high =
+          __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      const uint32_t mask0 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_low,
+                                      one_byte_bytemask_low)),
+          0);
+      const uint32_t mask1 = __lsx_vpickve2gr_bu(
+          __lsx_vmskltz_h(__lsx_vor_v(one_or_two_bytes_bytemask_high,
+                                      one_byte_bytemask_high)),
+          0);
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+      const uint8_t *row0 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+      const uint8_t *row1 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
+      __lsx_vst(utf8_1, utf8_output, 0);
+      utf8_output += row1[0];
 
-  template <endianness endian>
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
-          in + pos, size - pos, utf16_output);
-      if (howmany == 0) {
-        return 0;
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xF800) != 0xD800) {
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf8_output++ = char((value >> 18) | 0b11110000);
+          *utf8_output++ = char(((value >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((value >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        }
       }
-      utf16_output += howmany;
+      buf += k;
     }
-    return utf16_output - start;
-  }
+  } // while
 
-  template <endianness endian>
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char16_t *utf16_output) {
-    size_t pos = 0;
-    char16_t *start{utf16_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lsx/lsx_convert_utf16_to_utf8.cpp */
+/* begin file src/lsx/lsx_convert_utf16_to_utf32.cpp */
+template <endianness big_endian>
+std::pair<const char16_t *, char32_t *>
+lsx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                           char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *end = buf + len;
+
+  __m128i zero = __lsx_vldi(0);
+  __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+
+  while (buf + 8 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res =
-              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-                  pos, in + pos, size - pos, utf16_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
+
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
+    // It might seem like checking for surrogates_bitmask == 0xc000 could help.
+    // However, it is likely an uncommon occurrence.
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
+      __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
         }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
       }
+      buf += k;
     }
-    if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      res.count += pos;
-      return res;
+  } // while
+  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
+}
+
+/*
+  Returns a pair: a result struct and utf8_output.
+  If there is an error, the count field of the result is the position of the
+  error. Otherwise, it is the position of the first unprocessed byte in buf
+  (even if finished). A scalar routing should carry on the conversion of the
+  tail if needed.
+*/
+template <endianness big_endian>
+std::pair<result, char32_t *>
+lsx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                       char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
+  const char16_t *start = buf;
+  const char16_t *end = buf + len;
+
+  __m128i zero = __lsx_vldi(0);
+  __m128i v_f800 = __lsx_vldi(-2568); /*0xF800*/
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+
+  while (buf + 8 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lsx_swap_bytes(in);
     }
-    if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf16_output += res.count;
+
+    __m128i surrogates_bytemask =
+        __lsx_vseq_h(__lsx_vand_v(in, v_f800), v_d800);
+    if (__lsx_bz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __lsx_vst(__lsx_vilvl_h(zero, in), utf32_output, 0);
+      __lsx_vst(__lsx_vilvh_h(zero, in), utf32_output, 16);
+      utf32_output += 8;
+      buf += 8;
+      // surrogate pair(s) in a register
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
+        if ((word & 0xF800) != 0xD800) {
+          *utf32_output++ = char32_t(word);
+        } else {
+          // must be a surrogate pair
+          uint16_t diff = uint16_t(word - 0xD800);
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
+          k++;
+          uint16_t diff2 = uint16_t(next_word - 0xDC00);
+          if ((diff | diff2) > 0x3FF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k - 1),
+                reinterpret_cast<char32_t *>(utf32_output));
+          }
+          uint32_t value = (diff << 10) + diff2 + 0x10000;
+          *utf32_output++ = char32_t(value);
+        }
       }
+      buf += k;
     }
-    return result(error_code::SUCCESS, utf16_output - start);
-  }
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char32_t *>(utf32_output));
+}
+/* end file src/lsx/lsx_convert_utf16_to_utf32.cpp */
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+/* begin file src/lsx/lsx_convert_utf32_to_latin1.cpp */
+std::pair<const char32_t *, char *>
+lsx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                            char *latin1_output) {
+  const char32_t *end = buf + len;
+  const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+  __m128i v_ff = __lsx_vrepli_w(0xFF);
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
-/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+  while (buf + 16 <= end) {
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf16 {
+    __m128i in12 = __lsx_vor_v(in1, in2);
+    if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
+    }
+  } // while
+  return std::make_pair(buf, latin1_output);
+}
 
-using namespace simd;
+std::pair<result, char *>
+lsx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                        char *latin1_output) {
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-template <endianness endian>
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char16_t *utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the
-  // generic directory.
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the
-    // mask far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
+  const v16u8 shuf_mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
+  __m128i v_ff = __lsx_vrepli_w(0xFF);
+
+  while (buf + 16 <= end) {
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in2 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+    __m128i in12 = __lsx_vor_v(in1, in2);
+
+    if (__lsx_bz_v(__lsx_vslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vshuf_b(in2, in1, (__m128i)shuf_mask);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 8;
+      latin1_output += 8;
     } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow
-      // path. Anything that is not a continuation mask is a 'leading byte',
-      // that is, the start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end*
-      // of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(
-            input + pos, utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 8; k++) {
+        uint32_t word = buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
+        } else {
+          return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
+                                latin1_output);
+        }
       }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
     }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
-      input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+  } // while
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        latin1_output);
 }
+/* end file src/lsx/lsx_convert_utf32_to_latin1.cpp */
+/* begin file src/lsx/lsx_convert_utf32_to_utf8.cpp */
+std::pair<const char32_t *, char *>
+lsx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *end = buf + len;
 
-} // namespace utf8_to_utf16
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-// transcoding from UTF-8 to UTF-32
-/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+  __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
+  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i forbidden_bytemask = __lsx_vldi(0x0);
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf32 {
-using namespace simd;
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
 
-simdutf_really_inline simd8<uint8_t>
-check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
-  // Bit 1 = Too Long (ASCII followed by continuation)
-  // Bit 2 = Overlong 3-byte
-  // Bit 4 = Surrogate
-  // Bit 5 = Overlong 2-byte
-  // Bit 7 = Two Continuations
-  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
-                                               // 11______ 11______
-  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
-  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
-  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
-  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
-  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
-  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
-                                               // 11110100 101_____
-                                               // 11110101 1001____
-                                               // 11110101 101_____
-                                               // 1111011_ 1001____
-                                               // 1111011_ 101_____
-                                               // 11111___ 1001____
-                                               // 11111___ 101_____
-  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
-  // 11110101 1000____
-  // 1111011_ 1000____
-  // 11111___ 1000____
-  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  while (buf + 16 + safety_margin < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
 
-  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
-      // 0_______ ________ <ASCII in byte 1>
-      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
-      TOO_LONG,
-      // 10______ ________ <continuation in byte 1>
-      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
-      // 1100____ ________ <two byte lead in byte 1>
-      TOO_SHORT | OVERLONG_2,
-      // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
-      // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
-      // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
-  constexpr const uint8_t CARRY =
-      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
-  const simd8<uint8_t> byte_1_low =
-      (prev1 & 0x0F)
-          .lookup_16<uint8_t>(
-              // ____0000 ________
-              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
-              // ____0001 ________
-              CARRY | OVERLONG_2,
-              // ____001_ ________
-              CARRY, CARRY,
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
+      __m128i utf16_packed = __lsx_vpickev_h(nextin, in);
 
-              // ____0100 ________
-              CARRY | TOO_LARGE,
-              // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
+                                   utf16_packed))) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+      __m128i zero = __lsx_vldi(0);
+      if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
 
-              // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
-  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
-      // ________ 0_______ <ASCII in byte 2>
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
-      TOO_SHORT, TOO_SHORT,
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m128i t3 = __lsx_vor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m128i t4 = __lsx_vor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
+        __m128i utf8_unpacked =
+            __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        uint32_t m2 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lsx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle = __lsx_vld(row, 1);
+        __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+        // 5. store bytes
+        __lsx_vst(utf8_packed, utf8_output, 0);
 
-      // ________ 1000____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
-          OVERLONG_4,
-      // ________ 1001____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
-      // ________ 101_____
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
-      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lsx_vor_v(
+            __lsx_vand_v(
+                __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        /* In this branch we handle three cases:
+    1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single
+    UFT-8 byte
+    2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+    UTF-8 bytes
+    3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
+    UTF-8 bytes
+
+    We expand the input word (16-bit) into two code units (32-bit), thus
+    we have room for four bytes. However, we need five distinct bit
+    layouts. Note that the last byte in cases #2 and #3 is the same.
+
+    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+    in register t2.
+
+    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+    either byte 1 for case #2 or byte 2 for case #3. Note that they
+    differ by exactly one bit.
+
+    Finally from these two code units we build proper UTF-8 sequence, taking
+    into account the case (i.e, the number of bytes to write).
+  */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        t0 = __lsx_vilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+        __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
 
-      // ________ 11______
-      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
-  return (byte_1_high & byte_1_low & byte_2_high);
-}
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m128i s2 = __lsx_vor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+        __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+        // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
+        __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                   __lsx_vldi(-2752 /*0x4000*/));
+        __m128i s4 = __lsx_vxor_v(s3, m0);
 
-struct validating_transcoder {
-  // If this is nonzero, there has been a UTF-8 error.
-  simd8<uint8_t> error;
+        // 4. expand code units 16-bit => 32-bit
+        __m128i out0 = __lsx_vilvl_h(s4, t2);
+        __m128i out1 = __lsx_vilvh_h(s4, t2);
 
-  validating_transcoder() : error(uint8_t(0)) {}
-  //
-  // Check whether the current bytes are valid UTF-8.
-  //
-  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
-                                              const simd8<uint8_t> prev_input) {
-    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
-    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
-    // small negative numbers)
-    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
-  }
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
+
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m128i one_byte_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m128i one_byte_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        const uint32_t mask0 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_low,
+                                    one_byte_bytemask_u16_to_u32_low)),
+                                0);
+        const uint32_t mask1 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_high,
+                                    one_byte_bytemask_u16_to_u32_high)),
+                                0);
 
-  simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
-      } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // we have an error
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
-        }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
+
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
+
+        buf += 8;
       }
-    }
-    if (errors()) {
-      return 0;
-    }
-    if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
-      if (howmany == 0) {
-        return 0;
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
+    } else {
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
       }
-      utf32_output += howmany;
+      buf += k;
     }
-    return utf32_output - start;
+  } // while
+
+  // check for invalid input
+  if (__lsx_bnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
   }
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
+}
 
-  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char32_t *utf32_output) {
-    size_t pos = 0;
-    char32_t *start{utf32_output};
-    // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
-    // last 16 bytes, and if the data is valid, then it is entirely safe because
-    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
-    // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
-    size_t leading_byte = 0;
-    size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
-    }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
-    const size_t safety_margin = size - margin + 1; // to avoid overruns!
-    while (pos + 64 + safety_margin <= size) {
-      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-      if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
-        pos += 64;
+std::pair<result, char *>
+lsx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                      char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
+
+  __m128i v_c080 = __lsx_vreplgr2vr_h(uint16_t(0xC080));
+  __m128i v_07ff = __lsx_vreplgr2vr_h(uint16_t(0x7FF));
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xDFFF));
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i forbidden_bytemask = __lsx_vldi(0x0);
+  const size_t safety_margin =
+      12; // to avoid overruns, see issue
+          // https://github.com/simdutf/simdutf/issues/92
+
+  while (buf + 16 + safety_margin < end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i nextin = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lsx_convert_utf16_to_utf8.cpp)
+      __m128i utf16_packed = __lsx_vpickev_h(nextin, in);
+
+      if (__lsx_bz_v(__lsx_vslt_hu(__lsx_vrepli_h(0x7F),
+                                   utf16_packed))) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m128i utf8_packed = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        // 2. store (8 bytes)
+        __lsx_vst(utf8_packed, utf8_output, 0);
+        // 3. adjust pointers
+        buf += 8;
+        utf8_output += 8;
+        continue; // we are done for this round!
+      }
+      __m128i zero = __lsx_vldi(0);
+      if (__lsx_bz_v(__lsx_vslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
+
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m128i t0 = __lsx_vslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m128i t1 = __lsx_vand_v(t0, __lsx_vldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m128i t2 = __lsx_vand_v(utf16_packed, __lsx_vrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m128i t3 = __lsx_vor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m128i t4 = __lsx_vor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F /*0x007F*/));
+        __m128i utf8_unpacked =
+            __lsx_vbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        uint32_t m2 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(one_byte_bytemask), 0);
+        // 4. pack the bytes
+        const uint8_t *row =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lsx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle = __lsx_vld(row, 1);
+        __m128i utf8_packed = __lsx_vshuf_b(zero, utf8_unpacked, shuffle);
+        // 5. store bytes
+        __lsx_vst(utf8_packed, utf8_output, 0);
+
+        // 6. adjust pointers
+        buf += 8;
+        utf8_output += row[0];
+        continue;
       } else {
-        // you might think that a for-loop would work, but under Visual Studio,
-        // it is not good enough.
-        static_assert(
-            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
-                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
-            "We support either two or four chunks per 64-byte block.");
-        auto zero = simd8<uint8_t>{uint8_t(0)};
-        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
-          this->check_utf8_bytes(input.chunks[0], zero);
-          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
-          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
-          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
-        }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
-          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, utf32_output);
-          res.count += pos;
-          return res;
-        }
-        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-        // We process in blocks of up to 12 bytes except possibly
-        // for fast paths which may process up to 16 bytes. For the
-        // slow path to work, we should have at least 12 input bytes left.
-        size_t max_starting_point = (pos + 64) - 12;
-        // Next loop is going to run at least five times.
-        while (pos < max_starting_point) {
-          // Performance note: our ability to compute 'consumed' and
-          // then shift and recompute is critical. If there is a
-          // latency of, say, 4 cycles on getting 'consumed', then
-          // the inner loop might have a total latency of about 6 cycles.
-          // Yet we process between 6 to 12 inputs bytes, thus we get
-          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-          // for this section of the code. Hence, there is a limit
-          // to how much we can further increase this latency before
-          // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
-          pos += consumed;
-          utf8_end_of_code_point_mask >>= consumed;
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lsx_vor_v(
+            __lsx_vand_v(
+                __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        if (__lsx_bnz_v(forbidden_bytemask)) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start),
+                                reinterpret_cast<char *>(utf8_output));
         }
-        // At this point there may remain between 0 and 12 bytes in the
-        // 64-byte block. These bytes will be processed again. So we have an
-        // 80% efficiency (in the worst case). In practice we expect an
-        // 85% to 90% efficiency.
-      }
-    }
-    if (errors()) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      res.count += pos;
-      return res;
-    }
-    if (pos < size) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
-      if (res.error) { // In case of error, we want the error position
-        res.count += pos;
-        return res;
-      } else { // In case of success, we want the number of word written
-        utf32_output += res.count;
-      }
-    }
-    return result(error_code::SUCCESS, utf32_output - start);
-  }
+        /* In this branch we handle three cases:
+    1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           - single
+    UFT-8 byte
+    2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
+    UTF-8 bytes
+    3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] - three
+    UTF-8 bytes
+
+    We expand the input word (16-bit) into two code units (32-bit), thus
+    we have room for four bytes. However, we need five distinct bit
+    layouts. Note that the last byte in cases #2 and #3 is the same.
+
+    We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+    in register t2.
+
+    We precompute byte 1 for case #3 and -- **conditionally** -- precompute
+    either byte 1 for case #2 or byte 2 for case #3. Note that they
+    differ by exactly one bit.
+
+    Finally from these two code units we build proper UTF-8 sequence, taking
+    into account the case (i.e, the number of bytes to write).
+  */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m128i t0 = __lsx_vpickev_b(utf16_packed, utf16_packed);
+        t0 = __lsx_vilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m128i v_3f7f = __lsx_vreplgr2vr_h(uint16_t(0x3F7F));
+        __m128i t1 = __lsx_vand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m128i t2 = __lsx_vor_v(t1, __lsx_vldi(-2688 /*0x8000*/));
 
-  simdutf_really_inline bool errors() const {
-    return this->error.any_bits_set_anywhere();
-  }
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m128i s0 = __lsx_vsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m128i s1 = __lsx_vslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lsx_vand_v(s1, __lsx_vldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m128i s2 = __lsx_vor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m128i v_c0e0 = __lsx_vreplgr2vr_h(uint16_t(0xC0E0));
+        __m128i s3 = __lsx_vor_v(s2, v_c0e0);
+        // __m128i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m128i one_or_two_bytes_bytemask = __lsx_vsle_hu(utf16_packed, v_07ff);
+        __m128i m0 = __lsx_vandn_v(one_or_two_bytes_bytemask,
+                                   __lsx_vldi(-2752 /*0x4000*/));
+        __m128i s4 = __lsx_vxor_v(s3, m0);
 
-}; // struct utf8_checker
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
-/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+        // 4. expand code units 16-bit => 32-bit
+        __m128i out0 = __lsx_vilvl_h(s4, t2);
+        __m128i out1 = __lsx_vilvh_h(s4, t2);
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8_to_utf32 {
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m128i one_byte_bytemask =
+            __lsx_vsle_hu(utf16_packed, __lsx_vrepli_h(0x7F));
+
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_or_two_bytes_bytemask, zero);
+        __m128i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m128i one_byte_bytemask_u16_to_u32_low =
+            __lsx_vilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m128i one_byte_bytemask_u16_to_u32_high =
+            __lsx_vilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        const uint32_t mask0 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_low,
+                                    one_byte_bytemask_u16_to_u32_low)),
+                                0);
+        const uint32_t mask1 =
+            __lsx_vpickve2gr_bu(__lsx_vmskltz_h(__lsx_vor_v(
+                                    one_or_two_bytes_bytemask_u16_to_u32_high,
+                                    one_byte_bytemask_u16_to_u32_high)),
+                                0);
 
-using namespace simd;
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 = __lsx_vshuf_b(zero, out0, shuffle0);
 
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char32_t *utf32_output) noexcept {
-  size_t pos = 0;
-  char32_t *start{utf32_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
-      pos += 64;
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 = __lsx_vshuf_b(zero, out1, shuffle1);
+
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
+
+        buf += 8;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
     } else {
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      size_t max_starting_point = (pos + 64) - 12;
-      while (pos < max_starting_point) {
-        size_t consumed = convert_masked_utf8_to_utf32(
-            input + pos, utf8_end_of_code_point_mask, utf32_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
+      size_t forward = 15;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
       }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFFFF80) == 0) {
+          *utf8_output++ = char(word);
+        } else if ((word & 0xFFFFF800) == 0) {
+          *utf8_output++ = char((word >> 6) | 0b11000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else if ((word & 0xFFFF0000) == 0) {
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 12) | 0b11100000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        } else {
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
+          }
+          *utf8_output++ = char((word >> 18) | 0b11110000);
+          *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+          *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+          *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        }
+      }
+      buf += k;
     }
-  }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
-                                                       utf32_output);
-  return utf32_output - start;
+  } // while
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
 }
+/* end file src/lsx/lsx_convert_utf32_to_utf8.cpp */
+/* begin file src/lsx/lsx_convert_utf32_to_utf16.cpp */
+template <endianness big_endian>
+std::pair<const char32_t *, char16_t *>
+lsx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                           char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *end = buf + len;
 
-} // namespace utf8_to_utf32
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-// other functions
-/* begin file src/generic/utf16.h */
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf16 {
+  __m128i forbidden_bytemask = __lsx_vrepli_h(0);
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
+  while (buf + 8 <= end) {
+    __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
 
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t *in,
-                                               size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
-    }
-    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-    count += count_ones(not_pair) / 2;
-  }
-  return count +
-         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
-}
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
+      __m128i utf16_packed = __lsx_vpickev_h(in1, in0);
+      forbidden_bytemask = __lsx_vor_v(
+          __lsx_vand_v(
+              __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
 
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
+      if (!match_system(big_endian)) {
+        utf16_packed = lsx_swap_bytes(utf16_packed);
+      }
+      __lsx_vst(utf16_packed, utf16_output, 0);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
     }
-    uint64_t ascii_mask = input.lteq(0x7F);
-    uint64_t twobyte_mask = input.lteq(0x7FF);
-    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+  }
 
-    size_t ascii_count = count_ones(ascii_mask) / 2;
-    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
-             ascii_count;
+  // check for invalid input
+  if (__lsx_bnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
   }
-  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
-                                                                   size - pos);
+  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
 }
 
 template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
-                                                     size_t size) {
-  return count_code_points<big_endian>(in, size);
-}
-
-simdutf_really_inline void
-change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
-  size_t pos = 0;
-
-  while (pos < size / 32 * 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
-  }
-
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
-}
+std::pair<result, char16_t *>
+lsx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                       char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
+  const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-} // namespace utf16
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf16.h */
-/* begin file src/generic/utf8.h */
+  __m128i forbidden_bytemask = __lsx_vrepli_h(0);
+  __m128i v_d800 = __lsx_vldi(-2600); /*0xD800*/
+  __m128i v_dfff = __lsx_vreplgr2vr_h(uint16_t(0xdfff));
 
-namespace simdutf {
-namespace ppc64 {
-namespace {
-namespace utf8 {
+  while (buf + 8 <= end) {
+    __m128i in0 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint32_t *>(buf), 16);
+    // Check if no bits set above 16th
+    if (__lsx_bz_v(__lsx_vpickod_h(in1, in0))) {
+      __m128i utf16_packed = __lsx_vpickev_h(in1, in0);
+
+      forbidden_bytemask = __lsx_vor_v(
+          __lsx_vand_v(
+              __lsx_vsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lsx_vsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
+      if (__lsx_bnz_v(forbidden_bytemask)) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
 
-using namespace simd;
+      if (!match_system(big_endian)) {
+        utf16_packed = lsx_swap_bytes(utf16_packed);
+      }
 
-simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.gt(-65);
-    count += count_ones(utf8_continuation_mask);
+      __lsx_vst(utf16_packed, utf16_output, 0);
+      utf16_output += 8;
+      buf += 8;
+    } else {
+      size_t forward = 3;
+      size_t k = 0;
+      if (size_t(end - buf) < forward + 1) {
+        forward = size_t(end - buf - 1);
+      }
+      for (; k < forward; k++) {
+        uint32_t word = buf[k];
+        if ((word & 0xFFFF0000) == 0) {
+          // will not generate a surrogate pair
+          if (word >= 0xD800 && word <= 0xDFFF) {
+            return std::make_pair(
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
+        } else {
+          // will generate a surrogate pair
+          if (word > 0x10FFFF) {
+            return std::make_pair(
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
+          }
+          word -= 0x10000;
+          uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+          uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+          if (!match_system(big_endian)) {
+            high_surrogate =
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+          }
+          *utf16_output++ = char16_t(high_surrogate);
+          *utf16_output++ = char16_t(low_surrogate);
+        }
+      }
+      buf += k;
+    }
   }
-  return count + scalar::utf8::count_code_points(in + pos, size - pos);
-}
 
-simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    int64_t utf8_4byte = input.gteq_unsigned(240);
-    count += count_ones(utf8_4byte);
-  }
-  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char16_t *>(utf16_output));
 }
-} // namespace utf8
-} // unnamed namespace
-} // namespace ppc64
-} // namespace simdutf
-/* end file src/generic/utf8.h */
+/* end file src/lsx/lsx_convert_utf32_to_utf16.cpp */
+/* begin file src/lsx/lsx_base64.cpp */
+/**
+ * References and further reading:
+ *
+ * Wojciech Muła, Daniel Lemire, Base64 encoding and decoding at almost the
+ * speed of a memory copy, Software: Practice and Experience 50 (2), 2020.
+ * https://arxiv.org/abs/1910.05109
+ *
+ * Wojciech Muła, Daniel Lemire, Faster Base64 Encoding and Decoding using AVX2
+ * Instructions, ACM Transactions on the Web 12 (3), 2018.
+ * https://arxiv.org/abs/1704.00605
+ *
+ * Simon Josefsson. 2006. The Base16, Base32, and Base64 Data Encodings.
+ * https://tools.ietf.org/html/rfc4648. (2006). Internet Engineering Task Force,
+ * Request for Comments: 4648.
+ *
+ * Alfred Klomp. 2014a. Fast Base64 encoding/decoding with SSE vectorization.
+ * http://www.alfredklomp.com/programming/sse-base64/. (2014).
+ *
+ * Alfred Klomp. 2014b. Fast Base64 stream encoder/decoder in C99, with SIMD
+ * acceleration. https://github.com/aklomp/base64. (2014).
+ *
+ * Hanson Char. 2014. A Fast and Correct Base 64 Codec. (2014).
+ * https://aws.amazon.com/blogs/developer/a-fast-and-correct-base-64-codec/
+ *
+ * Nick Kopp. 2013. Base64 Encoding on a GPU.
+ * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
+ */
 
-//
-// Implementation-specific overrides
-//
-namespace simdutf {
-namespace ppc64 {
+template <bool isbase64url>
+size_t encode_base64(char *dst, const char *src, size_t srclen,
+                     base64_options options) {
+  // credit: Wojciech Muła
+  // SSE (lookup: pshufb improved unrolled)
+  const uint8_t *input = (const uint8_t *)src;
+  static const char *lookup_tbl =
+      isbase64url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  uint8_t *out = (uint8_t *)dst;
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified) {
-    return bom_encoding;
-  }
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length)) {
-    out |= encoding_type::UTF8;
-  }
-  if ((length % 2) == 0) {
-    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2)) {
-      out |= encoding_type::UTF16_LE;
-    }
+  v16u8 shuf;
+  __m128i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
+      base64_tbl2, base64_tbl3;
+  if (srclen >= 16) {
+    shuf = v16u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+    v_fc0fc00 = __lsx_vreplgr2vr_w(uint32_t(0x0fc0fc00));
+    v_3f03f0 = __lsx_vreplgr2vr_w(uint32_t(0x003f03f0));
+    shift_r = __lsx_vreplgr2vr_w(uint32_t(0x0006000a));
+    shift_l = __lsx_vreplgr2vr_w(uint32_t(0x00080004));
+    base64_tbl0 = __lsx_vld(lookup_tbl, 0);
+    base64_tbl1 = __lsx_vld(lookup_tbl, 16);
+    base64_tbl2 = __lsx_vld(lookup_tbl, 32);
+    base64_tbl3 = __lsx_vld(lookup_tbl, 48);
   }
-  if ((length % 4) == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
-      out |= encoding_type::UTF32_LE;
-    }
+
+  size_t i = 0;
+  for (; i + 52 <= srclen; i += 48) {
+    __m128i in0 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
+    __m128i in1 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+    __m128i in2 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
+    __m128i in3 =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
+
+    in0 = __lsx_vshuf_b(in0, in0, (__m128i)shuf);
+    in1 = __lsx_vshuf_b(in1, in1, (__m128i)shuf);
+    in2 = __lsx_vshuf_b(in2, in2, (__m128i)shuf);
+    in3 = __lsx_vshuf_b(in3, in3, (__m128i)shuf);
+
+    __m128i t0_0 = __lsx_vand_v(in0, v_fc0fc00);
+    __m128i t0_1 = __lsx_vand_v(in1, v_fc0fc00);
+    __m128i t0_2 = __lsx_vand_v(in2, v_fc0fc00);
+    __m128i t0_3 = __lsx_vand_v(in3, v_fc0fc00);
+
+    __m128i t1_0 = __lsx_vsrl_h(t0_0, shift_r);
+    __m128i t1_1 = __lsx_vsrl_h(t0_1, shift_r);
+    __m128i t1_2 = __lsx_vsrl_h(t0_2, shift_r);
+    __m128i t1_3 = __lsx_vsrl_h(t0_3, shift_r);
+
+    __m128i t2_0 = __lsx_vand_v(in0, v_3f03f0);
+    __m128i t2_1 = __lsx_vand_v(in1, v_3f03f0);
+    __m128i t2_2 = __lsx_vand_v(in2, v_3f03f0);
+    __m128i t2_3 = __lsx_vand_v(in3, v_3f03f0);
+
+    __m128i t3_0 = __lsx_vsll_h(t2_0, shift_l);
+    __m128i t3_1 = __lsx_vsll_h(t2_1, shift_l);
+    __m128i t3_2 = __lsx_vsll_h(t2_2, shift_l);
+    __m128i t3_3 = __lsx_vsll_h(t2_3, shift_l);
+
+    __m128i input0 = __lsx_vor_v(t1_0, t3_0);
+    __m128i input0_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input0);
+    __m128i input0_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input0, __lsx_vldi(32)));
+    __m128i input0_mask = __lsx_vslei_bu(input0, 31);
+    __m128i input0_result =
+        __lsx_vbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
+    __lsx_vst(input0_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+
+    __m128i input1 = __lsx_vor_v(t1_1, t3_1);
+    __m128i input1_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input1);
+    __m128i input1_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input1, __lsx_vldi(32)));
+    __m128i input1_mask = __lsx_vslei_bu(input1, 31);
+    __m128i input1_result =
+        __lsx_vbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
+    __lsx_vst(input1_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+
+    __m128i input2 = __lsx_vor_v(t1_2, t3_2);
+    __m128i input2_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input2);
+    __m128i input2_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input2, __lsx_vldi(32)));
+    __m128i input2_mask = __lsx_vslei_bu(input2, 31);
+    __m128i input2_result =
+        __lsx_vbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
+    __lsx_vst(input2_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+
+    __m128i input3 = __lsx_vor_v(t1_3, t3_3);
+    __m128i input3_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, input3);
+    __m128i input3_shuf1 = __lsx_vshuf_b(base64_tbl3, base64_tbl2,
+                                         __lsx_vsub_b(input3, __lsx_vldi(32)));
+    __m128i input3_mask = __lsx_vslei_bu(input3, 31);
+    __m128i input3_result =
+        __lsx_vbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
+    __lsx_vst(input3_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
   }
+  for (; i + 16 <= srclen; i += 12) {
 
-  return out;
-}
+    __m128i in = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8(buf, len);
-}
+    // bytes from groups A, B and C are needed in separate 32-bit lanes
+    // in = [DDDD|CCCC|BBBB|AAAA]
+    //
+    //      an input triplet has layout
+    //      [????????|ccdddddd|bbbbcccc|aaaaaabb]
+    //        byte 3   byte 2   byte 1   byte 0    -- byte 3 comes from the next
+    //        triplet
+    //
+    //      shuffling changes the order of bytes: 1, 0, 2, 1
+    //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
+    //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
+    //                  processed bits
+    in = __lsx_vshuf_b(in, in, (__m128i)shuf);
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_utf8_with_errors(buf, len);
-}
+    // unpacking
+    // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
+    __m128i t0 = __lsx_vand_v(in, v_fc0fc00);
+    // t1    = [00000000|00cccccc|00000000|00aaaaaa]
+    //          ((c >> 6),  (a >> 10))
+    __m128i t1 = __lsx_vsrl_h(t0, shift_r);
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii(buf, len);
-}
+    // t2    = [00000000|00dddddd|000000bb|bbbb0000]
+    __m128i t2 = __lsx_vand_v(in, v_3f03f0);
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000]
+    //          ((d << 8), (b << 4))
+    __m128i t3 = __lsx_vsll_h(t2, shift_l);
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *buf, size_t len) const noexcept {
-  return ppc64::utf8_validation::generic_validate_ascii_with_errors(buf, len);
-}
+    // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
+    __m128i indices = __lsx_vor_v(t1, t3);
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::LITTLE>(buf, len);
-}
+    __m128i indices_shuf0 = __lsx_vshuf_b(base64_tbl1, base64_tbl0, indices);
+    __m128i indices_shuf1 = __lsx_vshuf_b(
+        base64_tbl3, base64_tbl2, __lsx_vsub_b(indices, __lsx_vldi(32)));
+    __m128i indices_mask = __lsx_vslei_bu(indices, 31);
+    __m128i indices_result =
+        __lsx_vbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *buf,
-                                 size_t len) const noexcept {
-  return scalar::utf16::validate<endianness::BIG>(buf, len);
-}
+    __lsx_vst(indices_result, reinterpret_cast<__m128i *>(out), 0);
+    out += 16;
+  }
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::LITTLE>(buf, len);
+  return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
+                                                        srclen - i, options);
 }
 
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf16::validate_with_errors<endianness::BIG>(buf, len);
-}
+static inline void compress(__m128i data, uint16_t mask, char *output) {
+  if (mask == 0) {
+    __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
+    return;
+  }
+  // this particular implementation was inspired by work done by @animetosho
+  // we do it in two steps, first 8 bytes and then second 8 bytes
+  uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
+  uint8_t mask2 = uint8_t(mask >> 8); // most significant 8 bits
+  // next line just loads the 64-bit values thintable_epi8[mask1] and
+  // thintable_epi8[mask2] into a 128-bit register, using only
+  // two instructions on most compilers.
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate_with_errors(buf, len);
-}
+  v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
+                    tables::base64::thintable_epi8[mask2]};
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char16_t *buf, size_t len) const noexcept {
-  return scalar::utf32::validate(buf, len);
-}
+  // we increment by 0x08 the second half of the mask
+  v4u32 hi = {0, 0, 0x08080808, 0x08080808};
+  __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  // this is the version "nearly pruned"
+  __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
+  // we still need to put the two halves together.
+  // we compute the popcount of the first half:
+  int pop1 = tables::base64::BitsSetTable256mul2[mask1];
+  // then load the corresponding mask, what it does is to write
+  // only the first pop1 bytes from the first 8 bytes, and then
+  // it fills in with the bytes from the second 8 bytes + some filling
+  // at the end.
+  __m128i compactmask =
+      __lsx_vld(reinterpret_cast<const __m128i *>(
+                    tables::base64::pshufb_combine_table + pop1 * 8),
+                0);
+  __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
+  __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
-}
+struct block64 {
+  __m128i chunks[4];
+};
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
-}
+template <bool base64_url>
+static inline uint16_t to_base64_mask(__m128i *src, bool *error) {
+  const v16u8 ascii_space_tbl = {0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                                 0x0,  0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0};
+  // credit: aqrit
+  /*
+  '0'(0x30)-'9'(0x39) => delta_values_index = 4
+  'A'(0x41)-'Z'(0x5a) => delta_values_index = 4/5/12(4+8)
+  'a'(0x61)-'z'(0x7a) => delta_values_index = 6/7/14(6+8)
+  '+'(0x2b)           => delta_values_index = 3
+  '/'(0x2f)           => delta_values_index = 2+8 = 10
+  '-'(0x2d)           => delta_values_index = 2+8 = 10
+  '_'(0x5f)           => delta_values_index = 5+8 = 13
+  */
+  v16u8 delta_asso = {0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
+                      0x0, 0x0, 0x0, 0x0, 0x0, 0xF, 0x0, 0xF};
+  v16i8 delta_values;
+  if (base64_url) {
+    delta_values =
+        v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+              int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
+              int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)};
+  } else {
+    delta_values =
+        v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+              int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+              int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+              int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)};
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  v16u8 check_asso;
+  if (base64_url) {
+    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x06, 0x0B, 0x12};
+  } else {
+    check_asso = v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+                       0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F};
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char * /*buf*/, size_t /*len*/,
-    char16_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  v16i8 check_values;
+  if (base64_url) {
+    check_values = v16i8{int8_t(0x0),  int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                         int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
+                         int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
+                         int8_t(0xB0), int8_t(0x80), int8_t(0x0),  int8_t(0x0)};
+  } else {
+    check_values =
+        v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+              int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+              int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+              int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)};
+  }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char * /*buf*/, size_t /*len*/,
-    char32_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
-}
+  const __m128i shifted = __lsx_vsrli_b(*src, 3);
+  __m128i asso_index = __lsx_vand_v(*src, __lsx_vldi(0xF));
+  const __m128i delta_hash =
+      __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)delta_asso, (__m128i)delta_asso,
+                                   (__m128i)asso_index),
+                     shifted);
+  const __m128i check_hash =
+      __lsx_vavgr_bu(__lsx_vshuf_b((__m128i)check_asso, (__m128i)check_asso,
+                                   (__m128i)asso_index),
+                     shifted);
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char * /*buf*/, size_t /*len*/,
-    char32_t * /*utf16_output*/) const noexcept {
-  return result(error_code::OTHER, 0); // stub
-}
+  const __m128i out =
+      __lsx_vsadd_b(__lsx_vshuf_b((__m128i)delta_values, (__m128i)delta_values,
+                                  (__m128i)delta_hash),
+                    *src);
+  const __m128i chk =
+      __lsx_vsadd_b(__lsx_vshuf_b((__m128i)check_values, (__m128i)check_values,
+                                  (__m128i)check_hash),
+                    *src);
+  unsigned int mask = __lsx_vpickve2gr_hu(__lsx_vmskltz_b(chk), 0);
+  if (mask) {
+    __m128i ascii_space = __lsx_vseq_b(__lsx_vshuf_b((__m128i)ascii_space_tbl,
+                                                     (__m128i)ascii_space_tbl,
+                                                     (__m128i)asso_index),
+                                       *src);
+    *error |=
+        (mask != __lsx_vpickve2gr_hu(__lsx_vmskltz_b((__m128i)ascii_space), 0));
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char * /*buf*/, size_t /*len*/,
-    char32_t * /*utf16_output*/) const noexcept {
-  return 0; // stub
+  *src = out;
+  return (uint16_t)mask;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::LITTLE>(buf, len,
-                                                            utf8_output);
+template <bool base64_url>
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  *error = 0;
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], error);
+  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], error);
+  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert<endianness::BIG>(buf, len, utf8_output);
+static inline void copy_block(block64 *b, char *output) {
+  __lsx_vst(b->chunks[0], reinterpret_cast<__m128i *>(output), 0);
+  __lsx_vst(b->chunks[1], reinterpret_cast<__m128i *>(output), 16);
+  __lsx_vst(b->chunks[2], reinterpret_cast<__m128i *>(output), 32);
+  __lsx_vst(b->chunks[3], reinterpret_cast<__m128i *>(output), 48);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf8_output);
+static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
+  uint64_t nmask = ~mask;
+  uint64_t count =
+      __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
+  uint16_t *count_ptr = (uint16_t *)&count;
+  compress(b->chunks[0], uint16_t(mask), output);
+  compress(b->chunks[1], uint16_t(mask >> 16), output + count_ptr[0]);
+  compress(b->chunks[2], uint16_t(mask >> 32),
+           output + count_ptr[0] + count_ptr[1]);
+  compress(b->chunks[3], uint16_t(mask >> 48),
+           output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
+  return count_ones(nmask);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
-      buf, len, utf8_output);
+// The caller of this function is responsible to ensure that there are 64 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char *src) {
+  b->chunks[0] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
+  b->chunks[1] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
+  b->chunks[2] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
+  b->chunks[3] = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::LITTLE>(buf, len,
-                                                                  utf8_output);
+// The caller of this function is responsible to ensure that there are 128 bytes
+// available from reading at src. The data is read into a block64 structure.
+static inline void load_block(block64 *b, const char16_t *src) {
+  __m128i m1 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 0);
+  __m128i m2 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 16);
+  __m128i m3 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 32);
+  __m128i m4 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 48);
+  __m128i m5 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 64);
+  __m128i m6 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 80);
+  __m128i m7 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 96);
+  __m128i m8 = __lsx_vld(reinterpret_cast<const __m128i *>(src), 112);
+  b->chunks[0] = __lsx_vssrlni_bu_h(m2, m1, 0);
+  b->chunks[1] = __lsx_vssrlni_bu_h(m4, m3, 0);
+  b->chunks[2] = __lsx_vssrlni_bu_h(m6, m5, 0);
+  b->chunks[3] = __lsx_vssrlni_bu_h(m8, m7, 0);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf16_to_utf8::convert_valid<endianness::BIG>(buf, len,
-                                                               utf8_output);
-}
+static inline void base64_decode(char *out, __m128i str) {
+  __m128i t0 = __lsx_vor_v(
+      __lsx_vslli_w(str, 26),
+      __lsx_vslli_w(__lsx_vand_v(str, __lsx_vldi(-1758 /*0x0000FF00*/)), 12));
+  __m128i t1 =
+      __lsx_vsrli_w(__lsx_vand_v(str, __lsx_vldi(-3521 /*0x003F0000*/)), 2);
+  __m128i t2 = __lsx_vor_v(t0, t1);
+  __m128i t3 = __lsx_vor_v(t2, __lsx_vsrli_w(str, 16));
+  const v16u8 pack_shuffle = {3, 2,  1,  7,  6, 5, 11, 10,
+                              9, 15, 14, 13, 0, 0, 0,  0};
+  t3 = __lsx_vshuf_b(t3, t3, (__m128i)pack_shuffle);
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert(buf, len, utf8_output);
+  // Store the output:
+  // we only need 12.
+  __lsx_vstelm_d(t3, out, 0, 0);
+  __lsx_vstelm_w(t3, out + 8, 0, 2);
 }
-
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_with_errors(buf, len, utf8_output);
+// decode 64 bytes and output 48 bytes
+static inline void base64_decode_block(char *out, const char *src) {
+  base64_decode(out, __lsx_vld(reinterpret_cast<const __m128i *>(src), 0));
+  base64_decode(out + 12,
+                __lsx_vld(reinterpret_cast<const __m128i *>(src), 16));
+  base64_decode(out + 24,
+                __lsx_vld(reinterpret_cast<const __m128i *>(src), 32));
+  base64_decode(out + 36,
+                __lsx_vld(reinterpret_cast<const __m128i *>(src), 48));
 }
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
-  return scalar::utf32_to_utf8::convert_valid(buf, len, utf8_output);
+static inline void base64_decode_block_safe(char *out, const char *src) {
+  base64_decode_block(out, src);
 }
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::LITTLE>(buf, len,
-                                                             utf16_output);
+static inline void base64_decode_block(char *out, block64 *b) {
+  base64_decode(out, b->chunks[0]);
+  base64_decode(out + 12, b->chunks[1]);
+  base64_decode(out + 24, b->chunks[2]);
+  base64_decode(out + 36, b->chunks[3]);
 }
-
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert<endianness::BIG>(buf, len,
-                                                          utf16_output);
+static inline void base64_decode_block_safe(char *out, block64 *b) {
+  base64_decode_block(out, b);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+template <bool base64_url, typename char_type>
+full_result
+compress_decode_base64(char *dst, const char_type *src, size_t srclen,
+                       base64_options options,
+                       last_chunk_handling_options last_chunk_options) {
+  const uint8_t *to_base64 = base64_url ? tables::base64::to_base64_url_value
+                                        : tables::base64::to_base64_value;
+  size_t equallocation =
+      srclen; // location of the first padding character if any
+  // skip trailing spaces
+  while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+         to_base64[uint8_t(src[srclen - 1])] == 64) {
+    srclen--;
+  }
+  size_t equalsigns = 0;
+  if (srclen > 0 && src[srclen - 1] == '=') {
+    equallocation = srclen - 1;
+    srclen--;
+    equalsigns = 1;
+    // skip trailing spaces
+    while (srclen > 0 && scalar::base64::is_eight_byte(src[srclen - 1]) &&
+           to_base64[uint8_t(src[srclen - 1])] == 64) {
+      srclen--;
+    }
+    if (srclen > 0 && src[srclen - 1] == '=') {
+      equallocation = srclen - 1;
+      srclen--;
+      equalsigns = 2;
+    }
+  }
+  if (srclen == 0) {
+    if (equalsigns > 0) {
+      return {INVALID_BASE64_CHARACTER, equallocation, 0};
+    }
+    return {SUCCESS, 0, 0};
+  }
+  const char_type *const srcinit = src;
+  const char *const dstinit = dst;
+  const char_type *const srcend = src + srclen;
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
-      buf, len, utf16_output);
-}
+  constexpr size_t block_size = 10;
+  char buffer[block_size * 64];
+  char *bufferptr = buffer;
+  if (srclen >= 64) {
+    const char_type *const srcend64 = src + srclen - 64;
+    while (src <= srcend64) {
+      block64 b;
+      load_block(&b, src);
+      src += 64;
+      bool error = false;
+      uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
+      if (badcharmask) {
+        if (error) {
+          src -= 64;
+          while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+                 to_base64[uint8_t(*src)] <= 64) {
+            src++;
+          }
+          if (src < srcend) {
+            // should never happen
+          }
+          return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                  size_t(dst - dstinit)};
+        }
+      }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::LITTLE>(
-      buf, len, utf16_output);
-}
+      if (badcharmask != 0) {
+        // optimization opportunity: check for simple masks like those made of
+        // continuous 1s followed by continuous 0s. And masks containing a
+        // single bad character.
+        bufferptr += compress_block(&b, badcharmask, bufferptr);
+      } else {
+        // optimization opportunity: if bufferptr == buffer and mask == 0, we
+        // can avoid the call to compress_block and decode directly.
+        copy_block(&b, bufferptr);
+        bufferptr += 64;
+      }
+      if (bufferptr >= (block_size - 1) * 64 + buffer) {
+        for (size_t i = 0; i < (block_size - 1); i++) {
+          base64_decode_block(dst, buffer + i * 64);
+          dst += 48;
+        }
+        std::memcpy(buffer, buffer + (block_size - 1) * 64,
+                    64); // 64 might be too much
+        bufferptr -= (block_size - 1) * 64;
+      }
+    }
+  }
+  char *buffer_start = buffer;
+  // Optimization note: if this is almost full, then it is worth our
+  // time, otherwise, we should just decode directly.
+  int last_block = (int)((bufferptr - buffer_start) % 64);
+  if (last_block != 0 && srcend - src + last_block >= 64) {
+    while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
+      uint8_t val = to_base64[uint8_t(*src)];
+      *bufferptr = char(val);
+      if (!scalar::base64::is_eight_byte(*src) || val > 64) {
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
+      }
+      bufferptr += (val <= 63);
+      src++;
+    }
+  }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
-  return scalar::utf32_to_utf16::convert_valid<endianness::BIG>(buf, len,
-                                                                utf16_output);
+  for (; buffer_start + 64 <= bufferptr; buffer_start += 64) {
+    base64_decode_block(dst, buffer_start);
+    dst += 48;
+  }
+  if ((bufferptr - buffer_start) % 64 != 0) {
+    while (buffer_start + 4 < bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 4);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    if (buffer_start + 4 <= bufferptr) {
+      uint32_t triple = ((uint32_t(uint8_t(buffer_start[0])) << 3 * 6) +
+                         (uint32_t(uint8_t(buffer_start[1])) << 2 * 6) +
+                         (uint32_t(uint8_t(buffer_start[2])) << 1 * 6) +
+                         (uint32_t(uint8_t(buffer_start[3])) << 0 * 6))
+                        << 8;
+      triple = scalar::utf32::swap_bytes(triple);
+      std::memcpy(dst, &triple, 3);
+
+      dst += 3;
+      buffer_start += 4;
+    }
+    // we may have 1, 2 or 3 bytes left and we need to decode them so let us
+    // backtrack
+    int leftover = int(bufferptr - buffer_start);
+    while (leftover > 0) {
+      while (to_base64[uint8_t(*(src - 1))] == 64) {
+        src--;
+      }
+      src--;
+      leftover--;
+    }
+  }
+  if (src < srcend + equalsigns) {
+    full_result r = scalar::base64::base64_tail_decode(
+        dst, src, srcend - src, equalsigns, options, last_chunk_options);
+    r.input_count += size_t(src - srcinit);
+    if (r.error == error_code::INVALID_BASE64_CHARACTER ||
+        r.error == error_code::BASE64_EXTRA_BITS) {
+      return r;
+    } else {
+      r.output_count += size_t(dst - dstinit);
+    }
+    if (last_chunk_options != stop_before_partial &&
+        r.error == error_code::SUCCESS && equalsigns > 0) {
+      // additional checks
+      if ((r.output_count % 3 == 0) ||
+          ((r.output_count % 3) + 1 + equalsigns != 4)) {
+        r.error = error_code::INVALID_BASE64_CHARACTER;
+        r.input_count = equallocation;
+      }
+    }
+    return r;
+  }
+  if (equalsigns > 0) {
+    if ((size_t(dst - dstinit) % 3 == 0) ||
+        ((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
+      return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
+    }
+  }
+  return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
+/* end file src/lsx/lsx_base64.cpp */
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::LITTLE>(buf, len,
-                                                             utf32_output);
+} // namespace
+} // namespace lsx
+} // namespace simdutf
+
+/* begin file src/generic/buf_block_reader.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+
+// Walks through a buffer in block-sized increments, loading the last part with
+// spaces
+template <size_t STEP_SIZE> struct buf_block_reader {
+public:
+  simdutf_really_inline buf_block_reader(const uint8_t *_buf, size_t _len);
+  simdutf_really_inline size_t block_index();
+  simdutf_really_inline bool has_full_block() const;
+  simdutf_really_inline const uint8_t *full_block() const;
+  /**
+   * Get the last block, padded with spaces.
+   *
+   * There will always be a last block, with at least 1 byte, unless len == 0
+   * (in which case this function fills the buffer with spaces and returns 0. In
+   * particular, if len == STEP_SIZE there will be 0 full_blocks and 1 remainder
+   * block with STEP_SIZE bytes and no spaces for padding.
+   *
+   * @return the number of effective characters in the last block.
+   */
+  simdutf_really_inline size_t get_remainder(uint8_t *dst) const;
+  simdutf_really_inline void advance();
+
+private:
+  const uint8_t *buf;
+  const size_t len;
+  const size_t lenminusstep;
+  size_t idx;
+};
+
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text_64(const uint8_t *text) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    buf[i] = int8_t(text[i]) < ' ' ? '_' : int8_t(text[i]);
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert<endianness::BIG>(buf, len,
-                                                          utf32_output);
+// Routines to print masks and text for debugging bitmask operations
+simdutf_unused static char *format_input_text(const simd8x64<uint8_t> &in) {
+  static char *buf =
+      reinterpret_cast<char *>(malloc(sizeof(simd8x64<uint8_t>) + 1));
+  in.store(reinterpret_cast<uint8_t *>(buf));
+  for (size_t i = 0; i < sizeof(simd8x64<uint8_t>); i++) {
+    if (buf[i] < ' ') {
+      buf[i] = '_';
+    }
+  }
+  buf[sizeof(simd8x64<uint8_t>)] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
-      buf, len, utf32_output);
+simdutf_unused static char *format_mask(uint64_t mask) {
+  static char *buf = reinterpret_cast<char *>(malloc(64 + 1));
+  for (size_t i = 0; i < 64; i++) {
+    buf[i] = (mask & (size_t(1) << i)) ? 'X' : ' ';
+  }
+  buf[64] = '\0';
+  return buf;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
-      buf, len, utf32_output);
-}
+template <size_t STEP_SIZE>
+simdutf_really_inline
+buf_block_reader<STEP_SIZE>::buf_block_reader(const uint8_t *_buf, size_t _len)
+    : buf{_buf}, len{_len}, lenminusstep{len < STEP_SIZE ? 0 : len - STEP_SIZE},
+      idx{0} {}
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::LITTLE>(
-      buf, len, utf32_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t buf_block_reader<STEP_SIZE>::block_index() {
+  return idx;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
-  return scalar::utf16_to_utf32::convert_valid<endianness::BIG>(buf, len,
-                                                                utf32_output);
+template <size_t STEP_SIZE>
+simdutf_really_inline bool buf_block_reader<STEP_SIZE>::has_full_block() const {
+  return idx < lenminusstep;
 }
 
-void implementation::change_endianness_utf16(const char16_t *input,
-                                             size_t length,
-                                             char16_t *output) const noexcept {
-  scalar::utf16::change_endianness_utf16(input, length, output);
+template <size_t STEP_SIZE>
+simdutf_really_inline const uint8_t *
+buf_block_reader<STEP_SIZE>::full_block() const {
+  return &buf[idx];
 }
 
-simdutf_warn_unused size_t implementation::count_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::LITTLE>(input, length);
+template <size_t STEP_SIZE>
+simdutf_really_inline size_t
+buf_block_reader<STEP_SIZE>::get_remainder(uint8_t *dst) const {
+  if (len == idx) {
+    return 0;
+  } // memcpy(dst, null, 0) will trigger an error with some sanitizers
+  std::memset(dst, 0x20,
+              STEP_SIZE); // std::memset STEP_SIZE because it is more efficient
+                          // to write out 8 or 16 bytes at once.
+  std::memcpy(dst, buf + idx, len - idx);
+  return len - idx;
 }
 
-simdutf_warn_unused size_t implementation::count_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::count_code_points<endianness::BIG>(input, length);
+template <size_t STEP_SIZE>
+simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
+  idx += STEP_SIZE;
 }
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
-}
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/buf_block_reader.h */
+/* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_validation {
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::LITTLE>(input,
-                                                                   length);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::LITTLE>(input,
-                                                                    length);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
-}
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::utf16_length_from_utf8(input, length);
-}
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf8_length_from_utf32(input, length);
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *input, size_t length) const noexcept {
-  return scalar::utf32::utf16_length_from_utf32(input, length);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
-
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *input, size_t length) const noexcept {
-  return scalar::utf8::count_code_points(input, length);
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+//
+// Return nonzero if there are incomplete multibyte characters at the end of the
+// block: e.g. if there is a 4-byte character, but it is 3 bytes from the end.
+//
+simdutf_really_inline simd8<uint8_t> is_incomplete(const simd8<uint8_t> input) {
+  // If the previous input's last 3 bytes match this, they're too short (they
+  // ended at EOF):
+  // ... 1111____ 111_____ 11______
+  static const uint8_t max_array[32] = {255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        255,
+                                        0b11110000u - 1,
+                                        0b11100000u - 1,
+                                        0b11000000u - 1};
+  const simd8<uint8_t> max_value(
+      &max_array[sizeof(max_array) - sizeof(simd8<uint8_t>)]);
+  return input.gt_bits(max_value);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  // skip trailing spaces
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
+struct utf8_checker {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+  // The last input we received
+  simd8<uint8_t> prev_input_block;
+  // Whether the last input we received was incomplete (used for ASCII fast
+  // path)
+  simd8<uint8_t> prev_incomplete;
+
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
+
+  // The only problem that can happen at EOF is that a multibyte character is
+  // too short or a byte value too large in the last bytes: check_special_cases
+  // only checks for bytes too large in the first of two bytes.
+  simdutf_really_inline void check_eof() {
+    // If the previous block had incomplete UTF-8 characters at the end, an
+    // ASCII block can't possibly finish them.
+    this->error |= this->prev_incomplete;
   }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+
+  simdutf_really_inline void check_next_input(const simd8x64<uint8_t> &input) {
+    if (simdutf_likely(is_ascii(input))) {
+      this->error |= this->prev_incomplete;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      static_assert((simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                        (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+                    "We support either two or four chunks per 64-byte block.");
+      if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+      } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+        this->check_utf8_bytes(input.chunks[0], this->prev_input_block);
+        this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+        this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+      }
+      this->prev_incomplete =
+          is_incomplete(input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1]);
+      this->prev_input_block = input.chunks[simd8x64<uint8_t>::NUM_CHUNKS - 1];
     }
-    return {SUCCESS, 0};
   }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
+
+  // do not forget to call check_eof!
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
   }
-  return r;
+
+}; // struct utf8_checker
+} // namespace utf8_validation
+
+using utf8_validation::utf8_checker;
+
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
+/* begin file src/generic/utf8_validation/utf8_validator.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_validation {
+
+/**
+ * Validates that the string is actual UTF-8.
+ */
+template <class checker>
+bool generic_validate_utf8(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  return !c.errors();
 }
 
-simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
-    const char16_t *input, size_t length) const noexcept {
-  return scalar::base64::maximal_binary_length_from_base64(input, length);
+bool generic_validate_utf8(const char *input, size_t length) {
+  return generic_validate_utf8<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-simdutf_warn_unused result implementation::base64_to_binary(
-    const char16_t *input, size_t length, char *output, base64_options options,
-    last_chunk_handling_options last_chunk_options) const noexcept {
-  // skip trailing spaces
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
+/**
+ * Validates that the string is actual UTF-8 and stops on errors.
+ */
+template <class checker>
+result generic_validate_utf8_with_errors(const uint8_t *input, size_t length) {
+  checker c{};
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    c.check_next_input(in);
+    if (c.errors()) {
+      if (count != 0) {
+        count--;
+      } // Sometimes the error is only detected in the next chunk
+      result res = scalar::utf8::rewind_and_validate_with_errors(
+          reinterpret_cast<const char *>(input),
+          reinterpret_cast<const char *>(input + count), length - count);
+      res.count += count;
+      return res;
     }
-    return {SUCCESS, 0};
+    reader.advance();
+    count += 64;
   }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  c.check_next_input(in);
+  reader.advance();
+  c.check_eof();
+  if (c.errors()) {
+    if (count != 0) {
+      count--;
+    } // Sometimes the error is only detected in the next chunk
+    result res = scalar::utf8::rewind_and_validate_with_errors(
+        reinterpret_cast<const char *>(input),
+        reinterpret_cast<const char *>(input) + count, length - count);
+    res.count += count;
+    return res;
+  } else {
+    return result(error_code::SUCCESS, length);
   }
-  return r;
 }
 
-simdutf_warn_unused size_t implementation::base64_length_from_binary(
-    size_t length, base64_options options) const noexcept {
-  return scalar::base64::base64_length_from_binary(length, options);
+result generic_validate_utf8_with_errors(const char *input, size_t length) {
+  return generic_validate_utf8_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-size_t implementation::binary_to_base64(const char *input, size_t length,
-                                        char *output,
-                                        base64_options options) const noexcept {
-  return scalar::base64::binary_to_base64(input, length, output, options);
+template <class checker>
+bool generic_validate_ascii(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  uint8_t blocks[64]{};
+  simd::simd8x64<uint8_t> running_or(blocks);
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    running_or |= in;
+    reader.advance();
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  running_or |= in;
+  return running_or.is_ascii();
 }
-} // namespace ppc64
-} // namespace simdutf
-
-/* begin file src/simdutf/ppc64/end.h */
-/* end file src/simdutf/ppc64/end.h */
-/* end file src/ppc64/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_RVV
-/* begin file src/rvv/implementation.cpp */
-
-
-
 
+bool generic_validate_ascii(const char *input, size_t length) {
+  return generic_validate_ascii<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
-/* begin file src/simdutf/rvv/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "rvv"
-// #define SIMDUTF_IMPLEMENTATION rvv
+template <class checker>
+result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
+  buf_block_reader<64> reader(input, length);
+  size_t count{0};
+  while (reader.has_full_block()) {
+    simd::simd8x64<uint8_t> in(reader.full_block());
+    if (!in.is_ascii()) {
+      result res = scalar::ascii::validate_with_errors(
+          reinterpret_cast<const char *>(input + count), length - count);
+      return result(res.error, count + res.count);
+    }
+    reader.advance();
 
-#if SIMDUTF_CAN_ALWAYS_RUN_RVV
-// nothing needed.
-#else
-SIMDUTF_TARGET_RVV
-#endif
-/* end file src/simdutf/rvv/begin.h */
-namespace simdutf {
-namespace rvv {
-namespace {
-#ifndef SIMDUTF_RVV_H
-  #error "rvv.h must be included"
-#endif
+    count += 64;
+  }
+  uint8_t block[64]{};
+  reader.get_remainder(block);
+  simd::simd8x64<uint8_t> in(block);
+  if (!in.is_ascii()) {
+    result res = scalar::ascii::validate_with_errors(
+        reinterpret_cast<const char *>(input + count), length - count);
+    return result(res.error, count + res.count);
+  } else {
+    return result(error_code::SUCCESS, length);
+  }
+}
+
+result generic_validate_ascii_with_errors(const char *input, size_t length) {
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
+}
 
+} // namespace utf8_validation
 } // unnamed namespace
-} // namespace rvv
+} // namespace lsx
 } // namespace simdutf
+/* end file src/generic/utf8_validation/utf8_validator.h */
+
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
-//
-// Implementation-specific overrides
-//
 namespace simdutf {
-namespace rvv {
-/* begin file src/rvv/rvv_helpers.inl.cpp */
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_utf32_store_utf16_m4(uint16_t *dst, vuint32m4_t utf32, size_t vl,
-                         vbool4_t m4even) {
-  /* convert [000000000000aaaa|aaaaaabbbbbbbbbb]
-   * to      [110111bbbbbbbbbb|110110aaaaaaaaaa] */
-  vuint32m4_t sur = __riscv_vsub_vx_u32m4(utf32, 0x10000, vl);
-  sur = __riscv_vor_vv_u32m4(__riscv_vsll_vx_u32m4(sur, 16, vl),
-                             __riscv_vsrl_vx_u32m4(sur, 10, vl), vl);
-  sur = __riscv_vand_vx_u32m4(sur, 0x3FF03FF, vl);
-  sur = __riscv_vor_vx_u32m4(sur, 0xDC00D800, vl);
-  /* merge 1 byte utf32 and 2 byte sur */
-  vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(utf32, 0xFFFF, vl);
-  vuint16m4_t utf32_16 = __riscv_vreinterpret_v_u32m4_u16m4(
-      __riscv_vmerge_vvm_u32m4(utf32, sur, m4, vl));
-  /* compress and store */
-  vbool4_t mOut = __riscv_vmor_mm_b4(
-      __riscv_vmsne_vx_u16m4_b4(utf32_16, 0, vl * 2), m4even, vl * 2);
-  vuint16m4_t vout = __riscv_vcompress_vm_u16m4(utf32_16, mOut, vl * 2);
-  vl = __riscv_vcpop_m_b4(mOut, vl * 2);
-  __riscv_vse16_v_u16m4(dst, simdutf_byteflip<bflip>(vout, vl), vl);
-  return vl;
-};
-/* end file src/rvv/rvv_helpers.inl.cpp */
+namespace lsx {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-/* begin file src/rvv/rvv_length_from.inl.cpp */
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
 
-simdutf_warn_unused size_t
-implementation::count_utf16le(const char16_t *src, size_t len) const noexcept {
-  return utf32_length_from_utf16le(src, len);
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      FORBIDDEN,
+      // 1110____ ________ <three byte lead in byte 1>
+      FORBIDDEN,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      FORBIDDEN);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-simdutf_warn_unused size_t
-implementation::count_utf16be(const char16_t *src, size_t len) const noexcept {
-  return utf32_length_from_utf16be(src, len);
-}
+              // ____0100 ________
+              FORBIDDEN,
+              // ____0101 ________
+              FORBIDDEN,
+              // ____011_ ________
+              FORBIDDEN, FORBIDDEN,
 
-simdutf_warn_unused size_t
-implementation::count_utf8(const char *src, size_t len) const noexcept {
-  return utf32_length_from_utf8(src, len);
-}
+              // ____1___ ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              // ____1101 ________
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
-    const char *src, size_t len) const noexcept {
-  return utf32_length_from_utf8(src, len);
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf16(size_t len) const noexcept {
-  return len;
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
 
-simdutf_warn_unused size_t
-implementation::latin1_length_from_utf32(size_t len) const noexcept {
-  return len;
-}
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-simdutf_warn_unused size_t
-implementation::utf16_length_from_latin1(size_t len) const noexcept {
-  return len;
-}
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    this->error |= check_special_cases(input, prev1);
+  }
 
-simdutf_warn_unused size_t
-implementation::utf32_length_from_latin1(size_t len) const noexcept {
-  return len;
-}
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      latin1_output += howmany;
+    }
+    return latin1_output - start;
+  }
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
-    const char *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    vbool1_t mask = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
-    count += __riscv_vcpop_m_b1(mask, vl);
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char *latin1_output) {
+    size_t pos = 0;
+    char *start{latin1_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        if (errors()) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        latin1_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, latin1_output - start);
   }
-  return count;
-}
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_utf32_length_from_utf16(const char16_t *src, size_t len) {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    vbool2_t notHigh =
-        __riscv_vmor_mm_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl),
-                           __riscv_vmsltu_vx_u16m8_b2(v, 0xDC00, vl), vl);
-    count += __riscv_vcpop_m_b2(notHigh, vl);
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
   }
-  return count;
-}
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
-    const char16_t *src, size_t len) const noexcept {
-  return rvv_utf32_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
-}
+}; // struct utf8_checker
+} // namespace utf8_to_latin1
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
-simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
-    const char16_t *src, size_t len) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
-  else
-    return rvv_utf32_length_from_utf16<simdutf_ByteFlip::V>(src, len);
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_latin1 {
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *src, size_t len) const noexcept {
-  size_t count = len;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    count += __riscv_vcpop_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
   }
-  return count;
-}
-
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_utf8_length_from_utf16(const char16_t *src, size_t len) {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    vbool2_t m234 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7F, vl);
-    vbool2_t m34 = __riscv_vmsgtu_vx_u16m8_b2(v, 0x7FF, vl);
-    vbool2_t notSur =
-        __riscv_vmor_mm_b2(__riscv_vmsltu_vx_u16m8_b2(v, 0xD800, vl),
-                           __riscv_vmsgtu_vx_u16m8_b2(v, 0xDFFF, vl), vl);
-    vbool2_t m3 = __riscv_vmand_mm_b2(m34, notSur, vl);
-    count += vl + __riscv_vcpop_m_b2(m234, vl) + __riscv_vcpop_m_b2(m3, vl);
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
   }
-  return count;
-}
-
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
-    const char16_t *src, size_t len) const noexcept {
-  return rvv_utf8_length_from_utf16<simdutf_ByteFlip::NONE>(src, len);
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
 }
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
-    const char16_t *src, size_t len) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::ZVBB>(src, len);
-  else
-    return rvv_utf8_length_from_utf16<simdutf_ByteFlip::V>(src, len);
-}
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace lsx
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
 
-simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
-    const char32_t *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vbool4_t m234 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7F, vl);
-    vbool4_t m34 = __riscv_vmsgtu_vx_u32m8_b4(v, 0x7FF, vl);
-    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
-    count += vl + __riscv_vcpop_m_b4(m234, vl) + __riscv_vcpop_m_b4(m34, vl) +
-             __riscv_vcpop_m_b4(m4, vl);
-  }
-  return count;
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf16 {
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
-    const char *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    vbool1_t m1234 = __riscv_vmsgt_vx_i8m8_b1(v, -65, vl);
-    vbool1_t m4 = __riscv_vmsgtu_vx_u8m8_b1(__riscv_vreinterpret_u8m8(v),
-                                            (uint8_t)0b11101111, vl);
-    count += __riscv_vcpop_m_b1(m1234, vl) + __riscv_vcpop_m_b1(m4, vl);
-  }
-  return count;
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
-    const char32_t *src, size_t len) const noexcept {
-  size_t count = 0;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vbool4_t m4 = __riscv_vmsgtu_vx_u32m8_b4(v, 0xFFFF, vl);
-    count += vl + __riscv_vcpop_m_b4(m4, vl);
+template <endianness endian>
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
+  size_t pos = 0;
+  char16_t *start{utf16_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
+      pos += 64;
+    } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
   }
-  return count;
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
-/* end file src/rvv/rvv_length_from.inl.cpp */
-/* begin file src/rvv/rvv_validate.inl.cpp */
 
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
-simdutf_warn_unused bool
-implementation::validate_ascii(const char *src, size_t len) const noexcept {
-  size_t vlmax = __riscv_vsetvlmax_e8m8();
-  vint8m8_t mask = __riscv_vmv_v_x_i8m8(0, vlmax);
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    mask = __riscv_vor_vv_i8m8_tu(mask, mask, v, vl);
-  }
-  return __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(mask, 0, vlmax), vlmax) <
-         0;
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf16 {
+using namespace simd;
 
-simdutf_warn_unused result implementation::validate_ascii_with_errors(
-    const char *src, size_t len) const noexcept {
-  const char *beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m8(len);
-    vint8m8_t v = __riscv_vle8_v_i8m8((int8_t *)src, vl);
-    long idx = __riscv_vfirst_m_b1(__riscv_vmslt_vx_i8m8_b1(v, 0, vl), vl);
-    if (idx >= 0)
-      return result(error_code::TOO_LARGE, src - beg + idx);
-  }
-  return result(error_code::SUCCESS, src - beg);
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-/* Returns a close estimation of the number of valid UTF-8 bytes up to the
- * first invalid one, but never overestimating. */
-simdutf_really_inline static size_t rvv_count_valid_utf8(const char *src,
-                                                         size_t len) {
-  const char *beg = src;
-  if (len < 32)
-    return 0;
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-  /* validate first three bytes */
-  {
-    size_t idx = 3;
-    while (idx < len && (src[idx] >> 6) == 0b10)
-      ++idx;
-    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
-      return 0;
-  }
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
-  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
-  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-  const vuint8m1_t err1tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
-  const vuint8m1_t err2tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
-  const vuint8m1_t err3tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-  size_t tail = 3;
-  size_t n = len - tail;
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
+}
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
-  for (size_t vl; n > 0; n -= vl, src += vl) {
-    vl = __riscv_vsetvl_e8m4(n);
-    vuint8m4_t v0 = __riscv_vle8_v_u8m4((uint8_t const *)src, vl);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
 
-    uint8_t next0 = src[vl + 0];
-    uint8_t next1 = src[vl + 1];
-    uint8_t next2 = src[vl + 2];
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
+  }
 
-    /* fast path: ASCII */
-    if (__riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u8m4_b2(v0, 0b01111111, vl), vl) <
-            0 &&
-        (next0 | next1 | next2) < 0b10000000)
-      continue;
+  template <endianness endian>
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      return 0;
+    }
+    if (pos < size) {
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf16_output += howmany;
+    }
+    return utf16_output - start;
+  }
 
-    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
-     * https://arxiv.org/abs/2010.03090 */
-    vuint8m4_t v1 = __riscv_vslide1down_vx_u8m4(v0, next0, vl);
-    vuint8m4_t v2 = __riscv_vslide1down_vx_u8m4(v1, next1, vl);
-    vuint8m4_t v3 = __riscv_vslide1down_vx_u8m4(v2, next2, vl);
+  template <endianness endian>
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char16_t *utf16_output) {
+    size_t pos = 0;
+    char16_t *start{utf16_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf16_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf16_output - start);
+  }
 
-    vuint8m4_t s1 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
-        __riscv_vreinterpret_v_u8m4_u16m4(v2), 4, __riscv_vsetvlmax_e16m4()));
-    vuint8m4_t s3 = __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vsrl_vx_u16m4(
-        __riscv_vreinterpret_v_u8m4_u16m4(v3), 4, __riscv_vsetvlmax_e16m4()));
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
+  }
 
-    vuint8m4_t idx2 = __riscv_vand_vx_u8m4(v2, 0xF, vl);
-    vuint8m4_t idx1 = __riscv_vand_vx_u8m4(s1, 0xF, vl);
-    vuint8m4_t idx3 = __riscv_vand_vx_u8m4(s3, 0xF, vl);
+}; // struct utf8_checker
+} // namespace utf8_to_utf16
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
-    vuint8m4_t err1 = simdutf_vrgather_u8m1x4(err1tbl, idx1);
-    vuint8m4_t err2 = simdutf_vrgather_u8m1x4(err2tbl, idx2);
-    vuint8m4_t err3 = simdutf_vrgather_u8m1x4(err3tbl, idx3);
-    vint8m4_t errs = __riscv_vreinterpret_v_u8m4_i8m4(
-        __riscv_vand_vv_u8m4(__riscv_vand_vv_u8m4(err1, err2, vl), err3, vl));
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf32 {
 
-    vbool2_t is_3 = __riscv_vmsgtu_vx_u8m4_b2(v1, 0b11100000 - 1, vl);
-    vbool2_t is_4 = __riscv_vmsgtu_vx_u8m4_b2(v0, 0b11110000 - 1, vl);
-    vbool2_t is_34 = __riscv_vmor_mm_b2(is_3, is_4, vl);
-    vbool2_t err34 =
-        __riscv_vmxor_mm_b2(is_34, __riscv_vmslt_vx_i8m4_b2(errs, 0, vl), vl);
-    vbool2_t errm =
-        __riscv_vmor_mm_b2(__riscv_vmsgt_vx_i8m4_b2(errs, 0, vl), err34, vl);
-    if (__riscv_vfirst_m_b2(errm, vl) >= 0)
-      break;
-  }
+using namespace simd;
 
-  /* we need to validate the last character */
-  while (tail < len && (src[0] >> 6) == 0b10)
-    --src, ++tail;
-  return src - beg;
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
+  size_t pos = 0;
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+    }
+  }
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf8(const char *src, size_t len) const noexcept {
-  size_t count = rvv_count_valid_utf8(src, len);
-  return scalar::utf8::validate(src + count, len - count);
-}
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-simdutf_warn_unused result implementation::validate_utf8_with_errors(
-    const char *src, size_t len) const noexcept {
-  size_t count = rvv_count_valid_utf8(src, len);
-  result res = scalar::utf8::validate_with_errors(src + count, len - count);
-  return result(res.error, count + res.count);
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8_to_utf32 {
+using namespace simd;
 
-simdutf_warn_unused bool
-implementation::validate_utf16le(const char16_t *src,
-                                 size_t len) const noexcept {
-  return validate_utf16le_with_errors(src, len).error == error_code::SUCCESS;
-}
+simdutf_really_inline simd8<uint8_t>
+check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
+  // Bit 1 = Too Long (ASCII followed by continuation)
+  // Bit 2 = Overlong 3-byte
+  // Bit 4 = Surrogate
+  // Bit 5 = Overlong 2-byte
+  // Bit 7 = Two Continuations
+  constexpr const uint8_t TOO_SHORT = 1 << 0;  // 11______ 0_______
+                                               // 11______ 11______
+  constexpr const uint8_t TOO_LONG = 1 << 1;   // 0_______ 10______
+  constexpr const uint8_t OVERLONG_3 = 1 << 2; // 11100000 100_____
+  constexpr const uint8_t SURROGATE = 1 << 4;  // 11101101 101_____
+  constexpr const uint8_t OVERLONG_2 = 1 << 5; // 1100000_ 10______
+  constexpr const uint8_t TWO_CONTS = 1 << 7;  // 10______ 10______
+  constexpr const uint8_t TOO_LARGE = 1 << 3;  // 11110100 1001____
+                                               // 11110100 101_____
+                                               // 11110101 1001____
+                                               // 11110101 101_____
+                                               // 1111011_ 1001____
+                                               // 1111011_ 101_____
+                                               // 11111___ 1001____
+                                               // 11111___ 101_____
+  constexpr const uint8_t TOO_LARGE_1000 = 1 << 6;
+  // 11110101 1000____
+  // 1111011_ 1000____
+  // 11111___ 1000____
+  constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
 
-simdutf_warn_unused bool
-implementation::validate_utf16be(const char16_t *src,
-                                 size_t len) const noexcept {
-  return validate_utf16be_with_errors(src, len).error == error_code::SUCCESS;
-}
+  const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
+      // 0_______ ________ <ASCII in byte 1>
+      TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
+      TOO_LONG,
+      // 10______ ________ <continuation in byte 1>
+      TWO_CONTS, TWO_CONTS, TWO_CONTS, TWO_CONTS,
+      // 1100____ ________ <two byte lead in byte 1>
+      TOO_SHORT | OVERLONG_2,
+      // 1101____ ________ <two byte lead in byte 1>
+      TOO_SHORT,
+      // 1110____ ________ <three byte lead in byte 1>
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      // 1111____ ________ <four+ byte lead in byte 1>
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+  constexpr const uint8_t CARRY =
+      TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
+  const simd8<uint8_t> byte_1_low =
+      (prev1 & 0x0F)
+          .lookup_16<uint8_t>(
+              // ____0000 ________
+              CARRY | OVERLONG_3 | OVERLONG_2 | OVERLONG_4,
+              // ____0001 ________
+              CARRY | OVERLONG_2,
+              // ____001_ ________
+              CARRY, CARRY,
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_validate_utf16_with_errors(const char16_t *src, size_t len) {
-  const char16_t *beg = src;
-  uint16_t last = 0;
-  for (size_t vl; len > 0;
-       len -= vl, src += vl, last = simdutf_byteflip<bflip>(src[-1])) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v1 = __riscv_vle16_v_u16m8((const uint16_t *)src, vl);
-    v1 = simdutf_byteflip<bflip>(v1, vl);
-    vuint16m8_t v0 = __riscv_vslide1up_vx_u16m8(v1, last, vl);
+              // ____0100 ________
+              CARRY | TOO_LARGE,
+              // ____0101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____011_ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
-    vbool2_t surhi = __riscv_vmseq_vx_u16m8_b2(
-        __riscv_vand_vx_u16m8(v0, 0xFC00, vl), 0xD800, vl);
-    vbool2_t surlo = __riscv_vmseq_vx_u16m8_b2(
-        __riscv_vand_vx_u16m8(v1, 0xFC00, vl), 0xDC00, vl);
+              // ____1___ ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              // ____1101 ________
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
+  const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
+      // ________ 0_______ <ASCII in byte 2>
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
+      TOO_SHORT, TOO_SHORT,
 
-    long idx = __riscv_vfirst_m_b2(__riscv_vmxor_mm_b2(surhi, surlo, vl), vl);
-    if (idx >= 0) {
-      last = idx > 0 ? simdutf_byteflip<bflip>(src[idx - 1]) : last;
-      return result(error_code::SURROGATE,
-                    src - beg + idx - (last - 0xD800u < 0x400u));
-      break;
-    }
-  }
-  if (last - 0xD800u < 0x400u) {
-    return result(error_code::SURROGATE,
-                  src - beg - 1); /* end on high surrogate */
-  } else {
-    return result(error_code::SUCCESS, src - beg);
-  }
-}
+      // ________ 1000____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE_1000 |
+          OVERLONG_4,
+      // ________ 1001____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | OVERLONG_3 | TOO_LARGE,
+      // ________ 101_____
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
+      TOO_LONG | OVERLONG_2 | TWO_CONTS | SURROGATE | TOO_LARGE,
 
-simdutf_warn_unused result implementation::validate_utf16le_with_errors(
-    const char16_t *src, size_t len) const noexcept {
-  return rvv_validate_utf16_with_errors<simdutf_ByteFlip::NONE>(src, len);
+      // ________ 11______
+      TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
+  return (byte_1_high & byte_1_low & byte_2_high);
 }
-
-simdutf_warn_unused result implementation::validate_utf16be_with_errors(
-    const char16_t *src, size_t len) const noexcept {
-  if (supports_zvbb())
-    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::ZVBB>(src, len);
-  else
-    return rvv_validate_utf16_with_errors<simdutf_ByteFlip::V>(src, len);
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
 }
 
-simdutf_warn_unused bool
-implementation::validate_utf32(const char32_t *src, size_t len) const noexcept {
-  size_t vlmax = __riscv_vsetvlmax_e32m8();
-  vuint32m8_t max = __riscv_vmv_v_x_u32m8(0x10FFFF, vlmax);
-  vuint32m8_t maxOff = __riscv_vmv_v_x_u32m8(0xFFFFF7FF, vlmax);
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
-    max = __riscv_vmaxu_vv_u32m8_tu(max, max, v, vl);
-    maxOff = __riscv_vmaxu_vv_u32m8_tu(maxOff, maxOff, off, vl);
+struct validating_transcoder {
+  // If this is nonzero, there has been a UTF-8 error.
+  simd8<uint8_t> error;
+
+  validating_transcoder() : error(uint8_t(0)) {}
+  //
+  // Check whether the current bytes are valid UTF-8.
+  //
+  simdutf_really_inline void check_utf8_bytes(const simd8<uint8_t> input,
+                                              const simd8<uint8_t> prev_input) {
+    // Flip prev1...prev3 so we can easily determine if they are 2+, 3+ or 4+
+    // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
+    // small negative numbers)
+    simd8<uint8_t> prev1 = input.prev<1>(prev_input);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
-  return __riscv_vfirst_m_b4(
-             __riscv_vmor_mm_b4(
-                 __riscv_vmsne_vx_u32m8_b4(max, 0x10FFFF, vlmax),
-                 __riscv_vmsne_vx_u32m8_b4(maxOff, 0xFFFFF7FF, vlmax), vlmax),
-             vlmax) < 0;
-}
 
-simdutf_warn_unused result implementation::validate_utf32_with_errors(
-    const char32_t *src, size_t len) const noexcept {
-  const char32_t *beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    vuint32m8_t off = __riscv_vadd_vx_u32m8(v, 0xFFFF2000, vl);
-    long idx1 =
-        __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 0x10FFFF, vl), vl);
-    long idx2 = __riscv_vfirst_m_b4(
-        __riscv_vmsgtu_vx_u32m8_b4(off, 0xFFFFF7FF, vl), vl);
-    if (idx1 >= 0 && idx2 >= 0) {
-      if (idx1 <= idx2) {
-        return result(error_code::TOO_LARGE, src - beg + idx1);
+  simdutf_really_inline size_t convert(const char *in, size_t size,
+                                       char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 16 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
+    }
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
       } else {
-        return result(error_code::SURROGATE, src - beg + idx2);
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
       }
     }
-    if (idx1 >= 0) {
-      return result(error_code::TOO_LARGE, src - beg + idx1);
+    if (errors()) {
+      return 0;
     }
-    if (idx2 >= 0) {
-      return result(error_code::SURROGATE, src - beg + idx2);
+    if (pos < size) {
+      size_t howmany =
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      if (howmany == 0) {
+        return 0;
+      }
+      utf32_output += howmany;
     }
+    return utf32_output - start;
   }
-  return result(error_code::SUCCESS, src - beg);
-}
-/* end file src/rvv/rvv_validate.inl.cpp */
-
-/* begin file src/rvv/rvv_latin1_to.inl.cpp */
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
-    const char *src, size_t len, char *dst) const noexcept {
-  char *beg = dst;
-  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    vbool4_t nascii =
-        __riscv_vmslt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v1), 0, vl);
-    size_t cnt = __riscv_vcpop_m_b4(nascii, vl);
-    vlOut = vl + cnt;
-    if (cnt == 0) {
-      __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
-      continue;
+  simdutf_really_inline result convert_with_errors(const char *in, size_t size,
+                                                   char32_t *utf32_output) {
+    size_t pos = 0;
+    char32_t *start{utf32_output};
+    // In the worst case, we have the haswell kernel which can cause an overflow
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // last 16 bytes, and if the data is valid, then it is entirely safe because
+    // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
+    // generally assume that you have valid UTF-8 input, so we are going to go
+    // back from the end counting 8 leading bytes, to give us a good margin.
+    size_t leading_byte = 0;
+    size_t margin = size;
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-
-    vuint8m2_t v0 =
-        __riscv_vor_vx_u8m2(__riscv_vsrl_vx_u8m2(v1, 6, vl), 0b11000000, vl);
-    v1 = __riscv_vand_vx_u8m2_mu(nascii, v1, v1, 0b10111111, vl);
-
-    vuint8m4_t wide =
-        __riscv_vreinterpret_v_u16m4_u8m4(__riscv_vwmaccu_vx_u16m4(
-            __riscv_vwaddu_vv_u16m4(v0, v1, vl), 0xFF, v1, vl));
-    vbool2_t mask = __riscv_vmsgtu_vx_u8m4_b2(
-        __riscv_vsub_vx_u8m4(wide, 0b11000000, vl * 2), 1, vl * 2);
-    vuint8m4_t comp = __riscv_vcompress_vm_u8m4(wide, mask, vl * 2);
-
-    __riscv_vse8_v_u8m4((uint8_t *)dst, comp, vlOut);
-  }
-  return dst - beg;
-}
-
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  char16_t *beg = dst;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e8m4(len);
-    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
-    __riscv_vse16_v_u16m8((uint16_t *)dst, __riscv_vzext_vf2_u16m8(v, vl), vl);
-  }
-  return dst - beg;
-}
-
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  char16_t *beg = dst;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e8m4(len);
-    vuint8m4_t v = __riscv_vle8_v_u8m4((uint8_t *)src, vl);
-    __riscv_vse16_v_u16m8(
-        (uint16_t *)dst,
-        __riscv_vsll_vx_u16m8(__riscv_vzext_vf2_u16m8(v, vl), 8, vl), vl);
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
+    const size_t safety_margin = size - margin + 1; // to avoid overruns!
+    while (pos + 64 + safety_margin <= size) {
+      simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+      if (input.is_ascii()) {
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
+        pos += 64;
+      } else {
+        // you might think that a for-loop would work, but under Visual Studio,
+        // it is not good enough.
+        static_assert(
+            (simd8x64<uint8_t>::NUM_CHUNKS == 2) ||
+                (simd8x64<uint8_t>::NUM_CHUNKS == 4),
+            "We support either two or four chunks per 64-byte block.");
+        auto zero = simd8<uint8_t>{uint8_t(0)};
+        if (simd8x64<uint8_t>::NUM_CHUNKS == 2) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+        } else if (simd8x64<uint8_t>::NUM_CHUNKS == 4) {
+          this->check_utf8_bytes(input.chunks[0], zero);
+          this->check_utf8_bytes(input.chunks[1], input.chunks[0]);
+          this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
+          this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
+        }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
+          res.count += pos;
+          return res;
+        }
+        uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+        uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+        // We process in blocks of up to 12 bytes except possibly
+        // for fast paths which may process up to 16 bytes. For the
+        // slow path to work, we should have at least 12 input bytes left.
+        size_t max_starting_point = (pos + 64) - 12;
+        // Next loop is going to run at least five times.
+        while (pos < max_starting_point) {
+          // Performance note: our ability to compute 'consumed' and
+          // then shift and recompute is critical. If there is a
+          // latency of, say, 4 cycles on getting 'consumed', then
+          // the inner loop might have a total latency of about 6 cycles.
+          // Yet we process between 6 to 12 inputs bytes, thus we get
+          // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+          // for this section of the code. Hence, there is a limit
+          // to how much we can further increase this latency before
+          // it seriously harms performance.
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          pos += consumed;
+          utf8_end_of_code_point_mask >>= consumed;
+        }
+        // At this point there may remain between 0 and 12 bytes in the
+        // 64-byte block. These bytes will be processed again. So we have an
+        // 80% efficiency (in the worst case). In practice we expect an
+        // 85% to 90% efficiency.
+      }
+    }
+    if (errors()) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      res.count += pos;
+      return res;
+    }
+    if (pos < size) {
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
+      if (res.error) { // In case of error, we want the error position
+        res.count += pos;
+        return res;
+      } else { // In case of success, we want the number of word written
+        utf32_output += res.count;
+      }
+    }
+    return result(error_code::SUCCESS, utf32_output - start);
   }
-  return dst - beg;
-}
 
-simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  char32_t *beg = dst;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    __riscv_vse32_v_u32m8((uint32_t *)dst, __riscv_vzext_vf4_u32m8(v, vl), vl);
+  simdutf_really_inline bool errors() const {
+    return this->error.any_bits_set_anywhere();
   }
-  return dst - beg;
-}
-/* end file src/rvv/rvv_latin1_to.inl.cpp */
-/* begin file src/rvv/rvv_utf16_to.inl.cpp */
-#include <cstdio>
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_utf16_to_latin1_with_errors(const char16_t *src, size_t len, char *dst) {
-  const char16_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    long idx = __riscv_vfirst_m_b2(__riscv_vmsgtu_vx_u16m8_b2(v, 255, vl), vl);
-    if (idx >= 0)
-      return result(error_code::TOO_LARGE, src - beg + idx);
-    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
-  }
-  return result(error_code::SUCCESS, src - beg);
-}
+}; // struct utf8_checker
+} // namespace utf8_to_utf32
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16le_to_latin1_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
-}
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16be_to_latin1_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
-}
+// other functions
+/* begin file src/generic/utf8.h */
 
-simdutf_warn_unused result
-implementation::convert_utf16le_to_latin1_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
-}
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf8 {
 
-simdutf_warn_unused result
-implementation::convert_utf16be_to_latin1_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
-                                                                   dst);
-  else
-    return rvv_utf16_to_latin1_with_errors<simdutf_ByteFlip::V>(src, len, dst);
-}
+using namespace simd;
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  const char16_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m4(v, vl), vl);
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
   }
-  return src - beg;
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  const char16_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    __riscv_vse8_v_u8m4((uint8_t *)dst, __riscv_vnsrl_wx_u8m4(v, 8, vl), vl);
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
   }
-  return src - beg;
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
 }
+} // namespace utf8
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace lsx {
+namespace {
+namespace utf16 {
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_utf16_to_utf8_with_errors(const char16_t *src, size_t len, char *dst) {
-  size_t n = len;
-  const char16_t *srcBeg = src;
-  const char *dstBeg = dst;
-  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
-  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
-      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
-
-  for (size_t vl, vlOut; n > 0;) {
-    vl = __riscv_vsetvl_e16m2(n);
-
-    vuint16m2_t v = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
-    v = simdutf_byteflip<bflip>(v, vl);
-    vbool8_t m234 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x80 - 1, vl);
-
-    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
-      vlOut = vl;
-      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(v, vlOut),
-                          vlOut);
-      n -= vl, src += vl, dst += vlOut;
-      continue;
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
+  }
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
 
-    vbool8_t m34 = __riscv_vmsgtu_vx_u16m2_b8(v, 0x800 - 1, vl);
-
-    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
-      /* 0: [     aaa|aabbbbbb]
-       * 1: [aabbbbbb|        ] vsll 8
-       * 2: [        |   aaaaa] vsrl 6
-       * 3: [00111111|00011111]
-       * 4: [  bbbbbb|000aaaaa] (1|2)&3
-       * 5: [11000000|11000000]
-       * 6: [10bbbbbb|110aaaaa] 4|5 */
-      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
-          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(v, 8, vl),
-                               __riscv_vsrl_vx_u16m2(v, 6, vl), vl),
-          0b0011111100011111, vl);
-      vuint16m2_t vout16 =
-          __riscv_vor_vx_u16m2_mu(m234, v, twoByte, 0b1000000011000000, vl);
-      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
-
-      /* Every high byte that is zero should be compressed
-       * low bytes should never be compressed, so we set them
-       * to all ones, and then create a non-zero bytes mask */
-      vbool4_t mcomp =
-          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
-                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
-                                   0, vl * 2);
-      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
-
-      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
-      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
-
-      n -= vl, src += vl, dst += vlOut;
-      continue;
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
 
-    vbool8_t sur = __riscv_vmseq_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v, 0xF800, vl), 0xD800, vl);
-    long first = __riscv_vfirst_m_b8(sur, vl);
-    size_t tail = vl - first;
-    vl = first < 0 ? vl : first;
-
-    if (vl > 0) { /* 1/2/3 byte utf8 */
-      /* in: [aaaabbbb|bbcccccc]
-       * v1: [0bcccccc|        ] vsll  8
-       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
-       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
-       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
-       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
-       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
-       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
-       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
-       * [10cccccc]
-       */
-      vuint16m2_t v1, v2, v3, v12;
-      v1 = __riscv_vor_vx_u16m2_mu(
-          m234, v, __riscv_vand_vx_u16m2(v, 0b00111111, vl), 0b10000000, vl);
-      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
-
-      v2 = __riscv_vor_vx_u16m2(
-          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 6, vl), 0b00111111,
-                                vl),
-          0b10000000, vl);
-      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
-                                   0b01000000, vl);
-      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(v, 12, vl), 0b11100000,
-                                vl);
-      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
+  }
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
+}
 
-      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
-      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
-      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
 
-      vbool2_t mcomp = __riscv_vmor_mm_b2(
-          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
-      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
 
-      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
-      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
 
-      n -= vl, src += vl, dst += vlOut;
-    }
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
 
-    if (tail)
-      while (n) {
-        uint16_t word = simdutf_byteflip<bflip>(src[0]);
-        if ((word & 0xFF80) == 0) {
-          break;
-        } else if ((word & 0xF800) == 0) {
-          break;
-        } else if ((word & 0xF800) != 0xD800) {
-          break;
-        } else {
-          // must be a surrogate pair
-          if (n <= 1)
-            return result(error_code::SURROGATE, src - srcBeg);
-          uint16_t diff = word - 0xD800;
-          if (diff > 0x3FF)
-            return result(error_code::SURROGATE, src - srcBeg);
-          uint16_t diff2 = simdutf_byteflip<bflip>(src[1]) - 0xDC00;
-          if (diff2 > 0x3FF)
-            return result(error_code::SURROGATE, src - srcBeg);
+} // namespace utf16
+} // unnamed namespace
+} // namespace lsx
+} // namespace simdutf
+/* end file src/generic/utf16.h */
 
-          uint32_t value = ((diff + 0x40) << 10) + diff2;
+//
+// Implementation-specific overrides
+//
+namespace simdutf {
+namespace lsx {
 
-          // will generate four UTF-8 bytes
-          // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
-          *dst++ = (char)((value >> 18) | 0b11110000);
-          *dst++ = (char)(((value >> 12) & 0b111111) | 0b10000000);
-          *dst++ = (char)(((value >> 6) & 0b111111) | 0b10000000);
-          *dst++ = (char)((value & 0b111111) | 0b10000000);
-          src += 2;
-          n -= 2;
-        }
-      }
+simdutf_warn_unused int
+implementation::detect_encodings(const char *input,
+                                 size_t length) const noexcept {
+  // If there is a BOM, then we trust it.
+  auto bom_encoding = simdutf::BOM::check_bom(input, length);
+  // todo: reimplement as a one-pass algorithm.
+  if (bom_encoding != encoding_type::unspecified) {
+    return bom_encoding;
   }
-
-  return result(error_code::SUCCESS, dst - dstBeg);
+  int out = 0;
+  if (validate_utf8(input, length)) {
+    out |= encoding_type::UTF8;
+  }
+  if ((length % 2) == 0) {
+    if (validate_utf16le(reinterpret_cast<const char16_t *>(input),
+                         length / 2)) {
+      out |= encoding_type::UTF16_LE;
+    }
+  }
+  if ((length % 4) == 0) {
+    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4)) {
+      out |= encoding_type::UTF32_LE;
+    }
+  }
+  return out;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16le_to_utf8_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused bool
+implementation::validate_utf8(const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_utf8(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf16be_to_utf8_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused result implementation::validate_utf8_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+simdutf_warn_unused bool
+implementation::validate_ascii(const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_ascii(buf, len);
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::ZVBB>(src, len, dst);
-  else
-    return rvv_utf16_to_utf8_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused result implementation::validate_ascii_with_errors(
+    const char *buf, size_t len) const noexcept {
+  return lsx::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf16le_to_utf8(src, len, dst);
+simdutf_warn_unused bool
+implementation::validate_utf16le(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = lsx_validate_utf16<endianness::LITTLE>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::LITTLE>(tail,
+                                                       len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
-    const char16_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf16be_to_utf8(src, len, dst);
+simdutf_warn_unused bool
+implementation::validate_utf16be(const char16_t *buf,
+                                 size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char16_t *tail = lsx_validate_utf16<endianness::BIG>(buf, len);
+  if (tail) {
+    return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_utf16_to_utf32_with_errors(const char16_t *src, size_t len, char32_t *dst) {
-  const char16_t *const srcBeg = src;
-  char32_t *const dstBeg = dst;
-
-  constexpr const uint16_t ANY_SURROGATE_MASK = 0xf800;
-  constexpr const uint16_t ANY_SURROGATE_VALUE = 0xd800;
-  constexpr const uint16_t LO_SURROGATE_MASK = 0xfc00;
-  constexpr const uint16_t LO_SURROGATE_VALUE = 0xdc00;
-  constexpr const uint16_t HI_SURROGATE_MASK = 0xfc00;
-  constexpr const uint16_t HI_SURROGATE_VALUE = 0xd800;
-
-  uint16_t last = 0;
-  while (len > 0) {
-    size_t vl = __riscv_vsetvl_e16m2(len);
-    vuint16m2_t v0 = __riscv_vle16_v_u16m2((uint16_t const *)src, vl);
-    v0 = simdutf_byteflip<bflip>(v0, vl);
-
-    { // check fast-path
-      const vuint16m2_t v = __riscv_vand_vx_u16m2(v0, ANY_SURROGATE_MASK, vl);
-      const vbool8_t any_surrogate =
-          __riscv_vmseq_vx_u16m2_b8(v, ANY_SURROGATE_VALUE, vl);
-      if (__riscv_vfirst_m_b8(any_surrogate, vl) < 0) {
-        /* no surrogates */
-        __riscv_vse32_v_u32m4((uint32_t *)dst, __riscv_vzext_vf2_u32m4(v0, vl),
-                              vl);
-        len -= vl;
-        src += vl;
-        dst += vl;
-        continue;
-      }
-    }
-
-    if ((simdutf_byteflip<bflip>(src[0]) & LO_SURROGATE_MASK) ==
-        LO_SURROGATE_VALUE) {
-      return result(error_code::SURROGATE, src - srcBeg);
-    }
-
-    // decode surrogates
-    vuint16m2_t v1 = __riscv_vslide1down_vx_u16m2(v0, 0, vl);
-    vl = __riscv_vsetvl_e16m2(vl - 1);
-    if (vl == 0) {
-      return result(error_code::SURROGATE, src - srcBeg);
-    }
-
-    const vbool8_t surhi = __riscv_vmseq_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v0, HI_SURROGATE_MASK, vl), HI_SURROGATE_VALUE,
-        vl);
-    const vbool8_t surlo = __riscv_vmseq_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v1, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
-        vl);
-
-    // compress everything but lo surrogates
-    const vbool8_t compress = __riscv_vmsne_vx_u16m2_b8(
-        __riscv_vand_vx_u16m2(v0, LO_SURROGATE_MASK, vl), LO_SURROGATE_VALUE,
-        vl);
-
-    {
-      const vbool8_t diff = __riscv_vmxor_mm_b8(surhi, surlo, vl);
-      const long idx = __riscv_vfirst_m_b8(diff, vl);
-      if (idx >= 0) {
-        uint16_t word = simdutf_byteflip<bflip>(src[idx]);
-        if (word < 0xD800 || word > 0xDBFF) {
-          return result(error_code::SURROGATE, src - srcBeg + idx + 1);
-        }
-        return result(error_code::SURROGATE, src - srcBeg + idx);
-      }
-    }
-
-    last = simdutf_byteflip<bflip>(src[vl]);
-    vuint32m4_t utf32 = __riscv_vzext_vf2_u32m4(v0, vl);
-
-    // v0 = 110110yyyyyyyyyy (0xd800 + yyyyyyyyyy) --- hi surrogate
-    // v1 = 110111xxxxxxxxxx (0xdc00 + xxxxxxxxxx) --- lo surrogate
-
-    // t0 = u16(                    0000_00yy_yyyy_yyyy)
-    const vuint32m4_t t0 =
-        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v0, 0x03ff, vl), vl);
-    // t1 = u32(0000_0000_0000_yyyy_yyyy_yy00_0000_0000)
-    const vuint32m4_t t1 = __riscv_vsll_vx_u32m4(t0, 10, vl);
-
-    // t2 = u32(0000_0000_0000_0000_0000_00xx_xxxx_xxxx)
-    const vuint32m4_t t2 =
-        __riscv_vzext_vf2_u32m4(__riscv_vand_vx_u16m2(v1, 0x03ff, vl), vl);
-
-    // t3 = u32(0000_0000_0000_yyyy_yyyy_yyxx_xxxx_xxxx)
-    const vuint32m4_t t3 = __riscv_vor_vv_u32m4(t1, t2, vl);
+simdutf_warn_unused result implementation::validate_utf16le_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lsx_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    // t4 = utf32 from surrogate pairs
-    const vuint32m4_t t4 = __riscv_vadd_vx_u32m4(t3, 0x10000, vl);
+simdutf_warn_unused result implementation::validate_utf16be_with_errors(
+    const char16_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lsx_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (res.count != len) {
+    result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
+        buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    const vuint32m4_t result = __riscv_vmerge_vvm_u32m4(utf32, t4, surhi, vl);
+simdutf_warn_unused bool
+implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    // empty input is valid. protected the implementation from nullptr.
+    return true;
+  }
+  const char32_t *tail = lsx_validate_utf32le(buf, len);
+  if (tail) {
+    return scalar::utf32::validate(tail, len - (tail - buf));
+  } else {
+    return false;
+  }
+}
 
-    const vuint32m4_t comp = __riscv_vcompress_vm_u32m4(result, compress, vl);
-    const size_t vlOut = __riscv_vcpop_m_b8(compress, vl);
-    __riscv_vse32_v_u32m4((uint32_t *)dst, comp, vlOut);
+simdutf_warn_unused result implementation::validate_utf32_with_errors(
+    const char32_t *buf, size_t len) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lsx_validate_utf32le_with_errors(buf, len);
+  if (res.count != len) {
+    result scalar_res =
+        scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
+    return result(scalar_res.error, res.count + scalar_res.count);
+  } else {
+    return res;
+  }
+}
 
-    len -= vl;
-    src += vl;
-    dst += vlOut;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
+    const char *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char *, char *> ret =
+      lsx_convert_latin1_to_utf8(buf, len, utf8_output);
+  size_t converted_chars = ret.second - utf8_output;
 
-    if ((last & LO_SURROGATE_MASK) == LO_SURROGATE_VALUE) {
-      // last item is lo surrogate and got already consumed
-      len -= 1;
-      src += 1;
-    }
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
   }
-
-  return result(error_code::SUCCESS, dst - dstBeg);
+  return converted_chars;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  result res = convert_utf16le_to_utf32_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      lsx_convert_latin1_to_utf16le(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  result res = convert_utf16be_to_utf32_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char *, char16_t *> ret =
+      lsx_convert_latin1_to_utf16be(buf, len, utf16_output);
+  size_t converted_chars = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars =
+        scalar::latin1_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::NONE>(src, len, dst);
+simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char *, char32_t *> ret =
+      lsx_convert_latin1_to_utf32(buf, len, utf32_output);
+  size_t converted_chars = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    converted_chars += scalar_converted_chars;
+  }
+  return converted_chars;
 }
 
-simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::ZVBB>(src, len,
-                                                                  dst);
-  else
-    return rvv_utf16_to_utf32_with_errors<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  return convert_utf16le_to_utf32(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  utf8_to_latin1::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, latin1_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
-    const char16_t *src, size_t len, char32_t *dst) const noexcept {
-  return convert_utf16be_to_utf32(src, len, dst);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
+    const char *buf, size_t len, char *latin1_output) const noexcept {
+  return lsx::utf8_to_latin1::convert_valid(buf, len, latin1_output);
 }
-/* end file src/rvv/rvv_utf16_to.inl.cpp */
-/* begin file src/rvv/rvv_utf32_to.inl.cpp */
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf32_to_latin1_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::LITTLE>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  const char32_t *const beg = src;
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e32m8(len);
-    vuint32m8_t v = __riscv_vle32_v_u32m8((uint32_t *)src, vl);
-    long idx = __riscv_vfirst_m_b4(__riscv_vmsgtu_vx_u32m8_b4(v, 255, vl), vl);
-    if (idx >= 0)
-      return result(error_code::TOO_LARGE, src - beg + idx);
-    /* We don't use vcompress here, because its performance varies widely on
-     * current platforms. This might be worth reconsidering once there is more
-     * hardware available. */
-    __riscv_vse8_v_u8m2(
-        (uint8_t *)dst,
-        __riscv_vncvt_x_x_w_u8m2(__riscv_vncvt_x_x_w_u16m4(v, vl), vl), vl);
-  }
-  return result(error_code::SUCCESS, src - beg);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert<endianness::BIG>(buf, len, utf16_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf32_to_latin1(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::LITTLE>(buf, len,
+                                                           utf16_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  size_t n = len;
-  const char32_t *srcBeg = src;
-  const char *dstBeg = dst;
-  size_t vl8m4 = __riscv_vsetvlmax_e8m4();
-  vbool2_t m4mulp2 = __riscv_vmseq_vx_u8m4_b2(
-      __riscv_vand_vx_u8m4(__riscv_vid_v_u8m4(vl8m4), 3, vl8m4), 2, vl8m4);
-
-  for (size_t vl, vlOut; n > 0;) {
-    vl = __riscv_vsetvl_e32m4(n);
+simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
+    const char *buf, size_t len, char16_t *utf16_output) const noexcept {
+  utf8_to_utf16::validating_transcoder converter;
+  return converter.convert_with_errors<endianness::BIG>(buf, len, utf16_output);
+}
 
-    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t const *)src, vl);
-    vbool8_t m234 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x80 - 1, vl);
-    vuint16m2_t vn = __riscv_vncvt_x_x_w_u16m2(v, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::LITTLE>(input, size,
+                                                          utf16_output);
+}
 
-    if (__riscv_vfirst_m_b8(m234, vl) < 0) { /* 1 byte utf8 */
-      vlOut = vl;
-      __riscv_vse8_v_u8m1((uint8_t *)dst, __riscv_vncvt_x_x_w_u8m1(vn, vlOut),
-                          vlOut);
-      n -= vl, src += vl, dst += vlOut;
-      continue;
-    }
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
+    const char *input, size_t size, char16_t *utf16_output) const noexcept {
+  return utf8_to_utf16::convert_valid<endianness::BIG>(input, size,
+                                                       utf16_output);
+}
 
-    vbool8_t m34 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x800 - 1, vl);
+simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert(buf, len, utf32_output);
+}
 
-    if (__riscv_vfirst_m_b8(m34, vl) < 0) { /* 1/2 byte utf8 */
-      /* 0: [     aaa|aabbbbbb]
-       * 1: [aabbbbbb|        ] vsll 8
-       * 2: [        |   aaaaa] vsrl 6
-       * 3: [00111111|00111111]
-       * 4: [  bbbbbb|000aaaaa] (1|2)&3
-       * 5: [10000000|11000000]
-       * 6: [10bbbbbb|110aaaaa] 4|5 */
-      vuint16m2_t twoByte = __riscv_vand_vx_u16m2(
-          __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(vn, 8, vl),
-                               __riscv_vsrl_vx_u16m2(vn, 6, vl), vl),
-          0b0011111100111111, vl);
-      vuint16m2_t vout16 =
-          __riscv_vor_vx_u16m2_mu(m234, vn, twoByte, 0b1000000011000000, vl);
-      vuint8m2_t vout = __riscv_vreinterpret_v_u16m2_u8m2(vout16);
+simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
+    const char *buf, size_t len, char32_t *utf32_output) const noexcept {
+  utf8_to_utf32::validating_transcoder converter;
+  return converter.convert_with_errors(buf, len, utf32_output);
+}
 
-      /* Every high byte that is zero should be compressed
-       * low bytes should never be compressed, so we set them
-       * to all ones, and then create a non-zero bytes mask */
-      vbool4_t mcomp =
-          __riscv_vmsne_vx_u8m2_b4(__riscv_vreinterpret_v_u16m2_u8m2(
-                                       __riscv_vor_vx_u16m2(vout16, 0xFF, vl)),
-                                   0, vl * 2);
-      vlOut = __riscv_vcpop_m_b4(mcomp, vl * 2);
+simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
+    const char *input, size_t size, char32_t *utf32_output) const noexcept {
+  return utf8_to_utf32::convert_valid(input, size, utf32_output);
+}
 
-      vout = __riscv_vcompress_vm_u8m2(vout, mcomp, vl * 2);
-      __riscv_vse8_v_u8m2((uint8_t *)dst, vout, vlOut);
+simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-      n -= vl, src += vl, dst += vlOut;
-      continue;
-    }
-    long idx1 =
-        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
-    vbool8_t sur = __riscv_vmseq_vx_u32m4_b8(
-        __riscv_vand_vx_u32m4(v, 0xFFFFF800, vl), 0xD800, vl);
-    long idx2 = __riscv_vfirst_m_b8(sur, vl);
-    if (idx1 >= 0 && idx2 >= 0) {
-      if (idx1 <= idx2) {
-        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-      } else {
-        return result(error_code::SURROGATE, src - srcBeg + idx2);
-      }
-    }
-    if (idx1 >= 0) {
-      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-    }
-    if (idx2 >= 0) {
-      return result(error_code::SURROGATE, src - srcBeg + idx2);
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    vbool8_t m4 = __riscv_vmsgtu_vx_u32m4_b8(v, 0x10000 - 1, vl);
-    long first = __riscv_vfirst_m_b8(m4, vl);
-    size_t tail = vl - first;
-    vl = first < 0 ? vl : first;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-    if (vl > 0) { /* 1/2/3 byte utf8 */
-      /* vn: [aaaabbbb|bbcccccc]
-       * v1: [0bcccccc|        ] vsll  8
-       * v1: [10cccccc|        ] vsll  8 & 0b00111111 | 0b10000000
-       * v2: [        |110bbbbb] vsrl  6 & 0b00111111 | 0b11000000
-       * v2: [        |10bbbbbb] vsrl  6 & 0b00111111 | 0b10000000
-       * v3: [        |1110aaaa] vsrl 12 | 0b11100000
-       *  1: [00000000|0bcccccc|00000000|00000000] => [0bcccccc]
-       *  2: [00000000|10cccccc|110bbbbb|00000000] => [110bbbbb] [10cccccc]
-       *  3: [00000000|10cccccc|10bbbbbb|1110aaaa] => [1110aaaa] [10bbbbbb]
-       * [10cccccc]
-       */
-      vuint16m2_t v1, v2, v3, v12;
-      v1 = __riscv_vor_vx_u16m2_mu(
-          m234, vn, __riscv_vand_vx_u16m2(vn, 0b00111111, vl), 0b10000000, vl);
-      v1 = __riscv_vsll_vx_u16m2(v1, 8, vl);
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_latin1::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      v2 = __riscv_vor_vx_u16m2(
-          __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 6, vl), 0b00111111,
-                                vl),
-          0b10000000, vl);
-      v2 = __riscv_vor_vx_u16m2_mu(__riscv_vmnot_m_b8(m34, vl), v2, v2,
-                                   0b01000000, vl);
-      v3 = __riscv_vor_vx_u16m2(__riscv_vsrl_vx_u16m2(vn, 12, vl), 0b11100000,
-                                vl);
-      v12 = __riscv_vor_vv_u16m2_mu(m234, v1, v1, v2, vl);
+simdutf_warn_unused result
+implementation::convert_utf16le_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+          buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      vuint32m4_t w12 = __riscv_vwmulu_vx_u32m4(v12, 1 << 8, vl);
-      vuint32m4_t w123 = __riscv_vwaddu_wv_u32m4_mu(m34, w12, w12, v3, vl);
-      vuint8m4_t vout = __riscv_vreinterpret_v_u32m4_u8m4(w123);
+simdutf_warn_unused result
+implementation::convert_utf16be_to_latin1_with_errors(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                               latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_latin1::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-      vbool2_t mcomp = __riscv_vmor_mm_b2(
-          m4mulp2, __riscv_vmsne_vx_u8m4_b2(vout, 0, vl * 4), vl * 4);
-      vlOut = __riscv_vcpop_m_b2(mcomp, vl * 4);
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16be_to_latin1(buf, len, latin1_output);
+}
 
-      vout = __riscv_vcompress_vm_u8m4(vout, mcomp, vl * 4);
-      __riscv_vse8_v_u8m4((uint8_t *)dst, vout, vlOut);
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
+    const char16_t *buf, size_t len, char *latin1_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf16le_to_latin1(buf, len, latin1_output);
+}
 
-      n -= vl, src += vl, dst += vlOut;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-
-    if (tail)
-      while (n) {
-        uint32_t word = src[0];
-        if (word < 0x10000)
-          break;
-        if (word > 0x10FFFF)
-          return result(error_code::TOO_LARGE, src - srcBeg);
-        *dst++ = (uint8_t)((word >> 18) | 0b11110000);
-        *dst++ = (uint8_t)(((word >> 12) & 0b111111) | 0b10000000);
-        *dst++ = (uint8_t)(((word >> 6) & 0b111111) | 0b10000000);
-        *dst++ = (uint8_t)((word & 0b111111) | 0b10000000);
-        ++src;
-        --n;
-      }
+    saved_bytes += scalar_saved_bytes;
   }
-
-  return result(error_code::SUCCESS, dst - dstBeg);
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  result res = convert_utf32_to_utf8_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  std::pair<const char16_t *, char *> ret =
+      lsx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf8::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
-    const char32_t *src, size_t len, char *dst) const noexcept {
-  return convert_utf32_to_utf8(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+                                                                utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static result
-rvv_convert_utf32_to_utf16_with_errors(const char32_t *src, size_t len,
-                                       char16_t *dst) {
-  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
-  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
-      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
-  const char16_t *dstBeg = dst;
-  const char32_t *srcBeg = src;
-  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e32m4(len);
-    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
-    vuint32m4_t off = __riscv_vadd_vx_u32m4(v, 0xFFFF2000, vl);
-    long idx1 =
-        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0x10FFFF, vl), vl);
-    long idx2 = __riscv_vfirst_m_b8(
-        __riscv_vmsgtu_vx_u32m4_b8(off, 0xFFFFF7FF, vl), vl);
-    if (idx1 >= 0 && idx2 >= 0) {
-      if (idx1 <= idx2)
-        return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-      return result(error_code::SURROGATE, src - srcBeg + idx2);
-    }
-    if (idx1 >= 0)
-      return result(error_code::TOO_LARGE, src - srcBeg + idx1);
-    if (idx2 >= 0)
-      return result(error_code::SURROGATE, src - srcBeg + idx2);
-    long idx =
-        __riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl);
-    if (idx < 0) {
-      vlOut = vl;
-      vuint16m2_t n =
-          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
-      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
-      continue;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      lsx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+                                                             utf8_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf8::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
   }
-  return result(error_code::SUCCESS, dst - dstBeg);
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  result res = convert_utf32_to_utf16le_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16le_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  result res = convert_utf32_to_utf16be_with_errors(src, len, dst);
-  return res.error == error_code::SUCCESS ? res.count : 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
+    const char16_t *buf, size_t len, char *utf8_output) const noexcept {
+  return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::NONE>(
-      src, len, dst);
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
+  std::pair<const char32_t *, char *> ret =
+      lsx_convert_utf32_to_utf8(buf, len, utf8_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf8_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_utf8::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::ZVBB>(
-        src, len, dst);
-  else
-    return rvv_convert_utf32_to_utf16_with_errors<simdutf_ByteFlip::V>(src, len,
-                                                                       dst);
+simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char *> ret =
+      lsx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+  if (ret.first.count != len) {
+    result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf8_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static size_t
-rvv_convert_valid_utf32_to_utf16(const char32_t *src, size_t len,
-                                 char16_t *dst) {
-  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
-  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
-      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
-  char16_t *dstBeg = dst;
-  for (size_t vl, vlOut; len > 0; len -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e32m4(len);
-    vuint32m4_t v = __riscv_vle32_v_u32m4((uint32_t *)src, vl);
-    if (__riscv_vfirst_m_b8(__riscv_vmsgtu_vx_u32m4_b8(v, 0xFFFF, vl), vl) <
-        0) {
-      vlOut = vl;
-      vuint16m2_t n =
-          simdutf_byteflip<bflip>(__riscv_vncvt_x_x_w_u16m2(v, vlOut), vlOut);
-      __riscv_vse16_v_u16m2((uint16_t *)dst, n, vlOut);
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      lsx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
-    vlOut = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, v, vl, m4even);
+    saved_bytes += scalar_saved_bytes;
   }
-  return dst - dstBeg;
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::NONE>(src, len,
-                                                                  dst);
+simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  std::pair<const char16_t *, char32_t *> ret =
+      lsx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf32_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf16_to_utf32::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
-    const char32_t *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::ZVBB>(src, len,
-                                                                    dst);
-  else
-    return rvv_convert_valid_utf32_to_utf16<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      lsx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
-/* end file src/rvv/rvv_utf32_to.inl.cpp */
-/* begin file src/rvv/rvv_utf8_to.inl.cpp */
-template <typename Tdst, simdutf_ByteFlip bflip, bool validate = true>
-simdutf_really_inline static size_t rvv_utf8_to_common(char const *src,
-                                                       size_t len, Tdst *dst) {
-  static_assert(std::is_same<Tdst, uint16_t>() ||
-                    std::is_same<Tdst, uint32_t>(),
-                "invalid type");
-  constexpr bool is16 = std::is_same<Tdst, uint16_t>();
-  constexpr endianness endian =
-      bflip == simdutf_ByteFlip::NONE ? endianness::LITTLE : endianness::BIG;
-  const auto scalar = [](char const *in, size_t count, Tdst *out) {
-    return is16 ? scalar::utf8_to_utf16::convert<endian>(in, count,
-                                                         (char16_t *)out)
-                : scalar::utf8_to_utf32::convert(in, count, (char32_t *)out);
-  };
 
-  if (len < 32)
-    return scalar(src, len, dst);
-
-  /* validate first three bytes */
-  if (validate) {
-    size_t idx = 3;
-    while (idx < len && (src[idx] >> 6) == 0b10)
-      ++idx;
-    if (idx > 3 + 3 || !scalar::utf8::validate(src, idx))
-      return 0;
+simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char32_t *> ret =
+      lsx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+                                                              utf32_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res =
+        scalar::utf16_to_utf32::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
   }
+  ret.first.count =
+      ret.second -
+      utf32_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  size_t tail = 3;
-  size_t n = len - tail;
-  Tdst *beg = dst;
-
-  static const uint64_t err1m[] = {0x0202020202020202, 0x4915012180808080};
-  static const uint64_t err2m[] = {0xCBCBCB8B8383A3E7, 0xCBCBDBCBCBCBCBCB};
-  static const uint64_t err3m[] = {0x0101010101010101, 0X01010101BABAAEE6};
-
-  const vuint8m1_t err1tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err1m, 2));
-  const vuint8m1_t err2tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err2m, 2));
-  const vuint8m1_t err3tbl =
-      __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vle64_v_u64m1(err3m, 2));
-
-  size_t vl8m2 = __riscv_vsetvlmax_e8m2();
-  vbool4_t m4even = __riscv_vmseq_vx_u8m2_b4(
-      __riscv_vand_vx_u8m2(__riscv_vid_v_u8m2(vl8m2), 1, vl8m2), 0, vl8m2);
-
-  for (size_t vl, vlOut; n > 0; n -= vl, src += vl, dst += vlOut) {
-    vl = __riscv_vsetvl_e8m2(n);
-
-    vuint8m2_t v0 = __riscv_vle8_v_u8m2((uint8_t const *)src, vl);
-    uint64_t max = __riscv_vmv_x_s_u8m1_u8(
-        __riscv_vredmaxu_vs_u8m2_u8m1(v0, __riscv_vmv_s_x_u8m1(0, vl), vl));
-
-    uint8_t next0 = src[vl + 0];
-    uint8_t next1 = src[vl + 1];
-    uint8_t next2 = src[vl + 2];
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lsx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-    /* fast path: ASCII */
-    if ((max | next0 | next1 | next2) < 0b10000000) {
-      vlOut = vl;
-      if (is16)
-        __riscv_vse16_v_u16m4(
-            (uint16_t *)dst,
-            simdutf_byteflip<bflip>(__riscv_vzext_vf2_u16m4(v0, vlOut), vlOut),
-            vlOut);
-      else
-        __riscv_vse32_v_u32m8((uint32_t *)dst,
-                              __riscv_vzext_vf4_u32m8(v0, vlOut), vlOut);
-      continue;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    /* see "Validating UTF-8 In Less Than One Instruction Per Byte"
-     * https://arxiv.org/abs/2010.03090 */
-    vuint8m2_t v1 = __riscv_vslide1down_vx_u8m2(v0, next0, vl);
-    vuint8m2_t v2 = __riscv_vslide1down_vx_u8m2(v1, next1, vl);
-    vuint8m2_t v3 = __riscv_vslide1down_vx_u8m2(v2, next2, vl);
-
-    if (validate) {
-      vuint8m2_t s1 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
-          __riscv_vreinterpret_v_u8m2_u16m2(v2), 4, __riscv_vsetvlmax_e16m2()));
-      vuint8m2_t s3 = __riscv_vreinterpret_v_u16m2_u8m2(__riscv_vsrl_vx_u16m2(
-          __riscv_vreinterpret_v_u8m2_u16m2(v3), 4, __riscv_vsetvlmax_e16m2()));
-
-      vuint8m2_t idx2 = __riscv_vand_vx_u8m2(v2, 0xF, vl);
-      vuint8m2_t idx1 = __riscv_vand_vx_u8m2(s1, 0xF, vl);
-      vuint8m2_t idx3 = __riscv_vand_vx_u8m2(s3, 0xF, vl);
-
-      vuint8m2_t err1 = simdutf_vrgather_u8m1x2(err1tbl, idx1);
-      vuint8m2_t err2 = simdutf_vrgather_u8m1x2(err2tbl, idx2);
-      vuint8m2_t err3 = simdutf_vrgather_u8m1x2(err3tbl, idx3);
-      vint8m2_t errs = __riscv_vreinterpret_v_u8m2_i8m2(
-          __riscv_vand_vv_u8m2(__riscv_vand_vv_u8m2(err1, err2, vl), err3, vl));
-
-      vbool4_t is_3 = __riscv_vmsgtu_vx_u8m2_b4(v1, 0b11100000 - 1, vl);
-      vbool4_t is_4 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b11110000 - 1, vl);
-      vbool4_t is_34 = __riscv_vmor_mm_b4(is_3, is_4, vl);
-      vbool4_t err34 =
-          __riscv_vmxor_mm_b4(is_34, __riscv_vmslt_vx_i8m2_b4(errs, 0, vl), vl);
-      vbool4_t errm =
-          __riscv_vmor_mm_b4(__riscv_vmsgt_vx_i8m2_b4(errs, 0, vl), err34, vl);
-      if (__riscv_vfirst_m_b4(errm, vl) >= 0)
-        return 0;
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lsx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-    /* decoding */
-
-    /* mask of non continuation bytes */
-    vbool4_t m =
-        __riscv_vmsgt_vx_i8m2_b4(__riscv_vreinterpret_v_u8m2_i8m2(v0), -65, vl);
-    vlOut = __riscv_vcpop_m_b4(m, vl);
-
-    /* extract first and second bytes */
-    vuint8m2_t b1 = __riscv_vcompress_vm_u8m2(v0, m, vl);
-    vuint8m2_t b2 = __riscv_vcompress_vm_u8m2(v1, m, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lsx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
 
-    /* fast path: one and two byte */
-    if (max < 0b11100000) {
-      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+        ret.first, len - (ret.first - buf), ret.second);
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
-      b1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
+    const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // optimization opportunity: implement a custom function.
+  return convert_utf32_to_utf8(buf, len, utf8_output);
+}
 
-      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
-          b1,
-          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
-                                  vlOut),
-          vlOut);
-      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
-      if (is16)
-        __riscv_vse16_v_u16m4((uint16_t *)dst,
-                              simdutf_byteflip<bflip>(b12, vlOut), vlOut);
-      else
-        __riscv_vse32_v_u32m8((uint32_t *)dst,
-                              __riscv_vzext_vf2_u32m8(b12, vlOut), vlOut);
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      lsx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::LITTLE>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
 
-    /* fast path: one, two and three byte */
-    if (max < 0b11110000) {
-      vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
-
-      b2 = __riscv_vand_vx_u8m2(b2, 0b00111111, vlOut);
-      b3 = __riscv_vand_vx_u8m2(b3, 0b00111111, vlOut);
-
-      vbool4_t m1 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b10111111, vlOut);
-      vbool4_t m3 = __riscv_vmsgtu_vx_u8m2_b4(b1, 0b11011111, vlOut);
-
-      vuint8m2_t t1 = __riscv_vand_vx_u8m2_mu(m1, b1, b1, 63, vlOut);
-      b1 = __riscv_vand_vx_u8m2_mu(m3, t1, b1, 15, vlOut);
+  return saved_bytes;
+}
 
-      vuint16m4_t b12 = __riscv_vwmulu_vv_u16m4(
-          b1,
-          __riscv_vmerge_vxm_u8m2(__riscv_vmv_v_x_u8m2(1, vlOut), 1 << 6, m1,
-                                  vlOut),
-          vlOut);
-      b12 = __riscv_vwaddu_wv_u16m4_mu(m1, b12, b12, b2, vlOut);
-      vuint16m4_t b123 = __riscv_vwaddu_wv_u16m4_mu(
-          m3, b12, __riscv_vsll_vx_u16m4_mu(m3, b12, b12, 6, vlOut), b3, vlOut);
-      if (is16)
-        __riscv_vse16_v_u16m4((uint16_t *)dst,
-                              simdutf_byteflip<bflip>(b123, vlOut), vlOut);
-      else
-        __riscv_vse32_v_u32m8((uint32_t *)dst,
-                              __riscv_vzext_vf2_u32m8(b123, vlOut), vlOut);
-      continue;
+simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  std::pair<const char32_t *, char16_t *> ret =
+      lsx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - utf16_output;
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes =
+        scalar::utf32_to_utf16::convert<endianness::BIG>(
+            ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
     }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
 
-    /* extract third and fourth bytes */
-    vuint8m2_t b3 = __riscv_vcompress_vm_u8m2(v2, m, vl);
-    vuint8m2_t b4 = __riscv_vcompress_vm_u8m2(v3, m, vl);
-
-    /* remove prefix from leading bytes
-     *
-     * We could also use vrgather here, but it increases register pressure,
-     * and its performance varies widely on current platforms. It might be
-     * worth reconsidering, though, once there is more hardware available.
-     * Same goes for the __riscv_vsrl_vv_u32m4 correction step.
-     *
-     * We shift left and then right by the number of bytes in the prefix,
-     * which can be calculated as follows:
-     *         x                                max(x-10, 0)
-     * 0xxx -> 0000-0111 -> sift by 0 or 1   -> 0
-     * 10xx -> 1000-1011 -> don't care
-     * 110x -> 1100,1101 -> sift by 3        -> 2,3
-     * 1110 -> 1110      -> sift by 4        -> 4
-     * 1111 -> 1111      -> sift by 5        -> 5
-     *
-     * vssubu.vx v, 10, (max(x-10, 0)) almost gives us what we want, we
-     * just need to manually detect and handle the one special case:
-     */
-#define SIMDUTF_RVV_UTF8_TO_COMMON_M1(idx)                                     \
-  vuint8m1_t c1 = __riscv_vget_v_u8m2_u8m1(b1, idx);                           \
-  vuint8m1_t c2 = __riscv_vget_v_u8m2_u8m1(b2, idx);                           \
-  vuint8m1_t c3 = __riscv_vget_v_u8m2_u8m1(b3, idx);                           \
-  vuint8m1_t c4 = __riscv_vget_v_u8m2_u8m1(b4, idx);                           \
-  /* remove prefix from trailing bytes */                                      \
-  c2 = __riscv_vand_vx_u8m1(c2, 0b00111111, vlOut);                            \
-  c3 = __riscv_vand_vx_u8m1(c3, 0b00111111, vlOut);                            \
-  c4 = __riscv_vand_vx_u8m1(c4, 0b00111111, vlOut);                            \
-  vuint8m1_t shift = __riscv_vsrl_vx_u8m1(c1, 4, vlOut);                       \
-  shift = __riscv_vmerge_vxm_u8m1(__riscv_vssubu_vx_u8m1(shift, 10, vlOut), 3, \
-                                  __riscv_vmseq_vx_u8m1_b8(shift, 12, vlOut),  \
-                                  vlOut);                                      \
-  c1 = __riscv_vsll_vv_u8m1(c1, shift, vlOut);                                 \
-  c1 = __riscv_vsrl_vv_u8m1(c1, shift, vlOut);                                 \
-  /* unconditionally widen and combine to c1234 */                             \
-  vuint16m2_t c34 = __riscv_vwaddu_wv_u16m2(                                   \
-      __riscv_vwmulu_vx_u16m2(c3, 1 << 6, vlOut), c4, vlOut);                  \
-  vuint16m2_t c12 = __riscv_vwaddu_wv_u16m2(                                   \
-      __riscv_vwmulu_vx_u16m2(c1, 1 << 6, vlOut), c2, vlOut);                  \
-  vuint32m4_t c1234 = __riscv_vwaddu_wv_u32m4(                                 \
-      __riscv_vwmulu_vx_u32m4(c12, 1 << 12, vlOut), c34, vlOut);               \
-  /* derive required right-shift amount from `shift` to reduce                 \
-   * c1234 to the required number of bytes */                                  \
-  c1234 = __riscv_vsrl_vv_u32m4(                                               \
-      c1234,                                                                   \
-      __riscv_vzext_vf4_u32m4(                                                 \
-          __riscv_vmul_vx_u8m1(                                                \
-              __riscv_vrsub_vx_u8m1(__riscv_vssubu_vx_u8m1(shift, 2, vlOut),   \
-                                    3, vlOut),                                 \
-              6, vlOut),                                                       \
-          vlOut),                                                              \
-      vlOut);                                                                  \
-  /* store result in desired format */                                         \
-  if (is16)                                                                    \
-    vlDst = rvv_utf32_store_utf16_m4<bflip>((uint16_t *)dst, c1234, vlOut,     \
-                                            m4even);                           \
-  else                                                                         \
-    vlDst = vlOut, __riscv_vse32_v_u32m4((uint32_t *)dst, c1234, vlOut);
-
-    /* Unrolling this manually reduces register pressure and allows
-     * us to terminate early. */
-    {
-      size_t vlOutm2 = vlOut, vlDst;
-      vlOut = __riscv_vsetvl_e8m1(vlOut);
-      SIMDUTF_RVV_UTF8_TO_COMMON_M1(0)
-      if (vlOutm2 == vlOut) {
-        vlOut = vlDst;
-        continue;
-      }
-
-      dst += vlDst;
-      vlOut = vlOutm2 - vlOut;
-    }
-    {
-      size_t vlDst;
-      SIMDUTF_RVV_UTF8_TO_COMMON_M1(1)
-      vlOut = vlDst;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      lsx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
-
-#undef SIMDUTF_RVV_UTF8_TO_COMMON_M1
   }
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
 
-  /* validate the last character and reparse it + tail */
-  if (len > tail) {
-    if ((src[0] >> 6) == 0b10)
-      --dst;
-    while ((src[0] >> 6) == 0b10 && tail < len)
-      --src, ++tail;
-    if (is16) {
-      /* go back one more, when on high surrogate */
-      if (simdutf_byteflip<bflip>((uint16_t)dst[-1]) >= 0xD800 &&
-          simdutf_byteflip<bflip>((uint16_t)dst[-1]) <= 0xDBFF)
-        --dst;
+simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  // ret.first.count is always the position in the buffer, not the number of
+  // code units written even if finished
+  std::pair<result, char16_t *> ret =
+      lsx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                              utf16_output);
+  if (ret.first.count != len) {
+    result scalar_res =
+        scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
+            buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
     }
   }
-  size_t ret = scalar(src, tail, dst);
-  if (ret == 0)
-    return 0;
-  return (size_t)(dst - beg) + ret;
+  ret.first.count =
+      ret.second -
+      utf16_output; // Set count to the number of 8-bit code units written
+  return ret.first;
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
-    const char *src, size_t len, char *dst) const noexcept {
-  const char *beg = dst;
-  uint8_t last = 0;
-  for (size_t vl, vlOut; len > 0;
-       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    // check which bytes are ASCII
-    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
-    // count ASCII bytes
-    vlOut = __riscv_vcpop_m_b4(ascii, vl);
-    // The original code would only enter the next block after this check:
-    //   vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-    //   vlOut = __riscv_vcpop_m_b4(m, vl);
-    //   if (vlOut != vl || last > 0b01111111) {...}q
-    // So that everything is ASCII or continuation bytes, we just proceeded
-    // without any processing, going straight to __riscv_vse8_v_u8m2.
-    // But you need the __riscv_vslide1up_vx_u8m2 whenever there is a non-ASCII
-    // byte.
-    if (vlOut != vl) { // If not pure ASCII
-      // Non-ASCII characters
-      // We now want to mark the ascii and continuation bytes
-      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-      // We count them, that's our new vlOut (output vector length)
-      vlOut = __riscv_vcpop_m_b4(m, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16le(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16le(buf, len, utf16_output);
+}
 
-      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf16be(
+    const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
+  return convert_utf32_to_utf16be(buf, len, utf16_output);
+}
 
-      vbool4_t leading0 = __riscv_vmsgtu_vx_u8m2_b4(v0, 0b10111111, vl);
-      vbool4_t trailing1 = __riscv_vmslt_vx_i8m2_b4(
-          __riscv_vreinterpret_v_u8m2_i8m2(v1), (uint8_t)0b11000000, vl);
-      // -62 i 0b11000010, so we check whether any of v0 is too big
-      vbool4_t tobig = __riscv_vmand_mm_b4(
-          leading0,
-          __riscv_vmsgtu_vx_u8m2_b4(__riscv_vxor_vx_u8m2(v0, (uint8_t)-62, vl),
-                                    1, vl),
-          vl);
-      if (__riscv_vfirst_m_b4(
-              __riscv_vmor_mm_b4(
-                  tobig, __riscv_vmxor_mm_b4(leading0, trailing1, vl), vl),
-              vl) >= 0)
-        return 0;
+simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16le_to_utf32(buf, len, utf32_output);
+}
 
-      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
-                                  v1, v1, 0b01000000, vl);
-      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
-    } else if (last >= 0b11000000) { // If last byte is a leading  byte and we
-                                     // got only ASCII, error!
-      return 0;
-    }
-    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
-  }
-  if (last > 0b10111111)
-    return 0;
-  return dst - beg;
+simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf32(
+    const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
+  return convert_utf16be_to_utf32(buf, len, utf32_output);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
-    const char *src, size_t len, char *dst) const noexcept {
-  size_t res = convert_utf8_to_latin1(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_latin1::convert_with_errors(src, len, dst);
+void implementation::change_endianness_utf16(const char16_t *input,
+                                             size_t length,
+                                             char16_t *output) const noexcept {
+  utf16::change_endianness_utf16(input, length, output);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
-    const char *src, size_t len, char *dst) const noexcept {
-  const char *beg = dst;
-  uint8_t last = 0;
-  for (size_t vl, vlOut; len > 0;
-       len -= vl, src += vl, dst += vlOut, last = src[-1]) {
-    vl = __riscv_vsetvl_e8m2(len);
-    vuint8m2_t v1 = __riscv_vle8_v_u8m2((uint8_t *)src, vl);
-    vbool4_t ascii = __riscv_vmsltu_vx_u8m2_b4(v1, 0b10000000, vl);
-    vlOut = __riscv_vcpop_m_b4(ascii, vl);
-    if (vlOut != vl) { // If not pure ASCII
-      vbool4_t m = __riscv_vmsltu_vx_u8m2_b4(v1, 0b11000000, vl);
-      vlOut = __riscv_vcpop_m_b4(m, vl);
-      vuint8m2_t v0 = __riscv_vslide1up_vx_u8m2(v1, last, vl);
-      v1 = __riscv_vor_vx_u8m2_mu(__riscv_vmseq_vx_u8m2_b4(v0, 0b11000011, vl),
-                                  v1, v1, 0b01000000, vl);
-      v1 = __riscv_vcompress_vm_u8m2(v1, m, vl);
-    }
-    __riscv_vse8_v_u8m2((uint8_t *)dst, v1, vlOut);
-  }
-  return dst - beg;
+simdutf_warn_unused size_t implementation::count_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE>(src, len,
-                                                              (uint16_t *)dst);
+simdutf_warn_unused size_t implementation::count_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::count_code_points<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf16be(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB>(
-        src, len, (uint16_t *)dst);
-  else
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V>(src, len,
-                                                             (uint16_t *)dst);
+simdutf_warn_unused size_t
+implementation::count_utf8(const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16le_with_errors(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  size_t res = convert_utf8_to_utf16le(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::LITTLE>(
-      src, len, dst);
+simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
+    const char *buf, size_t len) const noexcept {
+  return count_utf8(buf, len);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf16be_with_errors(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  size_t res = convert_utf8_to_utf16be(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_utf16::convert_with_errors<endianness::BIG>(src, len,
-                                                                     dst);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf16(size_t length) const noexcept {
+  return length;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16le(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::NONE, false>(
-      src, len, (uint16_t *)dst);
+simdutf_warn_unused size_t
+implementation::latin1_length_from_utf32(size_t length) const noexcept {
+  return length;
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf16be(
-    const char *src, size_t len, char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::ZVBB, false>(
-        src, len, (uint16_t *)dst);
-  else
-    return rvv_utf8_to_common<uint16_t, simdutf_ByteFlip::V, false>(
-        src, len, (uint16_t *)dst);
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  const uint8_t *data_end = data + length;
+  uint64_t result = 0;
+  while (data + 16 < data_end) {
+    uint64_t two_bytes = 0;
+    __m128i input_vec = __lsx_vld(data, 0);
+    two_bytes =
+        __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
+    result += 16 + two_bytes;
+    data += 16;
+  }
+  return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
+                                                          data_end - data);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf8_to_utf32(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE>(src, len,
-                                                              (uint32_t *)dst);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-simdutf_warn_unused result implementation::convert_utf8_to_utf32_with_errors(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  size_t res = convert_utf8_to_utf32(src, len, dst);
-  if (res)
-    return result(error_code::SUCCESS, res);
-  return scalar::utf8_to_utf32::convert_with_errors(src, len, dst);
+simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf8_length_from_utf16<endianness::BIG>(input, length);
 }
 
-simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
-    const char *src, size_t len, char32_t *dst) const noexcept {
-  return rvv_utf8_to_common<uint32_t, simdutf_ByteFlip::NONE, false>(
-      src, len, (uint32_t *)dst);
+simdutf_warn_unused size_t
+implementation::utf16_length_from_latin1(size_t length) const noexcept {
+  return length;
 }
-/* end file src/rvv/rvv_utf8_to.inl.cpp */
 
-simdutf_warn_unused int
-implementation::detect_encodings(const char *input,
-                                 size_t length) const noexcept {
-  // If there is a BOM, then we trust it.
-  auto bom_encoding = simdutf::BOM::check_bom(input, length);
-  if (bom_encoding != encoding_type::unspecified)
-    return bom_encoding;
-  // todo: reimplement as a one-pass algorithm.
-  int out = 0;
-  if (validate_utf8(input, length))
-    out |= encoding_type::UTF8;
-  if (length % 2 == 0) {
-    if (validate_utf16(reinterpret_cast<const char16_t *>(input), length / 2))
-      out |= encoding_type::UTF16_LE;
-  }
-  if (length % 4 == 0) {
-    if (validate_utf32(reinterpret_cast<const char32_t *>(input), length / 4))
-      out |= encoding_type::UTF32_LE;
-  }
+simdutf_warn_unused size_t
+implementation::utf32_length_from_latin1(size_t length) const noexcept {
+  return length;
+}
 
-  return out;
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::LITTLE>(input, length);
 }
 
-template <simdutf_ByteFlip bflip>
-simdutf_really_inline static void
-rvv_change_endianness_utf16(const char16_t *src, size_t len, char16_t *dst) {
-  for (size_t vl; len > 0; len -= vl, src += vl, dst += vl) {
-    vl = __riscv_vsetvl_e16m8(len);
-    vuint16m8_t v = __riscv_vle16_v_u16m8((uint16_t *)src, vl);
-    __riscv_vse16_v_u16m8((uint16_t *)dst, simdutf_byteflip<bflip>(v, vl), vl);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf16be(
+    const char16_t *input, size_t length) const noexcept {
+  return utf16::utf32_length_from_utf16<endianness::BIG>(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::utf16_length_from_utf8(input, length);
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m128i v_80 = __lsx_vrepli_w(0x80); /*0x00000080*/
+  const __m128i v_800 = __lsx_vldi(-3832);   /*0x00000800*/
+  const __m128i v_10000 = __lsx_vldi(-3583); /*0x00010000*/
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    const __m128i ascii_bytes_bytemask = __lsx_vslt_w(in, v_80);
+    const __m128i one_two_bytes_bytemask = __lsx_vslt_w(in, v_800);
+    const __m128i two_bytes_bytemask =
+        __lsx_vxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    const __m128i three_bytes_bytemask =
+        __lsx_vxor_v(__lsx_vslt_w(in, v_10000), one_two_bytes_bytemask);
+
+    const uint32_t ascii_bytes_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(ascii_bytes_bytemask)), 0);
+    const uint32_t two_bytes_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(two_bytes_bytemask)), 0);
+    const uint32_t three_bytes_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(three_bytes_bytemask)), 0);
+
+    count +=
+        16 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
+  }
+  return count +
+         scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
+}
+
+simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
+    const char32_t *input, size_t length) const noexcept {
+  const __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos + 4 <= length; pos += 4) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    const __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
+    size_t surrogate_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
+    count += 4 + surrogate_count;
   }
+  return count +
+         scalar::utf32::utf16_length_from_utf32(input + pos, length - pos);
 }
 
-void implementation::change_endianness_utf16(const char16_t *src, size_t len,
-                                             char16_t *dst) const noexcept {
-  if (supports_zvbb())
-    return rvv_change_endianness_utf16<simdutf_ByteFlip::ZVBB>(src, len, dst);
-  else
-    return rvv_change_endianness_utf16<simdutf_ByteFlip::V>(src, len, dst);
+simdutf_warn_unused size_t implementation::utf32_length_from_utf8(
+    const char *input, size_t length) const noexcept {
+  return utf8::count_code_points(input, length);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -35002,86 +49864,21 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
@@ -35092,86 +49889,21 @@ simdutf_warn_unused size_t implementation::maximal_binary_length_from_base64(
 simdutf_warn_unused result implementation::base64_to_binary(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  auto equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-    return {SUCCESS, 0};
-  }
-  result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.count % 3 == 0) || ((r.count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused full_result implementation::base64_to_binary_details(
     const char16_t *input, size_t length, char *output, base64_options options,
     last_chunk_handling_options last_chunk_options) const noexcept {
-  while (length > 0 &&
-         scalar::base64::is_ascii_white_space(input[length - 1])) {
-    length--;
-  }
-  size_t equallocation =
-      length; // location of the first padding character if any
-  size_t equalsigns = 0;
-  if (length > 0 && input[length - 1] == '=') {
-    equallocation = length - 1;
-    length -= 1;
-    equalsigns++;
-    while (length > 0 &&
-           scalar::base64::is_ascii_white_space(input[length - 1])) {
-      length--;
-    }
-    if (length > 0 && input[length - 1] == '=') {
-      equallocation = length - 1;
-      equalsigns++;
-      length -= 1;
-    }
-  }
-  if (length == 0) {
-    if (equalsigns > 0) {
-      return {INVALID_BASE64_CHARACTER, equallocation, 0};
-    }
-    return {SUCCESS, 0, 0};
-  }
-  full_result r = scalar::base64::base64_tail_decode(
-      output, input, length, equalsigns, options, last_chunk_options);
-  if (last_chunk_options != stop_before_partial &&
-      r.error == error_code::SUCCESS && equalsigns > 0) {
-    // additional checks
-    if ((r.output_count % 3 == 0) ||
-        ((r.output_count % 3) + 1 + equalsigns != 4)) {
-      return {INVALID_BASE64_CHARACTER, equallocation, r.output_count};
-    }
-  }
-  return r;
+  return (options & base64_url)
+             ? compress_decode_base64<true>(output, input, length, options,
+                                            last_chunk_options)
+             : compress_decode_base64<false>(output, input, length, options,
+                                             last_chunk_options);
 }
 
 simdutf_warn_unused size_t implementation::base64_length_from_binary(
@@ -35182,148 +49914,148 @@ simdutf_warn_unused size_t implementation::base64_length_from_binary(
 size_t implementation::binary_to_base64(const char *input, size_t length,
                                         char *output,
                                         base64_options options) const noexcept {
-  return scalar::base64::tail_encode_base64(output, input, length, options);
+  if (options & base64_url) {
+    return encode_base64<true>(output, input, length, options);
+  } else {
+    return encode_base64<false>(output, input, length, options);
+  }
 }
-} // namespace rvv
+} // namespace lsx
 } // namespace simdutf
 
-/* begin file src/simdutf/rvv/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_RVV
-// nothing needed.
-#else
-SIMDUTF_UNTARGET_REGION
-#endif
-
-/* end file src/simdutf/rvv/end.h */
-/* end file src/rvv/implementation.cpp */
-#endif
-#if SIMDUTF_IMPLEMENTATION_WESTMERE
-/* begin file src/westmere/implementation.cpp */
-/* begin file src/simdutf/westmere/begin.h */
-// redefining SIMDUTF_IMPLEMENTATION to "westmere"
-// #define SIMDUTF_IMPLEMENTATION westmere
-
-#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
-// nothing needed.
-#else
-SIMDUTF_TARGET_WESTMERE
+/* begin file src/simdutf/lsx/end.h */
+/* end file src/simdutf/lsx/end.h */
+/* end file src/lsx/implementation.cpp */
 #endif
-/* end file src/simdutf/westmere/begin.h */
+#if SIMDUTF_IMPLEMENTATION_LASX
+/* begin file src/lasx/implementation.cpp */
+/* begin file src/simdutf/lasx/begin.h */
+// redefining SIMDUTF_IMPLEMENTATION to "lasx"
+// #define SIMDUTF_IMPLEMENTATION lasx
+/* end file src/simdutf/lasx/begin.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-#ifndef SIMDUTF_WESTMERE_H
-  #error "westmere.h must be included"
+#ifndef SIMDUTF_LASX_H
+  #error "lasx.h must be included"
 #endif
 using namespace simd;
 
+// convert vmskltz/vmskgez/vmsknz to
+// simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes index
+const uint8_t lasx_1_2_utf8_bytes_mask[] = {
+    0,   1,   4,   5,   16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,
+    85,  2,   3,   6,   7,   18,  19,  22,  23,  66,  67,  70,  71,  82,  83,
+    86,  87,  8,   9,   12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,
+    89,  92,  93,  10,  11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,
+    90,  91,  94,  95,  32,  33,  36,  37,  48,  49,  52,  53,  96,  97,  100,
+    101, 112, 113, 116, 117, 34,  35,  38,  39,  50,  51,  54,  55,  98,  99,
+    102, 103, 114, 115, 118, 119, 40,  41,  44,  45,  56,  57,  60,  61,  104,
+    105, 108, 109, 120, 121, 124, 125, 42,  43,  46,  47,  58,  59,  62,  63,
+    106, 107, 110, 111, 122, 123, 126, 127, 128, 129, 132, 133, 144, 145, 148,
+    149, 192, 193, 196, 197, 208, 209, 212, 213, 130, 131, 134, 135, 146, 147,
+    150, 151, 194, 195, 198, 199, 210, 211, 214, 215, 136, 137, 140, 141, 152,
+    153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221, 138, 139, 142, 143,
+    154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223, 160, 161, 164,
+    165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245, 162, 163,
+    166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247, 168,
+    169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+    170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254,
+    255};
+
+simdutf_really_inline __m128i lsx_swap_bytes(__m128i vec) {
+  return __lsx_vshuf4i_b(vec, 0b10110001);
+}
+simdutf_really_inline __m256i lasx_swap_bytes(__m256i vec) {
+  return __lasx_xvshuf4i_b(vec, 0b10110001);
+}
+
 simdutf_really_inline bool is_ascii(const simd8x64<uint8_t> &input) {
-  return input.reduce_or().is_ascii();
+  return input.is_ascii();
 }
 
 simdutf_unused simdutf_really_inline simd8<bool>
 must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2,
                      const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_second_byte =
-      prev1.saturating_sub(0b11000000u - 1); // Only 11______ will be > 0
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0b11100000u - 1); // Only 111_____ will be > 0
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0b11110000u - 1); // Only 1111____ will be > 0
-  // Caller requires a bool (all 1's). All values resulting from the subtraction
-  // will be <= 64, so signed comparison is fine.
-  return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) >
-         int8_t(0);
+  simd8<bool> is_second_byte = prev1 >= uint8_t(0b11000000u);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  // Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller
+  // is using ^ as well. This will work fine because we only have to report
+  // errors for cases with 0-1 lead bytes. Multiple lead bytes implies 2
+  // overlapping multibyte characters, and if that happens, there is guaranteed
+  // to be at least *one* lead byte that is part of only 1 other multibyte
+  // character. The error will be detected there.
+  return is_second_byte ^ is_third_byte ^ is_fourth_byte;
 }
 
 simdutf_really_inline simd8<bool>
 must_be_2_3_continuation(const simd8<uint8_t> prev2,
                          const simd8<uint8_t> prev3) {
-  simd8<uint8_t> is_third_byte =
-      prev2.saturating_sub(0xe0u - 0x80); // Only 111_____ will be >= 0x80
-  simd8<uint8_t> is_fourth_byte =
-      prev3.saturating_sub(0xf0u - 0x80); // Only 1111____ will be >= 0x80
-  return simd8<bool>(is_third_byte | is_fourth_byte);
+  simd8<bool> is_third_byte = prev2 >= uint8_t(0b11100000u);
+  simd8<bool> is_fourth_byte = prev3 >= uint8_t(0b11110000u);
+  return is_third_byte ^ is_fourth_byte;
 }
 
-/* begin file src/westmere/internal/loader.cpp */
-namespace internal {
-namespace westmere {
-
-/* begin file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
-/*
- * reads a vector of uint16 values
- * bits after 11th are ignored
- * first 11 bits are encoded into utf8
- * !important! utf8_output must have at least 16 writable bytes
- */
-
-inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
-                                       const __m128i one_byte_bytemask,
-                                       const uint16_t one_byte_bitmask) {
-  // 0b1100_0000_1000_0000
-  const __m128i v_c080 = _mm_set1_epi16((int16_t)0xc080);
-  // 0b0001_1111_0000_0000
-  const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-  // 0b0000_0000_0011_1111
-  const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-  // 1. prepare 2-byte values
-  // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-  // expected output   : [110a|aaaa|10bb|bbbb] x 8
-
-  // t0 = [000a|aaaa|bbbb|bb00]
-  const __m128i t0 = _mm_slli_epi16(v_u16, 2);
-  // t1 = [000a|aaaa|0000|0000]
-  const __m128i t1 = _mm_and_si128(t0, v_1f00);
-  // t2 = [0000|0000|00bb|bbbb]
-  const __m128i t2 = _mm_and_si128(v_u16, v_003f);
-  // t3 = [000a|aaaa|00bb|bbbb]
-  const __m128i t3 = _mm_or_si128(t1, t2);
-  // t4 = [110a|aaaa|10bb|bbbb]
-  const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-  // 2. merge ASCII and 2-byte codewords
-  const __m128i utf8_unpacked = _mm_blendv_epi8(t4, v_u16, one_byte_bytemask);
-
-  // 3. prepare bitmask for 8-bit lookup
-  //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h - MSB, a
-  //    - LSB)
-  const uint16_t m0 = one_byte_bitmask & 0x5555;      // m0 = 0h0g0f0e0d0c0b0a
-  const uint16_t m1 = static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-  const uint8_t m2 = static_cast<uint8_t>((m0 | m1) & 0xff); // m2 = hdgcfbea
-  // 4. pack the bytes
-  const uint8_t *row =
-      &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-  const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-  const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-  // 5. store bytes
-  _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+// common functions for utf8 conversions
+simdutf_really_inline __m128i convert_utf8_3_byte_to_utf16(__m128i in) {
+  // Low half contains  10bbbbbb|10cccccc
+  // High half contains 1110aaaa|1110aaaa
+  const v16u8 sh = {2, 1, 5, 4, 8, 7, 11, 10, 0, 0, 3, 3, 6, 6, 9, 9};
+  const v8u16 v0fff = {0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff, 0xfff};
+
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, (__m128i)sh);
+  // 1110aaaa => aaaa0000
+  __m128i perm_high = __lsx_vslli_b(__lsx_vbsrl_v(perm, 8), 4);
+  // 10bbbbbb 10cccccc => 0010bbbb bbcccccc
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), /* perm >> 2*/
+                                     perm, __lsx_vrepli_h(0x3f) /* 0x003f */);
+  // 0010bbbb bbcccccc => aaaabbbb bbcccccc
+  composed = __lsx_vbitsel_v(perm_high, composed, (__m128i)v0fff);
 
-  // 6. adjust pointers
-  utf8_output += row[0];
+  return composed;
 }
 
-inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
-                                       const __m128i v_0000,
-                                       const __m128i v_ff80) {
-  // no bits set above 7th bit
-  const __m128i one_byte_bytemask =
-      _mm_cmpeq_epi16(_mm_and_si128(v_u16, v_ff80), v_0000);
-  const uint16_t one_byte_bitmask =
-      static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-  write_v_u16_11bits_to_utf8(v_u16, utf8_output, one_byte_bytemask,
-                             one_byte_bitmask);
+simdutf_really_inline __m128i convert_utf8_2_byte_to_utf16(__m128i in) {
+  // 10bbbbb 110aaaaa => 00bbbbb 000aaaaa
+  __m128i composed = __lsx_vand_v(in, __lsx_vldi(0x3f));
+  // 00bbbbbb 000aaaaa => 00000aaa aabbbbbb
+  composed = __lsx_vbitsel_v(
+      __lsx_vsrli_h(__lsx_vslli_h(composed, 8), 2), /* (aaaaa << 8) >> 2 */
+      __lsx_vsrli_h(composed, 8),                   /* bbbbbb >> 8 */
+      __lsx_vrepli_h(0x3f));                        /* 0x003f */
+  return composed;
 }
-/* end file src/westmere/internal/write_v_u16_11bits_to_utf8.cpp */
 
-} // namespace westmere
-} // namespace internal
-/* end file src/westmere/internal/loader.cpp */
+simdutf_really_inline __m128i
+convert_utf8_1_to_2_byte_to_utf16(__m128i in, size_t shufutf8_idx) {
+  // Converts 6 1-2 byte UTF-8 characters to 6 UTF-16 characters.
+  // This is a relatively easy scenario
+  // we process SIX (6) input code-code units. The max length in bytes of six
+  // code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh =
+      __lsx_vld(reinterpret_cast<const uint8_t *>(
+                    simdutf::tables::utf8_to_utf16::shufutf8[shufutf8_idx]),
+                0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000000 00bbbbbb
+  __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_h(0x7f)); // 6 or 7 bits
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00000aaa aa000000
+  __m128i v1f00 = __lsx_vldi(-2785); // -2785(13bit) => 151f
+  __m128i composed = __lsx_vsrli_h(__lsx_vand_v(perm, v1f00), 2); // 5 bits
+  // Combine with a shift right accumulate
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 00000aaa aabbbbbb
+  composed = __lsx_vadd_h(ascii, composed);
+  return composed;
+}
 
-/* begin file src/westmere/sse_validate_utf16.cpp */
+/* begin file src/lasx/lasx_validate_utf16.cpp */
 /*
     In UTF-16 code units in range 0xD800 to 0xDFFF have special meaning.
 
@@ -35344,7 +50076,7 @@ inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
     - there must not be two consecutive high surrogates (0xdc00 .. 0xdfff)
     - there must not be sole low surrogate nor high surrogate
 
-    We are going to build three bitmasks based on the 3rd nibble:
+    We're going to build three bitmasks based on the 3rd nibble:
     - V = valid word,
     - L = low surrogate (0xd800 .. 0xdbff)
     - H = high surrogate (0xdc00 .. 0xdfff)
@@ -35371,7 +50103,7 @@ inline void write_v_u16_11bits_to_utf8(const __m128i v_u16, char *&utf8_output,
    - nullptr if an error was detected.
 */
 template <endianness big_endian>
-const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
+const char16_t *lasx_validate_utf16(const char16_t *input, size_t size) {
   const char16_t *end = input + size;
 
   const auto v_d8 = simd8<uint8_t>::splat(0xd8);
@@ -35379,29 +50111,26 @@ const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
+
     if (big_endian) {
       in0 = in0.swap_bytes();
       in1 = in1.swap_bytes();
     }
 
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-
-    const auto in = simd16<uint16_t>::pack(t0, t1);
+    const auto in = simd8<uint8_t>(__lasx_xvpermi_d(
+        __lasx_xvssrlni_bu_h(in1.value, in0.value, 8), 0b11011000));
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
-    if (surrogates_bitmask == 0x0000) {
-      input += 16;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -35411,36 +50140,35 @@ const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+      const uint32_t V = ~surrogates_bitmask;
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+      const uint32_t H = vH.to_bitmask();
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+      const uint32_t L = ~H & surrogates_bitmask;
 
-      const uint16_t a = static_cast<uint16_t>(
-          L & (H >> 1)); // A low surrogate must be followed by high one.
-                         // (A low surrogate placed in the 7th register's word
-                         // is an exception we handle.)
-      const uint16_t b = static_cast<uint16_t>(
-          a << 1); // Just mark that the opinput - startite fact is hold,
-                   // thanks to that we have only two masks for valid case.
-      const uint16_t c = static_cast<uint16_t>(
-          V | a | b); // Combine all the masks into the final one.
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
-      if (c == 0xffff) {
+      if (c == 0xffffffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0x7fff) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += 15;
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
       } else {
         return nullptr;
       }
@@ -35451,8 +50179,8 @@ const char16_t *sse_validate_utf16(const char16_t *input, size_t size) {
 }
 
 template <endianness big_endian>
-const result sse_validate_utf16_with_errors(const char16_t *input,
-                                            size_t size) {
+const result lasx_validate_utf16_with_errors(const char16_t *input,
+                                             size_t size) {
   if (simdutf_unlikely(size == 0)) {
     return result(error_code::SUCCESS, 0);
   }
@@ -35464,30 +50192,25 @@ const result sse_validate_utf16_with_errors(const char16_t *input,
   const auto v_fc = simd8<uint8_t>::splat(0xfc);
   const auto v_dc = simd8<uint8_t>::splat(0xdc);
 
-  while (input + simd16<uint16_t>::SIZE * 2 < end) {
+  while (input + simd16<uint16_t>::ELEMENTS * 2 < end) {
     // 0. Load data: since the validation takes into account only higher
     //    byte of each word, we compress the two vectors into one which
     //    consists only the higher bytes.
     auto in0 = simd16<uint16_t>(input);
-    auto in1 =
-        simd16<uint16_t>(input + simd16<uint16_t>::SIZE / sizeof(char16_t));
+    auto in1 = simd16<uint16_t>(input + simd16<uint16_t>::ELEMENTS);
 
     if (big_endian) {
       in0 = in0.swap_bytes();
       in1 = in1.swap_bytes();
     }
-
-    const auto t0 = in0.shr<8>();
-    const auto t1 = in1.shr<8>();
-
-    const auto in = simd16<uint16_t>::pack(t0, t1);
+    const auto in = simd8<uint8_t>(__lasx_xvpermi_d(
+        __lasx_xvssrlni_bu_h(in1.value, in0.value, 8), 0b11011000));
 
     // 1. Check whether we have any 0xD800..DFFF word (0b1101'1xxx'yyyy'yyyy).
     const auto surrogates_wordmask = (in & v_f8) == v_d8;
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(surrogates_wordmask.to_bitmask());
-    if (surrogates_bitmask == 0x0000) {
-      input += 16;
+    const uint32_t surrogates_bitmask = surrogates_wordmask.to_bitmask();
+    if (surrogates_bitmask == 0x0) {
+      input += simd16<uint16_t>::ELEMENTS * 2;
     } else {
       // 2. We have some surrogates that have to be distinguished:
       //    - low  surrogates: 0b1101'10xx'yyyy'yyyy (0xD800..0xDBFF)
@@ -35497,36 +50220,35 @@ const result sse_validate_utf16_with_errors(const char16_t *input,
 
       // V - non-surrogate code units
       //     V = not surrogates_wordmask
-      const uint16_t V = static_cast<uint16_t>(~surrogates_bitmask);
+      const uint32_t V = ~surrogates_bitmask;
 
       // H - word-mask for high surrogates: the six highest bits are 0b1101'11
       const auto vH = (in & v_fc) == v_dc;
-      const uint16_t H = static_cast<uint16_t>(vH.to_bitmask());
+      const uint32_t H = vH.to_bitmask();
 
       // L - word mask for low surrogates
       //     L = not H and surrogates_wordmask
-      const uint16_t L = static_cast<uint16_t>(~H & surrogates_bitmask);
+      const uint32_t L = ~H & surrogates_bitmask;
 
-      const uint16_t a = static_cast<uint16_t>(
-          L & (H >> 1)); // A low surrogate must be followed by high one.
-                         // (A low surrogate placed in the 7th register's word
-                         // is an exception we handle.)
-      const uint16_t b = static_cast<uint16_t>(
-          a << 1); // Just mark that the opinput - startite fact is hold,
-                   // thanks to that we have only two masks for valid case.
-      const uint16_t c = static_cast<uint16_t>(
-          V | a | b); // Combine all the masks into the final one.
+      const uint32_t a =
+          L & (H >> 1); // A low surrogate must be followed by high one.
+                        // (A low surrogate placed in the 7th register's word
+                        // is an exception we handle.)
+      const uint32_t b =
+          a << 1; // Just mark that the opposite fact is hold,
+                  // thanks to that we have only two masks for valid case.
+      const uint32_t c = V | a | b; // Combine all the masks into the final one.
 
-      if (c == 0xffff) {
+      if (c == 0xffffffff) {
         // The whole input register contains valid UTF-16, i.e.,
         // either single code units or proper surrogate pairs.
-        input += 16;
-      } else if (c == 0x7fff) {
-        // The 15 lower code units of the input register contains valid UTF-16.
-        // The 15th word may be either a low or high surrogate. It the next
+        input += simd16<uint16_t>::ELEMENTS * 2;
+      } else if (c == 0x7fffffff) {
+        // The 31 lower code units of the input register contains valid UTF-16.
+        // The 31 word may be either a low or high surrogate. It the next
         // iteration we 1) check if the low surrogate is followed by a high
         // one, 2) reject sole high surrogate.
-        input += 15;
+        input += simd16<uint16_t>::ELEMENTS * 2 - 1;
       } else {
         return result(error_code::SURROGATE, input - start);
       }
@@ -35535,200 +50257,289 @@ const result sse_validate_utf16_with_errors(const char16_t *input,
 
   return result(error_code::SUCCESS, input - start);
 }
-/* end file src/westmere/sse_validate_utf16.cpp */
-/* begin file src/westmere/sse_validate_utf32le.cpp */
-/* Returns:
-   - pointer to the last unprocessed character (a scalar fallback should check
-   the rest);
-   - nullptr if an error was detected.
-*/
-const char32_t *sse_validate_utf32le(const char32_t *input, size_t size) {
+/* end file src/lasx/lasx_validate_utf16.cpp */
+/* begin file src/lasx/lasx_validate_utf32le.cpp */
+
+const char32_t *lasx_validate_utf32le(const char32_t *input, size_t size) {
   const char32_t *end = input + size;
 
-  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
-  const __m128i offset = _mm_set1_epi32(0xffff2000);
-  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
-  __m128i currentmax = _mm_setzero_si128();
-  __m128i currentoffsetmax = _mm_setzero_si128();
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)input & 0x1F) && input < end) {
+    uint32_t word = *input++;
+    if (word > 0x10FFFF || (word >= 0xD800 && word <= 0xDFFF)) {
+      return nullptr;
+    }
+  }
 
-  while (input + 4 < end) {
-    const __m128i in = _mm_loadu_si128((__m128i *)input);
-    currentmax = _mm_max_epu32(in, currentmax);
+  __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
+  __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+  __m256i currentmax = __lasx_xvldi(0x0);
+  __m256i currentoffsetmax = __lasx_xvldi(0x0);
+
+  while (input + 8 < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lasx_xvmax_wu(in, currentmax);
+    // 0xD8__ + 0x2000 = 0xF8__ => 0xF8__ > 0xF7FF
     currentoffsetmax =
-        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
-    input += 4;
+        __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
+    input += 8;
   }
-  __m128i is_zero =
-      _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+  __m256i is_zero =
+      __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
+  if (__lasx_xbnz_v(is_zero)) {
     return nullptr;
   }
 
-  is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
-                          standardoffsetmax);
-  if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+  is_zero = __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
+                           standardoffsetmax);
+  if (__lasx_xbnz_v(is_zero)) {
     return nullptr;
   }
-
   return input;
 }
 
-const result sse_validate_utf32le_with_errors(const char32_t *input,
-                                              size_t size) {
+const result lasx_validate_utf32le_with_errors(const char32_t *input,
+                                               size_t size) {
   const char32_t *start = input;
   const char32_t *end = input + size;
 
-  const __m128i standardmax = _mm_set1_epi32(0x10ffff);
-  const __m128i offset = _mm_set1_epi32(0xffff2000);
-  const __m128i standardoffsetmax = _mm_set1_epi32(0xfffff7ff);
-  __m128i currentmax = _mm_setzero_si128();
-  __m128i currentoffsetmax = _mm_setzero_si128();
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)input & 0x1F) && input < end) {
+    uint32_t word = *input;
+    if (word > 0x10FFFF) {
+      return result(error_code::TOO_LARGE, input - start);
+    }
+    if (word >= 0xD800 && word <= 0xDFFF) {
+      return result(error_code::SURROGATE, input - start);
+    }
+    input++;
+  }
 
-  while (input + 4 < end) {
-    const __m128i in = _mm_loadu_si128((__m128i *)input);
-    currentmax = _mm_max_epu32(in, currentmax);
+  __m256i offset = __lasx_xvreplgr2vr_w(uint32_t(0xffff2000));
+  __m256i standardoffsetmax = __lasx_xvreplgr2vr_w(uint32_t(0xfffff7ff));
+  __m256i standardmax = __lasx_xvldi(-2288); /*0x10ffff*/
+  __m256i currentmax = __lasx_xvldi(0x0);
+  __m256i currentoffsetmax = __lasx_xvldi(0x0);
+
+  while (input + 8 < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(input), 0);
+    currentmax = __lasx_xvmax_wu(in, currentmax);
     currentoffsetmax =
-        _mm_max_epu32(_mm_add_epi32(in, offset), currentoffsetmax);
+        __lasx_xvmax_wu(__lasx_xvadd_w(in, offset), currentoffsetmax);
 
-    __m128i is_zero =
-        _mm_xor_si128(_mm_max_epu32(currentmax, standardmax), standardmax);
-    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    __m256i is_zero =
+        __lasx_xvxor_v(__lasx_xvmax_wu(currentmax, standardmax), standardmax);
+    if (__lasx_xbnz_v(is_zero)) {
       return result(error_code::TOO_LARGE, input - start);
     }
-
-    is_zero = _mm_xor_si128(_mm_max_epu32(currentoffsetmax, standardoffsetmax),
-                            standardoffsetmax);
-    if (_mm_test_all_zeros(is_zero, is_zero) == 0) {
+    is_zero =
+        __lasx_xvxor_v(__lasx_xvmax_wu(currentoffsetmax, standardoffsetmax),
+                       standardoffsetmax);
+    if (__lasx_xbnz_v(is_zero)) {
       return result(error_code::SURROGATE, input - start);
     }
-    input += 4;
+    input += 8;
   }
 
   return result(error_code::SUCCESS, input - start);
 }
-/* end file src/westmere/sse_validate_utf32le.cpp */
-
-/* begin file src/westmere/sse_convert_latin1_to_utf8.cpp */
-std::pair<const char *const, char *const>
-sse_convert_latin1_to_utf8(const char *latin_input,
-                           const size_t latin_input_length, char *utf8_output) {
-  const char *end = latin_input + latin_input_length;
-
-  const __m128i v_0000 = _mm_setzero_si128();
-  // 0b1000_0000
-  const __m128i v_80 = _mm_set1_epi8((uint8_t)0x80);
-  // 0b1111_1111_1000_0000
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-
-  const __m128i latin_1_half_into_u16_byte_mask =
-      _mm_setr_epi8(0, '\x80', 1, '\x80', 2, '\x80', 3, '\x80', 4, '\x80', 5,
-                    '\x80', 6, '\x80', 7, '\x80');
+/* end file src/lasx/lasx_validate_utf32le.cpp */
 
-  const __m128i latin_2_half_into_u16_byte_mask =
-      _mm_setr_epi8(8, '\x80', 9, '\x80', 10, '\x80', 11, '\x80', 12, '\x80',
-                    13, '\x80', 14, '\x80', 15, '\x80');
+/* begin file src/lasx/lasx_convert_latin1_to_utf8.cpp */
+/*
+  Returns a pair: the first unprocessed byte from buf and utf8_output
+  A scalar routing should carry on the conversion of the tail.
+*/
 
-  // each latin1 takes 1-2 utf8 bytes
-  // slow path writes useful 8-15 bytes twice (eagerly writes 16 bytes and then
-  // adjust the pointer) so the last write can exceed the utf8_output size by
-  // 8-1 bytes by reserving 8 extra input bytes, we expect the output to have
-  // 8-16 bytes free
-  while (end - latin_input >= 16 + 8) {
-    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
-    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+std::pair<const char *, char *>
+lasx_convert_latin1_to_utf8(const char *latin1_input, size_t len,
+                            char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
+  const size_t safety_margin = 12;
+  const char *end = latin1_input + len - safety_margin;
 
-    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
-      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
-      latin_input += 16;
+  // We always write 16 bytes, of which more than the first 8 bytes
+  // are valid. A safety margin of 8 is more than sufficient.
+  while (latin1_input + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(latin1_input), 0);
+    uint32_t ascii_mask = __lsx_vpickve2gr_wu(__lsx_vmskgez_b(in8), 0);
+    if (ascii_mask == 0xFFFF) {
+      __lsx_vst(in8, utf8_output, 0);
       utf8_output += 16;
+      latin1_input += 16;
       continue;
     }
+    // We just fallback on UTF-16 code. This could be optimized/simplified
+    // further.
+    __m256i in16 = __lasx_vext2xv_hu_bu(____m256i(in8));
+    // 1. prepare 2-byte values
+    // input 8-bit word : [aabb|bbbb] x 16
+    // expected output   : [1100|00aa|10bb|bbbb] x 16
+    // t0 = [0000|00aa|bbbb|bb00]
+    __m256i t0 = __lasx_xvslli_h(in16, 2);
+    // t1 = [0000|00aa|0000|0000]
+    __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785));
+    // t3 = [0000|00aa|00bb|bbbb]
+    __m256i t2 = __lasx_xvbitsel_v(t1, in16, __lasx_xvrepli_h(0x3f));
+    // t4 = [1100|00aa|10bb|bbbb]
+    __m256i t3 = __lasx_xvor_v(t2, __lasx_xvreplgr2vr_h(uint16_t(0xc080)));
+    // merge ASCII and 2-byte codewords
+    __m256i one_byte_bytemask = __lasx_xvsle_hu(in16, __lasx_xvrepli_h(0x7F));
+    __m256i utf8_unpacked = __lasx_xvbitsel_v(t3, in16, one_byte_bytemask);
+
+    const uint8_t *row0 =
+        &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+            [lasx_1_2_utf8_bytes_mask[(ascii_mask & 0xFF)]][0];
+    __m128i shuffle0 = __lsx_vld(row0 + 1, 0);
+    __m128i utf8_unpacked_lo = lasx_extracti128_lo(utf8_unpacked);
+    __m128i utf8_packed0 =
+        __lsx_vshuf_b(utf8_unpacked_lo, utf8_unpacked_lo, shuffle0);
+    __lsx_vst(utf8_packed0, utf8_output, 0);
+    utf8_output += row0[0];
+
+    const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                              [lasx_1_2_utf8_bytes_mask[(ascii_mask >> 8)]][0];
+    __m128i shuffle1 = __lsx_vld(row1 + 1, 0);
+    __m128i utf8_unpacked_hi = lasx_extracti128_hi(utf8_unpacked);
+    __m128i utf8_packed1 =
+        __lsx_vshuf_b(utf8_unpacked_hi, utf8_unpacked_hi, shuffle1);
+    __lsx_vst(utf8_packed1, utf8_output, 0);
+    utf8_output += row1[0];
 
-    // assuming a/b are bytes and A/B are uint16 of the same value
-    // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
-    __m128i v_u16_latin_1_half =
-        _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
-    // aaaa_aaaa_bbbb_bbbb -> BBBB_BBBB
-    __m128i v_u16_latin_2_half =
-        _mm_shuffle_epi8(v_latin, latin_2_half_into_u16_byte_mask);
+    latin1_input += 16;
+  } // while
 
-    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_1_half,
-                                                   utf8_output, v_0000, v_ff80);
-    internal::westmere::write_v_u16_11bits_to_utf8(v_u16_latin_2_half,
-                                                   utf8_output, v_0000, v_ff80);
-    latin_input += 16;
+  return std::make_pair(latin1_input, reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lasx/lasx_convert_latin1_to_utf8.cpp */
+/* begin file src/lasx/lasx_convert_latin1_to_utf16.cpp */
+std::pair<const char *, char16_t *>
+lasx_convert_latin1_to_utf16le(const char *buf, size_t len,
+                               char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    *utf16_output++ = uint8_t(*buf) & 0xFF;
+    buf++;
   }
 
-  if (end - latin_input >= 16) {
-    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
-    __m128i v_latin = _mm_loadu_si128((__m128i *)latin_input);
+  while (buf + 32 <= end) {
+    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
 
-    if (_mm_testz_si128(v_latin, v_80)) { // ASCII fast path!!!!
-      _mm_storeu_si128((__m128i *)utf8_output, v_latin);
-      latin_input += 16;
-      utf8_output += 16;
-    } else {
-      // assuming a/b are bytes and A/B are uint16 of the same value
-      // aaaa_aaaa_bbbb_bbbb -> AAAA_AAAA
-      __m128i v_u16_latin_1_half =
-          _mm_shuffle_epi8(v_latin, latin_1_half_into_u16_byte_mask);
-      internal::westmere::write_v_u16_11bits_to_utf8(
-          v_u16_latin_1_half, utf8_output, v_0000, v_ff80);
-      latin_input += 8;
-    }
+    __m256i inlow = __lasx_vext2xv_hu_bu(in8);
+    __m256i in8_high = __lasx_xvpermi_q(in8, in8, 0b00000001);
+    __m256i inhigh = __lasx_vext2xv_hu_bu(in8_high);
+    __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
+
+    utf16_output += 32;
+    buf += 32;
   }
 
-  return std::make_pair(latin_input, utf8_output);
+  if (buf + 16 <= end) {
+    __m128i zero = __lsx_vldi(0);
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(zero, in8);
+    __m128i inhigh = __lsx_vilvh_b(zero, in8);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+
+    utf16_output += 16;
+    buf += 16;
+  }
+  return std::make_pair(buf, utf16_output);
 }
-/* end file src/westmere/sse_convert_latin1_to_utf8.cpp */
-/* begin file src/westmere/sse_convert_latin1_to_utf16.cpp */
-template <endianness big_endian>
+
 std::pair<const char *, char16_t *>
-sse_convert_latin1_to_utf16(const char *latin1_input, size_t len,
-                            char16_t *utf16_output) {
-  size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    // Load 16 Latin1 characters into a 128-bit register
-    __m128i in =
-        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&latin1_input[i]));
-    __m128i out1 = big_endian ? _mm_unpacklo_epi8(_mm_setzero_si128(), in)
-                              : _mm_unpacklo_epi8(in, _mm_setzero_si128());
-    __m128i out2 = big_endian ? _mm_unpackhi_epi8(_mm_setzero_si128(), in)
-                              : _mm_unpackhi_epi8(in, _mm_setzero_si128());
-    // Zero extend each Latin1 character to 16-bit integers and store the
-    // results back to memory
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i]), out1);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(&utf16_output[i + 8]), out2);
+lasx_convert_latin1_to_utf16be(const char *buf, size_t len,
+                               char16_t *utf16_output) {
+  const char *end = buf + len;
+
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    *utf16_output++ = (uint16_t(*buf++) << 8);
   }
-  // return pointers pointing to where we left off
-  return std::make_pair(latin1_input + rounded_len, utf16_output + rounded_len);
+
+  __m256i zero = __lasx_xvldi(0);
+  while (buf + 32 <= end) {
+    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m256i in8_shuf = __lasx_xvpermi_d(in8, 0b11011000);
+
+    __m256i inlow = __lasx_xvilvl_b(in8_shuf, zero);
+    __m256i inhigh = __lasx_xvilvh_b(in8_shuf, zero);
+    __lasx_xvst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lasx_xvst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 32);
+    utf16_output += 32;
+    buf += 32;
+  }
+
+  if (buf + 16 <= end) {
+    __m128i zero_128 = __lsx_vldi(0);
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i inlow = __lsx_vilvl_b(in8, zero_128);
+    __m128i inhigh = __lsx_vilvh_b(in8, zero_128);
+    __lsx_vst(inlow, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    __lsx_vst(inhigh, reinterpret_cast<uint16_t *>(utf16_output), 16);
+    utf16_output += 16;
+    buf += 16;
+  }
+
+  return std::make_pair(buf, utf16_output);
 }
-/* end file src/westmere/sse_convert_latin1_to_utf16.cpp */
-/* begin file src/westmere/sse_convert_latin1_to_utf32.cpp */
+/* end file src/lasx/lasx_convert_latin1_to_utf16.cpp */
+/* begin file src/lasx/lasx_convert_latin1_to_utf32.cpp */
 std::pair<const char *, char32_t *>
-sse_convert_latin1_to_utf32(const char *buf, size_t len,
-                            char32_t *utf32_output) {
+lasx_convert_latin1_to_utf32(const char *buf, size_t len,
+                             char32_t *utf32_output) {
   const char *end = buf + len;
 
-  while (end - buf >= 16) {
-    // Load 16 Latin1 characters (16 bytes) into a 128-bit register
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
+  // LASX requires 32-byte alignment, otherwise performance will be degraded
+  while (((uint64_t)utf32_output & 0x1F) && buf < end) {
+    *utf32_output++ = ((uint32_t)*buf) & 0xFF;
+    buf++;
+  }
 
-    // Shift input to process next 4 bytes
-    __m128i in_shifted1 = _mm_srli_si128(in, 4);
-    __m128i in_shifted2 = _mm_srli_si128(in, 8);
-    __m128i in_shifted3 = _mm_srli_si128(in, 12);
+  while (buf + 32 <= end) {
+    __m256i in8 = __lasx_xvld(reinterpret_cast<const uint8_t *>(buf), 0);
 
-    // expand 8-bit to 32-bit unit
-    __m128i out1 = _mm_cvtepu8_epi32(in);
-    __m128i out2 = _mm_cvtepu8_epi32(in_shifted1);
-    __m128i out3 = _mm_cvtepu8_epi32(in_shifted2);
-    __m128i out4 = _mm_cvtepu8_epi32(in_shifted3);
+    __m256i in32_0 = __lasx_vext2xv_wu_bu(in8);
+    __lasx_xvst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
 
-    _mm_storeu_si128((__m128i *)utf32_output, out1);
-    _mm_storeu_si128((__m128i *)(utf32_output + 4), out2);
-    _mm_storeu_si128((__m128i *)(utf32_output + 8), out3);
-    _mm_storeu_si128((__m128i *)(utf32_output + 12), out4);
+    __m256i in8_1 = __lasx_xvpermi_d(in8, 0b00000001);
+    __m256i in32_1 = __lasx_vext2xv_wu_bu(in8_1);
+    __lasx_xvst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 32);
+
+    __m256i in8_2 = __lasx_xvpermi_d(in8, 0b00000010);
+    __m256i in32_2 = __lasx_vext2xv_wu_bu(in8_2);
+    __lasx_xvst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 64);
+
+    __m256i in8_3 = __lasx_xvpermi_d(in8, 0b00000011);
+    __m256i in32_3 = __lasx_vext2xv_wu_bu(in8_3);
+    __lasx_xvst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 96);
+
+    utf32_output += 32;
+    buf += 32;
+  }
+
+  if (buf + 16 <= end) {
+    __m128i in8 = __lsx_vld(reinterpret_cast<const uint8_t *>(buf), 0);
+
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, in8);
+    __m128i in16high = __lsx_vilvh_b(zero, in8);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
 
     utf32_output += 16;
     buf += 16;
@@ -35736,15 +50547,13 @@ sse_convert_latin1_to_utf32(const char *buf, size_t len,
 
   return std::make_pair(buf, utf32_output);
 }
-/* end file src/westmere/sse_convert_latin1_to_utf32.cpp */
-
-/* begin file src/westmere/sse_convert_utf8_to_utf16.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
+/* end file src/lasx/lasx_convert_latin1_to_utf32.cpp */
 
-// Convert up to 12 bytes from utf8 to utf16 using a mask indicating the
+/* begin file src/lasx/lasx_convert_utf8_to_utf16.cpp */
+// Convert up to 16 bytes from utf8 to utf16 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
-// It returns how many bytes were consumed (up to 12).
+// It returns how many bytes were consumed (up to 16, usually 12).
 template <endianness big_endian>
 size_t convert_masked_utf8_to_utf16(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
@@ -35753,204 +50562,304 @@ size_t convert_masked_utf8_to_utf16(const char *input,
   // Why 12 input bytes and not 16? Because we are concerned with the size of
   // the lookup tables. Also 12 is nicely divisible by two and three.
   //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
   //
   // Optimization note: our main path below is load-latency dependent. Thus it
   // is maybe beneficial to have fast paths that depend on branch prediction but
   // have less latency. This results in more instructions but, potentially, also
   // higher speeds.
-  //
+
   // We first try a few fast paths.
-  const __m128i swap =
-      _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    // Note: using 16 bytes is unsafe, see issue_ossfuzz_71218
-    __m128i ascii_first = _mm_cvtepu8_epi16(in);
-    __m128i ascii_second = _mm_cvtepu8_epi16(_mm_srli_si128(in, 8));
-    if (big_endian) {
-      ascii_first = _mm_shuffle_epi8(ascii_first, swap);
-      ascii_second = _mm_shuffle_epi8(ascii_second, swap);
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    __m128i zero = __lsx_vldi(0);
+    if (match_system(big_endian)) {
+      __lsx_vst(__lsx_vilvl_b(zero, in),
+                reinterpret_cast<uint16_t *>(utf16_output), 0);
+      __lsx_vst(__lsx_vilvh_b(zero, in),
+                reinterpret_cast<uint16_t *>(utf16_output), 16);
+    } else {
+      __lsx_vst(__lsx_vilvl_b(in, zero),
+                reinterpret_cast<uint16_t *>(utf16_output), 0);
+      __lsx_vst(__lsx_vilvh_b(in, zero),
+                reinterpret_cast<uint16_t *>(utf16_output), 16);
     }
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output), ascii_first);
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf16_output + 8),
-                     ascii_second);
-    utf16_output += 12; // We wrote 12 16-bit characters.
-    return 12;          // We consumed 12 bytes.
+    utf16_output += 16; // We wrote 16 16-bit characters.
+    return 16;          // We consumed 16 bytes.
   }
-  if (((utf8_end_of_code_point_mask & 0xFFFF) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 code units and turn them into 8 2-byte
-    // UTF-16 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian)
-      composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 8; // We wrote 16 bytes, 8 code points.
-    return 16;
+
+  // 3 byte sequences are the next most common, as seen in CJK, which has long
+  // sequences of these.
+  if (input_utf8_end_of_code_point_mask == 0x924) {
+    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_3_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit characters.
+    return 12;         // We consumed 12 bytes.
   }
-  if (input_utf8_end_of_code_point_mask == 0x924) {
-    // We want to take 4 3-byte UTF-8 code units and turn them into 4 2-byte
-    // UTF-16 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian)
-      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
-    return 12;
+
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xAAAA) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 2-byte
+    // UTF-16 code units.
+    __m128i composed = convert_utf8_2_byte_to_utf16(in);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 8; // We wrote 6 16-bit characters.
+    return 16;         // We consumed 12 bytes.
   }
-  /// We do not have a fast path available, so we fallback.
 
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
+  const __m128i zero = __lsx_vldi(0);
   if (idx < 64) {
     // SIX (6) input code-code units
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-code units. The max length in bytes of six
-    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-    // processors where pdep/pext is fast, we might be able to use a small
-    // lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    if (big_endian)
-      composed = _mm_shuffle_epi8(composed, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed);
-    utf16_output += 6; // We wrote 12 bytes, 6 code points.
+    // Convert to UTF-16
+    __m128i composed = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+    // Store
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 6; // We wrote 6 16-bit characters.
+    return consumed;
   } else if (idx < 145) {
     // FOUR (4) input code-code units
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    __m128i composed_repacked = _mm_packus_epi32(composed, composed);
-    if (big_endian)
-      composed_repacked = _mm_shuffle_epi8(composed_repacked, swap);
-    _mm_storeu_si128((__m128i *)utf16_output, composed_repacked);
-    utf16_output += 4;
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // XXX: depending on the system scalar instructions might be faster.
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: xx0bbbbb x0cccccc
+    // 3 byte: xxbbbbbb x0cccccc
+    __m128i lowperm = __lsx_vpickev_h(perm, perm);
+    // 1 byte: 00000000 00000000
+    // 2 byte: 00000000 00000000
+    // 3 byte: 00000000 1110aaaa
+    __m128i highperm = __lsx_vpickod_h(perm, perm);
+    // 3 byte: aaaa0000 00000000
+    highperm = __lsx_vslli_h(highperm, 12);
+    // ASCII
+    // 1 byte: 00000000 0ccccccc
+    // 2+byte: 00000000 00cccccc
+    __m128i ascii = __lsx_vand_v(lowperm, __lsx_vrepli_h(0x7f));
+    // 1 byte: 00000000 00000000
+    // 2 byte: xx0bbbbb 00000000
+    // 3 byte: xxbbbbbb 00000000
+    __m128i middlebyte = __lsx_vand_v(lowperm, __lsx_vldi(-2561) /*0xFF00*/);
+    // 1 byte: 00000000 0ccccccc
+    // 2 byte: 0010bbbb bbcccccc
+    // 3 byte: 0010bbbb bbcccccc
+    __m128i composed = __lsx_vor_v(__lsx_vsrli_h(middlebyte, 2), ascii);
+
+    __m128i v0fff = __lsx_vreplgr2vr_h(uint16_t(0xfff));
+    // aaaabbbb bbcccccc
+    composed = __lsx_vbitsel_v(highperm, composed, v0fff);
+
+    if (!match_system(big_endian)) {
+      composed = lsx_swap_bytes(composed);
+    }
+
+    __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+    utf16_output += 4; // We wrote 4 16-bit codepoints
+    return consumed;
   } else if (idx < 209) {
-    // TWO (2) input code-code units
-    //////////////
-    // There might be garbage inputs where a leading byte mascarades as a
-    // four-byte leading byte (by being followed by 3 continuation byte), but is
-    // not greater than 0xf0. This could trigger a buffer overflow if we only
-    // counted leading bytes of the form 0xf0 as generating surrogate pairs,
-    // without further UTF-8 validation. Thus we must be careful to ensure that
-    // only leading bytes at least as large as 0xf0 generate surrogate pairs. We
-    // do as at the cost of an extra mask.
-    /////////////
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    // We deliberately carry the leading four bits in highbyte if they are
-    // present, we remove them later when computing hightenbits.
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0xff000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    // When we need to generate a surrogate pair (leading byte > 0xF0), then
-    // the corresponding 32-bit value in 'composed'  will be greater than
-    // > (0xff00000>>6) or > 0x3c00000. This can be used later to identify the
-    // location of the surrogate pairs.
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    const __m128i composedminus =
-        _mm_sub_epi32(composed, _mm_set1_epi32(0x10000));
-    const __m128i lowtenbits =
-        _mm_and_si128(composedminus, _mm_set1_epi32(0x3ff));
-    // Notice the 0x3ff mask:
-    const __m128i hightenbits =
-        _mm_and_si128(_mm_srli_epi32(composedminus, 10), _mm_set1_epi32(0x3ff));
-    const __m128i lowtenbitsadd =
-        _mm_add_epi32(lowtenbits, _mm_set1_epi32(0xDC00));
-    const __m128i hightenbitsadd =
-        _mm_add_epi32(hightenbits, _mm_set1_epi32(0xD800));
-    const __m128i lowtenbitsaddshifted = _mm_slli_epi32(lowtenbitsadd, 16);
-    __m128i surrogates = _mm_or_si128(hightenbitsadd, lowtenbitsaddshifted);
-    uint32_t basic_buffer[4];
-    uint32_t basic_buffer_swap[4];
-    if (big_endian) {
-      _mm_storeu_si128((__m128i *)basic_buffer_swap,
-                       _mm_shuffle_epi8(composed, swap));
-      surrogates = _mm_shuffle_epi8(surrogates, swap);
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-16 pairs. Generating surrogate pairs is a little tricky though, but
+      // it is easier when we can assume they are all pairs. This version does
+      // not use the LUT, but 4 byte sequences are less common and the overhead
+      // of the extra memory access is less important than the early branch
+      // overhead in shorter sequences.
+
+      // Swap byte pairs
+      // 10dddddd 10cccccc|10bbbbbb 11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left 2 bits
+      // cccccc00 dddddd00 xxxxxxxx bbbbbb00
+      __m128i shift = __lsx_vslli_b(swap, 2);
+      // Create a magic number containing the low 2 bits of the trail surrogate
+      // and all the corrections needed to create the pair. UTF-8 4b prefix   =
+      // -0x0000|0xF000 surrogate offset  = -0x0000|0x0040 (0x10000 << 6)
+      // surrogate high    = +0x0000|0xD800
+      // surrogate low     = +0xDC00|0x0000
+      // -------------------------------
+      //                   = +0xDC00|0xE7C0
+      __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xDC00E7C0));
+      // Generate unadjusted trail surrogate minus lowest 2 bits
+      // vec(0000FF00) = __lsx_vldi(-1758)
+      // xxxxxxxx xxxxxxxx|11110aaa bbbbbb00
+      __m128i trail =
+          __lsx_vbitsel_v(shift, swap, __lsx_vldi(-1758 /*0000FF00*/));
+      // Insert low 2 bits of trail surrogate to magic number for later
+      // 11011100 00000000 11100111 110000cc
+      __m128i magic_with_low_2 = __lsx_vor_v(__lsx_vsrli_w(shift, 30), magic);
+
+      // Generate lead surrogate
+      // xxxxcccc ccdddddd|xxxxxxxx xxxxxxxx
+      // 000000cc ccdddddd|xxxxxxxx xxxxxxxx
+      __m128i lead = __lsx_vbitsel_v(
+          __lsx_vsrli_h(__lsx_vand_v(shift, __lsx_vldi(0x3F)), 4), swap,
+          __lsx_vrepli_h(0x3f /* 0x003f*/));
+
+      // Blend pairs
+      // __lsx_vldi(-1741) => vec(0x0000FFFF)
+      // 000000cc ccdddddd|11110aaa bbbbbb00
+      __m128i blend =
+          __lsx_vbitsel_v(lead, trail, __lsx_vldi(-1741) /* (0x0000FFFF)*4 */);
+
+      // Add magic number to finish the result
+      // 110111CC CCDDDDDD|110110AA BBBBBBCC
+      __m128i composed = __lsx_vadd_h(blend, magic_with_low_2);
+      // Byte swap if necessary
+      if (!match_system(big_endian)) {
+        composed = lsx_swap_bytes(composed);
+      }
+      __lsx_vst(composed, reinterpret_cast<uint16_t *>(utf16_output), 0);
+      utf16_output += 6; // We 3 32-bit surrogate pairs.
+      return 12;         // We consumed 12 bytes.
     }
-    _mm_storeu_si128((__m128i *)basic_buffer, composed);
-    uint32_t surrogate_buffer[4];
-    _mm_storeu_si128((__m128i *)surrogate_buffer, surrogates);
+    // 3 1-4 byte sequences
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 3 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // added to fix issue https://github.com/simdutf/simdutf/issues/514
+    // We only want to write 2 * 16-bit code units when that is actually what we
+    // have. Unfortunately, we cannot trust the input. So it is possible to get
+    // 0xff as an input byte and it should not result in a surrogate pair. We
+    // need to check for that.
+    uint32_t permbuffer[4];
+    __lsx_vst(perm, permbuffer, 0);
+    // Mask the low and middle bytes
+    // 00000000 00000000 00000000 0ddddddd
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7f));
+    // Because the surrogates need more work, the high surrogate is computed
+    // first.
+    __m128i middlehigh = __lsx_vslli_w(perm, 2);
+    // 00000000 00000000 00cccccc 00000000
+    __m128i middlebyte = __lsx_vand_v(perm, __lsx_vldi(-3777) /* 0x00003F00 */);
+    // Start assembling the sequence. Since the 4th byte is in the same position
+    // as it would be in a surrogate and there is no dependency, shift left
+    // instead of right. 3 byte: 00000000 10bbbbxx xxxxxxxx xxxxxxxx 4 byte:
+    // 11110aaa bbbbbbxx xxxxxxxx xxxxxxxx
+    __m128i ab =
+        __lsx_vbitsel_v(middlehigh, perm, __lsx_vldi(-1656) /*0xFF000000*/);
+    // Top 16 bits contains the high ten bits of the surrogate pair before
+    // correction 3 byte: 00000000 10bbbbcc|cccc0000 00000000 4 byte: 11110aaa
+    // bbbbbbcc|cccc0000 00000000 - high 10 bits correct w/o correction
+    __m128i v_fffc0000 = __lsx_vreplgr2vr_w(uint32_t(0xFFFC0000));
+    __m128i abc = __lsx_vbitsel_v(__lsx_vslli_w(middlebyte, 4), ab, v_fffc0000);
+    // Combine the low 6 or 7 bits by a shift right accumulate
+    // 3 byte: 00000000 00000010|bbbbcccc ccdddddd - low 16 bits correct
+    // 4 byte: 00000011 110aaabb|bbbbcccc ccdddddd - low 10 bits correct w/o
+    // correction
+    __m128i composed = __lsx_vor_v(ascii, __lsx_vsrli_w(abc, 6));
+    // After this is for surrogates
+    // Blend the low and high surrogates
+    // 4 byte: 11110aaa bbbbbbcc|bbbbcccc ccdddddd
+    __m128i mixed =
+        __lsx_vbitsel_v(abc, composed, __lsx_vldi(-1741) /*0x0000FFFF*/);
+    // Clear the upper 6 bits of the low surrogate. Don't clear the upper bits
+    // yet as 0x10000 was not subtracted from the codepoint yet. 4 byte:
+    // 11110aaa bbbbbbcc|000000cc ccdddddd
+    __m128i v_ffff03ff = __lsx_vreplgr2vr_w(uint32_t(0xFFFF03FF));
+    __m128i masked_pair = __lsx_vand_v(mixed, v_ffff03ff);
+    // Correct the remaining UTF-8 prefix, surrogate offset, and add the
+    // surrogate prefixes in one magic 16-bit addition. similar magic number but
+    // without the continue byte adjust and halfword swapped UTF-8 4b prefix   =
+    // -0xF000|0x0000 surrogate offset  = -0x0040|0x0000 (0x10000 << 6)
+    // surrogate high    = +0xD800|0x0000
+    // surrogate low     = +0x0000|0xDC00
+    // -----------------------------------
+    //                   = +0xE7C0|0xDC00
+    __m128i magic = __lsx_vreplgr2vr_w(uint32_t(0xE7C0DC00));
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD - surrogate pair complete
+    __m128i surrogates = __lsx_vadd_w(masked_pair, magic);
+    // If the high bit is 1 (s32 less than zero), this needs a surrogate pair
+    __m128i is_pair = __lsx_vslt_w(perm, zero);
+    // Select either the 4 byte surrogate pair or the 2 byte solo codepoint
+    // 3 byte: 0xxxxxxx xxxxxxxx|bbbbcccc ccdddddd
+    // 4 byte: 110110AA BBBBBBCC|110111CC CCDDDDDD
+    __m128i selected = __lsx_vbitsel_v(composed, surrogates, is_pair);
+    // Byte swap if necessary
+    if (!match_system(big_endian)) {
+      selected = lsx_swap_bytes(selected);
+    }
+    // Attempting to shuffle and store would be complex, just scalarize.
+    uint32_t buffer_tmp[4];
+    __lsx_vst(selected, buffer_tmp, 0);
+    // Test for the top bit of the surrogate mask. Remove due to issue 514
+    // const uint32_t SURROGATE_MASK = match_system(big_endian) ? 0x80000000 :
+    // 0x00800000;
     for (size_t i = 0; i < 3; i++) {
-      if (basic_buffer[i] > 0x3c00000) {
-        utf16_output[0] = uint16_t(surrogate_buffer[i] & 0xffff);
-        utf16_output[1] = uint16_t(surrogate_buffer[i] >> 16);
+      // Surrogate
+      // Used to be if (buffer[i] & SURROGATE_MASK) {
+      // See discussion above.
+      // patch for issue https://github.com/simdutf/simdutf/issues/514
+      if ((permbuffer[i] & 0xf8000000) == 0xf0000000) {
+        utf16_output[0] = uint16_t(buffer_tmp[i] >> 16);
+        utf16_output[1] = uint16_t(buffer_tmp[i] & 0xFFFF);
         utf16_output += 2;
       } else {
-        utf16_output[0] = big_endian ? uint16_t(basic_buffer_swap[i])
-                                     : uint16_t(basic_buffer[i]);
+        utf16_output[0] = uint16_t(buffer_tmp[i] & 0xFFFF);
         utf16_output++;
       }
     }
+    return consumed;
   } else {
     // here we know that there is an error but we do not handle errors
+    return 12;
   }
-  return consumed;
 }
-/* end file src/westmere/sse_convert_utf8_to_utf16.cpp */
-/* begin file src/westmere/sse_convert_utf8_to_utf32.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
-
+/* end file src/lasx/lasx_convert_utf8_to_utf16.cpp */
+/* begin file src/lasx/lasx_convert_utf8_to_utf32.cpp */
 // Convert up to 12 bytes from utf8 to utf32 using a mask indicating the
 // end of the code points. Only the least significant 12 bits of the mask
 // are accessed.
 // It returns how many bytes were consumed (up to 12).
 size_t convert_masked_utf8_to_utf32(const char *input,
                                     uint64_t utf8_end_of_code_point_mask,
-                                    char32_t *&utf32_output) {
+                                    char32_t *&utf32_out) {
   // we use an approach where we try to process up to 12 input bytes.
   // Why 12 input bytes and not 16? Because we are concerned with the size of
   // the lookup tables. Also 12 is nicely divisible by two and three.
   //
+  uint32_t *&utf32_output = reinterpret_cast<uint32_t *&>(utf32_out);
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xFFF;
   //
   // Optimization note: our main path below is load-latency dependent. Thus it
   // is maybe beneficial to have fast paths that depend on branch prediction but
@@ -35958,135 +50867,179 @@ size_t convert_masked_utf8_to_utf32(const char *input,
   // higher speeds.
   //
   // We first try a few fast paths.
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask & 0xfff;
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                     _mm_cvtepu8_epi32(in));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 4)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 8),
-                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 8)));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 12),
-                     _mm_cvtepu8_epi32(_mm_srli_si128(in, 12)));
-    utf32_output += 12; // We wrote 12 32-bit characters.
-    return 12;          // We consumed 12 bytes.
-  }
-  if (((utf8_end_of_code_point_mask & 0xffff) == 0xaaaa)) {
-    // We want to take 8 2-byte UTF-8 code units and turn them into 8 4-byte
-    // UTF-32 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                     _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
-    utf32_output += 8; // We wrote 32 bytes, 8 code points.
-    return 16;
+  if ((utf8_end_of_code_point_mask & 0xffff) == 0xffff) {
+    // We process in chunks of 16 bytes.
+    // use fast implementation in src/simdutf/arm64/simd.h
+    // Ideally the compiler can keep the tables in registers.
+    __m128i zero = __lsx_vldi(0);
+    __m128i in16low = __lsx_vilvl_b(zero, in);
+    __m128i in16high = __lsx_vilvh_b(zero, in);
+    __m128i in32_0 = __lsx_vilvl_h(zero, in16low);
+    __m128i in32_1 = __lsx_vilvh_h(zero, in16low);
+    __m128i in32_2 = __lsx_vilvl_h(zero, in16high);
+    __m128i in32_3 = __lsx_vilvh_h(zero, in16high);
+
+    __lsx_vst(in32_0, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(in32_1, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    __lsx_vst(in32_2, reinterpret_cast<uint32_t *>(utf32_output), 32);
+    __lsx_vst(in32_3, reinterpret_cast<uint32_t *>(utf32_output), 48);
+
+    utf32_output += 16; // We wrote 16 32-bit characters.
+    return 16;          // We consumed 16 bytes.
   }
+  __m128i zero = __lsx_vldi(0);
   if (input_utf8_end_of_code_point_mask == 0x924) {
     // We want to take 4 3-byte UTF-8 code units and turn them into 4 4-byte
-    // UTF-32 code units. There is probably a more efficient sequence, but the
-    // following might do.
-    const __m128i sh =
-        _mm_setr_epi8(2, 1, 0, -1, 5, 4, 3, -1, 8, 7, 6, -1, 11, 10, 9, -1);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
-    return 12;
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_3_byte_to_utf16(in);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return 12;         // We consumed 12 bytes.
   }
-  /// We do not have a fast path available, so we fallback.
+  // 2 byte sequences occur in short bursts in languages like Greek and Russian.
+  if (input_utf8_end_of_code_point_mask == 0xaaa) {
+    // We want to take 6 2-byte UTF-8 code units and turn them into 6 4-byte
+    // UTF-32 code units. Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_2_byte_to_utf16(in);
+
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return 12; // We consumed 12 bytes.
+  }
+  // Either no fast path or an unimportant fast path.
+
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
 
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
   if (idx < 64) {
     // SIX (6) input code-code units
-    // this is a relatively easy scenario
-    // we process SIX (6) input code-code units. The max length in bytes of six
-    // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-    // processors where pdep/pext is fast, we might be able to use a small
-    // lookup table.
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-    const __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                     _mm_cvtepu16_epi32(composed));
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                     _mm_cvtepu16_epi32(_mm_srli_si128(composed, 8)));
-    utf32_output += 6; // We wrote 12 bytes, 6 code points.
+    // Convert to UTF-16
+    __m128i composed_utf16 = convert_utf8_1_to_2_byte_to_utf16(in, idx);
+    __m128i utf32_low = __lsx_vilvl_h(zero, composed_utf16);
+    __m128i utf32_high = __lsx_vilvh_h(zero, composed_utf16);
+
+    __lsx_vst(utf32_low, reinterpret_cast<uint32_t *>(utf32_output), 0);
+    __lsx_vst(utf32_high, reinterpret_cast<uint32_t *>(utf32_output), 16);
+    utf32_output += 6;
+    return consumed;
   } else if (idx < 145) {
     // FOUR (4) input code-code units
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii =
-        _mm_and_si128(perm, _mm_set1_epi32(0x7f)); // 7 or 6 bits
-    const __m128i middlebyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x3f00)); // 5 or 6 bits
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    const __m128i highbyte =
-        _mm_and_si128(perm, _mm_set1_epi32(0x0f0000)); // 4 bits
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 4);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted), highbyte_shifted);
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 4;
+    // UTF-16 and UTF-32 use similar algorithms, but UTF-32 skips the narrowing.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // Shuffle
+    // 1 byte: 00000000 00000000 0ccccccc
+    // 2 byte: 00000000 110bbbbb 10cccccc
+    // 3 byte: 1110aaaa 10bbbbbb 10cccccc
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+    // Split
+    // 00000000 00000000 0ccccccc
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F)); // 6 or 7 bits
+    // Note: unmasked
+    // xxxxxxxx aaaaxxxx xxxxxxxx
+    __m128i high =
+        __lsx_vsrli_w(__lsx_vand_v(perm, __lsx_vldi(0xf)), 4); // 4 bits
+    // Use 16 bit bic instead of and.
+    // The top bits will be corrected later in the bsl
+    // 00000000 10bbbbbb 00000000
+    __m128i middle =
+        __lsx_vand_v(perm, __lsx_vldi(-1758 /*0x0000FF00*/)); // 5 or 6 bits
+    // Combine low and middle with shift right accumulate
+    // 00000000 00xxbbbb bbcccccc
+    __m128i lowmid = __lsx_vor_v(ascii, __lsx_vsrli_w(middle, 2));
+    // Insert top 4 bits from high byte with bitwise select
+    // 00000000 aaaabbbb bbcccccc
+    __m128i composed =
+        __lsx_vbitsel_v(lowmid, high, __lsx_vldi(-3600 /*0x0000F000*/));
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 4; // We wrote 4 32-bit characters.
+    return consumed;
   } else if (idx < 209) {
-    // TWO (2) input code-code units
-    const __m128i sh =
-        _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-    const __m128i perm = _mm_shuffle_epi8(in, sh);
-    const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi32(0x7f));
-    const __m128i middlebyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f00));
-    const __m128i middlebyte_shifted = _mm_srli_epi32(middlebyte, 2);
-    __m128i middlehighbyte = _mm_and_si128(perm, _mm_set1_epi32(0x3f0000));
-    // correct for spurious high bit
-    const __m128i correct =
-        _mm_srli_epi32(_mm_and_si128(perm, _mm_set1_epi32(0x400000)), 1);
-    middlehighbyte = _mm_xor_si128(correct, middlehighbyte);
-    const __m128i middlehighbyte_shifted = _mm_srli_epi32(middlehighbyte, 4);
-    const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi32(0x07000000));
-    const __m128i highbyte_shifted = _mm_srli_epi32(highbyte, 6);
-    const __m128i composed =
-        _mm_or_si128(_mm_or_si128(ascii, middlebyte_shifted),
-                     _mm_or_si128(highbyte_shifted, middlehighbyte_shifted));
-    _mm_storeu_si128((__m128i *)utf32_output, composed);
-    utf32_output += 3;
+    // THREE (3) input code-code units
+    if (input_utf8_end_of_code_point_mask == 0x888) {
+      // We want to take 3 4-byte UTF-8 code units and turn them into 3 4-byte
+      // UTF-32 code units. This uses the same method as the fixed 3 byte
+      // version, reversing and shift left insert. However, there is no need for
+      // a shuffle mask now, just rev16 and rev32.
+      //
+      // This version does not use the LUT, but 4 byte sequences are less common
+      // and the overhead of the extra memory access is less important than the
+      // early branch overhead in shorter sequences, so it comes last.
+
+      // Swap pairs of bytes
+      // 10dddddd|10cccccc|10bbbbbb|11110aaa
+      // 10cccccc 10dddddd|11110aaa 10bbbbbb
+      __m128i swap = lsx_swap_bytes(in);
+      // Shift left and insert
+      // xxxxcccc ccdddddd|xxxxxxxa aabbbbbb
+      __m128i merge1 = __lsx_vbitsel_v(__lsx_vsrli_h(swap, 2), swap,
+                                       __lsx_vrepli_h(0x3f /*0x003F*/));
+      // Shift insert again
+      // xxxxxxxx xxxaaabb bbbbcccc ccdddddd
+      __m128i merge2 =
+          __lsx_vbitsel_v(__lsx_vslli_w(merge1, 12), /* merge1 << 12 */
+                          __lsx_vsrli_w(merge1, 16), /* merge1 >> 16 */
+                          __lsx_vldi(-2545));        /*0x00000FFF*/
+      // Clear the garbage
+      // 00000000 000aaabb bbbbcccc ccdddddd
+      __m128i composed = __lsx_vand_v(merge2, __lsx_vldi(-2273 /*0x1FFFFF*/));
+      // Store
+      __lsx_vst(composed, utf32_output, 0);
+      utf32_output += 3; // We wrote 3 32-bit characters.
+      return 12;         // We consumed 12 bytes.
+    }
+    // Unlike UTF-16, doing a fast codepath doesn't have nearly as much benefit
+    // due to surrogates no longer being involved.
+    __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                               simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                           0);
+    // 1 byte: 00000000 00000000 00000000 0ddddddd
+    // 2 byte: 00000000 00000000 110ccccc 10dddddd
+    // 3 byte: 00000000 1110bbbb 10cccccc 10dddddd
+    // 4 byte: 11110aaa 10bbbbbb 10cccccc 10dddddd
+    sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+    __m128i perm = __lsx_vshuf_b(zero, in, sh);
+
+    // Ascii
+    __m128i ascii = __lsx_vand_v(perm, __lsx_vrepli_w(0x7F));
+    __m128i middle = __lsx_vand_v(perm, __lsx_vldi(-3777 /*0x00003f00*/));
+    // 00000000 00000000 0000cccc ccdddddd
+    __m128i cd =
+        __lsx_vbitsel_v(__lsx_vsrli_w(middle, 2), ascii, __lsx_vrepli_w(0x3f));
+
+    __m128i correction = __lsx_vand_v(perm, __lsx_vldi(-3520 /*0x00400000*/));
+    __m128i corrected = __lsx_vadd_b(perm, __lsx_vsrli_w(correction, 1));
+    // Insert twice
+    // 00000000 000aaabb bbbbxxxx xxxxxxxx
+    __m128i corrected_srli2 =
+        __lsx_vsrli_w(__lsx_vand_v(corrected, __lsx_vrepli_b(0x7)), 2);
+    __m128i ab =
+        __lsx_vbitsel_v(corrected_srli2, corrected, __lsx_vrepli_h(0x3f));
+    ab = __lsx_vsrli_w(ab, 4);
+    // 00000000 000aaabb bbbbcccc ccdddddd
+    __m128i composed =
+        __lsx_vbitsel_v(ab, cd, __lsx_vldi(-2545 /*0x00000FFF*/));
+    // Store
+    __lsx_vst(composed, utf32_output, 0);
+    utf32_output += 3; // We wrote 3 32-bit characters.
+    return consumed;
   } else {
     // here we know that there is an error but we do not handle errors
+    return 12;
   }
-  return consumed;
 }
-/* end file src/westmere/sse_convert_utf8_to_utf32.cpp */
-/* begin file src/westmere/sse_convert_utf8_to_latin1.cpp */
-// depends on "tables/utf8_to_utf16_tables.h"
-
-// Convert up to 12 bytes from utf8 to latin1 using a mask indicating the
-// end of the code points. Only the least significant 12 bits of the mask
-// are accessed.
-// It returns how many bytes were consumed (up to 12).
+/* end file src/lasx/lasx_convert_utf8_to_utf32.cpp */
+/* begin file src/lasx/lasx_convert_utf8_to_latin1.cpp */
 size_t convert_masked_utf8_to_latin1(const char *input,
                                      uint64_t utf8_end_of_code_point_mask,
                                      char *&latin1_output) {
@@ -36094,27 +51047,30 @@ size_t convert_masked_utf8_to_latin1(const char *input,
   // Why 12 input bytes and not 16? Because we are concerned with the size of
   // the lookup tables. Also 12 is nicely divisible by two and three.
   //
-  //
+  __m128i in = __lsx_vld(reinterpret_cast<const uint8_t *>(input), 0);
+
+  const uint16_t input_utf8_end_of_code_point_mask =
+      utf8_end_of_code_point_mask & 0xfff;
   // Optimization note: our main path below is load-latency dependent. Thus it
   // is maybe beneficial to have fast paths that depend on branch prediction but
   // have less latency. This results in more instructions but, potentially, also
   // higher speeds.
-  //
-  const __m128i in = _mm_loadu_si128((__m128i *)input);
-  const uint16_t input_utf8_end_of_code_point_mask =
-      utf8_end_of_code_point_mask &
-      0xfff; // we are only processing 12 bytes in case it is not all ASCII
-  if (utf8_end_of_code_point_mask == 0xfff) {
-    // We process the data in chunks of 12 bytes.
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(latin1_output), in);
-    latin1_output += 12; // We wrote 12 characters.
-    return 12;           // We consumed 12 bytes.
+
+  // We first try a few fast paths.
+  // The obvious first test is ASCII, which actually consumes the full 16.
+  if ((utf8_end_of_code_point_mask & 0xFFFF) == 0xFFFF) {
+    // We process in chunks of 16 bytes
+    __lsx_vst(in, reinterpret_cast<uint8_t *>(latin1_output), 0);
+    latin1_output += 16; // We wrote 16 18-bit characters.
+    return 16;           // We consumed 16 bytes.
   }
-  /// We do not have a fast path available, so we fallback.
-  const uint8_t idx =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][0];
-  const uint8_t consumed =
-      tables::utf8_to_utf16::utf8bigindex[input_utf8_end_of_code_point_mask][1];
+  /// We do not have a fast path available, or the fast path is unimportant, so
+  /// we fallback.
+  const uint8_t idx = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][0];
+
+  const uint8_t consumed = simdutf::tables::utf8_to_utf16::utf8bigindex
+      [input_utf8_end_of_code_point_mask][1];
   // this indicates an invalid input:
   if (idx >= 64) {
     return consumed;
@@ -36122,50 +51078,63 @@ size_t convert_masked_utf8_to_latin1(const char *input,
   // Here we should have (idx < 64), if not, there is a bug in the validation or
   // elsewhere. SIX (6) input code-code units this is a relatively easy scenario
   // we process SIX (6) input code-code units. The max length in bytes of six
-  // code code units spanning between 1 and 2 bytes each is 12 bytes. On
-  // processors where pdep/pext is fast, we might be able to use a small lookup
-  // table.
-  const __m128i sh =
-      _mm_loadu_si128((const __m128i *)tables::utf8_to_utf16::shufutf8[idx]);
-  const __m128i perm = _mm_shuffle_epi8(in, sh);
-  const __m128i ascii = _mm_and_si128(perm, _mm_set1_epi16(0x7f));
-  const __m128i highbyte = _mm_and_si128(perm, _mm_set1_epi16(0x1f00));
-  __m128i composed = _mm_or_si128(ascii, _mm_srli_epi16(highbyte, 2));
-  const __m128i latin1_packed = _mm_packus_epi16(composed, composed);
+  // code code units spanning between 1 and 2 bytes each is 12 bytes. Converts 6
+  // 1-2 byte UTF-8 characters to 6 UTF-16 characters. This is a relatively easy
+  // scenario we process SIX (6) input code-code units. The max length in bytes
+  // of six code code units spanning between 1 and 2 bytes each is 12 bytes.
+  __m128i sh = __lsx_vld(reinterpret_cast<const uint8_t *>(
+                             simdutf::tables::utf8_to_utf16::shufutf8[idx]),
+                         0);
+  // Shuffle
+  // 1 byte: 00000000 0bbbbbbb
+  // 2 byte: 110aaaaa 10bbbbbb
+  sh = __lsx_vand_v(sh, __lsx_vldi(0x1f));
+  __m128i perm = __lsx_vshuf_b(__lsx_vldi(0), in, sh);
+  // ascii mask
+  // 1 byte: 11111111 11111111
+  // 2 byte: 00000000 00000000
+  __m128i ascii_mask = __lsx_vslt_bu(perm, __lsx_vldi(0x80));
+  // utf8 mask
+  // 1 byte: 00000000 00000000
+  // 2 byte: 00111111 00111111
+  __m128i utf8_mask = __lsx_vand_v(__lsx_vsle_bu(__lsx_vldi(0x80), perm),
+                                   __lsx_vldi(0b00111111));
+  // mask
+  //  1 byte: 11111111 11111111
+  //  2 byte: 00111111 00111111
+  __m128i mask = __lsx_vor_v(utf8_mask, ascii_mask);
+
+  __m128i composed = __lsx_vbitsel_v(__lsx_vsrli_h(perm, 2), perm, mask);
   // writing 8 bytes even though we only care about the first 6 bytes.
-  // performance note: it would be faster to use _mm_storeu_si128, we should
-  // investigate.
-  _mm_storel_epi64((__m128i *)latin1_output, latin1_packed);
+  __m128i latin1_packed = __lsx_vpickev_b(__lsx_vldi(0), composed);
+
+  __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
   latin1_output += 6; // We wrote 6 bytes.
   return consumed;
 }
-/* end file src/westmere/sse_convert_utf8_to_latin1.cpp */
+/* end file src/lasx/lasx_convert_utf8_to_latin1.cpp */
 
-/* begin file src/westmere/sse_convert_utf16_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf16_to_latin1.cpp */
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
-                            char *latin1_output) {
+lasx_convert_utf16_to_latin1(const char16_t *buf, size_t len,
+                             char *latin1_output) {
   const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    // Load 8 UTF-16 characters into 128-bit SSE register
-    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
-
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
     if (!match_system(big_endian)) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
     }
-
-    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
-    if (_mm_testz_si128(in, high_byte_mask)) {
-      // Pack 16-bit characters into 8-bit and store in latin1_output
-      __m128i latin1_packed = _mm_packus_epi16(in, in);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed);
-      // Adjust pointers for next iteration
-      buf += 8;
-      latin1_output += 8;
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
     } else {
       return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
@@ -36175,29 +51144,28 @@ sse_convert_utf16_to_latin1(const char16_t *buf, size_t len,
 
 template <endianness big_endian>
 std::pair<result, char *>
-sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
-                                        char *latin1_output) {
+lasx_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
+                                         char *latin1_output) {
   const char16_t *start = buf;
   const char16_t *end = buf + len;
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(buf));
-
+  while (buf + 16 <= end) {
+    __m128i in = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 0);
+    __m128i in1 = __lsx_vld(reinterpret_cast<const uint16_t *>(buf), 16);
     if (!match_system(big_endian)) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+      in = lsx_swap_bytes(in);
+      in1 = lsx_swap_bytes(in1);
     }
-
-    __m128i high_byte_mask = _mm_set1_epi16((int16_t)0xFF00);
-    if (_mm_testz_si128(in, high_byte_mask)) {
-      __m128i latin1_packed = _mm_packus_epi16(in, in);
-      _mm_storel_epi64(reinterpret_cast<__m128i *>(latin1_output),
-                       latin1_packed);
-      buf += 8;
-      latin1_output += 8;
+    if (__lsx_bz_v(__lsx_vpickod_b(in1, in))) {
+      // 1. pack the bytes
+      __m128i latin1_packed = __lsx_vpickev_b(in1, in);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
     } else {
-      // Fallback to scalar code for handling errors
-      for (int k = 0; k < 8; k++) {
+      // Let us do a scalar fallback.
+      for (int k = 0; k < 16; k++) {
         uint16_t word = !match_system(big_endian)
                             ? scalar::utf16::swap_bytes(buf[k])
                             : buf[k];
@@ -36208,16 +51176,15 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
                                 latin1_output);
         }
       }
-      buf += 8;
     }
   } // while
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/westmere/sse_convert_utf16_to_latin1.cpp */
-/* begin file src/westmere/sse_convert_utf16_to_utf8.cpp */
+/* end file src/lasx/lasx_convert_utf16_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf16_to_utf8.cpp */
 /*
-    The vectorized algorithm works on single SSE register i.e., it
+    The vectorized algorithm works on single LASX register i.e., it
     loads eight 16-bit code units.
 
     We consider three cases:
@@ -36231,11 +51198,11 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
     Ad 1.
 
     When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it is an ASCII
+    can be converted into: 1) single UTF8 byte (when it's an ASCII
     char) or 2) two UTF8 bytes.
 
     For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
+    codes and finally compress the whole LASX register with a single
     shuffle.
 
     We need 256-entry lookup table to get a compression pattern
@@ -36253,7 +51220,7 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
     the three-UTF8-bytes case.
 
     Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
+    array of 32-bit values. The array spans two LASX registers.
     The bytes from the registers are compressed using two shuffles.
 
     We need 256-entry lookup table to get a compression pattern
@@ -36264,187 +51231,210 @@ sse_convert_utf16_to_latin1_with_errors(const char16_t *buf, size_t len,
     To summarize:
     - We need two 256-entry tables that have 8704 bytes in total.
 */
-
 /*
   Returns a pair: the first unprocessed byte from buf and utf8_output
   A scalar routing should carry on the conversion of the tail.
 */
+
 template <endianness big_endian>
 std::pair<const char16_t *, char *>
-sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
-
+lasx_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char16_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
     }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        nextin = _mm_shuffle_epi8(nextin, swap);
-      }
-      if (!_mm_testz_si128(nextin, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, in);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
-      } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
+    if (__lasx_xbnz_h(__lasx_xvslt_hu(
+            in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      __m256i utf8_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
+      // 2. store (16 bytes)
+      __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
 
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+    if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
+      // expected output   : [110a|aaaa|10bb|bbbb] x 16
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m256i t0 = __lasx_xvslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m256i t3 = __lasx_xvor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
+      __m256i t4 = __lasx_xvor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m256i one_byte_bytemask =
+          __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
+      __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+      uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+      uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+      // 4. pack the bytes
+      const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m1]][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_packed1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+      const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_packed2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+      // 5. store bytes
+      __lsx_vst(utf8_packed1, utf8_output, 0);
+      utf8_output += row1[0];
 
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+      __lsx_vst(utf8_packed2, utf8_output, 0);
+      utf8_output += row2[0];
 
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      internal::westmere::write_v_u16_11bits_to_utf8(
-          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
-      buf += 8;
+      buf += 16;
       continue;
     }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
+                       __lasx_xvldi(-2600 /*0xD800*/));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
+    if (__lasx_xbz_v(surrogates_bytemask)) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
       /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
       /**
        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+      __m256i t0 = __lasx_xvpickev_b(in, in);
+      t0 = __lasx_xvilvl_b(t0, t0);
+
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+      __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m256i s0 = __lasx_xvsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m256i s1 = __lasx_xvslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+
+      // [00bb|bbbb|0000|aaaa]
+      __m256i s2 = __lasx_xvor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+      __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+      __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
+      __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                   __lasx_xvldi(-2752 /*0x4000*/));
+      __m256i s4 = __lasx_xvxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+      __m256i out0 = __lasx_xvilvl_h(s4, t2);
+      __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
+      __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
+      __m256i one_byte_bytemask_low =
+          __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m256i one_byte_bytemask_high =
+          __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      __m256i one_or_two_bytes_bytemask_low =
+          __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+      __m256i one_or_two_bytes_bytemask_high =
+          __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m256i mask0 = __lasx_xvmskltz_h(
+          __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
+      __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
+          one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
+
+      uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
       const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
 
+      mask = __lasx_xvpickve2gr_wu(mask1, 0);
       const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+      __lsx_vst(utf8_1, utf8_output, 0);
       utf8_output += row1[0];
 
-      buf += 8;
+      mask = __lasx_xvpickve2gr_wu(mask0, 4);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+      __lsx_vst(utf8_2, utf8_output, 0);
+      utf8_output += row2[0];
+
+      mask = __lasx_xvpickve2gr_wu(mask1, 4);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle3 = __lsx_vld(row3, 1);
+      __m128i utf8_3 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+      __lsx_vst(utf8_3, utf8_output, 0);
+      utf8_output += row3[0];
+
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36456,7 +51446,9 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xFF80) == 0) {
           *utf8_output++ = char(word);
         } else if ((word & 0xF800) == 0) {
@@ -36469,12 +51461,14 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr, utf8_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf8_output++ = char((value >> 18) | 0b11110000);
@@ -36486,8 +51480,7 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
       buf += k;
     }
   } // while
-
-  return std::make_pair(buf, utf8_output);
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
 /*
@@ -36499,181 +51492,205 @@ sse_convert_utf16_to_utf8(const char16_t *buf, size_t len, char *utf8_output) {
 */
 template <endianness big_endian>
 std::pair<result, char *>
-sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
-                                      char *utf8_output) {
+lasx_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
+                                       char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char16_t *start = buf;
   const char16_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7ff));
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  while (buf + 16 + safety_margin <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
     }
-    // a single 16-bit UTF-16 word can yield 1, 2 or 3 UTF-8 bytes
-    const __m128i v_ff80 = _mm_set1_epi16((int16_t)0xff80);
-    if (_mm_testz_si128(in, v_ff80)) { // ASCII fast path!!!!
-      __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        nextin = _mm_shuffle_epi8(nextin, swap);
-      }
-      if (!_mm_testz_si128(nextin, v_ff80)) {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, in);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 8;
-        utf8_output += 8;
-        in = nextin;
-      } else {
-        // 1. pack the bytes
-        // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(in, nextin);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-        // 3. adjust pointers
-        buf += 16;
-        utf8_output += 16;
-        continue; // we are done for this round!
-      }
+    if (__lasx_xbnz_h(__lasx_xvslt_hu(
+            in, __lasx_xvrepli_h(0x7F)))) { // ASCII fast path!!!!
+      // 1. pack the bytes
+      __m256i utf8_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_b(in, in), 0b00001000);
+      // 2. store (16 bytes)
+      __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+      // 3. adjust pointers
+      buf += 16;
+      utf8_output += 16;
+      continue; // we are done for this round!
     }
 
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
+    if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, in))) {
+      // 1. prepare 2-byte values
+      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 16
+      // expected output   : [110a|aaaa|10bb|bbbb] x 16
+      // t0 = [000a|aaaa|bbbb|bb00]
+      __m256i t0 = __lasx_xvslli_h(in, 2);
+      // t1 = [000a|aaaa|0000|0000]
+      __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+      // t2 = [0000|0000|00bb|bbbb]
+      __m256i t2 = __lasx_xvand_v(in, __lasx_xvrepli_h(0x3f));
+      // t3 = [000a|aaaa|00bb|bbbb]
+      __m256i t3 = __lasx_xvor_v(t1, t2);
+      // t4 = [110a|aaaa|10bb|bbbb]
+      __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xc080));
+      __m256i t4 = __lasx_xvor_v(t3, v_c080);
+      // 2. merge ASCII and 2-byte codewords
+      __m256i one_byte_bytemask =
+          __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F /*0x007F*/));
+      __m256i utf8_unpacked = __lasx_xvbitsel_v(t4, in, one_byte_bytemask);
+      // 3. prepare bitmask for 8-bit lookup
+      __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+      uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+      uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+      // 4. pack the bytes
+      const uint8_t *row1 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m1]][0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_packed1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+      const uint8_t *row2 = &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                                [lasx_1_2_utf8_bytes_mask[m2]][0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_packed2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+      // 5. store bytes
+      __lsx_vst(utf8_packed1, utf8_output, 0);
+      utf8_output += row1[0];
 
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
+      __lsx_vst(utf8_packed2, utf8_output, 0);
+      utf8_output += row2[0];
 
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      internal::westmere::write_v_u16_11bits_to_utf8(
-          in, utf8_output, one_byte_bytemask, one_byte_bitmask);
-      buf += 8;
+      buf += 16;
       continue;
     }
-
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, __lasx_xvldi(-2568 /*0xF800*/)),
+                       __lasx_xvldi(-2600 /*0xD800*/));
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
+    if (__lasx_xbz_v(surrogates_bytemask)) {
       // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
       /* In this branch we handle three cases:
-         1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-         2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              - two
-        UTF-8 bytes
-         3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
+           1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+         single UFT-8 byte
+           2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+         two UTF-8 bytes
+           3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+         three UTF-8 bytes
 
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
+          We expand the input word (16-bit) into two code units (32-bit), thus
+          we have room for four bytes. However, we need five distinct bit
+          layouts. Note that the last byte in cases #2 and #3 is the same.
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
+          We precompute byte 1 for case #1 and the common byte for cases #2 & #3
+          in register t2.
 
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
+          We precompute byte 1 for case #3 and -- **conditionally** --
+         precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+         they differ by exactly one bit.
 
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
+          Finally from these two code units we build proper UTF-8 sequence,
+         taking into account the case (i.e, the number of bytes to write).
+        */
       /**
        * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
        * t2 => [0ccc|cccc] [10cc|cccc]
        * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
        */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
       // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
+      __m256i t0 = __lasx_xvpickev_b(in, in);
+      t0 = __lasx_xvilvl_b(t0, t0);
+
+      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|00cc|cccc]
+      __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+      __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
       // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688));
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+      // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+      __m256i s0 = __lasx_xvsrli_h(in, 12);
+      // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+      __m256i s1 = __lasx_xvslli_h(in, 2);
+      // s1: [aabb|bbbb|cccc|cc00] => [00bb|bbbb|0000|0000]
+      s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+
+      // [00bb|bbbb|0000|aaaa]
+      __m256i s2 = __lasx_xvor_v(s0, s1);
+      // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+      __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+      __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+      __m256i one_or_two_bytes_bytemask = __lasx_xvsle_hu(in, v_07ff);
+      __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                   __lasx_xvldi(-2752 /*0x4000*/));
+      __m256i s4 = __lasx_xvxor_v(s3, m0);
 
       // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+      __m256i out0 = __lasx_xvilvl_h(s4, t2);
+      __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
       // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
-        continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
-
+      __m256i one_byte_bytemask = __lasx_xvsle_hu(in, __lasx_xvrepli_h(0x7F));
+      __m256i one_byte_bytemask_low =
+          __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+      __m256i one_byte_bytemask_high =
+          __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+      __m256i one_or_two_bytes_bytemask_low =
+          __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+      __m256i one_or_two_bytes_bytemask_high =
+          __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+      __m256i mask0 = __lasx_xvmskltz_h(
+          __lasx_xvor_v(one_or_two_bytes_bytemask_low, one_byte_bytemask_low));
+      __m256i mask1 = __lasx_xvmskltz_h(__lasx_xvor_v(
+          one_or_two_bytes_bytemask_high, one_byte_bytemask_high));
+
+      uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
       const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
-
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle0 = __lsx_vld(row0, 1);
+      __m128i utf8_0 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+      __lsx_vst(utf8_0, utf8_output, 0);
+      utf8_output += row0[0];
 
+      mask = __lasx_xvpickve2gr_wu(mask1, 0);
       const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
-
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle1 = __lsx_vld(row1, 1);
+      __m128i utf8_1 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+      __lsx_vst(utf8_1, utf8_output, 0);
       utf8_output += row1[0];
 
-      buf += 8;
+      mask = __lasx_xvpickve2gr_wu(mask0, 4);
+      const uint8_t *row2 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle2 = __lsx_vld(row2, 1);
+      __m128i utf8_2 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+      __lsx_vst(utf8_2, utf8_output, 0);
+      utf8_output += row2[0];
+
+      mask = __lasx_xvpickve2gr_wu(mask1, 4);
+      const uint8_t *row3 =
+          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                [0];
+      __m128i shuffle3 = __lsx_vld(row3, 1);
+      __m128i utf8_3 =
+          __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+      __lsx_vst(utf8_3, utf8_output, 0);
+      utf8_output += row3[0];
+
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36685,7 +51702,9 @@ sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xFF80) == 0) {
           *utf8_output++ = char(word);
         } else if ((word & 0xF800) == 0) {
@@ -36698,14 +51717,15 @@ sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
             return std::make_pair(
                 result(error_code::SURROGATE, buf - start + k - 1),
-                utf8_output);
+                reinterpret_cast<char *>(utf8_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf8_output++ = char((value >> 18) | 0b11110000);
@@ -36718,101 +51738,67 @@ sse_convert_utf16_to_utf8_with_errors(const char16_t *buf, size_t len,
     }
   } // while
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
-}
-/* end file src/westmere/sse_convert_utf16_to_utf8.cpp */
-/* begin file src/westmere/sse_convert_utf16_to_utf32.cpp */
-/*
-    The vectorized algorithm works on single SSE register i.e., it
-    loads eight 16-bit code units.
-
-    We consider three cases:
-    1. an input register contains no surrogates and each value
-       is in range 0x0000 .. 0x07ff.
-    2. an input register contains no surrogates and values are
-       is in range 0x0000 .. 0xffff.
-    3. an input register contains surrogates --- i.e. codepoints
-       can have 16 or 32 bits.
-
-    Ad 1.
-
-    When values are less than 0x0800, it means that a 16-bit code unit
-    can be converted into: 1) single UTF8 byte (when it's an ASCII
-    char) or 2) two UTF8 bytes.
-
-    For this case we do only some shuffle to obtain these 2-byte
-    codes and finally compress the whole SSE register with a single
-    shuffle.
-
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
-
-    Ad 2.
-
-    When values fit in 16-bit code units, but are above 0x07ff, then
-    a single word may produce one, two or three UTF8 bytes.
-
-    We prepare data for all these three cases in two registers.
-    The first register contains lower two UTF8 bytes (used in all
-    cases), while the second one contains just the third byte for
-    the three-UTF8-bytes case.
-
-    Finally these two registers are interleaved forming eight-element
-    array of 32-bit values. The array spans two SSE registers.
-    The bytes from the registers are compressed using two shuffles.
-
-    We need 256-entry lookup table to get a compression pattern
-    and the number of output bytes in the compressed vector register.
-    Each entry occupies 17 bytes.
-
-
-    To summarize:
-    - We need two 256-entry tables that have 8704 bytes in total.
-*/
-
-/*
-  Returns a pair: the first unprocessed byte from buf and utf8_output
-  A scalar routing should carry on the conversion of the tail.
-*/
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
+}
+/* end file src/lasx/lasx_convert_utf16_to_utf8.cpp */
+/* begin file src/lasx/lasx_convert_utf16_to_utf32.cpp */
 template <endianness big_endian>
 std::pair<const char16_t *, char32_t *>
-sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
-                           char32_t *utf32_output) {
+lasx_convert_utf16_to_utf32(const char16_t *buf, size_t len,
+                            char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
   const char16_t *end = buf + len;
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf32_output & 0x1f) && buf < end) {
+    uint16_t word =
+        !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[0]) : buf[0];
+    if ((word & 0xF800) != 0xD800) {
+      *utf32_output++ = char32_t(word);
+      buf++;
+    } else {
+      if (buf + 1 >= end) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char32_t *>(utf32_output));
+      }
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      uint16_t next_word = !match_system(big_endian)
+                               ? scalar::utf16::swap_bytes(buf[1])
+                               : buf[1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if ((diff | diff2) > 0x3FF) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char32_t *>(utf32_output));
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      buf += 2;
+    }
+  }
 
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
+  __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
 
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  while (buf + 16 <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
     }
 
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
-
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                       _mm_cvtepu16_epi32(in));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
-      utf32_output += 8;
-      buf += 8;
+    if (__lasx_xbz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
+      utf32_output += 16;
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36824,18 +51810,22 @@ sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xF800) != 0xD800) {
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
-            return std::make_pair(nullptr, utf32_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char32_t *>(utf32_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf32_output++ = char32_t(value);
@@ -36844,7 +51834,7 @@ sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
       buf += k;
     }
   } // while
-  return std::make_pair(buf, utf32_output);
+  return std::make_pair(buf, reinterpret_cast<char32_t *>(utf32_output));
 }
 
 /*
@@ -36856,43 +51846,59 @@ sse_convert_utf16_to_utf32(const char16_t *buf, size_t len,
 */
 template <endianness big_endian>
 std::pair<result, char32_t *>
-sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
-                                       char32_t *utf32_output) {
+lasx_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
+                                        char32_t *utf32_out) {
+  uint32_t *utf32_output = reinterpret_cast<uint32_t *>(utf32_out);
   const char16_t *start = buf;
   const char16_t *end = buf + len;
 
-  const __m128i v_f800 = _mm_set1_epi16((int16_t)0xf800);
-  const __m128i v_d800 = _mm_set1_epi16((int16_t)0xd800);
-
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-
-    if (big_endian) {
-      const __m128i swap =
-          _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-      in = _mm_shuffle_epi8(in, swap);
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf32_output & 0x1f) && buf < end) {
+    uint16_t word =
+        !match_system(big_endian) ? scalar::utf16::swap_bytes(buf[0]) : buf[0];
+    if ((word & 0xF800) != 0xD800) {
+      *utf32_output++ = char32_t(word);
+      buf++;
+    } else if (buf + 1 < end) {
+      // must be a surrogate pair
+      uint16_t diff = uint16_t(word - 0xD800);
+      uint16_t next_word = !match_system(big_endian)
+                               ? scalar::utf16::swap_bytes(buf[1])
+                               : buf[1];
+      uint16_t diff2 = uint16_t(next_word - 0xDC00);
+      if ((diff | diff2) > 0x3FF) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char32_t *>(utf32_output));
+      }
+      uint32_t value = (diff << 10) + diff2 + 0x10000;
+      *utf32_output++ = char32_t(value);
+      buf += 2;
+    } else {
+      return std::make_pair(result(error_code::SURROGATE, buf - start),
+                            reinterpret_cast<char32_t *>(utf32_output));
     }
+  }
 
-    // 1. Check if there are any surrogate word in the input chunk.
-    //    We have also deal with situation when there is a surrogate word
-    //    at the end of a chunk.
-    const __m128i surrogates_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in, v_f800), v_d800);
+  __m256i v_f800 = __lasx_xvldi(-2568); /*0xF800*/
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  while (buf + 16 <= end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint16_t *>(buf), 0);
+    if (!match_system(big_endian)) {
+      in = lasx_swap_bytes(in);
+    }
 
-    // bitmask = 0x0000 if there are no surrogates
-    //         = 0xc000 if the last word is a surrogate
-    const uint16_t surrogates_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogates_bytemask));
+    __m256i surrogates_bytemask =
+        __lasx_xvseq_h(__lasx_xvand_v(in, v_f800), v_d800);
     // It might seem like checking for surrogates_bitmask == 0xc000 could help.
     // However, it is likely an uncommon occurrence.
-    if (surrogates_bitmask == 0x0000) {
-      // case: no surrogate pair, extend 16-bit code units to 32-bit code units
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output),
-                       _mm_cvtepu16_epi32(in));
-      _mm_storeu_si128(reinterpret_cast<__m128i *>(utf32_output + 4),
-                       _mm_cvtepu16_epi32(_mm_srli_si128(in, 8)));
-      utf32_output += 8;
-      buf += 8;
+    if (__lasx_xbz_v(surrogates_bytemask)) {
+      // case: no surrogate pairs, extend all 16-bit code units to 32-bit code
+      // units
+      __m256i in_hi = __lasx_xvpermi_q(in, in, 0b00000001);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in), utf32_output, 0);
+      __lasx_xvst(__lasx_vext2xv_wu_hu(in_hi), utf32_output, 32);
+      utf32_output += 16;
+      buf += 16;
       // surrogate pair(s) in a register
     } else {
       // Let us do a scalar fallback.
@@ -36904,20 +51910,23 @@ sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
         forward = size_t(end - buf - 1);
       }
       for (; k < forward; k++) {
-        uint16_t word = big_endian ? scalar::utf16::swap_bytes(buf[k]) : buf[k];
+        uint16_t word = !match_system(big_endian)
+                            ? scalar::utf16::swap_bytes(buf[k])
+                            : buf[k];
         if ((word & 0xF800) != 0xD800) {
           *utf32_output++ = char32_t(word);
         } else {
           // must be a surrogate pair
           uint16_t diff = uint16_t(word - 0xD800);
-          uint16_t next_word =
-              big_endian ? scalar::utf16::swap_bytes(buf[k + 1]) : buf[k + 1];
+          uint16_t next_word = !match_system(big_endian)
+                                   ? scalar::utf16::swap_bytes(buf[k + 1])
+                                   : buf[k + 1];
           k++;
           uint16_t diff2 = uint16_t(next_word - 0xDC00);
           if ((diff | diff2) > 0x3FF) {
             return std::make_pair(
                 result(error_code::SURROGATE, buf - start + k - 1),
-                utf32_output);
+                reinterpret_cast<char32_t *>(utf32_output));
           }
           uint32_t value = (diff << 10) + diff2 + 0x10000;
           *utf32_output++ = char32_t(value);
@@ -36926,379 +51935,337 @@ sse_convert_utf16_to_utf32_with_errors(const char16_t *buf, size_t len,
       buf += k;
     }
   } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf32_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char32_t *>(utf32_output));
 }
-/* end file src/westmere/sse_convert_utf16_to_utf32.cpp */
+/* end file src/lasx/lasx_convert_utf16_to_utf32.cpp */
 
-/* begin file src/westmere/sse_convert_utf32_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf32_to_latin1.cpp */
 std::pair<const char32_t *, char *>
-sse_convert_utf32_to_latin1(const char32_t *buf, size_t len,
-                            char *latin1_output) {
-  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
-
-  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
-  __m128i shufmask =
-      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
-
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
-    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
-    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
-    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
+lasx_convert_utf32_to_latin1(const char32_t *buf, size_t len,
+                             char *latin1_output) {
+  const char32_t *end = buf + len;
+  const __m256i shuf_mask = ____m256i(
+      (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
+  __m256i v_ff = __lasx_xvrepli_w(0xFF);
 
-    __m128i check_combined = _mm_or_si128(in1, in2);
-    check_combined = _mm_or_si128(check_combined, in3);
-    check_combined = _mm_or_si128(check_combined, in4);
+  while (buf + 16 <= end) {
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
-      return std::make_pair(nullptr, latin1_output);
+    __m256i in12 = __lasx_xvor_v(in1, in2);
+    if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
+      latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
+      __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
+      latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      return std::make_pair(nullptr, reinterpret_cast<char *>(latin1_output));
     }
-    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
-                                       _mm_shuffle_epi8(in2, shufmask));
-    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
-                                       _mm_shuffle_epi8(in4, shufmask));
-    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
-    _mm_storeu_si128((__m128i *)latin1_output, pack);
-    latin1_output += 16;
-    buf += 16;
-  }
-
+  } // while
   return std::make_pair(buf, latin1_output);
 }
 
 std::pair<result, char *>
-sse_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
-                                        char *latin1_output) {
+lasx_convert_utf32_to_latin1_with_errors(const char32_t *buf, size_t len,
+                                         char *latin1_output) {
   const char32_t *start = buf;
-  const size_t rounded_len = len & ~0xF; // Round down to nearest multiple of 16
-
-  __m128i high_bytes_mask = _mm_set1_epi32(0xFFFFFF00);
-  __m128i shufmask =
-      _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0);
+  const char32_t *end = buf + len;
 
-  for (size_t i = 0; i < rounded_len; i += 16) {
-    __m128i in1 = _mm_loadu_si128((__m128i *)buf);
-    __m128i in2 = _mm_loadu_si128((__m128i *)(buf + 4));
-    __m128i in3 = _mm_loadu_si128((__m128i *)(buf + 8));
-    __m128i in4 = _mm_loadu_si128((__m128i *)(buf + 12));
+  const __m256i shuf_mask = ____m256i(
+      (__m128i)v16u8{0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0});
+  __m256i v_ff = __lasx_xvrepli_w(0xFF);
 
-    __m128i check_combined = _mm_or_si128(in1, in2);
-    check_combined = _mm_or_si128(check_combined, in3);
-    check_combined = _mm_or_si128(check_combined, in4);
+  while (buf + 16 <= end) {
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in2 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    if (!_mm_testz_si128(check_combined, high_bytes_mask)) {
-      // Fallback to scalar code for handling errors
+    __m256i in12 = __lasx_xvor_v(in1, in2);
+    if (__lasx_xbz_v(__lasx_xvslt_wu(v_ff, in12))) {
+      // 1. pack the bytes
+      __m256i latin1_packed_tmp = __lasx_xvshuf_b(in2, in1, shuf_mask);
+      latin1_packed_tmp = __lasx_xvpermi_d(latin1_packed_tmp, 0b00001000);
+      __m128i latin1_packed = lasx_extracti128_lo(latin1_packed_tmp);
+      latin1_packed = __lsx_vpermi_w(latin1_packed, latin1_packed, 0b11011000);
+      // 2. store (8 bytes)
+      __lsx_vst(latin1_packed, reinterpret_cast<uint8_t *>(latin1_output), 0);
+      // 3. adjust pointers
+      buf += 16;
+      latin1_output += 16;
+    } else {
+      // Let us do a scalar fallback.
       for (int k = 0; k < 16; k++) {
-        char32_t codepoint = buf[k];
-        if (codepoint <= 0xff) {
-          *latin1_output++ = char(codepoint);
+        uint32_t word = buf[k];
+        if (word <= 0xff) {
+          *latin1_output++ = char(word);
         } else {
           return std::make_pair(result(error_code::TOO_LARGE, buf - start + k),
                                 latin1_output);
         }
       }
-      buf += 16;
-      continue;
     }
-    __m128i pack1 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in1, shufmask),
-                                       _mm_shuffle_epi8(in2, shufmask));
-    __m128i pack2 = _mm_unpacklo_epi32(_mm_shuffle_epi8(in3, shufmask),
-                                       _mm_shuffle_epi8(in4, shufmask));
-    __m128i pack = _mm_unpacklo_epi64(pack1, pack2);
-    _mm_storeu_si128((__m128i *)latin1_output, pack);
-    latin1_output += 16;
-    buf += 16;
-  }
-
+  } // while
   return std::make_pair(result(error_code::SUCCESS, buf - start),
                         latin1_output);
 }
-/* end file src/westmere/sse_convert_utf32_to_latin1.cpp */
-/* begin file src/westmere/sse_convert_utf32_to_utf8.cpp */
+/* end file src/lasx/lasx_convert_utf32_to_latin1.cpp */
+/* begin file src/lasx/lasx_convert_utf32_to_utf8.cpp */
 std::pair<const char32_t *, char *>
-sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
+lasx_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();              //__m128 = 128 bits
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800); // 1111 1000 0000
-                                                           // 0000
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080); // 1100 0000 1000
-                                                           // 0000
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80); // 1111 1111 1000
-                                                           // 0000
-  const __m128i v_ffff0000 = _mm_set1_epi32(
-      (uint32_t)0xffff0000); // 1111 1111 1111 1111 0000 0000 0000 0000
-  const __m128i v_7fffffff = _mm_set1_epi32(
-      (uint32_t)0x7fffffff); // 0111 1111 1111 1111 1111 1111 1111 1111
-  __m128i running_max = _mm_setzero_si128();
-  __m128i forbidden_bytemask = _mm_setzero_si128();
+  // load addr align 32
+  while (((uint64_t)buf & 0x1F) && buf < end) {
+    uint32_t word = *buf;
+    if ((word & 0xFFFFFF80) == 0) {
+      *utf8_output++ = char(word);
+    } else if ((word & 0xFFFFF800) == 0) {
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else {
+      if (word > 0x10FFFF) {
+        return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    }
+    buf++;
+  }
+
+  __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  __m256i forbidden_bytemask = __lasx_xvldi(0x0);
+
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >=
-         std::ptrdiff_t(
-             16 + safety_margin)) { // buf is a char32_t pointer, each char32_t
-                                    // has 4 bytes or 32 bits, thus buf + 16 *
-                                    // char_32t = 512 bits = 64 bytes
-    // We load two 16 bytes registers for a total of 32 bytes or 16 characters.
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128(
-        (__m128i *)buf + 1); // These two values can hold only 8 UTF32 chars
-    running_max = _mm_max_epu32(
-        _mm_max_epu32(in, running_max), // take element-wise max char32_t from
-                                        // in and running_max vector
-        nextin); // and take element-wise max element from nextin and
-                 // running_max vector
-
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m128i in_16 = _mm_packus_epi32(
-        _mm_and_si128(in, v_7fffffff),
-        _mm_and_si128(
-            nextin,
-            v_7fffffff)); // in this context pack the two __m128 into a single
-    // By ensuring the highest bit is set to 0(&v_7fffffff), we are making sure
-    // all values are interpreted as non-negative, or specifically, the values
-    // are within the range of valid Unicode code points. remember : having
-    // leading byte 0 means a positive number by the two complements system.
-    // Unicode is well beneath the range where you'll start getting issues so
-    // that's OK.
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
+  while (buf + 16 + safety_margin < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    // Check for ASCII fast path
+    // Check if no bits set above 16th
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
 
-    // ASCII fast path!!!!
-    // We eagerly load another 32 bytes, hoping that they will be ASCII too.
-    // The intuition is that we try to collect 16 ASCII characters which
-    // requires a total of 64 bytes of input. If we fail, we just pass thirdin
-    // and fourthin as our new inputs.
-    if (_mm_testz_si128(in_16, v_ff80)) { // if the first two blocks are ASCII
-      __m128i thirdin = _mm_loadu_si128((__m128i *)buf + 2);
-      __m128i fourthin = _mm_loadu_si128((__m128i *)buf + 3);
-      running_max = _mm_max_epu32(
-          _mm_max_epu32(thirdin, running_max),
-          fourthin); // take the running max of all 4 vectors thus far
-      __m128i nextin_16 = _mm_packus_epi32(
-          _mm_and_si128(thirdin, v_7fffffff),
-          _mm_and_si128(fourthin,
-                        v_7fffffff)); // pack into 1 vector, now you have two
-      if (!_mm_testz_si128(
-              nextin_16,
-              v_ff80)) { // checks if the second packed vector is ASCII, if not:
+      if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
+                                       utf16_packed))) { // ASCII fast path!!!!
         // 1. pack the bytes
         // obviously suboptimal.
-        const __m128i utf8_packed = _mm_packus_epi16(
-            in_16, in_16); // creates two copy of in_16 in 1 vector
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output,
-                         utf8_packed); // put them into the output
-        // 3. adjust pointers
-        buf += 8; // the char32_t buffer pointer goes up 8 char32_t chars* 32
-                  // bits =  256 bits
-        utf8_output +=
-            8; // same with output, e.g. lift the first two blocks alone.
-        // Proceed with next input
-        in_16 = nextin_16;
-        // We need to update in and nextin because they are used later.
-        in = thirdin;
-        nextin = fourthin;
-      } else {
-        // 1. pack the bytes
-        const __m128i utf8_packed = _mm_packus_epi16(in_16, nextin_16);
-        // 2. store (16 bytes)
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
+        __m256i utf8_packed = __lasx_xvpermi_d(
+            __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
+        // 2. store (8 bytes)
+        __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
         // 3. adjust pointers
         buf += 16;
         utf8_output += 16;
         continue; // we are done for this round!
       }
-    }
-
-    // no bits set above 7th bit -- find out all the ASCII characters
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16( // this takes four bytes at a time and compares:
-            _mm_and_si128(in_16, v_ff80), // the vector that get only the first
-                                          // 9 bits of each 16-bit/2-byte units
-            v_0000                        //
-        ); // they should be all zero if they are ASCII. E.g. ASCII in UTF32 is
-           // of format 0000 0000 0000 0XXX XXXX
-    // _mm_cmpeq_epi16 should now return a 1111 1111 1111 1111 for equals, and
-    // 0000 0000 0000 0000 if not for each 16-bit/2-byte units
-    const uint16_t one_byte_bitmask = static_cast<uint16_t>(_mm_movemask_epi8(
-        one_byte_bytemask)); // collect the MSB from previous vector and put
-                             // them into uint16_t mas
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
-      // produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 =
-          _mm_set1_epi16((int16_t)0x1f00); // 0001 1111 0000 0000
-      const __m128i v_003f =
-          _mm_set1_epi16((int16_t)0x003f); // 0000 0000 0011 1111
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2); // shift packed vector by two
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 =
-          _mm_and_si128(t0, v_1f00); // potentital first utf8 byte
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 =
-          _mm_and_si128(in_16, v_003f); // potential second utf8 byte
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 =
-          _mm_or_si128(t1, t2); // first and second potential utf8 byte together
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(
-          t3,
-          v_c080); // t3 | 1100 0000 1000 0000 = full potential 2-byte utf8 unit
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked =
-          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
-      //    MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 =
-          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-      const uint8_t m2 =
-          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-    // Check for overflow in packing
-
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
-    if (saturation_bitmask == 0xffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask =
-          _mm_or_si128(forbidden_bytemask,
-                       _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800));
-
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-        two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
-
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
 
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m256i t3 = __lasx_xvor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m256i t4 = __lasx_xvor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
+        __m256i utf8_unpacked =
+            __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+        uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+        uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+        // 4. pack the bytes
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m1]][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_packed1 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_packed2 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+        // 5. store bytes
+        __lsx_vst(utf8_packed1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+        __lsx_vst(utf8_packed2, utf8_output, 0);
+        utf8_output += row2[0];
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
+        buf += 16;
         continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lasx_xvor_v(
+            __lasx_xvand_v(
+                __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        /* In this branch we handle three cases:
+            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+           single UFT-8 byte
+            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+           two UTF-8 bytes
+            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+           three UTF-8 bytes
+
+            We expand the input word (16-bit) into two code units (32-bit), thus
+            we have room for four bytes. However, we need five distinct bit
+            layouts. Note that the last byte in cases #2 and #3 is the same.
+
+            We precompute byte 1 for case #1 and the common byte for cases #2 &
+           #3 in register t2.
+
+            We precompute byte 1 for case #3 and -- **conditionally** --
+           precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+           they differ by exactly one bit.
+
+            Finally from these two code units we build proper UTF-8 sequence,
+           taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
+        t0 = __lasx_xvilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+        __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m256i s2 = __lasx_xvor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+        __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+        // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m256i one_or_two_bytes_bytemask =
+            __lasx_xvsle_hu(utf16_packed, v_07ff);
+        __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                     __lasx_xvldi(-2752 /*0x4000*/));
+        __m256i s4 = __lasx_xvxor_v(s3, m0);
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        // 4. expand code units 16-bit => 32-bit
+        __m256i out0 = __lasx_xvilvl_h(s4, t2);
+        __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
+
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m256i one_byte_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m256i one_byte_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        __m256i mask0 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
+                          one_byte_bytemask_u16_to_u32_low));
+        __m256i mask1 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
+                          one_byte_bytemask_u16_to_u32_high));
+
+        uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
+        mask = __lasx_xvpickve2gr_wu(mask1, 0);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      buf += 8;
+        mask = __lasx_xvpickve2gr_wu(mask0, 4);
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_2 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+        __lsx_vst(utf8_2, utf8_output, 0);
+        utf8_output += row2[0];
+
+        mask = __lasx_xvpickve2gr_wu(mask1, 4);
+        const uint8_t *row3 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle3 = __lsx_vld(row3, 1);
+        __m128i utf8_3 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+        __lsx_vst(utf8_3, utf8_output, 0);
+        utf8_output += row3[0];
+
+        buf += 16;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
     } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD in the
-      // presence of surrogate pairs may require non-trivial tables.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -37313,14 +52280,16 @@ sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf8_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
           *utf8_output++ = char((word & 0b111111) | 0b10000000);
         } else {
           if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf8_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 18) | 0b11110000);
           *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
@@ -37333,242 +52302,269 @@ sse_convert_utf32_to_utf8(const char32_t *buf, size_t len, char *utf8_output) {
   } // while
 
   // check for invalid input
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
-  if (static_cast<uint16_t>(_mm_movemask_epi8(_mm_cmpeq_epi32(
-          _mm_max_epu32(running_max, v_10ffff), v_10ffff))) != 0xffff) {
-    return std::make_pair(nullptr, utf8_output);
-  }
-
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf8_output);
+  if (__lasx_xbnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char *>(utf8_output));
   }
-
-  return std::make_pair(buf, utf8_output);
+  return std::make_pair(buf, reinterpret_cast<char *>(utf8_output));
 }
 
 std::pair<result, char *>
-sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
-                                      char *utf8_output) {
-  const char32_t *end = buf + len;
+lasx_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
+                                       char *utf8_out) {
+  uint8_t *utf8_output = reinterpret_cast<uint8_t *>(utf8_out);
   const char32_t *start = buf;
+  const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-  const __m128i v_c080 = _mm_set1_epi16((uint16_t)0xc080);
-  const __m128i v_ff80 = _mm_set1_epi16((uint16_t)0xff80);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
-  const __m128i v_7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff);
-  const __m128i v_10ffff = _mm_set1_epi32((uint32_t)0x10ffff);
+  // load addr align 32
+  while (((uint64_t)buf & 0x1F) && buf < end) {
+    uint32_t word = *buf;
+    if ((word & 0xFFFFFF80) == 0) {
+      *utf8_output++ = char(word);
+    } else if ((word & 0xFFFFF800) == 0) {
+      *utf8_output++ = char((word >> 6) | 0b11000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else if ((word & 0xFFFF0000) == 0) {
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start),
+                              reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 12) | 0b11100000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    } else {
+      if (word > 0x10FFFF) {
+        return std::make_pair(result(error_code::TOO_LARGE, buf - start),
+                              reinterpret_cast<char *>(utf8_output));
+      }
+      *utf8_output++ = char((word >> 18) | 0b11110000);
+      *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
+      *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
+      *utf8_output++ = char((word & 0b111111) | 0b10000000);
+    }
+    buf++;
+  }
 
+  __m256i v_c080 = __lasx_xvreplgr2vr_h(uint16_t(0xC080));
+  __m256i v_07ff = __lasx_xvreplgr2vr_h(uint16_t(0x7FF));
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xDFFF));
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i zero = __lasx_xvldi(0);
+  __m128i zero_128 = __lsx_vldi(0);
+  __m256i forbidden_bytemask = __lasx_xvldi(0x0);
   const size_t safety_margin =
       12; // to avoid overruns, see issue
           // https://github.com/simdutf/simdutf/issues/92
 
-  while (end - buf >= std::ptrdiff_t(16 + safety_margin)) {
-    // We load two 16 bytes registers for a total of 32 bytes or 8 characters.
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-    // Check for too large input
-    __m128i max_input = _mm_max_epu32(_mm_max_epu32(in, nextin), v_10ffff);
-    if (static_cast<uint16_t>(_mm_movemask_epi8(
-            _mm_cmpeq_epi32(max_input, v_10ffff))) != 0xffff) {
-      return std::make_pair(result(error_code::TOO_LARGE, buf - start),
-                            utf8_output);
-    }
-
-    // Pack 32-bit UTF-32 code units to 16-bit UTF-16 code units with unsigned
-    // saturation
-    __m128i in_16 = _mm_packus_epi32(_mm_and_si128(in, v_7fffffff),
-                                     _mm_and_si128(nextin, v_7fffffff));
-
-    // Try to apply UTF-16 => UTF-8 from ./sse_convert_utf16_to_utf8.cpp
-
-    // Check for ASCII fast path
-    if (_mm_testz_si128(in_16, v_ff80)) { // ASCII fast path!!!!
-      // 1. pack the bytes
-      // obviously suboptimal.
-      const __m128i utf8_packed = _mm_packus_epi16(in_16, in_16);
-      // 2. store (16 bytes)
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-      // 3. adjust pointers
-      buf += 8;
-      utf8_output += 8;
-      continue;
-    }
-
-    // no bits set above 7th bit
-    const __m128i one_byte_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_ff80), v_0000);
-    const uint16_t one_byte_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_byte_bytemask));
-
-    // no bits set above 11th bit
-    const __m128i one_or_two_bytes_bytemask =
-        _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_0000);
-    const uint16_t one_or_two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(one_or_two_bytes_bytemask));
-
-    if (one_or_two_bytes_bitmask == 0xffff) {
-      // case: all code units either produce 1 or 2 UTF-8 bytes (at least one
-      // produces 2 bytes)
-      // 1. prepare 2-byte values
-      // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
-      // expected output   : [110a|aaaa|10bb|bbbb] x 8
-      const __m128i v_1f00 = _mm_set1_epi16((int16_t)0x1f00);
-      const __m128i v_003f = _mm_set1_epi16((int16_t)0x003f);
-
-      // t0 = [000a|aaaa|bbbb|bb00]
-      const __m128i t0 = _mm_slli_epi16(in_16, 2);
-      // t1 = [000a|aaaa|0000|0000]
-      const __m128i t1 = _mm_and_si128(t0, v_1f00);
-      // t2 = [0000|0000|00bb|bbbb]
-      const __m128i t2 = _mm_and_si128(in_16, v_003f);
-      // t3 = [000a|aaaa|00bb|bbbb]
-      const __m128i t3 = _mm_or_si128(t1, t2);
-      // t4 = [110a|aaaa|10bb|bbbb]
-      const __m128i t4 = _mm_or_si128(t3, v_c080);
-
-      // 2. merge ASCII and 2-byte codewords
-      const __m128i utf8_unpacked =
-          _mm_blendv_epi8(t4, in_16, one_byte_bytemask);
-
-      // 3. prepare bitmask for 8-bit lookup
-      //    one_byte_bitmask = hhggffeeddccbbaa -- the bits are doubled (h -
-      //    MSB, a - LSB)
-      const uint16_t m0 = one_byte_bitmask & 0x5555; // m0 = 0h0g0f0e0d0c0b0a
-      const uint16_t m1 =
-          static_cast<uint16_t>(m0 >> 7); // m1 = 00000000h0g0f0e0
-      const uint8_t m2 =
-          static_cast<uint8_t>((m0 | m1) & 0xff); // m2 =         hdgcfbea
-      // 4. pack the bytes
-      const uint8_t *row =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes[m2][0];
-      const __m128i shuffle = _mm_loadu_si128((__m128i *)(row + 1));
-      const __m128i utf8_packed = _mm_shuffle_epi8(utf8_unpacked, shuffle);
-
-      // 5. store bytes
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_packed);
-
-      // 6. adjust pointers
-      buf += 8;
-      utf8_output += row[0];
-      continue;
-    }
-
-    // Check for overflow in packing
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+  while (buf + 16 + safety_margin < end) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i nextin = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
-    if (saturation_bitmask == 0xffff) {
-      // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+    // Check if no bits set above 16th
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in, nextin))) {
+      // Pack UTF-32 to UTF-16 safely (without surrogate pairs)
+      // Apply UTF-16 => UTF-8 routine (lasx_convert_utf16_to_utf8.cpp)
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(nextin, in), 0b11011000);
 
-      // Check for illegal surrogate code units
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask =
-          _mm_cmpeq_epi16(_mm_and_si128(in_16, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-        return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf8_output);
+      if (__lasx_xbz_v(__lasx_xvslt_hu(__lasx_xvrepli_h(0x7F),
+                                       utf16_packed))) { // ASCII fast path!!!!
+        // 1. pack the bytes
+        // obviously suboptimal.
+        __m256i utf8_packed = __lasx_xvpermi_d(
+            __lasx_xvpickev_b(utf16_packed, utf16_packed), 0b00001000);
+        // 2. store (8 bytes)
+        __lsx_vst(lasx_extracti128_lo(utf8_packed), utf8_output, 0);
+        // 3. adjust pointers
+        buf += 16;
+        utf8_output += 16;
+        continue; // we are done for this round!
       }
 
-      const __m128i dup_even = _mm_setr_epi16(0x0000, 0x0202, 0x0404, 0x0606,
-                                              0x0808, 0x0a0a, 0x0c0c, 0x0e0e);
-
-      /* In this branch we handle three cases:
-          1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
-        single UFT-8 byte
-          2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
-        two UTF-8 bytes
-          3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
-        three UTF-8 bytes
-
-        We expand the input word (16-bit) into two code units (32-bit), thus
-        we have room for four bytes. However, we need five distinct bit
-        layouts. Note that the last byte in cases #2 and #3 is the same.
-
-        We precompute byte 1 for case #1 and the common byte for cases #2 & #3
-        in register t2.
-
-        We precompute byte 1 for case #3 and -- **conditionally** -- precompute
-        either byte 1 for case #2 or byte 2 for case #3. Note that they
-        differ by exactly one bit.
-
-        Finally from these two code units we build proper UTF-8 sequence, taking
-        into account the case (i.e, the number of bytes to write).
-      */
-      /**
-       * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
-       * t2 => [0ccc|cccc] [10cc|cccc]
-       * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
-       */
-#define simdutf_vec(x) _mm_set1_epi16(static_cast<uint16_t>(x))
-      // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
-      const __m128i t0 = _mm_shuffle_epi8(in_16, dup_even);
-      // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
-      const __m128i t1 = _mm_and_si128(t0, simdutf_vec(0b0011111101111111));
-      // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
-      const __m128i t2 = _mm_or_si128(t1, simdutf_vec(0b1000000000000000));
+      if (__lasx_xbz_v(__lasx_xvslt_hu(v_07ff, utf16_packed))) {
+        // 1. prepare 2-byte values
+        // input 16-bit word : [0000|0aaa|aabb|bbbb] x 8
+        // expected output   : [110a|aaaa|10bb|bbbb] x 8
 
-      // [aaaa|bbbb|bbcc|cccc] =>  [0000|aaaa|bbbb|bbcc]
-      const __m128i s0 = _mm_srli_epi16(in_16, 4);
-      // [0000|aaaa|bbbb|bbcc] => [0000|aaaa|bbbb|bb00]
-      const __m128i s1 = _mm_and_si128(s0, simdutf_vec(0b0000111111111100));
-      // [0000|aaaa|bbbb|bb00] => [00bb|bbbb|0000|aaaa]
-      const __m128i s2 = _mm_maddubs_epi16(s1, simdutf_vec(0x0140));
-      // [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
-      const __m128i s3 = _mm_or_si128(s2, simdutf_vec(0b1100000011100000));
-      const __m128i m0 = _mm_andnot_si128(one_or_two_bytes_bytemask,
-                                          simdutf_vec(0b0100000000000000));
-      const __m128i s4 = _mm_xor_si128(s3, m0);
-#undef simdutf_vec
+        // t0 = [000a|aaaa|bbbb|bb00]
+        const __m256i t0 = __lasx_xvslli_h(utf16_packed, 2);
+        // t1 = [000a|aaaa|0000|0000]
+        const __m256i t1 = __lasx_xvand_v(t0, __lasx_xvldi(-2785 /*0x1f00*/));
+        // t2 = [0000|0000|00bb|bbbb]
+        const __m256i t2 = __lasx_xvand_v(utf16_packed, __lasx_xvrepli_h(0x3f));
+        // t3 = [000a|aaaa|00bb|bbbb]
+        const __m256i t3 = __lasx_xvor_v(t1, t2);
+        // t4 = [110a|aaaa|10bb|bbbb]
+        const __m256i t4 = __lasx_xvor_v(t3, v_c080);
+        // 2. merge ASCII and 2-byte codewords
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F /*0x007F*/));
+        __m256i utf8_unpacked =
+            __lasx_xvbitsel_v(t4, utf16_packed, one_byte_bytemask);
+        // 3. prepare bitmask for 8-bit lookup
+        __m256i mask = __lasx_xvmskltz_h(one_byte_bytemask);
+        uint32_t m1 = __lasx_xvpickve2gr_wu(mask, 0);
+        uint32_t m2 = __lasx_xvpickve2gr_wu(mask, 4);
+        // 4. pack the bytes
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m1]][0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_packed1 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_lo(utf8_unpacked), shuffle1);
+
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_utf8_bytes
+                [lasx_1_2_utf8_bytes_mask[m2]][0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_packed2 = __lsx_vshuf_b(
+            zero_128, lasx_extracti128_hi(utf8_unpacked), shuffle2);
+        // 5. store bytes
+        __lsx_vst(utf8_packed1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      // 4. expand code units 16-bit => 32-bit
-      const __m128i out0 = _mm_unpacklo_epi16(t2, s4);
-      const __m128i out1 = _mm_unpackhi_epi16(t2, s4);
+        __lsx_vst(utf8_packed2, utf8_output, 0);
+        utf8_output += row2[0];
 
-      // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
-      const uint16_t mask =
-          (one_byte_bitmask & 0x5555) | (one_or_two_bytes_bitmask & 0xaaaa);
-      if (mask == 0) {
-        // We only have three-byte code units. Use fast path.
-        const __m128i shuffle = _mm_setr_epi8(2, 3, 1, 6, 7, 5, 10, 11, 9, 14,
-                                              15, 13, -1, -1, -1, -1);
-        const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle);
-        const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle);
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-        utf8_output += 12;
-        _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-        utf8_output += 12;
-        buf += 8;
+        buf += 16;
         continue;
-      }
-      const uint8_t mask0 = uint8_t(mask);
+      } else {
+        // case: code units from register produce either 1, 2 or 3 UTF-8 bytes
+        forbidden_bytemask = __lasx_xvor_v(
+            __lasx_xvand_v(
+                __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+                __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+            forbidden_bytemask);
+        if (__lasx_xbnz_v(forbidden_bytemask)) {
+          return std::make_pair(result(error_code::SURROGATE, buf - start),
+                                reinterpret_cast<char *>(utf8_output));
+        }
+        /* In this branch we handle three cases:
+            1. [0000|0000|0ccc|cccc] => [0ccc|cccc]                           -
+           single UFT-8 byte
+            2. [0000|0bbb|bbcc|cccc] => [110b|bbbb], [10cc|cccc]              -
+           two UTF-8 bytes
+            3. [aaaa|bbbb|bbcc|cccc] => [1110|aaaa], [10bb|bbbb], [10cc|cccc] -
+           three UTF-8 bytes
+
+            We expand the input word (16-bit) into two code units (32-bit), thus
+            we have room for four bytes. However, we need five distinct bit
+            layouts. Note that the last byte in cases #2 and #3 is the same.
+
+            We precompute byte 1 for case #1 and the common byte for cases #2 &
+           #3 in register t2.
+
+            We precompute byte 1 for case #3 and -- **conditionally** --
+           precompute either byte 1 for case #2 or byte 2 for case #3. Note that
+           they differ by exactly one bit.
+
+            Finally from these two code units we build proper UTF-8 sequence,
+           taking into account the case (i.e, the number of bytes to write).
+        */
+        /**
+         * Given [aaaa|bbbb|bbcc|cccc] our goal is to produce:
+         * t2 => [0ccc|cccc] [10cc|cccc]
+         * s4 => [1110|aaaa] ([110b|bbbb] OR [10bb|bbbb])
+         */
+        // [aaaa|bbbb|bbcc|cccc] => [bbcc|cccc|bbcc|cccc]
+        __m256i t0 = __lasx_xvpickev_b(utf16_packed, utf16_packed);
+        t0 = __lasx_xvilvl_b(t0, t0);
+        // [bbcc|cccc|bbcc|cccc] => [00cc|cccc|0bcc|cccc]
+        __m256i v_3f7f = __lasx_xvreplgr2vr_h(uint16_t(0x3F7F));
+        __m256i t1 = __lasx_xvand_v(t0, v_3f7f);
+        // [00cc|cccc|0bcc|cccc] => [10cc|cccc|0bcc|cccc]
+        __m256i t2 = __lasx_xvor_v(t1, __lasx_xvldi(-2688 /*0x8000*/));
 
-      const uint8_t *row0 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask0][0];
-      const __m128i shuffle0 = _mm_loadu_si128((__m128i *)(row0 + 1));
-      const __m128i utf8_0 = _mm_shuffle_epi8(out0, shuffle0);
+        // s0: [aaaa|bbbb|bbcc|cccc] => [0000|0000|0000|aaaa]
+        __m256i s0 = __lasx_xvsrli_h(utf16_packed, 12);
+        // s1: [aaaa|bbbb|bbcc|cccc] => [0000|bbbb|bb00|0000]
+        __m256i s1 = __lasx_xvslli_h(utf16_packed, 2);
+        // [0000|bbbb|bb00|0000] => [00bb|bbbb|0000|0000]
+        s1 = __lasx_xvand_v(s1, __lasx_xvldi(-2753 /*0x3F00*/));
+        // [00bb|bbbb|0000|aaaa]
+        __m256i s2 = __lasx_xvor_v(s0, s1);
+        // s3: [00bb|bbbb|0000|aaaa] => [11bb|bbbb|1110|aaaa]
+        __m256i v_c0e0 = __lasx_xvreplgr2vr_h(uint16_t(0xC0E0));
+        __m256i s3 = __lasx_xvor_v(s2, v_c0e0);
+        // __m256i v_07ff = vmovq_n_u16((uint16_t)0x07FF);
+        __m256i one_or_two_bytes_bytemask =
+            __lasx_xvsle_hu(utf16_packed, v_07ff);
+        __m256i m0 = __lasx_xvandn_v(one_or_two_bytes_bytemask,
+                                     __lasx_xvldi(-2752 /*0x4000*/));
+        __m256i s4 = __lasx_xvxor_v(s3, m0);
 
-      const uint8_t mask1 = static_cast<uint8_t>(mask >> 8);
+        // 4. expand code units 16-bit => 32-bit
+        __m256i out0 = __lasx_xvilvl_h(s4, t2);
+        __m256i out1 = __lasx_xvilvh_h(s4, t2);
 
-      const uint8_t *row1 =
-          &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask1][0];
-      const __m128i shuffle1 = _mm_loadu_si128((__m128i *)(row1 + 1));
-      const __m128i utf8_1 = _mm_shuffle_epi8(out1, shuffle1);
+        // 5. compress 32-bit code units into 1, 2 or 3 bytes -- 2 x shuffle
+        __m256i one_byte_bytemask =
+            __lasx_xvsle_hu(utf16_packed, __lasx_xvrepli_h(0x7F));
+
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_or_two_bytes_bytemask, zero);
+        __m256i one_or_two_bytes_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_or_two_bytes_bytemask, zero);
+
+        __m256i one_byte_bytemask_u16_to_u32_low =
+            __lasx_xvilvl_h(one_byte_bytemask, one_byte_bytemask);
+        __m256i one_byte_bytemask_u16_to_u32_high =
+            __lasx_xvilvh_h(one_byte_bytemask, one_byte_bytemask);
+
+        __m256i mask0 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_low,
+                          one_byte_bytemask_u16_to_u32_low));
+        __m256i mask1 = __lasx_xvmskltz_h(
+            __lasx_xvor_v(one_or_two_bytes_bytemask_u16_to_u32_high,
+                          one_byte_bytemask_u16_to_u32_high));
+
+        uint32_t mask = __lasx_xvpickve2gr_wu(mask0, 0);
+        const uint8_t *row0 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle0 = __lsx_vld(row0, 1);
+        __m128i utf8_0 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out0), shuffle0);
+        __lsx_vst(utf8_0, utf8_output, 0);
+        utf8_output += row0[0];
 
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_0);
-      utf8_output += row0[0];
-      _mm_storeu_si128((__m128i *)utf8_output, utf8_1);
-      utf8_output += row1[0];
+        mask = __lasx_xvpickve2gr_wu(mask1, 0);
+        const uint8_t *row1 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle1 = __lsx_vld(row1, 1);
+        __m128i utf8_1 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_lo(out1), shuffle1);
+        __lsx_vst(utf8_1, utf8_output, 0);
+        utf8_output += row1[0];
 
-      buf += 8;
+        mask = __lasx_xvpickve2gr_wu(mask0, 4);
+        const uint8_t *row2 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle2 = __lsx_vld(row2, 1);
+        __m128i utf8_2 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out0), shuffle2);
+        __lsx_vst(utf8_2, utf8_output, 0);
+        utf8_output += row2[0];
+
+        mask = __lasx_xvpickve2gr_wu(mask1, 4);
+        const uint8_t *row3 =
+            &simdutf::tables::utf16_to_utf8::pack_1_2_3_utf8_bytes[mask & 0xFF]
+                                                                  [0];
+        __m128i shuffle3 = __lsx_vld(row3, 1);
+        __m128i utf8_3 =
+            __lsx_vshuf_b(zero_128, lasx_extracti128_hi(out1), shuffle3);
+        __lsx_vst(utf8_3, utf8_output, 0);
+        utf8_output += row3[0];
+
+        buf += 16;
+      }
+      // At least one 32-bit word will produce a surrogate pair in UTF-16 <=>
+      // will produce four UTF-8 bytes.
     } else {
-      // case: at least one 32-bit word produce a surrogate pair in UTF-16 <=>
-      // will produce four UTF-8 bytes Let us do a scalar fallback. It may seem
-      // wasteful to use scalar code, but being efficient with SIMD in the
-      // presence of surrogate pairs may require non-trivial tables.
+      // Let us do a scalar fallback.
+      // It may seem wasteful to use scalar code, but being efficient with SIMD
+      // in the presence of surrogate pairs may require non-trivial tables.
       size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
@@ -37584,7 +52580,8 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         } else if ((word & 0xFFFF0000) == 0) {
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf8_output);
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 12) | 0b11100000);
           *utf8_output++ = char(((word >> 6) & 0b111111) | 0b10000000);
@@ -37592,7 +52589,8 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
         } else {
           if (word > 0x10FFFF) {
             return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf8_output);
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char *>(utf8_output));
           }
           *utf8_output++ = char((word >> 18) | 0b11110000);
           *utf8_output++ = char(((word >> 12) & 0b111111) | 0b10000000);
@@ -37603,51 +52601,76 @@ sse_convert_utf32_to_utf8_with_errors(const char32_t *buf, size_t len,
       buf += k;
     }
   } // while
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf8_output);
+
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char *>(utf8_output));
 }
-/* end file src/westmere/sse_convert_utf32_to_utf8.cpp */
-/* begin file src/westmere/sse_convert_utf32_to_utf16.cpp */
+/* end file src/lasx/lasx_convert_utf32_to_utf8.cpp */
+/* begin file src/lasx/lasx_convert_utf32_to_utf16.cpp */
 template <endianness big_endian>
 std::pair<const char32_t *, char16_t *>
-sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
-                           char16_t *utf16_output) {
-
+lasx_convert_utf32_to_utf16(const char32_t *buf, size_t len,
+                            char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
   const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
-  __m128i forbidden_bytemask = _mm_setzero_si128();
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    uint32_t word = *buf++;
+    if ((word & 0xFFFF0000) == 0) {
+      // will not generate a surrogate pair
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(word >> 8 | word << 8)
+                            : char16_t(word);
+      // buf++;
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return std::make_pair(nullptr,
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+      // buf++;
+    }
+  }
 
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+  __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+  while (buf + 16 <= end) {
+    __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
     // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
-
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      forbidden_bytemask = _mm_or_si128(
-          forbidden_bytemask,
-          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800));
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
+      forbidden_bytemask = __lasx_xvor_v(
+          __lasx_xvand_v(
+              __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
 
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      if (!match_system(big_endian)) {
+        utf16_packed = lasx_swap_bytes(utf16_packed);
       }
-
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
+      __lasx_xvst(utf16_packed, utf16_output, 0);
+      utf16_output += 16;
+      buf += 16;
     } else {
-      size_t forward = 7;
+      size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
         forward = size_t(end - buf - 1);
@@ -37657,25 +52680,25 @@ sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
         if ((word & 0xFFFF0000) == 0) {
           // will not generate a surrogate pair
           if (word >= 0xD800 && word <= 0xDFFF) {
-            return std::make_pair(nullptr, utf16_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
           }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
         } else {
           // will generate a surrogate pair
           if (word > 0x10FFFF) {
-            return std::make_pair(nullptr, utf16_output);
+            return std::make_pair(nullptr,
+                                  reinterpret_cast<char16_t *>(utf16_output));
           }
           word -= 0x10000;
           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
+          if (!match_system(big_endian)) {
             high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
           }
           *utf16_output++ = char16_t(high_surrogate);
           *utf16_output++ = char16_t(low_surrogate);
@@ -37686,56 +52709,80 @@ sse_convert_utf32_to_utf16(const char32_t *buf, size_t len,
   }
 
   // check for invalid input
-  if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
-    return std::make_pair(nullptr, utf16_output);
+  if (__lasx_xbnz_v(forbidden_bytemask)) {
+    return std::make_pair(nullptr, reinterpret_cast<char16_t *>(utf16_output));
   }
-
-  return std::make_pair(buf, utf16_output);
+  return std::make_pair(buf, reinterpret_cast<char16_t *>(utf16_output));
 }
 
 template <endianness big_endian>
 std::pair<result, char16_t *>
-sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
-                                       char16_t *utf16_output) {
+lasx_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
+                                        char16_t *utf16_out) {
+  uint16_t *utf16_output = reinterpret_cast<uint16_t *>(utf16_out);
   const char32_t *start = buf;
   const char32_t *end = buf + len;
 
-  const __m128i v_0000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((int32_t)0xffff0000);
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)utf16_output & 0x1F) && buf < end) {
+    uint32_t word = *buf++;
+    if ((word & 0xFFFF0000) == 0) {
+      // will not generate a surrogate pair
+      if (word >= 0xD800 && word <= 0xDFFF) {
+        return std::make_pair(result(error_code::SURROGATE, buf - start - 1),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      *utf16_output++ = !match_system(big_endian)
+                            ? char16_t(word >> 8 | word << 8)
+                            : char16_t(word);
+    } else {
+      // will generate a surrogate pair
+      if (word > 0x10FFFF) {
+        return std::make_pair(result(error_code::TOO_LARGE, buf - start - 1),
+                              reinterpret_cast<char16_t *>(utf16_output));
+      }
+      word -= 0x10000;
+      uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
+      uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
+      if (!match_system(big_endian)) {
+        high_surrogate = uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+        low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
+      }
+      *utf16_output++ = char16_t(high_surrogate);
+      *utf16_output++ = char16_t(low_surrogate);
+    }
+  }
 
-  while (end - buf >= 8) {
-    __m128i in = _mm_loadu_si128((__m128i *)buf);
-    __m128i nextin = _mm_loadu_si128((__m128i *)buf + 1);
-    const __m128i saturation_bytemask = _mm_cmpeq_epi32(
-        _mm_and_si128(_mm_or_si128(in, nextin), v_ffff0000), v_0000);
-    const uint32_t saturation_bitmask =
-        static_cast<uint32_t>(_mm_movemask_epi8(saturation_bytemask));
+  __m256i forbidden_bytemask = __lasx_xvrepli_h(0);
+  __m256i v_d800 = __lasx_xvldi(-2600); /*0xD800*/
+  __m256i v_dfff = __lasx_xvreplgr2vr_h(uint16_t(0xdfff));
+  while (buf + 16 <= end) {
+    __m256i in0 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 0);
+    __m256i in1 = __lasx_xvld(reinterpret_cast<const uint32_t *>(buf), 32);
 
     // Check if no bits set above 16th
-    if (saturation_bitmask == 0xffff) {
-      // Pack UTF-32 to UTF-16
-      __m128i utf16_packed = _mm_packus_epi32(in, nextin);
-
-      const __m128i v_f800 = _mm_set1_epi16((uint16_t)0xf800);
-      const __m128i v_d800 = _mm_set1_epi16((uint16_t)0xd800);
-      const __m128i forbidden_bytemask =
-          _mm_cmpeq_epi16(_mm_and_si128(utf16_packed, v_f800), v_d800);
-      if (static_cast<uint32_t>(_mm_movemask_epi8(forbidden_bytemask)) != 0) {
+    if (__lasx_xbz_v(__lasx_xvpickod_h(in1, in0))) {
+      __m256i utf16_packed =
+          __lasx_xvpermi_d(__lasx_xvpickev_h(in1, in0), 0b11011000);
+      forbidden_bytemask = __lasx_xvor_v(
+          __lasx_xvand_v(
+              __lasx_xvsle_h(utf16_packed, v_dfff),  // utf16_packed <= 0xdfff
+              __lasx_xvsle_h(v_d800, utf16_packed)), // utf16_packed >= 0xd800
+          forbidden_bytemask);
+      if (__lasx_xbnz_v(forbidden_bytemask)) {
         return std::make_pair(result(error_code::SURROGATE, buf - start),
-                              utf16_output);
+                              reinterpret_cast<char16_t *>(utf16_output));
       }
 
-      if (big_endian) {
-        const __m128i swap =
-            _mm_setr_epi8(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14);
-        utf16_packed = _mm_shuffle_epi8(utf16_packed, swap);
+      if (!match_system(big_endian)) {
+        utf16_packed = lasx_swap_bytes(utf16_packed);
       }
 
-      _mm_storeu_si128((__m128i *)utf16_output, utf16_packed);
-      utf16_output += 8;
-      buf += 8;
+      __lasx_xvst(utf16_packed, utf16_output, 0);
+      utf16_output += 16;
+      buf += 16;
     } else {
-      size_t forward = 7;
+      size_t forward = 15;
       size_t k = 0;
       if (size_t(end - buf) < forward + 1) {
         forward = size_t(end - buf - 1);
@@ -37746,26 +52793,26 @@ sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
           // will not generate a surrogate pair
           if (word >= 0xD800 && word <= 0xDFFF) {
             return std::make_pair(
-                result(error_code::SURROGATE, buf - start + k), utf16_output);
+                result(error_code::SURROGATE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
           }
-          *utf16_output++ =
-              big_endian
-                  ? char16_t((uint16_t(word) >> 8) | (uint16_t(word) << 8))
-                  : char16_t(word);
+          *utf16_output++ = !match_system(big_endian)
+                                ? char16_t(word >> 8 | word << 8)
+                                : char16_t(word);
         } else {
           // will generate a surrogate pair
           if (word > 0x10FFFF) {
             return std::make_pair(
-                result(error_code::TOO_LARGE, buf - start + k), utf16_output);
+                result(error_code::TOO_LARGE, buf - start + k),
+                reinterpret_cast<char16_t *>(utf16_output));
           }
           word -= 0x10000;
           uint16_t high_surrogate = uint16_t(0xD800 + (word >> 10));
           uint16_t low_surrogate = uint16_t(0xDC00 + (word & 0x3FF));
-          if (big_endian) {
+          if (!match_system(big_endian)) {
             high_surrogate =
-                uint16_t((high_surrogate >> 8) | (high_surrogate << 8));
-            low_surrogate =
-                uint16_t((low_surrogate >> 8) | (low_surrogate << 8));
+                uint16_t(high_surrogate >> 8 | high_surrogate << 8);
+            low_surrogate = uint16_t(low_surrogate << 8 | low_surrogate >> 8);
           }
           *utf16_output++ = char16_t(high_surrogate);
           *utf16_output++ = char16_t(low_surrogate);
@@ -37775,10 +52822,11 @@ sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
     }
   }
 
-  return std::make_pair(result(error_code::SUCCESS, buf - start), utf16_output);
+  return std::make_pair(result(error_code::SUCCESS, buf - start),
+                        reinterpret_cast<char16_t *>(utf16_output));
 }
-/* end file src/westmere/sse_convert_utf32_to_utf16.cpp */
-/* begin file src/westmere/sse_base64.cpp */
+/* end file src/lasx/lasx_convert_utf32_to_utf16.cpp */
+/* begin file src/lasx/lasx_base64.cpp */
 /**
  * References and further reading:
  *
@@ -37806,36 +52854,6 @@ sse_convert_utf32_to_utf16_with_errors(const char32_t *buf, size_t len,
  * Nick Kopp. 2013. Base64 Encoding on a GPU.
  * https://www.codeproject.com/Articles/276993/Base-Encoding-on-a-GPU. (2013).
  */
-template <bool base64_url> __m128i lookup_pshufb_improved(const __m128i input) {
-  // credit: Wojciech Muła
-  // reduce  0..51 -> 0
-  //        52..61 -> 1 .. 10
-  //            62 -> 11
-  //            63 -> 12
-  __m128i result = _mm_subs_epu8(input, _mm_set1_epi8(51));
-
-  // distinguish between ranges 0..25 and 26..51:
-  //         0 .. 25 -> remains 0
-  //        26 .. 51 -> becomes 13
-  const __m128i less = _mm_cmpgt_epi8(_mm_set1_epi8(26), input);
-  result = _mm_or_si128(result, _mm_and_si128(less, _mm_set1_epi8(13)));
-
-  __m128i shift_LUT;
-  if (base64_url) {
-    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '-' - 62, '_' - 63, 'A', 0, 0);
-  } else {
-    shift_LUT = _mm_setr_epi8('a' - 26, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '0' - 52, '0' - 52, '0' - 52, '0' - 52,
-                              '0' - 52, '+' - 62, '/' - 63, 'A', 0, 0);
-  }
-
-  // read shift
-  result = _mm_shuffle_epi8(shift_LUT, result);
-
-  return _mm_add_epi8(result, input);
-}
 
 template <bool isbase64url>
 size_t encode_base64(char *dst, const char *src, size_t srclen,
@@ -37843,71 +52861,124 @@ size_t encode_base64(char *dst, const char *src, size_t srclen,
   // credit: Wojciech Muła
   // SSE (lookup: pshufb improved unrolled)
   const uint8_t *input = (const uint8_t *)src;
-
+  static const char *lookup_tbl =
+      isbase64url
+          ? "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+          : "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
   uint8_t *out = (uint8_t *)dst;
-  const __m128i shuf =
-      _mm_set_epi8(10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
 
+  v32u8 shuf;
+  __m256i v_fc0fc00, v_3f03f0, shift_r, shift_l, base64_tbl0, base64_tbl1,
+      base64_tbl2, base64_tbl3;
+  if (srclen >= 28) {
+    shuf = v32u8{1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10,
+                 1, 0, 2, 1, 4, 3, 5, 4, 7, 6, 8, 7, 10, 9, 11, 10};
+
+    v_fc0fc00 = __lasx_xvreplgr2vr_w(uint32_t(0x0fc0fc00));
+    v_3f03f0 = __lasx_xvreplgr2vr_w(uint32_t(0x003f03f0));
+    shift_r = __lasx_xvreplgr2vr_w(uint32_t(0x0006000a));
+    shift_l = __lasx_xvreplgr2vr_w(uint32_t(0x00080004));
+    base64_tbl0 = ____m256i(__lsx_vld(lookup_tbl, 0));
+    base64_tbl1 = ____m256i(__lsx_vld(lookup_tbl, 16));
+    base64_tbl2 = ____m256i(__lsx_vld(lookup_tbl, 32));
+    base64_tbl3 = ____m256i(__lsx_vld(lookup_tbl, 48));
+  }
   size_t i = 0;
-  for (; i + 52 <= srclen; i += 48) {
-    __m128i in0 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 0));
-    __m128i in1 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 1));
-    __m128i in2 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 2));
-    __m128i in3 = _mm_loadu_si128(
-        reinterpret_cast<const __m128i *>(input + i + 4 * 3 * 3));
-
-    in0 = _mm_shuffle_epi8(in0, shuf);
-    in1 = _mm_shuffle_epi8(in1, shuf);
-    in2 = _mm_shuffle_epi8(in2, shuf);
-    in3 = _mm_shuffle_epi8(in3, shuf);
-
-    const __m128i t0_0 = _mm_and_si128(in0, _mm_set1_epi32(0x0fc0fc00));
-    const __m128i t0_1 = _mm_and_si128(in1, _mm_set1_epi32(0x0fc0fc00));
-    const __m128i t0_2 = _mm_and_si128(in2, _mm_set1_epi32(0x0fc0fc00));
-    const __m128i t0_3 = _mm_and_si128(in3, _mm_set1_epi32(0x0fc0fc00));
-
-    const __m128i t1_0 = _mm_mulhi_epu16(t0_0, _mm_set1_epi32(0x04000040));
-    const __m128i t1_1 = _mm_mulhi_epu16(t0_1, _mm_set1_epi32(0x04000040));
-    const __m128i t1_2 = _mm_mulhi_epu16(t0_2, _mm_set1_epi32(0x04000040));
-    const __m128i t1_3 = _mm_mulhi_epu16(t0_3, _mm_set1_epi32(0x04000040));
-
-    const __m128i t2_0 = _mm_and_si128(in0, _mm_set1_epi32(0x003f03f0));
-    const __m128i t2_1 = _mm_and_si128(in1, _mm_set1_epi32(0x003f03f0));
-    const __m128i t2_2 = _mm_and_si128(in2, _mm_set1_epi32(0x003f03f0));
-    const __m128i t2_3 = _mm_and_si128(in3, _mm_set1_epi32(0x003f03f0));
-
-    const __m128i t3_0 = _mm_mullo_epi16(t2_0, _mm_set1_epi32(0x01000010));
-    const __m128i t3_1 = _mm_mullo_epi16(t2_1, _mm_set1_epi32(0x01000010));
-    const __m128i t3_2 = _mm_mullo_epi16(t2_2, _mm_set1_epi32(0x01000010));
-    const __m128i t3_3 = _mm_mullo_epi16(t2_3, _mm_set1_epi32(0x01000010));
-
-    const __m128i input0 = _mm_or_si128(t1_0, t3_0);
-    const __m128i input1 = _mm_or_si128(t1_1, t3_1);
-    const __m128i input2 = _mm_or_si128(t1_2, t3_2);
-    const __m128i input3 = _mm_or_si128(t1_3, t3_3);
-
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input0));
-    out += 16;
+  for (; i + 100 <= srclen; i += 96) {
+    __m128i in0_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 0);
+    __m128i in0_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+    __m128i in1_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 2);
+    __m128i in1_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 3);
+    __m128i in2_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 4);
+    __m128i in2_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 5);
+    __m128i in3_lo =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 6);
+    __m128i in3_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 7);
+
+    __m256i in0 = lasx_set_q(in0_hi, in0_lo);
+    __m256i in1 = lasx_set_q(in1_hi, in1_lo);
+    __m256i in2 = lasx_set_q(in2_hi, in2_lo);
+    __m256i in3 = lasx_set_q(in3_hi, in3_lo);
+
+    in0 = __lasx_xvshuf_b(in0, in0, (__m256i)shuf);
+    in1 = __lasx_xvshuf_b(in1, in1, (__m256i)shuf);
+    in2 = __lasx_xvshuf_b(in2, in2, (__m256i)shuf);
+    in3 = __lasx_xvshuf_b(in3, in3, (__m256i)shuf);
+
+    __m256i t0_0 = __lasx_xvand_v(in0, v_fc0fc00);
+    __m256i t0_1 = __lasx_xvand_v(in1, v_fc0fc00);
+    __m256i t0_2 = __lasx_xvand_v(in2, v_fc0fc00);
+    __m256i t0_3 = __lasx_xvand_v(in3, v_fc0fc00);
+
+    __m256i t1_0 = __lasx_xvsrl_h(t0_0, shift_r);
+    __m256i t1_1 = __lasx_xvsrl_h(t0_1, shift_r);
+    __m256i t1_2 = __lasx_xvsrl_h(t0_2, shift_r);
+    __m256i t1_3 = __lasx_xvsrl_h(t0_3, shift_r);
+
+    __m256i t2_0 = __lasx_xvand_v(in0, v_3f03f0);
+    __m256i t2_1 = __lasx_xvand_v(in1, v_3f03f0);
+    __m256i t2_2 = __lasx_xvand_v(in2, v_3f03f0);
+    __m256i t2_3 = __lasx_xvand_v(in3, v_3f03f0);
+
+    __m256i t3_0 = __lasx_xvsll_h(t2_0, shift_l);
+    __m256i t3_1 = __lasx_xvsll_h(t2_1, shift_l);
+    __m256i t3_2 = __lasx_xvsll_h(t2_2, shift_l);
+    __m256i t3_3 = __lasx_xvsll_h(t2_3, shift_l);
+
+    __m256i input0 = __lasx_xvor_v(t1_0, t3_0);
+    __m256i input0_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input0);
+    __m256i input0_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input0, __lasx_xvldi(32)));
+    __m256i input0_mask = __lasx_xvslei_bu(input0, 31);
+    __m256i input0_result =
+        __lasx_xvbitsel_v(input0_shuf1, input0_shuf0, input0_mask);
+    __lasx_xvst(input0_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
 
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input1));
-    out += 16;
+    __m256i input1 = __lasx_xvor_v(t1_1, t3_1);
+    __m256i input1_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input1);
+    __m256i input1_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input1, __lasx_xvldi(32)));
+    __m256i input1_mask = __lasx_xvslei_bu(input1, 31);
+    __m256i input1_result =
+        __lasx_xvbitsel_v(input1_shuf1, input1_shuf0, input1_mask);
+    __lasx_xvst(input1_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
 
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input2));
-    out += 16;
+    __m256i input2 = __lasx_xvor_v(t1_2, t3_2);
+    __m256i input2_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input2);
+    __m256i input2_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input2, __lasx_xvldi(32)));
+    __m256i input2_mask = __lasx_xvslei_bu(input2, 31);
+    __m256i input2_result =
+        __lasx_xvbitsel_v(input2_shuf1, input2_shuf0, input2_mask);
+    __lasx_xvst(input2_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
 
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(input3));
-    out += 16;
+    __m256i input3 = __lasx_xvor_v(t1_3, t3_3);
+    __m256i input3_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, input3);
+    __m256i input3_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(input3, __lasx_xvldi(32)));
+    __m256i input3_mask = __lasx_xvslei_bu(input3, 31);
+    __m256i input3_result =
+        __lasx_xvbitsel_v(input3_shuf1, input3_shuf0, input3_mask);
+    __lasx_xvst(input3_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
   }
-  for (; i + 16 <= srclen; i += 12) {
+  for (; i + 28 <= srclen; i += 24) {
 
-    __m128i in = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input + i));
+    __m128i in_lo = __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 0);
+    __m128i in_hi =
+        __lsx_vld(reinterpret_cast<const __m128i *>(input + i), 4 * 3 * 1);
+
+    __m256i in = lasx_set_q(in_hi, in_lo);
 
     // bytes from groups A, B and C are needed in separate 32-bit lanes
     // in = [DDDD|CCCC|BBBB|AAAA]
@@ -37921,40 +52992,43 @@ size_t encode_base64(char *dst, const char *src, size_t srclen,
     //      [bbbbcccc|ccdddddd|aaaaaabb|bbbbcccc]
     //           ^^^^ ^^^^^^^^ ^^^^^^^^ ^^^^
     //                  processed bits
-    in = _mm_shuffle_epi8(in, shuf);
+    in = __lasx_xvshuf_b(in, in, (__m256i)shuf);
 
     // unpacking
-
     // t0    = [0000cccc|cc000000|aaaaaa00|00000000]
-    const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0fc0fc00));
+    __m256i t0 = __lasx_xvand_v(in, v_fc0fc00);
     // t1    = [00000000|00cccccc|00000000|00aaaaaa]
-    //          (c * (1 << 10), a * (1 << 6)) >> 16 (note: an unsigned
-    //          multiplication)
-    const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
+    //          ((c >> 6),  (a >> 10))
+    __m256i t1 = __lasx_xvsrl_h(t0, shift_r);
 
     // t2    = [00000000|00dddddd|000000bb|bbbb0000]
-    const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003f03f0));
-    // t3    = [00dddddd|00000000|00bbbbbb|00000000](
-    //          (d * (1 << 8), b * (1 << 4))
-    const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+    __m256i t2 = __lasx_xvand_v(in, v_3f03f0);
+    // t3    = [00dddddd|00000000|00bbbbbb|00000000]
+    //          ((d << 8), (b << 4))
+    __m256i t3 = __lasx_xvsll_h(t2, shift_l);
 
     // res   = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] = t1 | t3
-    const __m128i indices = _mm_or_si128(t1, t3);
-
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(out),
-                     lookup_pshufb_improved<isbase64url>(indices));
-    out += 16;
+    __m256i indices = __lasx_xvor_v(t1, t3);
+
+    __m256i indices_shuf0 = __lasx_xvshuf_b(base64_tbl1, base64_tbl0, indices);
+    __m256i indices_shuf1 = __lasx_xvshuf_b(
+        base64_tbl3, base64_tbl2, __lasx_xvsub_b(indices, __lasx_xvldi(32)));
+    __m256i indices_mask = __lasx_xvslei_bu(indices, 31);
+    __m256i indices_result =
+        __lasx_xvbitsel_v(indices_shuf1, indices_shuf0, indices_mask);
+    __lasx_xvst(indices_result, reinterpret_cast<__m256i *>(out), 0);
+    out += 32;
   }
 
   return i / 3 * 4 + scalar::base64::tail_encode_base64((char *)out, src + i,
                                                         srclen - i, options);
 }
+
 static inline void compress(__m128i data, uint16_t mask, char *output) {
   if (mask == 0) {
-    _mm_storeu_si128(reinterpret_cast<__m128i *>(output), data);
+    __lsx_vst(data, reinterpret_cast<__m128i *>(output), 0);
     return;
   }
-
   // this particular implementation was inspired by work done by @animetosho
   // we do it in two steps, first 8 bytes and then second 8 bytes
   uint8_t mask1 = uint8_t(mask);      // least significant 8 bits
@@ -37963,13 +53037,15 @@ static inline void compress(__m128i data, uint16_t mask, char *output) {
   // thintable_epi8[mask2] into a 128-bit register, using only
   // two instructions on most compilers.
 
-  __m128i shufmask = _mm_set_epi64x(tables::base64::thintable_epi8[mask2],
-                                    tables::base64::thintable_epi8[mask1]);
+  v2u64 shufmask = {tables::base64::thintable_epi8[mask1],
+                    tables::base64::thintable_epi8[mask2]};
+
   // we increment by 0x08 the second half of the mask
-  shufmask =
-      _mm_add_epi8(shufmask, _mm_set_epi32(0x08080808, 0x08080808, 0, 0));
+  const v4u32 hi = {0, 0, 0x08080808, 0x08080808};
+  __m128i shufmask1 = __lsx_vadd_b((__m128i)shufmask, (__m128i)hi);
+
   // this is the version "nearly pruned"
-  __m128i pruned = _mm_shuffle_epi8(data, shufmask);
+  __m128i pruned = __lsx_vshuf_b(data, data, shufmask1);
   // we still need to put the two halves together.
   // we compute the popcount of the first half:
   int pop1 = tables::base64::BitsSetTable256mul2[mask1];
@@ -37977,212 +53053,185 @@ static inline void compress(__m128i data, uint16_t mask, char *output) {
   // only the first pop1 bytes from the first 8 bytes, and then
   // it fills in with the bytes from the second 8 bytes + some filling
   // at the end.
-  __m128i compactmask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(
-      tables::base64::pshufb_combine_table + pop1 * 8));
-  __m128i answer = _mm_shuffle_epi8(pruned, compactmask);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), answer);
+  __m128i compactmask =
+      __lsx_vld(reinterpret_cast<const __m128i *>(
+                    tables::base64::pshufb_combine_table + pop1 * 8),
+                0);
+  __m128i answer = __lsx_vshuf_b(pruned, pruned, compactmask);
+
+  __lsx_vst(answer, reinterpret_cast<__m128i *>(output), 0);
 }
 
 struct block64 {
-  __m128i chunks[4];
+  __m256i chunks[2];
 };
 
 template <bool base64_url>
-static inline uint16_t to_base64_mask(__m128i *src, uint32_t *error) {
-  const __m128i ascii_space_tbl =
-      _mm_setr_epi8(0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x9, 0xa, 0x0,
-                    0xc, 0xd, 0x0, 0x0);
+static inline uint32_t to_base64_mask(__m256i *src, bool *error) {
+  __m256i ascii_space_tbl =
+      ____m256i((__m128i)v16u8{0x20, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                               0x9, 0xa, 0x0, 0xc, 0xd, 0x0, 0x0});
   // credit: aqrit
-  __m128i delta_asso;
+  __m256i delta_asso =
+      ____m256i((__m128i)v16u8{0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
+                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF});
+  __m256i delta_values;
   if (base64_url) {
-    delta_asso = _mm_setr_epi8(0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x0, 0x0,
-                               0x0, 0x0, 0x0, 0xF, 0x0, 0xF);
+    delta_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                       int8_t(0xB9), int8_t(0x00), int8_t(0x11), int8_t(0xC3),
+                       int8_t(0xBF), int8_t(0xE0), int8_t(0xB9), int8_t(0xB9)});
   } else {
-
-    delta_asso = _mm_setr_epi8(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                               0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x0F);
+    delta_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
+                       int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
+                       int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
+                       int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9)});
   }
-  __m128i delta_values;
-  if (base64_url) {
-    delta_values = _mm_setr_epi8(0x0, 0x0, 0x0, 0x13, 0x4, uint8_t(0xBF),
-                                 uint8_t(0xBF), uint8_t(0xB9), uint8_t(0xB9),
-                                 0x0, 0x11, uint8_t(0xC3), uint8_t(0xBF),
-                                 uint8_t(0xE0), uint8_t(0xB9), uint8_t(0xB9));
-  } else {
 
-    delta_values =
-        _mm_setr_epi8(int8_t(0x00), int8_t(0x00), int8_t(0x00), int8_t(0x13),
-                      int8_t(0x04), int8_t(0xBF), int8_t(0xBF), int8_t(0xB9),
-                      int8_t(0xB9), int8_t(0x00), int8_t(0x10), int8_t(0xC3),
-                      int8_t(0xBF), int8_t(0xBF), int8_t(0xB9), int8_t(0xB9));
-  }
-  __m128i check_asso;
+  __m256i check_asso;
   if (base64_url) {
-    check_asso = _mm_setr_epi8(0xD, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1, 0x1,
-                               0x3, 0x7, 0xB, 0xE, 0xB, 0x6);
+    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+                                          0x0B, 0x06, 0x0B, 0x12});
   } else {
-
-    check_asso = _mm_setr_epi8(0x0D, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
-                               0x01, 0x01, 0x03, 0x07, 0x0B, 0x0B, 0x0B, 0x0F);
+    check_asso = ____m256i((__m128i)v16u8{0x0D, 0x01, 0x01, 0x01, 0x01, 0x01,
+                                          0x01, 0x01, 0x01, 0x01, 0x03, 0x07,
+                                          0x0B, 0x0B, 0x0B, 0x0F});
   }
-  __m128i check_values;
+
+  __m256i check_values;
   if (base64_url) {
-    check_values = _mm_setr_epi8(uint8_t(0x80), uint8_t(0x80), uint8_t(0x80),
-                                 uint8_t(0x80), uint8_t(0xCF), uint8_t(0xBF),
-                                 uint8_t(0xB6), uint8_t(0xA6), uint8_t(0xB5),
-                                 uint8_t(0xA1), 0x0, uint8_t(0x80), 0x0,
-                                 uint8_t(0x80), 0x0, uint8_t(0x80));
+    check_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x0), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD3), int8_t(0xA6),
+                       int8_t(0xB5), int8_t(0x86), int8_t(0xD0), int8_t(0x80),
+                       int8_t(0xB0), int8_t(0x80), int8_t(0x0), int8_t(0x0)});
   } else {
-
-    check_values =
-        _mm_setr_epi8(int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
-                      int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
-                      int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
-                      int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80));
-  }
-  const __m128i shifted = _mm_srli_epi32(*src, 3);
-
-  const __m128i delta_hash =
-      _mm_avg_epu8(_mm_shuffle_epi8(delta_asso, *src), shifted);
-  const __m128i check_hash =
-      _mm_avg_epu8(_mm_shuffle_epi8(check_asso, *src), shifted);
-
-  const __m128i out =
-      _mm_adds_epi8(_mm_shuffle_epi8(delta_values, delta_hash), *src);
-  const __m128i chk =
-      _mm_adds_epi8(_mm_shuffle_epi8(check_values, check_hash), *src);
-  const int mask = _mm_movemask_epi8(chk);
+    check_values = ____m256i(
+        (__m128i)v16i8{int8_t(0x80), int8_t(0x80), int8_t(0x80), int8_t(0x80),
+                       int8_t(0xCF), int8_t(0xBF), int8_t(0xD5), int8_t(0xA6),
+                       int8_t(0xB5), int8_t(0x86), int8_t(0xD1), int8_t(0x80),
+                       int8_t(0xB1), int8_t(0x80), int8_t(0x91), int8_t(0x80)});
+  }
+
+  __m256i shifted = __lasx_xvsrli_b(*src, 3);
+  __m256i asso_index = __lasx_xvand_v(*src, __lasx_xvldi(0xF));
+  __m256i delta_hash = __lasx_xvavgr_bu(
+      __lasx_xvshuf_b(delta_asso, delta_asso, asso_index), shifted);
+  __m256i check_hash = __lasx_xvavgr_bu(
+      __lasx_xvshuf_b(check_asso, check_asso, asso_index), shifted);
+
+  __m256i out = __lasx_xvsadd_b(
+      __lasx_xvshuf_b(delta_values, delta_values, delta_hash), *src);
+  __m256i chk = __lasx_xvsadd_b(
+      __lasx_xvshuf_b(check_values, check_values, check_hash), *src);
+  __m256i chk_ltz = __lasx_xvmskltz_b(chk);
+  unsigned int mask = __lasx_xvpickve2gr_wu(chk_ltz, 0);
+  mask = mask | (__lsx_vpickve2gr_hu(lasx_extracti128_hi(chk_ltz), 0) << 16);
   if (mask) {
-    __m128i ascii_space =
-        _mm_cmpeq_epi8(_mm_shuffle_epi8(ascii_space_tbl, *src), *src);
-    *error = (mask ^ _mm_movemask_epi8(ascii_space));
+    __m256i ascii_space = __lasx_xvseq_b(
+        __lasx_xvshuf_b(ascii_space_tbl, ascii_space_tbl, asso_index), *src);
+    __m256i ascii_space_ltz = __lasx_xvmskltz_b(ascii_space);
+    unsigned int ascii_space_mask = __lasx_xvpickve2gr_wu(ascii_space_ltz, 0);
+    ascii_space_mask =
+        ascii_space_mask |
+        (__lsx_vpickve2gr_hu(lasx_extracti128_hi(ascii_space_ltz), 0) << 16);
+    *error |= (mask != ascii_space_mask);
   }
+
   *src = out;
-  return (uint16_t)mask;
+  return (uint32_t)mask;
 }
 
 template <bool base64_url>
-static inline uint64_t to_base64_mask(block64 *b, uint64_t *error) {
-  uint32_t err0 = 0;
-  uint32_t err1 = 0;
-  uint32_t err2 = 0;
-  uint32_t err3 = 0;
-  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], &err0);
-  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], &err1);
-  uint64_t m2 = to_base64_mask<base64_url>(&b->chunks[2], &err2);
-  uint64_t m3 = to_base64_mask<base64_url>(&b->chunks[3], &err3);
-  *error = (err0) | ((uint64_t)err1 << 16) | ((uint64_t)err2 << 32) |
-           ((uint64_t)err3 << 48);
-  return m0 | (m1 << 16) | (m2 << 32) | (m3 << 48);
-}
-
-#if defined(_MSC_VER) && !defined(__clang__)
-static inline size_t simdutf_tzcnt_u64(uint64_t num) {
-  unsigned long ret;
-  if (num == 0) {
-    return 64;
-  }
-  _BitScanForward64(&ret, num);
-  return ret;
-}
-#else // GCC or Clang
-static inline size_t simdutf_tzcnt_u64(uint64_t num) {
-  return num ? __builtin_ctzll(num) : 64;
+static inline uint64_t to_base64_mask(block64 *b, bool *error) {
+  *error = 0;
+  uint64_t m0 = to_base64_mask<base64_url>(&b->chunks[0], error);
+  uint64_t m1 = to_base64_mask<base64_url>(&b->chunks[1], error);
+  return m0 | (m1 << 32);
 }
-#endif
 
 static inline void copy_block(block64 *b, char *output) {
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output), b->chunks[0]);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 16), b->chunks[1]);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 32), b->chunks[2]);
-  _mm_storeu_si128(reinterpret_cast<__m128i *>(output + 48), b->chunks[3]);
+  __lasx_xvst(b->chunks[0], reinterpret_cast<__m256i *>(output), 0);
+  __lasx_xvst(b->chunks[1], reinterpret_cast<__m256i *>(output), 32);
 }
 
 static inline uint64_t compress_block(block64 *b, uint64_t mask, char *output) {
   uint64_t nmask = ~mask;
-  compress(b->chunks[0], uint16_t(mask), output);
-  compress(b->chunks[1], uint16_t(mask >> 16),
-           output + _mm_popcnt_u64(nmask & 0xFFFF));
-  compress(b->chunks[2], uint16_t(mask >> 32),
-           output + _mm_popcnt_u64(nmask & 0xFFFFFFFF));
-  compress(b->chunks[3], uint16_t(mask >> 48),
-           output + _mm_popcnt_u64(nmask & 0xFFFFFFFFFFFFULL));
-  return _mm_popcnt_u64(nmask);
+  uint64_t count =
+      __lsx_vpickve2gr_d(__lsx_vpcnt_h(__lsx_vreplgr2vr_d(nmask)), 0);
+  uint16_t *count_ptr = (uint16_t *)&count;
+  compress(lasx_extracti128_lo(b->chunks[0]), uint16_t(mask), output);
+  compress(lasx_extracti128_hi(b->chunks[0]), uint16_t(mask >> 16),
+           output + count_ptr[0]);
+  compress(lasx_extracti128_lo(b->chunks[1]), uint16_t(mask >> 32),
+           output + count_ptr[0] + count_ptr[1]);
+  compress(lasx_extracti128_hi(b->chunks[1]), uint16_t(mask >> 48),
+           output + count_ptr[0] + count_ptr[1] + count_ptr[2]);
+  return count_ones(nmask);
 }
 
 // The caller of this function is responsible to ensure that there are 64 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char *src) {
-  b->chunks[0] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-  b->chunks[1] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
-  b->chunks[2] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
-  b->chunks[3] = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
+  b->chunks[0] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
+  b->chunks[1] = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
 }
 
 // The caller of this function is responsible to ensure that there are 128 bytes
 // available from reading at src. The data is read into a block64 structure.
 static inline void load_block(block64 *b, const char16_t *src) {
-  __m128i m1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
-  __m128i m2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 8));
-  __m128i m3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16));
-  __m128i m4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 24));
-  __m128i m5 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32));
-  __m128i m6 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 40));
-  __m128i m7 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48));
-  __m128i m8 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 56));
-  b->chunks[0] = _mm_packus_epi16(m1, m2);
-  b->chunks[1] = _mm_packus_epi16(m3, m4);
-  b->chunks[2] = _mm_packus_epi16(m5, m6);
-  b->chunks[3] = _mm_packus_epi16(m7, m8);
+  __m256i m1 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0);
+  __m256i m2 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32);
+  __m256i m3 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 64);
+  __m256i m4 = __lasx_xvld(reinterpret_cast<const __m256i *>(src), 96);
+  b->chunks[0] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m2, m1, 0), 0b11011000);
+  b->chunks[1] = __lasx_xvpermi_d(__lasx_xvssrlni_bu_h(m4, m3, 0), 0b11011000);
 }
 
-static inline void base64_decode(char *out, __m128i str) {
-  // credit: aqrit
-
-  const __m128i pack_shuffle =
-      _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1);
+static inline void base64_decode(char *out, __m256i str) {
+  __m256i t0 = __lasx_xvor_v(
+      __lasx_xvslli_w(str, 26),
+      __lasx_xvslli_w(__lasx_xvand_v(str, __lasx_xvldi(-1758 /*0x0000FF00*/)),
+                      12));
+  __m256i t1 = __lasx_xvsrli_w(
+      __lasx_xvand_v(str, __lasx_xvldi(-3521 /*0x003F0000*/)), 2);
+  __m256i t2 = __lasx_xvor_v(t0, t1);
+  __m256i t3 = __lasx_xvor_v(t2, __lasx_xvsrli_w(str, 16));
+  __m256i pack_shuffle = ____m256i(
+      (__m128i)v16u8{3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, 0, 0, 0, 0});
+  t3 = __lasx_xvshuf_b(t3, t3, (__m256i)pack_shuffle);
 
-  const __m128i t0 = _mm_maddubs_epi16(str, _mm_set1_epi32(0x01400140));
-  const __m128i t1 = _mm_madd_epi16(t0, _mm_set1_epi32(0x00011000));
-  const __m128i t2 = _mm_shuffle_epi8(t1, pack_shuffle);
   // Store the output:
-  // this writes 16 bytes, but we only need 12.
-  _mm_storeu_si128((__m128i *)out, t2);
+  __lsx_vst(lasx_extracti128_lo(t3), out, 0);
+  __lsx_vst(lasx_extracti128_hi(t3), out, 12);
 }
 // decode 64 bytes and output 48 bytes
 static inline void base64_decode_block(char *out, const char *src) {
-  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
-  base64_decode(out + 12,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
+  base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
   base64_decode(out + 24,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
-  base64_decode(out + 36,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
+                __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
 }
+
 static inline void base64_decode_block_safe(char *out, const char *src) {
-  base64_decode(out, _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
-  base64_decode(out + 12,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
-  base64_decode(out + 24,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 32)));
-  char buffer[16];
+  base64_decode(out, __lasx_xvld(reinterpret_cast<const __m256i *>(src), 0));
+  char buffer[32];
   base64_decode(buffer,
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 48)));
-  std::memcpy(out + 36, buffer, 12);
+                __lasx_xvld(reinterpret_cast<const __m256i *>(src), 32));
+  std::memcpy(out + 24, buffer, 24);
 }
+
 static inline void base64_decode_block(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  base64_decode(out + 12, b->chunks[1]);
-  base64_decode(out + 24, b->chunks[2]);
-  base64_decode(out + 36, b->chunks[3]);
+  base64_decode(out + 24, b->chunks[1]);
 }
 static inline void base64_decode_block_safe(char *out, block64 *b) {
   base64_decode(out, b->chunks[0]);
-  base64_decode(out + 12, b->chunks[1]);
-  base64_decode(out + 24, b->chunks[2]);
-  char buffer[16];
-  base64_decode(buffer, b->chunks[3]);
-  std::memcpy(out + 36, buffer, 12);
+  char buffer[32];
+  base64_decode(buffer, b->chunks[1]);
+  std::memcpy(out + 24, buffer, 24);
 }
 
 template <bool base64_url, typename chartype>
@@ -38229,7 +53278,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   const chartype *const srcend = src + srclen;
 
   constexpr size_t block_size = 6;
-  static_assert(block_size >= 2, "block should of size 2 or more");
+  static_assert(block_size >= 2, "block_size must be at least two");
   char buffer[block_size * 64];
   char *bufferptr = buffer;
   if (srclen >= 64) {
@@ -38238,13 +53287,16 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
       block64 b;
       load_block(&b, src);
       src += 64;
-      uint64_t error = 0;
+      bool error = false;
       uint64_t badcharmask = to_base64_mask<base64_url>(&b, &error);
       if (error) {
         src -= 64;
-        size_t error_offset = simdutf_tzcnt_u64(error);
-        return {error_code::INVALID_BASE64_CHARACTER,
-                size_t(src - srcinit + error_offset), size_t(dst - dstinit)};
+        while (src < srcend && scalar::base64::is_eight_byte(*src) &&
+               to_base64[uint8_t(*src)] <= 64) {
+          src++;
+        }
+        return {error_code::INVALID_BASE64_CHARACTER, size_t(src - srcinit),
+                size_t(dst - dstinit)};
       }
       if (badcharmask != 0) {
         // optimization opportunity: check for simple masks like those made of
@@ -38285,6 +53337,7 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   // time, otherwise, we should just decode directly.
   int last_block = (int)((bufferptr - buffer_start) % 64);
   if (last_block != 0 && srcend - src + last_block >= 64) {
+
     while ((bufferptr - buffer_start) % 64 != 0 && src < srcend) {
       uint8_t val = to_base64[uint8_t(*src)];
       *bufferptr = char(val);
@@ -38370,15 +53423,15 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
   }
   return {SUCCESS, srclen, size_t(dst - dstinit)};
 }
-/* end file src/westmere/sse_base64.cpp */
+/* end file src/lasx/lasx_base64.cpp */
 
-} // unnamed namespace
-} // namespace westmere
+} // namespace
+} // namespace lasx
 } // namespace simdutf
 
 /* begin file src/generic/buf_block_reader.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
 
 // Walks through a buffer in block-sized increments, loading the last part with
@@ -38484,12 +53537,12 @@ simdutf_really_inline void buf_block_reader<STEP_SIZE>::advance() {
 }
 
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
 /* end file src/generic/buf_block_reader.h */
 /* begin file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
 namespace utf8_validation {
 
@@ -38709,12 +53762,12 @@ struct utf8_checker {
 using utf8_validation::utf8_checker;
 
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
 /* end file src/generic/utf8_validation/utf8_lookup4_algorithm.h */
 /* begin file src/generic/utf8_validation/utf8_validator.h */
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
 namespace utf8_validation {
 
@@ -38843,103 +53896,31 @@ result generic_validate_ascii_with_errors(const uint8_t *input, size_t length) {
 }
 
 result generic_validate_ascii_with_errors(const char *input, size_t length) {
-  return generic_validate_ascii_with_errors<utf8_checker>(
-      reinterpret_cast<const uint8_t *>(input), length);
-}
-
-} // namespace utf8_validation
-} // unnamed namespace
-} // namespace westmere
-} // namespace simdutf
-/* end file src/generic/utf8_validation/utf8_validator.h */
-// transcoding from UTF-8 to UTF-16
-/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-
-namespace simdutf {
-namespace westmere {
-namespace {
-namespace utf8_to_utf16 {
-
-using namespace simd;
-
-template <endianness endian>
-simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char16_t *utf16_output) noexcept {
-  // The implementation is not specific to haswell and should be moved to the
-  // generic directory.
-  size_t pos = 0;
-  char16_t *start{utf16_output};
-  const size_t safety_margin = 16; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
-    // this loop could be unrolled further. For example, we could process the
-    // mask far more than 64 bytes.
-    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
-    if (in.is_ascii()) {
-      in.store_ascii_as_utf16<endian>(utf16_output);
-      utf16_output += 64;
-      pos += 64;
-    } else {
-      // Slow path. We hope that the compiler will recognize that this is a slow
-      // path. Anything that is not a continuation mask is a 'leading byte',
-      // that is, the start of a new code point.
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
-      // -65 is 0b10111111 in two-complement's, so largest possible continuation
-      // byte
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      // The *start* of code points is not so useful, rather, we want the *end*
-      // of code points.
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times when using solely
-      // the slow/regular path, and at least four times if there are fast paths.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        //
-        // Thus we may allow convert_masked_utf8_to_utf16 to process
-        // more bytes at a time under a fast-path mode where 16 bytes
-        // are consumed at once (e.g., when encountering ASCII).
-        size_t consumed = convert_masked_utf8_to_utf16<endian>(
-            input + pos, utf8_end_of_code_point_mask, utf16_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
-    }
-  }
-  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
-      input + pos, size - pos, utf16_output);
-  return utf16_output - start;
+  return generic_validate_ascii_with_errors<utf8_checker>(
+      reinterpret_cast<const uint8_t *>(input), length);
 }
 
-} // namespace utf8_to_utf16
+} // namespace utf8_validation
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
-/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+/* end file src/generic/utf8_validation/utf8_validator.h */
+
+// transcoding from UTF-8 to Latin 1
+/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_utf16 {
+namespace utf8_to_latin1 {
 using namespace simd;
 
 simdutf_really_inline simd8<uint8_t>
 check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
+  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
+  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
+  // 0b11000010 and nothing else.
+  //
   // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
   // Bit 1 = Too Long (ASCII followed by continuation)
   // Bit 2 = Overlong 3-byte
@@ -38966,6 +53947,7 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
   // 1111011_ 1000____
   // 11111___ 1000____
   constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
+  constexpr const uint8_t FORBIDDEN = 0xff;
 
   const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // 0_______ ________ <ASCII in byte 1>
@@ -38976,11 +53958,11 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       // 1100____ ________ <two byte lead in byte 1>
       TOO_SHORT | OVERLONG_2,
       // 1101____ ________ <two byte lead in byte 1>
-      TOO_SHORT,
+      FORBIDDEN,
       // 1110____ ________ <three byte lead in byte 1>
-      TOO_SHORT | OVERLONG_3 | SURROGATE,
+      FORBIDDEN,
       // 1111____ ________ <four+ byte lead in byte 1>
-      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
+      FORBIDDEN);
   constexpr const uint8_t CARRY =
       TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
   const simd8<uint8_t> byte_1_low =
@@ -38994,23 +53976,16 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
               CARRY, CARRY,
 
               // ____0100 ________
-              CARRY | TOO_LARGE,
+              FORBIDDEN,
               // ____0101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              FORBIDDEN,
               // ____011_ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              FORBIDDEN, FORBIDDEN,
 
               // ____1___ ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
               // ____1101 ________
-              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
-              CARRY | TOO_LARGE | TOO_LARGE_1000,
-              CARRY | TOO_LARGE | TOO_LARGE_1000);
+              FORBIDDEN, FORBIDDEN, FORBIDDEN);
   const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
       // ________ 0_______ <ASCII in byte 2>
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
@@ -39029,17 +54004,6 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
   return (byte_1_high & byte_1_low & byte_2_high);
 }
-simdutf_really_inline simd8<uint8_t>
-check_multibyte_lengths(const simd8<uint8_t> input,
-                        const simd8<uint8_t> prev_input,
-                        const simd8<uint8_t> sc) {
-  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
-  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
-  simd8<uint8_t> must23 =
-      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
-  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
-  return must23_80 ^ sc;
-}
 
 struct validating_transcoder {
   // If this is nonzero, there has been a UTF-8 error.
@@ -39055,25 +54019,24 @@ struct validating_transcoder {
     // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
     // small negative numbers)
     simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    simd8<uint8_t> sc = check_special_cases(input, prev1);
-    this->error |= check_multibyte_lengths(input, prev_input, sc);
+    this->error |= check_special_cases(input, prev1);
   }
 
-  template <endianness endian>
   simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char16_t *utf16_output) {
+                                       char *latin1_output) {
     size_t pos = 0;
-    char16_t *start{utf16_output};
+    char *start{latin1_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 8 leading bytes, to give us a good margin.
+    // back from the end counting 16 leading bytes, to give us a good margin.
     size_t leading_byte = 0;
     size_t margin = size;
-    for (; margin > 0 && leading_byte < 8; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) > -65);
+    for (; margin > 0 && leading_byte < 16; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) >
+                       -65); // twos complement of -65 is 1011 1111 ...
     }
     // If the input is long enough, then we have that margin-1 is the eight last
     // leading byte.
@@ -39081,8 +54044,8 @@ struct validating_transcoder {
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39101,10 +54064,9 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (utf8_continuation_mask & 1) {
-          return 0; // error
-        }
+        uint64_t utf8_continuation_mask =
+            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                               // this case, we also have ASCII to account for.
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -39122,8 +54084,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39137,23 +54099,22 @@ struct validating_transcoder {
       return 0;
     }
     if (pos < size) {
-      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
-          in + pos, size - pos, utf16_output);
+      size_t howmany =
+          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
       if (howmany == 0) {
         return 0;
       }
-      utf16_output += howmany;
+      latin1_output += howmany;
     }
-    return utf16_output - start;
+    return latin1_output - start;
   }
 
-  template <endianness endian>
   simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char16_t *utf16_output) {
+                                                   char *latin1_output) {
     size_t pos = 0;
-    char16_t *start{utf16_output};
+    char *start{latin1_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
@@ -39169,8 +54130,8 @@ struct validating_transcoder {
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf16<endian>(utf16_output);
-        utf16_output += 64;
+        input.store((int8_t *)latin1_output);
+        latin1_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39189,17 +54150,16 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-        if (errors() || (utf8_continuation_mask & 1)) {
+        if (errors()) {
           // rewind_and_convert_with_errors will seek a potential error from
           // in+pos onward, with the ability to go back up to pos bytes, and
           // read size-pos bytes forward.
-          result res =
-              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-                  pos, in + pos, size - pos, utf16_output);
+          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, latin1_output);
           res.count += pos;
           return res;
         }
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -39217,8 +54177,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf16<endian>(
-              in + pos, utf8_end_of_code_point_mask, utf16_output);
+          size_t consumed = convert_masked_utf8_to_latin1(
+              in + pos, utf8_end_of_code_point_mask, latin1_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39232,9 +54192,8 @@ struct validating_transcoder {
       // rewind_and_convert_with_errors will seek a potential error from in+pos
       // onward, with the ability to go back up to pos bytes, and read size-pos
       // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
       res.count += pos;
       return res;
     }
@@ -39242,17 +54201,16 @@ struct validating_transcoder {
       // rewind_and_convert_with_errors will seek a potential error from in+pos
       // onward, with the ability to go back up to pos bytes, and read size-pos
       // bytes forward.
-      result res =
-          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
-              pos, in + pos, size - pos, utf16_output);
+      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, latin1_output);
       if (res.error) { // In case of error, we want the error position
         res.count += pos;
         return res;
       } else { // In case of success, we want the number of word written
-        utf16_output += res.count;
+        latin1_output += res.count;
       }
     }
-    return result(error_code::SUCCESS, utf16_output - start);
+    return result(error_code::SUCCESS, latin1_output - start);
   }
 
   simdutf_really_inline bool errors() const {
@@ -39260,63 +54218,176 @@ struct validating_transcoder {
   }
 
 }; // struct utf8_checker
-} // namespace utf8_to_utf16
+} // namespace utf8_to_latin1
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
-// transcoding from UTF-8 to UTF-32
-/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_utf32 {
+namespace utf8_to_latin1 {
+using namespace simd;
+
+simdutf_really_inline size_t convert_valid(const char *in, size_t size,
+                                           char *latin1_output) {
+  size_t pos = 0;
+  char *start{latin1_output};
+  // In the worst case, we have the haswell kernel which can cause an overflow
+  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
+  // 16 bytes, and if the data is valid, then it is entirely safe because 16
+  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
+  // assume that you have valid UTF-8 input, so we are going to go back from the
+  // end counting 8 leading bytes, to give us a good margin.
+  size_t leading_byte = 0;
+  size_t margin = size;
+  for (; margin > 0 && leading_byte < 8; margin--) {
+    leading_byte += (int8_t(in[margin - 1]) >
+                     -65); // twos complement of -65 is 1011 1111 ...
+  }
+  // If the input is long enough, then we have that margin-1 is the eight last
+  // leading byte.
+  const size_t safety_margin = size - margin + 1; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    if (input.is_ascii()) {
+      input.store((int8_t *)latin1_output);
+      latin1_output += 64;
+      pos += 64;
+    } else {
+      // you might think that a for-loop would work, but under Visual Studio, it
+      // is not good enough.
+      uint64_t utf8_continuation_mask =
+          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
+                             // this case, we also have ASCII to account for.
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
+      size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times.
+      while (pos < max_starting_point) {
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        size_t consumed = convert_masked_utf8_to_latin1(
+            in + pos, utf8_end_of_code_point_mask, latin1_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
+    }
+  }
+  if (pos < size) {
+    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
+                                                           latin1_output);
+    latin1_output += howmany;
+  }
+  return latin1_output - start;
+}
+
+} // namespace utf8_to_latin1
+} // namespace
+} // namespace lasx
+} // namespace simdutf
+  // namespace simdutf
+/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+// transcoding from UTF-8 to UTF-16
+/* begin file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace utf8_to_utf16 {
 
 using namespace simd;
 
+template <endianness endian>
 simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
-                                         char32_t *utf32_output) noexcept {
+                                         char16_t *utf16_output) noexcept {
+  // The implementation is not specific to haswell and should be moved to the
+  // generic directory.
   size_t pos = 0;
-  char32_t *start{utf32_output};
+  char16_t *start{utf16_output};
   const size_t safety_margin = 16; // to avoid overruns!
   while (pos + 64 + safety_margin <= size) {
+    // this loop could be unrolled further. For example, we could process the
+    // mask far more than 64 bytes.
     simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
     if (in.is_ascii()) {
-      in.store_ascii_as_utf32(utf32_output);
-      utf32_output += 64;
+      in.store_ascii_as_utf16<endian>(utf16_output);
+      utf16_output += 64;
       pos += 64;
     } else {
+      // Slow path. We hope that the compiler will recognize that this is a slow
+      // path. Anything that is not a continuation mask is a 'leading byte',
+      // that is, the start of a new code point.
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
       // -65 is 0b10111111 in two-complement's, so largest possible continuation
       // byte
-      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
       uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      // The *start* of code points is not so useful, rather, we want the *end*
+      // of code points.
       uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      // We process in blocks of up to 12 bytes except possibly
+      // for fast paths which may process up to 16 bytes. For the
+      // slow path to work, we should have at least 12 input bytes left.
       size_t max_starting_point = (pos + 64) - 12;
+      // Next loop is going to run at least five times when using solely
+      // the slow/regular path, and at least four times if there are fast paths.
       while (pos < max_starting_point) {
-        size_t consumed = convert_masked_utf8_to_utf32(
-            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        // Performance note: our ability to compute 'consumed' and
+        // then shift and recompute is critical. If there is a
+        // latency of, say, 4 cycles on getting 'consumed', then
+        // the inner loop might have a total latency of about 6 cycles.
+        // Yet we process between 6 to 12 inputs bytes, thus we get
+        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
+        // for this section of the code. Hence, there is a limit
+        // to how much we can further increase this latency before
+        // it seriously harms performance.
+        //
+        // Thus we may allow convert_masked_utf8_to_utf16 to process
+        // more bytes at a time under a fast-path mode where 16 bytes
+        // are consumed at once (e.g., when encountering ASCII).
+        size_t consumed = convert_masked_utf8_to_utf16<endian>(
+            input + pos, utf8_end_of_code_point_mask, utf16_output);
         pos += consumed;
         utf8_end_of_code_point_mask >>= consumed;
       }
+      // At this point there may remain between 0 and 12 bytes in the
+      // 64-byte block. These bytes will be processed again. So we have an
+      // 80% efficiency (in the worst case). In practice we expect an
+      // 85% to 90% efficiency.
     }
   }
-  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
-                                                       utf32_output);
-  return utf32_output - start;
+  utf16_output += scalar::utf8_to_utf16::convert_valid<endian>(
+      input + pos, size - pos, utf16_output);
+  return utf16_output - start;
 }
 
-} // namespace utf8_to_utf32
+} // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
-/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+/* end file src/generic/utf8_to_utf16/valid_utf8_to_utf16.h */
+/* begin file src/generic/utf8_to_utf16/utf8_to_utf16.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_utf32 {
+namespace utf8_to_utf16 {
 using namespace simd;
 
 simdutf_really_inline simd8<uint8_t>
@@ -39440,29 +54511,30 @@ struct validating_transcoder {
     this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
 
+  template <endianness endian>
   simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char32_t *utf32_output) {
+                                       char16_t *utf16_output) {
     size_t pos = 0;
-    char32_t *start{utf32_output};
+    char16_t *start{utf16_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
-    // back from the end counting 16 leading bytes, to give us a good margin.
+    // back from the end counting 8 leading bytes, to give us a good margin.
     size_t leading_byte = 0;
     size_t margin = size;
     for (; margin > 0 && leading_byte < 8; margin--) {
       leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39483,7 +54555,7 @@ struct validating_transcoder {
         }
         uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         if (utf8_continuation_mask & 1) {
-          return 0; // we have an error
+          return 0; // error
         }
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
@@ -39502,8 +54574,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39517,22 +54589,23 @@ struct validating_transcoder {
       return 0;
     }
     if (pos < size) {
-      size_t howmany =
-          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
+      size_t howmany = scalar::utf8_to_utf16::convert<endian>(
+          in + pos, size - pos, utf16_output);
       if (howmany == 0) {
         return 0;
       }
-      utf32_output += howmany;
+      utf16_output += howmany;
     }
-    return utf32_output - start;
+    return utf16_output - start;
   }
 
+  template <endianness endian>
   simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char32_t *utf32_output) {
+                                                   char16_t *utf16_output) {
     size_t pos = 0;
-    char32_t *start{utf32_output};
+    char16_t *start{utf16_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_utf16. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
@@ -39542,14 +54615,14 @@ struct validating_transcoder {
     for (; margin > 0 && leading_byte < 8; margin--) {
       leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the fourth
-    // last leading byte.
+    // If the input is long enough, then we have that margin-1 is the eight last
+    // leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store_ascii_as_utf32(utf32_output);
-        utf32_output += 64;
+        input.store_ascii_as_utf16<endian>(utf16_output);
+        utf16_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39570,8 +54643,12 @@ struct validating_transcoder {
         }
         uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         if (errors() || (utf8_continuation_mask & 1)) {
-          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, utf32_output);
+          // rewind_and_convert_with_errors will seek a potential error from
+          // in+pos onward, with the ability to go back up to pos bytes, and
+          // read size-pos bytes forward.
+          result res =
+              scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+                  pos, in + pos, size - pos, utf16_output);
           res.count += pos;
           return res;
         }
@@ -39592,8 +54669,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_utf32(
-              in + pos, utf8_end_of_code_point_mask, utf32_output);
+          size_t consumed = convert_masked_utf8_to_utf16<endian>(
+              in + pos, utf8_end_of_code_point_mask, utf16_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39604,22 +54681,30 @@ struct validating_transcoder {
       }
     }
     if (errors()) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
       res.count += pos;
       return res;
     }
     if (pos < size) {
-      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, utf32_output);
+      // rewind_and_convert_with_errors will seek a potential error from in+pos
+      // onward, with the ability to go back up to pos bytes, and read size-pos
+      // bytes forward.
+      result res =
+          scalar::utf8_to_utf16::rewind_and_convert_with_errors<endian>(
+              pos, in + pos, size - pos, utf16_output);
       if (res.error) { // In case of error, we want the error position
         res.count += pos;
         return res;
       } else { // In case of success, we want the number of word written
-        utf32_output += res.count;
+        utf16_output += res.count;
       }
     }
-    return result(error_code::SUCCESS, utf32_output - start);
+    return result(error_code::SUCCESS, utf16_output - start);
   }
 
   simdutf_really_inline bool errors() const {
@@ -39627,143 +54712,67 @@ struct validating_transcoder {
   }
 
 }; // struct utf8_checker
-} // namespace utf8_to_utf32
+} // namespace utf8_to_utf16
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
-// other functions
-/* begin file src/generic/utf8.h */
+/* end file src/generic/utf8_to_utf16/utf8_to_utf16.h */
+// transcoding from UTF-8 to UTF-32
+/* begin file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8 {
+namespace utf8_to_utf32 {
 
 using namespace simd;
 
-simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.gt(-65);
-    count += count_ones(utf8_continuation_mask);
-  }
-  return count + scalar::utf8::count_code_points(in + pos, size - pos);
-}
-
-simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
-                                                    size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos + 64 <= size; pos += 64) {
-    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
-    // We count one word for anything that is not a continuation (so
-    // leading bytes).
-    count += 64 - count_ones(utf8_continuation_mask);
-    int64_t utf8_4byte = input.gteq_unsigned(240);
-    count += count_ones(utf8_4byte);
-  }
-  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
-}
-} // namespace utf8
-} // unnamed namespace
-} // namespace westmere
-} // namespace simdutf
-/* end file src/generic/utf8.h */
-/* begin file src/generic/utf16.h */
-namespace simdutf {
-namespace westmere {
-namespace {
-namespace utf16 {
-
-template <endianness big_endian>
-simdutf_really_inline size_t count_code_points(const char16_t *in,
-                                               size_t size) {
-  size_t pos = 0;
-  size_t count = 0;
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
-    }
-    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
-    count += count_ones(not_pair) / 2;
-  }
-  return count +
-         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
-}
-
-template <endianness big_endian>
-simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
-                                                    size_t size) {
+simdutf_warn_unused size_t convert_valid(const char *input, size_t size,
+                                         char32_t *utf32_output) noexcept {
   size_t pos = 0;
-  size_t count = 0;
-  // This algorithm could no doubt be improved!
-  for (; pos < size / 32 * 32; pos += 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    if (!match_system(big_endian)) {
-      input.swap_bytes();
+  char32_t *start{utf32_output};
+  const size_t safety_margin = 16; // to avoid overruns!
+  while (pos + 64 + safety_margin <= size) {
+    simd8x64<int8_t> in(reinterpret_cast<const int8_t *>(input + pos));
+    if (in.is_ascii()) {
+      in.store_ascii_as_utf32(utf32_output);
+      utf32_output += 64;
+      pos += 64;
+    } else {
+      // -65 is 0b10111111 in two-complement's, so largest possible continuation
+      // byte
+      uint64_t utf8_continuation_mask = in.lt(-65 + 1);
+      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
+      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
+      size_t max_starting_point = (pos + 64) - 12;
+      while (pos < max_starting_point) {
+        size_t consumed = convert_masked_utf8_to_utf32(
+            input + pos, utf8_end_of_code_point_mask, utf32_output);
+        pos += consumed;
+        utf8_end_of_code_point_mask >>= consumed;
+      }
     }
-    uint64_t ascii_mask = input.lteq(0x7F);
-    uint64_t twobyte_mask = input.lteq(0x7FF);
-    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
-
-    size_t ascii_count = count_ones(ascii_mask) / 2;
-    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
-    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
-    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
-    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
-             ascii_count;
-  }
-  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
-                                                                   size - pos);
-}
-
-template <endianness big_endian>
-simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
-                                                     size_t size) {
-  return count_code_points<big_endian>(in, size);
-}
-
-simdutf_really_inline void
-change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
-  size_t pos = 0;
-
-  while (pos < size / 32 * 32) {
-    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
-    input.swap_bytes();
-    input.store(reinterpret_cast<uint16_t *>(output));
-    pos += 32;
-    output += 32;
   }
-
-  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+  utf32_output += scalar::utf8_to_utf32::convert_valid(input + pos, size - pos,
+                                                       utf32_output);
+  return utf32_output - start;
 }
 
-} // namespace utf16
+} // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf16.h */
-// transcoding from UTF-8 to Latin 1
-/* begin file src/generic/utf8_to_latin1/utf8_to_latin1.h */
+/* end file src/generic/utf8_to_utf32/valid_utf8_to_utf32.h */
+/* begin file src/generic/utf8_to_utf32/utf8_to_utf32.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_latin1 {
+namespace utf8_to_utf32 {
 using namespace simd;
 
 simdutf_really_inline simd8<uint8_t>
 check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
-  // For UTF-8 to Latin 1, we can allow any ASCII character, and any
-  // continuation byte, but the non-ASCII leading bytes must be 0b11000011 or
-  // 0b11000010 and nothing else.
-  //
   // Bit 0 = Too Short (lead byte/ASCII followed by lead byte/ASCII)
   // Bit 1 = Too Long (ASCII followed by continuation)
   // Bit 2 = Overlong 3-byte
@@ -39790,7 +54799,6 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
   // 1111011_ 1000____
   // 11111___ 1000____
   constexpr const uint8_t OVERLONG_4 = 1 << 6; // 11110000 1000____
-  constexpr const uint8_t FORBIDDEN = 0xff;
 
   const simd8<uint8_t> byte_1_high = prev1.shr<4>().lookup_16<uint8_t>(
       // 0_______ ________ <ASCII in byte 1>
@@ -39801,11 +54809,11 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       // 1100____ ________ <two byte lead in byte 1>
       TOO_SHORT | OVERLONG_2,
       // 1101____ ________ <two byte lead in byte 1>
-      FORBIDDEN,
+      TOO_SHORT,
       // 1110____ ________ <three byte lead in byte 1>
-      FORBIDDEN,
+      TOO_SHORT | OVERLONG_3 | SURROGATE,
       // 1111____ ________ <four+ byte lead in byte 1>
-      FORBIDDEN);
+      TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4);
   constexpr const uint8_t CARRY =
       TOO_SHORT | TOO_LONG | TWO_CONTS; // These all have ____ in byte 1 .
   const simd8<uint8_t> byte_1_low =
@@ -39819,16 +54827,23 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
               CARRY, CARRY,
 
               // ____0100 ________
-              FORBIDDEN,
+              CARRY | TOO_LARGE,
               // ____0101 ________
-              FORBIDDEN,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
               // ____011_ ________
-              FORBIDDEN, FORBIDDEN,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
 
               // ____1___ ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN, FORBIDDEN,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
               // ____1101 ________
-              FORBIDDEN, FORBIDDEN, FORBIDDEN);
+              CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
+              CARRY | TOO_LARGE | TOO_LARGE_1000,
+              CARRY | TOO_LARGE | TOO_LARGE_1000);
   const simd8<uint8_t> byte_2_high = input.shr<4>().lookup_16<uint8_t>(
       // ________ 0_______ <ASCII in byte 2>
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
@@ -39847,6 +54862,17 @@ check_special_cases(const simd8<uint8_t> input, const simd8<uint8_t> prev1) {
       TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT);
   return (byte_1_high & byte_1_low & byte_2_high);
 }
+simdutf_really_inline simd8<uint8_t>
+check_multibyte_lengths(const simd8<uint8_t> input,
+                        const simd8<uint8_t> prev_input,
+                        const simd8<uint8_t> sc) {
+  simd8<uint8_t> prev2 = input.prev<2>(prev_input);
+  simd8<uint8_t> prev3 = input.prev<3>(prev_input);
+  simd8<uint8_t> must23 =
+      simd8<uint8_t>(must_be_2_3_continuation(prev2, prev3));
+  simd8<uint8_t> must23_80 = must23 & uint8_t(0x80);
+  return must23_80 ^ sc;
+}
 
 struct validating_transcoder {
   // If this is nonzero, there has been a UTF-8 error.
@@ -39862,33 +54888,33 @@ struct validating_transcoder {
     // lead bytes (2, 3, 4-byte leads become large positive numbers instead of
     // small negative numbers)
     simd8<uint8_t> prev1 = input.prev<1>(prev_input);
-    this->error |= check_special_cases(input, prev1);
+    simd8<uint8_t> sc = check_special_cases(input, prev1);
+    this->error |= check_multibyte_lengths(input, prev_input, sc);
   }
 
   simdutf_really_inline size_t convert(const char *in, size_t size,
-                                       char *latin1_output) {
+                                       char32_t *utf32_output) {
     size_t pos = 0;
-    char *start{latin1_output};
+    char32_t *start{utf32_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // of 8 words when calling convert_masked_utf8_to_utf32. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
     // back from the end counting 16 leading bytes, to give us a good margin.
     size_t leading_byte = 0;
     size_t margin = size;
-    for (; margin > 0 && leading_byte < 16; margin--) {
-      leading_byte += (int8_t(in[margin - 1]) >
-                       -65); // twos complement of -65 is 1011 1111 ...
+    for (; margin > 0 && leading_byte < 8; margin--) {
+      leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39907,9 +54933,10 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        uint64_t utf8_continuation_mask =
-            input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                               // this case, we also have ASCII to account for.
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (utf8_continuation_mask & 1) {
+          return 0; // we have an error
+        }
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -39927,8 +54954,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -39943,21 +54970,21 @@ struct validating_transcoder {
     }
     if (pos < size) {
       size_t howmany =
-          scalar::utf8_to_latin1::convert(in + pos, size - pos, latin1_output);
+          scalar::utf8_to_utf32::convert(in + pos, size - pos, utf32_output);
       if (howmany == 0) {
         return 0;
       }
-      latin1_output += howmany;
+      utf32_output += howmany;
     }
-    return latin1_output - start;
+    return utf32_output - start;
   }
 
   simdutf_really_inline result convert_with_errors(const char *in, size_t size,
-                                                   char *latin1_output) {
+                                                   char32_t *utf32_output) {
     size_t pos = 0;
-    char *start{latin1_output};
+    char32_t *start{utf32_output};
     // In the worst case, we have the haswell kernel which can cause an overflow
-    // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the
+    // of 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the
     // last 16 bytes, and if the data is valid, then it is entirely safe because
     // 16 UTF-8 bytes generate much more than 8 bytes. However, you cannot
     // generally assume that you have valid UTF-8 input, so we are going to go
@@ -39967,14 +54994,14 @@ struct validating_transcoder {
     for (; margin > 0 && leading_byte < 8; margin--) {
       leading_byte += (int8_t(in[margin - 1]) > -65);
     }
-    // If the input is long enough, then we have that margin-1 is the eight last
-    // leading byte.
+    // If the input is long enough, then we have that margin-1 is the fourth
+    // last leading byte.
     const size_t safety_margin = size - margin + 1; // to avoid overruns!
     while (pos + 64 + safety_margin <= size) {
       simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
       if (input.is_ascii()) {
-        input.store((int8_t *)latin1_output);
-        latin1_output += 64;
+        input.store_ascii_as_utf32(utf32_output);
+        utf32_output += 64;
         pos += 64;
       } else {
         // you might think that a for-loop would work, but under Visual Studio,
@@ -39993,16 +55020,13 @@ struct validating_transcoder {
           this->check_utf8_bytes(input.chunks[2], input.chunks[1]);
           this->check_utf8_bytes(input.chunks[3], input.chunks[2]);
         }
-        if (errors()) {
-          // rewind_and_convert_with_errors will seek a potential error from
-          // in+pos onward, with the ability to go back up to pos bytes, and
-          // read size-pos bytes forward.
-          result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-              pos, in + pos, size - pos, latin1_output);
+        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+        if (errors() || (utf8_continuation_mask & 1)) {
+          result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+              pos, in + pos, size - pos, utf32_output);
           res.count += pos;
           return res;
         }
-        uint64_t utf8_continuation_mask = input.lt(-65 + 1);
         uint64_t utf8_leading_mask = ~utf8_continuation_mask;
         uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
         // We process in blocks of up to 12 bytes except possibly
@@ -40020,8 +55044,8 @@ struct validating_transcoder {
           // for this section of the code. Hence, there is a limit
           // to how much we can further increase this latency before
           // it seriously harms performance.
-          size_t consumed = convert_masked_utf8_to_latin1(
-              in + pos, utf8_end_of_code_point_mask, latin1_output);
+          size_t consumed = convert_masked_utf8_to_utf32(
+              in + pos, utf8_end_of_code_point_mask, utf32_output);
           pos += consumed;
           utf8_end_of_code_point_mask >>= consumed;
         }
@@ -40032,28 +55056,22 @@ struct validating_transcoder {
       }
     }
     if (errors()) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
       res.count += pos;
       return res;
     }
     if (pos < size) {
-      // rewind_and_convert_with_errors will seek a potential error from in+pos
-      // onward, with the ability to go back up to pos bytes, and read size-pos
-      // bytes forward.
-      result res = scalar::utf8_to_latin1::rewind_and_convert_with_errors(
-          pos, in + pos, size - pos, latin1_output);
+      result res = scalar::utf8_to_utf32::rewind_and_convert_with_errors(
+          pos, in + pos, size - pos, utf32_output);
       if (res.error) { // In case of error, we want the error position
         res.count += pos;
         return res;
       } else { // In case of success, we want the number of word written
-        latin1_output += res.count;
+        utf32_output += res.count;
       }
     }
-    return result(error_code::SUCCESS, latin1_output - start);
+    return result(error_code::SUCCESS, utf32_output - start);
   }
 
   simdutf_really_inline bool errors() const {
@@ -40061,99 +55079,136 @@ struct validating_transcoder {
   }
 
 }; // struct utf8_checker
-} // namespace utf8_to_latin1
+} // namespace utf8_to_utf32
 } // unnamed namespace
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
-/* end file src/generic/utf8_to_latin1/utf8_to_latin1.h */
-/* begin file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+/* end file src/generic/utf8_to_utf32/utf8_to_utf32.h */
+
+
+// other functions
+/* begin file src/generic/utf8.h */
 
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 namespace {
-namespace utf8_to_latin1 {
+namespace utf8 {
+
 using namespace simd;
 
-simdutf_really_inline size_t convert_valid(const char *in, size_t size,
-                                           char *latin1_output) {
+simdutf_really_inline size_t count_code_points(const char *in, size_t size) {
   size_t pos = 0;
-  char *start{latin1_output};
-  // In the worst case, we have the haswell kernel which can cause an overflow
-  // of 8 bytes when calling convert_masked_utf8_to_latin1. If you skip the last
-  // 16 bytes, and if the data is valid, then it is entirely safe because 16
-  // UTF-8 bytes generate much more than 8 bytes. However, you cannot generally
-  // assume that you have valid UTF-8 input, so we are going to go back from the
-  // end counting 8 leading bytes, to give us a good margin.
-  size_t leading_byte = 0;
-  size_t margin = size;
-  for (; margin > 0 && leading_byte < 8; margin--) {
-    leading_byte += (int8_t(in[margin - 1]) >
-                     -65); // twos complement of -65 is 1011 1111 ...
+  size_t count = 0;
+  for (; pos + 64 <= size; pos += 64) {
+    simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
+    uint64_t utf8_continuation_mask = input.gt(-65);
+    count += count_ones(utf8_continuation_mask);
   }
-  // If the input is long enough, then we have that margin-1 is the eight last
-  // leading byte.
-  const size_t safety_margin = size - margin + 1; // to avoid overruns!
-  while (pos + 64 + safety_margin <= size) {
+  return count + scalar::utf8::count_code_points(in + pos, size - pos);
+}
+
+simdutf_really_inline size_t utf16_length_from_utf8(const char *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos + 64 <= size; pos += 64) {
     simd8x64<int8_t> input(reinterpret_cast<const int8_t *>(in + pos));
-    if (input.is_ascii()) {
-      input.store((int8_t *)latin1_output);
-      latin1_output += 64;
-      pos += 64;
-    } else {
-      // you might think that a for-loop would work, but under Visual Studio, it
-      // is not good enough.
-      uint64_t utf8_continuation_mask =
-          input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
-                             // this case, we also have ASCII to account for.
-      uint64_t utf8_leading_mask = ~utf8_continuation_mask;
-      uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
-      // We process in blocks of up to 12 bytes except possibly
-      // for fast paths which may process up to 16 bytes. For the
-      // slow path to work, we should have at least 12 input bytes left.
-      size_t max_starting_point = (pos + 64) - 12;
-      // Next loop is going to run at least five times.
-      while (pos < max_starting_point) {
-        // Performance note: our ability to compute 'consumed' and
-        // then shift and recompute is critical. If there is a
-        // latency of, say, 4 cycles on getting 'consumed', then
-        // the inner loop might have a total latency of about 6 cycles.
-        // Yet we process between 6 to 12 inputs bytes, thus we get
-        // a speed limit between 1 cycle/byte and 0.5 cycle/byte
-        // for this section of the code. Hence, there is a limit
-        // to how much we can further increase this latency before
-        // it seriously harms performance.
-        size_t consumed = convert_masked_utf8_to_latin1(
-            in + pos, utf8_end_of_code_point_mask, latin1_output);
-        pos += consumed;
-        utf8_end_of_code_point_mask >>= consumed;
-      }
-      // At this point there may remain between 0 and 12 bytes in the
-      // 64-byte block. These bytes will be processed again. So we have an
-      // 80% efficiency (in the worst case). In practice we expect an
-      // 85% to 90% efficiency.
+    uint64_t utf8_continuation_mask = input.lt(-65 + 1);
+    // We count one word for anything that is not a continuation (so
+    // leading bytes).
+    count += 64 - count_ones(utf8_continuation_mask);
+    int64_t utf8_4byte = input.gteq_unsigned(240);
+    count += count_ones(utf8_4byte);
+  }
+  return count + scalar::utf8::utf16_length_from_utf8(in + pos, size - pos);
+}
+} // namespace utf8
+} // unnamed namespace
+} // namespace lasx
+} // namespace simdutf
+/* end file src/generic/utf8.h */
+/* begin file src/generic/utf16.h */
+namespace simdutf {
+namespace lasx {
+namespace {
+namespace utf16 {
+
+template <endianness big_endian>
+simdutf_really_inline size_t count_code_points(const char16_t *in,
+                                               size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
     }
+    uint64_t not_pair = input.not_in_range(0xDC00, 0xDFFF);
+    count += count_ones(not_pair) / 2;
   }
-  if (pos < size) {
-    size_t howmany = scalar::utf8_to_latin1::convert_valid(in + pos, size - pos,
-                                                           latin1_output);
-    latin1_output += howmany;
+  return count +
+         scalar::utf16::count_code_points<big_endian>(in + pos, size - pos);
+}
+
+template <endianness big_endian>
+simdutf_really_inline size_t utf8_length_from_utf16(const char16_t *in,
+                                                    size_t size) {
+  size_t pos = 0;
+  size_t count = 0;
+  // This algorithm could no doubt be improved!
+  for (; pos < size / 32 * 32; pos += 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    if (!match_system(big_endian)) {
+      input.swap_bytes();
+    }
+    uint64_t ascii_mask = input.lteq(0x7F);
+    uint64_t twobyte_mask = input.lteq(0x7FF);
+    uint64_t not_pair_mask = input.not_in_range(0xD800, 0xDFFF);
+
+    size_t ascii_count = count_ones(ascii_mask) / 2;
+    size_t twobyte_count = count_ones(twobyte_mask & ~ascii_mask) / 2;
+    size_t threebyte_count = count_ones(not_pair_mask & ~twobyte_mask) / 2;
+    size_t fourbyte_count = 32 - count_ones(not_pair_mask) / 2;
+    count += 2 * fourbyte_count + 3 * threebyte_count + 2 * twobyte_count +
+             ascii_count;
   }
-  return latin1_output - start;
+  return count + scalar::utf16::utf8_length_from_utf16<big_endian>(in + pos,
+                                                                   size - pos);
 }
 
-} // namespace utf8_to_latin1
-} // namespace
-} // namespace westmere
+template <endianness big_endian>
+simdutf_really_inline size_t utf32_length_from_utf16(const char16_t *in,
+                                                     size_t size) {
+  return count_code_points<big_endian>(in, size);
+}
+
+simdutf_really_inline void
+change_endianness_utf16(const char16_t *in, size_t size, char16_t *output) {
+  size_t pos = 0;
+
+  while (pos < size / 32 * 32) {
+    simd16x32<uint16_t> input(reinterpret_cast<const uint16_t *>(in + pos));
+    input.swap_bytes();
+    input.store(reinterpret_cast<uint16_t *>(output));
+    pos += 32;
+    output += 32;
+  }
+
+  scalar::utf16::change_endianness_utf16(in + pos, size - pos, output);
+}
+
+} // namespace utf16
+} // unnamed namespace
+} // namespace lasx
 } // namespace simdutf
-  // namespace simdutf
-/* end file src/generic/utf8_to_latin1/valid_utf8_to_latin1.h */
+/* end file src/generic/utf16.h */
 
 //
 // Implementation-specific overrides
 //
-
 namespace simdutf {
-namespace westmere {
+namespace lasx {
 
 simdutf_warn_unused int
 implementation::detect_encodings(const char *input,
@@ -40184,34 +55239,32 @@ implementation::detect_encodings(const char *input,
 
 simdutf_warn_unused bool
 implementation::validate_utf8(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8(buf, len);
+  return lasx::utf8_validation::generic_validate_utf8(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_utf8_with_errors(
     const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_utf8_with_errors(buf, len);
+  return lasx::utf8_validation::generic_validate_utf8_with_errors(buf, len);
 }
 
 simdutf_warn_unused bool
 implementation::validate_ascii(const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii(buf, len);
+  return lasx::utf8_validation::generic_validate_ascii(buf, len);
 }
 
 simdutf_warn_unused result implementation::validate_ascii_with_errors(
     const char *buf, size_t len) const noexcept {
-  return westmere::utf8_validation::generic_validate_ascii_with_errors(buf,
-                                                                       len);
+  return lasx::utf8_validation::generic_validate_ascii_with_errors(buf, len);
 }
 
 simdutf_warn_unused bool
 implementation::validate_utf16le(const char16_t *buf,
                                  size_t len) const noexcept {
   if (simdutf_unlikely(len == 0)) {
-    // empty input is valid UTF-16. protect the implementation from
-    // handling nullptr
+    // empty input is valid. protected the implementation from nullptr.
     return true;
   }
-  const char16_t *tail = sse_validate_utf16<endianness::LITTLE>(buf, len);
+  const char16_t *tail = lasx_validate_utf16<endianness::LITTLE>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::LITTLE>(tail,
                                                        len - (tail - buf));
@@ -40224,11 +55277,10 @@ simdutf_warn_unused bool
 implementation::validate_utf16be(const char16_t *buf,
                                  size_t len) const noexcept {
   if (simdutf_unlikely(len == 0)) {
-    // empty input is valid UTF-16. protect the implementation from
-    // handling nullptr
+    // empty input is valid. protected the implementation from nullptr.
     return true;
   }
-  const char16_t *tail = sse_validate_utf16<endianness::BIG>(buf, len);
+  const char16_t *tail = lasx_validate_utf16<endianness::BIG>(buf, len);
   if (tail) {
     return scalar::utf16::validate<endianness::BIG>(tail, len - (tail - buf));
   } else {
@@ -40238,7 +55290,10 @@ implementation::validate_utf16be(const char16_t *buf,
 
 simdutf_warn_unused result implementation::validate_utf16le_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lasx_validate_utf16_with_errors<endianness::LITTLE>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::LITTLE>(
         buf + res.count, len - res.count);
@@ -40250,7 +55305,10 @@ simdutf_warn_unused result implementation::validate_utf16le_with_errors(
 
 simdutf_warn_unused result implementation::validate_utf16be_with_errors(
     const char16_t *buf, size_t len) const noexcept {
-  result res = sse_validate_utf16_with_errors<endianness::BIG>(buf, len);
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
+  result res = lasx_validate_utf16_with_errors<endianness::BIG>(buf, len);
   if (res.count != len) {
     result scalar_res = scalar::utf16::validate_with_errors<endianness::BIG>(
         buf + res.count, len - res.count);
@@ -40263,11 +55321,10 @@ simdutf_warn_unused result implementation::validate_utf16be_with_errors(
 simdutf_warn_unused bool
 implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
   if (simdutf_unlikely(len == 0)) {
-    // empty input is valid UTF-32. protect the implementation from
-    // handling nullptr
+    // empty input is valid. protected the implementation from nullptr.
     return true;
   }
-  const char32_t *tail = sse_validate_utf32le(buf, len);
+  const char32_t *tail = lasx_validate_utf32le(buf, len);
   if (tail) {
     return scalar::utf32::validate(tail, len - (tail - buf));
   } else {
@@ -40277,12 +55334,10 @@ implementation::validate_utf32(const char32_t *buf, size_t len) const noexcept {
 
 simdutf_warn_unused result implementation::validate_utf32_with_errors(
     const char32_t *buf, size_t len) const noexcept {
-  if (len == 0) {
-    // empty input is valid UTF-32. protect the implementation from
-    // handling nullptr
+  if (simdutf_unlikely(len == 0)) {
     return result(error_code::SUCCESS, 0);
   }
-  result res = sse_validate_utf32le_with_errors(buf, len);
+  result res = lasx_validate_utf32le_with_errors(buf, len);
   if (res.count != len) {
     result scalar_res =
         scalar::utf32::validate_with_errors(buf + res.count, len - res.count);
@@ -40294,9 +55349,8 @@ simdutf_warn_unused result implementation::validate_utf32_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
     const char *buf, size_t len, char *utf8_output) const noexcept {
-
   std::pair<const char *, char *> ret =
-      sse_convert_latin1_to_utf8(buf, len, utf8_output);
+      lasx_convert_latin1_to_utf8(buf, len, utf8_output);
   size_t converted_chars = ret.second - utf8_output;
 
   if (ret.first != buf + len) {
@@ -40304,25 +55358,18 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf8(
         ret.first, len - (ret.first - buf), ret.second);
     converted_chars += scalar_converted_chars;
   }
-
   return converted_chars;
 }
 
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      sse_convert_latin1_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
+      lasx_convert_latin1_to_utf16le(buf, len, utf16_output);
   size_t converted_chars = ret.second - utf16_output;
   if (ret.first != buf + len) {
     const size_t scalar_converted_chars =
         scalar::latin1_to_utf16::convert<endianness::LITTLE>(
             ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
     converted_chars += scalar_converted_chars;
   }
   return converted_chars;
@@ -40331,18 +55378,12 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16le(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
     const char *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char *, char16_t *> ret =
-      sse_convert_latin1_to_utf16<endianness::BIG>(buf, len, utf16_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
+      lasx_convert_latin1_to_utf16be(buf, len, utf16_output);
   size_t converted_chars = ret.second - utf16_output;
   if (ret.first != buf + len) {
     const size_t scalar_converted_chars =
         scalar::latin1_to_utf16::convert<endianness::BIG>(
             ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
     converted_chars += scalar_converted_chars;
   }
   return converted_chars;
@@ -40351,17 +55392,11 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf16be(
 simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
     const char *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char *, char32_t *> ret =
-      sse_convert_latin1_to_utf32(buf, len, utf32_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
+      lasx_convert_latin1_to_utf32(buf, len, utf32_output);
   size_t converted_chars = ret.second - utf32_output;
   if (ret.first != buf + len) {
     const size_t scalar_converted_chars = scalar::latin1_to_utf32::convert(
         ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_converted_chars == 0) {
-      return 0;
-    }
     converted_chars += scalar_converted_chars;
   }
   return converted_chars;
@@ -40369,19 +55404,117 @@ simdutf_warn_unused size_t implementation::convert_latin1_to_utf32(
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_latin1(
     const char *buf, size_t len, char *latin1_output) const noexcept {
+  size_t pos = 0;
+  char *output_start{latin1_output};
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+    if (buf[pos] & 0x80) {
+      if (pos + 1 >= len)
+        return 0;
+      if ((buf[pos] & 0b11100000) == 0b11000000) {
+        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+          return 0;
+        uint32_t code_point =
+            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+        if (code_point < 0x80 || 0xFF < code_point) {
+          return 0;
+        }
+        *latin1_output++ = char(code_point);
+        pos += 2;
+      } else {
+        return 0;
+      }
+    } else {
+      *latin1_output++ = char(buf[pos]);
+      pos++;
+    }
+  }
+  size_t convert_size = latin1_output - output_start;
+  if (pos == len)
+    return convert_size;
   utf8_to_latin1::validating_transcoder converter;
-  return converter.convert(buf, len, latin1_output);
+  size_t convert_result =
+      converter.convert(buf + pos, len - pos, latin1_output);
+  return convert_result ? convert_size + convert_result : 0;
 }
 
 simdutf_warn_unused result implementation::convert_utf8_to_latin1_with_errors(
     const char *buf, size_t len, char *latin1_output) const noexcept {
+  size_t pos = 0;
+  char *output_start{latin1_output};
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+    if (buf[pos] & 0x80) {
+      if ((buf[pos] & 0b11100000) == 0b11000000) {
+        if (pos + 1 >= len)
+          return result(error_code::TOO_SHORT, pos);
+        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+          return result(error_code::TOO_SHORT, pos);
+        uint32_t code_point =
+            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+        if (code_point < 0x80)
+          return result(error_code::OVERLONG, pos);
+        if (0xFF < code_point)
+          return result(error_code::TOO_LARGE, pos);
+        *latin1_output++ = char(code_point);
+        pos += 2;
+      } else if ((buf[pos] & 0b11110000) == 0b11100000) {
+        return result(error_code::TOO_LARGE, pos);
+      } else if ((buf[pos] & 0b11111000) == 0b11110000) {
+        return result(error_code::TOO_LARGE, pos);
+      } else {
+        if ((buf[pos] & 0b11000000) == 0b10000000) {
+          return result(error_code::TOO_LONG, pos);
+        }
+        return result(error_code::HEADER_BITS, pos);
+      }
+    } else {
+      *latin1_output++ = char(buf[pos]);
+      pos++;
+    }
+  }
+  size_t convert_size = latin1_output - output_start;
+  if (pos == len)
+    return result(error_code::SUCCESS, convert_size);
+
   utf8_to_latin1::validating_transcoder converter;
-  return converter.convert_with_errors(buf, len, latin1_output);
+  result res =
+      converter.convert_with_errors(buf + pos, len - pos, latin1_output);
+  return res.error ? result(res.error, res.count + pos)
+                   : result(res.error, res.count + convert_size);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf8_to_latin1(
     const char *buf, size_t len, char *latin1_output) const noexcept {
-  return westmere::utf8_to_latin1::convert_valid(buf, len, latin1_output);
+  size_t pos = 0;
+  char *output_start{latin1_output};
+  // Performance degradation when memory address is not 32-byte aligned
+  while (((uint64_t)latin1_output & 0x1F) && pos < len) {
+    if (buf[pos] & 0x80) {
+      if (pos + 1 >= len)
+        break;
+      if ((buf[pos] & 0b11100000) == 0b11000000) {
+        if ((buf[pos + 1] & 0b11000000) != 0b10000000)
+          return 0;
+        uint32_t code_point =
+            (buf[pos] & 0b00011111) << 6 | (buf[pos + 1] & 0b00111111);
+        *latin1_output++ = char(code_point);
+        pos += 2;
+      } else {
+        return 0;
+      }
+    } else {
+      *latin1_output++ = char(buf[pos]);
+      pos++;
+    }
+  }
+  size_t convert_size = latin1_output - output_start;
+  if (pos == len)
+    return convert_size;
+
+  size_t convert_result =
+      lasx::utf8_to_latin1::convert_valid(buf + pos, len - pos, latin1_output);
+  return convert_result ? convert_size + convert_result : 0;
 }
 
 simdutf_warn_unused size_t implementation::convert_utf8_to_utf16le(
@@ -40441,7 +55574,7 @@ simdutf_warn_unused size_t implementation::convert_valid_utf8_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
+      lasx_convert_utf16_to_latin1<endianness::LITTLE>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40462,7 +55595,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_latin1(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
+      lasx_convert_utf16_to_latin1<endianness::BIG>(buf, len, latin1_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40484,7 +55617,7 @@ simdutf_warn_unused result
 implementation::convert_utf16le_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      sse_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
+      lasx_convert_utf16_to_latin1_with_errors<endianness::LITTLE>(
           buf, len, latin1_output);
   if (ret.first.error) {
     return ret.first;
@@ -40511,8 +55644,8 @@ simdutf_warn_unused result
 implementation::convert_utf16be_to_latin1_with_errors(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
   std::pair<result, char *> ret =
-      sse_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
-                                                               latin1_output);
+      lasx_convert_utf16_to_latin1_with_errors<endianness::BIG>(buf, len,
+                                                                latin1_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40536,20 +55669,20 @@ implementation::convert_utf16be_to_latin1_with_errors(
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: we could provide an optimized function.
+  // optimization opportunity: implement a custom function.
   return convert_utf16be_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_valid_utf16le_to_latin1(
     const char16_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: we could provide an optimized function.
+  // optimization opportunity: implement a custom function.
   return convert_utf16le_to_latin1(buf, len, latin1_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8<endianness::LITTLE>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40569,7 +55702,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf8(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf8(
     const char16_t *buf, size_t len, char *utf8_output) const noexcept {
   std::pair<const char16_t *, char *> ret =
-      sse_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8<endianness::BIG>(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40591,8 +55724,8 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(
-          buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8_with_errors<endianness::LITTLE>(buf, len,
+                                                                 utf8_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40619,8 +55752,8 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf8_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      westmere::sse_convert_utf16_to_utf8_with_errors<endianness::BIG>(
-          buf, len, utf8_output);
+      lasx_convert_utf16_to_utf8_with_errors<endianness::BIG>(buf, len,
+                                                              utf8_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40652,59 +55785,13 @@ simdutf_warn_unused size_t implementation::convert_valid_utf16be_to_utf8(
   return convert_utf16be_to_utf8(buf, len, utf8_output);
 }
 
-simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  std::pair<const char32_t *, char *> ret =
-      sse_convert_utf32_to_latin1(buf, len, latin1_output);
-  if (ret.first == nullptr) {
-    return 0;
-  }
-  size_t saved_bytes = ret.second - latin1_output;
-  // if (ret.first != buf + len) {
-  if (ret.first < buf + len) {
-    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
-        ret.first, len - (ret.first - buf), ret.second);
-    if (scalar_saved_bytes == 0) {
-      return 0;
-    }
-    saved_bytes += scalar_saved_bytes;
-  }
-  return saved_bytes;
-}
-
-simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  // ret.first.count is always the position in the buffer, not the number of
-  // code units written even if finished
-  std::pair<result, char *> ret =
-      westmere::sse_convert_utf32_to_latin1_with_errors(buf, len,
-                                                        latin1_output);
-  if (ret.first.count != len) {
-    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
-        buf + ret.first.count, len - ret.first.count, ret.second);
-    if (scalar_res.error) {
-      scalar_res.count += ret.first.count;
-      return scalar_res;
-    } else {
-      ret.second += scalar_res.count;
-    }
-  }
-  ret.first.count =
-      ret.second -
-      latin1_output; // Set count to the number of 8-bit code units written
-  return ret.first;
-}
-
-simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
-    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
-  // optimization opportunity: we could provide an optimized function.
-  return convert_utf32_to_latin1(buf, len, latin1_output);
-}
-
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return 0;
+  }
   std::pair<const char32_t *, char *> ret =
-      sse_convert_utf32_to_utf8(buf, len, utf8_output);
+      lasx_convert_utf32_to_utf8(buf, len, utf8_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40722,10 +55809,13 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf8(
 
 simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  if (simdutf_unlikely(len == 0)) {
+    return result(error_code::SUCCESS, 0);
+  }
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char *> ret =
-      westmere::sse_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
+      lasx_convert_utf32_to_utf8_with_errors(buf, len, utf8_output);
   if (ret.first.count != len) {
     result scalar_res = scalar::utf32_to_utf8::convert_with_errors(
         buf + ret.first.count, len - ret.first.count, ret.second);
@@ -40745,7 +55835,7 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf8_with_errors(
 simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      sse_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32<endianness::LITTLE>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40765,7 +55855,7 @@ simdutf_warn_unused size_t implementation::convert_utf16le_to_utf32(
 simdutf_warn_unused size_t implementation::convert_utf16be_to_utf32(
     const char16_t *buf, size_t len, char32_t *utf32_output) const noexcept {
   std::pair<const char16_t *, char32_t *> ret =
-      sse_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32<endianness::BIG>(buf, len, utf32_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40787,8 +55877,8 @@ simdutf_warn_unused result implementation::convert_utf16le_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(
-          buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32_with_errors<endianness::LITTLE>(buf, len,
+                                                                  utf32_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40815,8 +55905,8 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char32_t *> ret =
-      westmere::sse_convert_utf16_to_utf32_with_errors<endianness::BIG>(
-          buf, len, utf32_output);
+      lasx_convert_utf16_to_utf32_with_errors<endianness::BIG>(buf, len,
+                                                               utf32_output);
   if (ret.first.error) {
     return ret.first;
   } // Can return directly since scalar fallback already found correct
@@ -40838,15 +55928,77 @@ simdutf_warn_unused result implementation::convert_utf16be_to_utf32_with_errors(
   return ret.first;
 }
 
+simdutf_warn_unused size_t implementation::convert_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lasx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert(
+        ret.first, len - (ret.first - buf), ret.second);
+    if (scalar_saved_bytes == 0) {
+      return 0;
+    }
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
+simdutf_warn_unused result implementation::convert_utf32_to_latin1_with_errors(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<result, char *> ret =
+      lasx_convert_utf32_to_latin1_with_errors(buf, len, latin1_output);
+  if (ret.first.error) {
+    return ret.first;
+  } // Can return directly since scalar fallback already found correct
+    // ret.first.count
+  if (ret.first.count != len) { // All good so far, but not finished
+    result scalar_res = scalar::utf32_to_latin1::convert_with_errors(
+        buf + ret.first.count, len - ret.first.count, ret.second);
+    if (scalar_res.error) {
+      scalar_res.count += ret.first.count;
+      return scalar_res;
+    } else {
+      ret.second += scalar_res.count;
+    }
+  }
+  ret.first.count =
+      ret.second -
+      latin1_output; // Set count to the number of 8-bit code units written
+  return ret.first;
+}
+
+simdutf_warn_unused size_t implementation::convert_valid_utf32_to_latin1(
+    const char32_t *buf, size_t len, char *latin1_output) const noexcept {
+  std::pair<const char32_t *, char *> ret =
+      lasx_convert_utf32_to_latin1(buf, len, latin1_output);
+  if (ret.first == nullptr) {
+    return 0;
+  }
+  size_t saved_bytes = ret.second - latin1_output;
+
+  if (ret.first != buf + len) {
+    const size_t scalar_saved_bytes = scalar::utf32_to_latin1::convert_valid(
+        ret.first, len - (ret.first - buf), ret.second);
+    saved_bytes += scalar_saved_bytes;
+  }
+  return saved_bytes;
+}
+
 simdutf_warn_unused size_t implementation::convert_valid_utf32_to_utf8(
     const char32_t *buf, size_t len, char *utf8_output) const noexcept {
+  // optimization opportunity: implement a custom function.
   return convert_utf32_to_utf8(buf, len, utf8_output);
 }
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      sse_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16<endianness::LITTLE>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40860,13 +56012,14 @@ simdutf_warn_unused size_t implementation::convert_utf32_to_utf16le(
     }
     saved_bytes += scalar_saved_bytes;
   }
+
   return saved_bytes;
 }
 
 simdutf_warn_unused size_t implementation::convert_utf32_to_utf16be(
     const char32_t *buf, size_t len, char16_t *utf16_output) const noexcept {
   std::pair<const char32_t *, char16_t *> ret =
-      sse_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16<endianness::BIG>(buf, len, utf16_output);
   if (ret.first == nullptr) {
     return 0;
   }
@@ -40888,8 +56041,8 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16le_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(
-          buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16_with_errors<endianness::LITTLE>(buf, len,
+                                                                  utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
         scalar::utf32_to_utf16::convert_with_errors<endianness::LITTLE>(
@@ -40912,8 +56065,8 @@ simdutf_warn_unused result implementation::convert_utf32_to_utf16be_with_errors(
   // ret.first.count is always the position in the buffer, not the number of
   // code units written even if finished
   std::pair<result, char16_t *> ret =
-      westmere::sse_convert_utf32_to_utf16_with_errors<endianness::BIG>(
-          buf, len, utf16_output);
+      lasx_convert_utf32_to_utf16_with_errors<endianness::BIG>(buf, len,
+                                                               utf16_output);
   if (ret.first.count != len) {
     result scalar_res =
         scalar::utf32_to_utf16::convert_with_errors<endianness::BIG>(
@@ -40969,7 +56122,23 @@ simdutf_warn_unused size_t implementation::count_utf16be(
 
 simdutf_warn_unused size_t
 implementation::count_utf8(const char *input, size_t length) const noexcept {
-  return utf8::count_code_points(input, length);
+  size_t pos = 0;
+  size_t count = 0;
+  // Performance degradation when memory address is not 32-byte aligned
+  while ((((uint64_t)input + pos) & 0x1F && pos < length)) {
+    if (input[pos++] > -65) {
+      count++;
+    }
+  }
+  __m256i v_bf = __lasx_xvldi(0xBF); // 0b10111111
+  for (; pos + 32 <= length; pos += 32) {
+    __m256i in = __lasx_xvld(reinterpret_cast<const int8_t *>(input + pos), 0);
+    __m256i utf8_count =
+        __lasx_xvpcnt_h(__lasx_xvmskltz_b(__lasx_xvslt_b(v_bf, in)));
+    count = count + __lasx_xvpickve2gr_wu(utf8_count, 0) +
+            __lasx_xvpickve2gr_wu(utf8_count, 4);
+  }
+  return count + scalar::utf8::count_code_points(input + pos, length - pos);
 }
 
 simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
@@ -40979,12 +56148,29 @@ simdutf_warn_unused size_t implementation::latin1_length_from_utf8(
 
 simdutf_warn_unused size_t
 implementation::latin1_length_from_utf16(size_t length) const noexcept {
-  return scalar::utf16::latin1_length_from_utf16(length);
+  return length;
 }
 
 simdutf_warn_unused size_t
 implementation::latin1_length_from_utf32(size_t length) const noexcept {
-  return scalar::utf32::latin1_length_from_utf32(length);
+  return length;
+}
+
+simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
+    const char *input, size_t length) const noexcept {
+  const uint8_t *data = reinterpret_cast<const uint8_t *>(input);
+  const uint8_t *data_end = data + length;
+  uint64_t result = 0;
+  while (data + 16 < data_end) {
+    uint64_t two_bytes = 0;
+    __m128i input_vec = __lsx_vld(data, 0);
+    two_bytes =
+        __lsx_vpickve2gr_hu(__lsx_vpcnt_h(__lsx_vmskltz_b(input_vec)), 0);
+    result += 16 + two_bytes;
+    data += 16;
+  }
+  return result + scalar::latin1::utf8_length_from_latin1((const char *)data,
+                                                          data_end - data);
 }
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf16le(
@@ -40999,72 +56185,12 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf16be(
 
 simdutf_warn_unused size_t
 implementation::utf16_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf16_length_from_latin1(length);
+  return length;
 }
 
 simdutf_warn_unused size_t
 implementation::utf32_length_from_latin1(size_t length) const noexcept {
-  return scalar::latin1::utf32_length_from_latin1(length);
-}
-
-simdutf_warn_unused size_t implementation::utf8_length_from_latin1(
-    const char *input, size_t len) const noexcept {
-  const uint8_t *str = reinterpret_cast<const uint8_t *>(input);
-  size_t answer = len / sizeof(__m128i) * sizeof(__m128i);
-  size_t i = 0;
-  if (answer >= 2048) { // long strings optimization
-    __m128i two_64bits = _mm_setzero_si128();
-    while (i + sizeof(__m128i) <= len) {
-      __m128i runner = _mm_setzero_si128();
-      size_t iterations = (len - i) / sizeof(__m128i);
-      if (iterations > 255) {
-        iterations = 255;
-      }
-      size_t max_i = i + iterations * sizeof(__m128i) - sizeof(__m128i);
-      for (; i + 4 * sizeof(__m128i) <= max_i; i += 4 * sizeof(__m128i)) {
-        __m128i input1 = _mm_loadu_si128((const __m128i *)(str + i));
-        __m128i input2 =
-            _mm_loadu_si128((const __m128i *)(str + i + sizeof(__m128i)));
-        __m128i input3 =
-            _mm_loadu_si128((const __m128i *)(str + i + 2 * sizeof(__m128i)));
-        __m128i input4 =
-            _mm_loadu_si128((const __m128i *)(str + i + 3 * sizeof(__m128i)));
-        __m128i input12 =
-            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input1),
-                         _mm_cmpgt_epi8(_mm_setzero_si128(), input2));
-        __m128i input34 =
-            _mm_add_epi8(_mm_cmpgt_epi8(_mm_setzero_si128(), input3),
-                         _mm_cmpgt_epi8(_mm_setzero_si128(), input4));
-        __m128i input1234 = _mm_add_epi8(input12, input34);
-        runner = _mm_sub_epi8(runner, input1234);
-      }
-      for (; i <= max_i; i += sizeof(__m128i)) {
-        __m128i more_input = _mm_loadu_si128((const __m128i *)(str + i));
-        runner = _mm_sub_epi8(runner,
-                              _mm_cmpgt_epi8(_mm_setzero_si128(), more_input));
-      }
-      two_64bits =
-          _mm_add_epi64(two_64bits, _mm_sad_epu8(runner, _mm_setzero_si128()));
-    }
-    answer +=
-        _mm_extract_epi64(two_64bits, 0) + _mm_extract_epi64(two_64bits, 1);
-  } else if (answer > 0) { // short string optimization
-    for (; i + 2 * sizeof(__m128i) <= len; i += 2 * sizeof(__m128i)) {
-      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
-      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
-      answer += count_ones(non_ascii);
-      latin = _mm_loadu_si128((const __m128i *)(input + i) + 1);
-      non_ascii = (uint16_t)_mm_movemask_epi8(latin);
-      answer += count_ones(non_ascii);
-    }
-    for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i)) {
-      __m128i latin = _mm_loadu_si128((const __m128i *)(input + i));
-      uint16_t non_ascii = (uint16_t)_mm_movemask_epi8(latin);
-      answer += count_ones(non_ascii);
-    }
-  }
-  return answer + scalar::latin1::utf8_length_from_latin1(
-                      reinterpret_cast<const char *>(str + i), len - i);
+  return length;
 }
 
 simdutf_warn_unused size_t implementation::utf32_length_from_utf16le(
@@ -41084,35 +56210,35 @@ simdutf_warn_unused size_t implementation::utf16_length_from_utf8(
 
 simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffffff80 = _mm_set1_epi32((uint32_t)0xffffff80);
-  const __m128i v_fffff800 = _mm_set1_epi32((uint32_t)0xfffff800);
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  __m256i v_80 = __lasx_xvrepli_w(0x80); /*0x00000080*/
+  __m256i v_800 = __lasx_xvldi(-3832);   /*0x00000800*/
+  __m256i v_10000 = __lasx_xvldi(-3583); /*0x00010000*/
   size_t pos = 0;
   size_t count = 0;
-  for (; pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
-    const __m128i ascii_bytes_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffffff80), v_00000000);
-    const __m128i one_two_bytes_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_fffff800), v_00000000);
-    const __m128i two_bytes_bytemask =
-        _mm_xor_si128(one_two_bytes_bytemask, ascii_bytes_bytemask);
-    const __m128i one_two_three_bytes_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const __m128i three_bytes_bytemask =
-        _mm_xor_si128(one_two_three_bytes_bytemask, one_two_bytes_bytemask);
-    const uint16_t ascii_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(ascii_bytes_bytemask));
-    const uint16_t two_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(two_bytes_bytemask));
-    const uint16_t three_bytes_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(three_bytes_bytemask));
-
-    size_t ascii_count = count_ones(ascii_bytes_bitmask) / 4;
-    size_t two_bytes_count = count_ones(two_bytes_bitmask) / 4;
-    size_t three_bytes_count = count_ones(three_bytes_bitmask) / 4;
-    count += 16 - 3 * ascii_count - 2 * two_bytes_count - three_bytes_count;
+  for (; pos + 8 <= length; pos += 8) {
+    __m256i in =
+        __lasx_xvld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    __m256i ascii_bytes_bytemask = __lasx_xvslt_w(in, v_80);
+    __m256i one_two_bytes_bytemask = __lasx_xvslt_w(in, v_800);
+    __m256i two_bytes_bytemask =
+        __lasx_xvxor_v(one_two_bytes_bytemask, ascii_bytes_bytemask);
+    __m256i three_bytes_bytemask =
+        __lasx_xvxor_v(__lasx_xvslt_w(in, v_10000), one_two_bytes_bytemask);
+
+    __m256i ascii_bytes =
+        __lasx_xvpcnt_w(__lasx_xvmskltz_w(ascii_bytes_bytemask));
+    const uint32_t ascii_bytes_count = __lasx_xvpickve2gr_wu(ascii_bytes, 0) +
+                                       __lasx_xvpickve2gr_wu(ascii_bytes, 4);
+    __m256i two_bytes = __lasx_xvpcnt_w(__lasx_xvmskltz_w(two_bytes_bytemask));
+    const uint32_t two_bytes_count = __lasx_xvpickve2gr_wu(two_bytes, 0) +
+                                     __lasx_xvpickve2gr_wu(two_bytes, 4);
+    __m256i three_bytes =
+        __lasx_xvpcnt_w(__lasx_xvmskltz_w(three_bytes_bytemask));
+    const uint32_t three_bytes_count = __lasx_xvpickve2gr_wu(three_bytes, 0) +
+                                       __lasx_xvpickve2gr_wu(three_bytes, 4);
+
+    count +=
+        32 - 3 * ascii_bytes_count - 2 * two_bytes_count - three_bytes_count;
   }
   return count +
          scalar::utf32::utf8_length_from_utf32(input + pos, length - pos);
@@ -41120,17 +56246,14 @@ simdutf_warn_unused size_t implementation::utf8_length_from_utf32(
 
 simdutf_warn_unused size_t implementation::utf16_length_from_utf32(
     const char32_t *input, size_t length) const noexcept {
-  const __m128i v_00000000 = _mm_setzero_si128();
-  const __m128i v_ffff0000 = _mm_set1_epi32((uint32_t)0xffff0000);
+  __m128i v_ffff = __lsx_vldi(-2304); /*0x0000ffff*/
   size_t pos = 0;
   size_t count = 0;
   for (; pos + 4 <= length; pos += 4) {
-    __m128i in = _mm_loadu_si128((__m128i *)(input + pos));
-    const __m128i surrogate_bytemask =
-        _mm_cmpeq_epi32(_mm_and_si128(in, v_ffff0000), v_00000000);
-    const uint16_t surrogate_bitmask =
-        static_cast<uint16_t>(_mm_movemask_epi8(surrogate_bytemask));
-    size_t surrogate_count = (16 - count_ones(surrogate_bitmask)) / 4;
+    __m128i in = __lsx_vld(reinterpret_cast<const uint32_t *>(input + pos), 0);
+    __m128i surrogate_bytemask = __lsx_vslt_wu(v_ffff, in);
+    size_t surrogate_count = __lsx_vpickve2gr_bu(
+        __lsx_vpcnt_b(__lsx_vmskltz_w(surrogate_bytemask)), 0);
     count += 4 + surrogate_count;
   }
   return count +
@@ -41206,18 +56329,12 @@ size_t implementation::binary_to_base64(const char *input, size_t length,
     return encode_base64<false>(output, input, length, options);
   }
 }
-} // namespace westmere
+} // namespace lasx
 } // namespace simdutf
 
-/* begin file src/simdutf/westmere/end.h */
-#if SIMDUTF_CAN_ALWAYS_RUN_WESTMERE
-// nothing needed.
-#else
-SIMDUTF_UNTARGET_REGION
-#endif
-
-/* end file src/simdutf/westmere/end.h */
-/* end file src/westmere/implementation.cpp */
+/* begin file src/simdutf/lasx/end.h */
+/* end file src/simdutf/lasx/end.h */
+/* end file src/lasx/implementation.cpp */
 #endif
 
 SIMDUTF_POP_DISABLE_WARNINGS
diff --git a/deps/simdutf/simdutf.h b/deps/simdutf/simdutf.h
index 5f82ca372ccfe3..2d984f40e7bc3f 100644
--- a/deps/simdutf/simdutf.h
+++ b/deps/simdutf/simdutf.h
@@ -1,4 +1,4 @@
-/* auto-generated on 2024-11-21 10:33:28 -0500. Do not edit! */
+/* auto-generated on 2024-12-10 14:54:53 -0500. Do not edit! */
 /* begin file include/simdutf.h */
 #ifndef SIMDUTF_H
 #define SIMDUTF_H
@@ -178,7 +178,12 @@
   #endif
 
 #elif defined(__loongarch_lp64)
-// LoongArch 64-bit
+  #if defined(__loongarch_sx) && defined(__loongarch_asx)
+    #define SIMDUTF_IS_LSX 1
+    #define SIMDUTF_IS_LASX 1
+  #elif defined(__loongarch_sx)
+    #define SIMDUTF_IS_LSX 1
+  #endif
 #else
   // The simdutf library is designed
   // for 64-bit processors and it seems that you are not
@@ -670,7 +675,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
 #define SIMDUTF_SIMDUTF_VERSION_H
 
 /** The version of simdutf being used (major.minor.revision) */
-#define SIMDUTF_VERSION "5.6.3"
+#define SIMDUTF_VERSION "5.6.4"
 
 namespace simdutf {
 enum {
@@ -685,7 +690,7 @@ enum {
   /**
    * The revision (major.minor.REVISION) of simdutf being used.
    */
-  SIMDUTF_VERSION_REVISION = 3
+  SIMDUTF_VERSION_REVISION = 4
 };
 } // namespace simdutf
 
@@ -796,6 +801,8 @@ enum instruction_set {
   AVX512VPOPCNTDQ = 0x2000,
   RVV = 0x4000,
   ZVBB = 0x8000,
+  LSX = 0x40000,
+  LASX = 0x80000,
 };
 
 #if defined(__PPC64__)
@@ -987,6 +994,28 @@ static inline uint32_t detect_supported_architectures() {
   }
   return host_isa;
 }
+#elif defined(__loongarch__)
+  #if defined(__linux__)
+    #include <sys/auxv.h>
+  // bits/hwcap.h
+  // #define HWCAP_LOONGARCH_LSX             (1 << 4)
+  // #define HWCAP_LOONGARCH_LASX            (1 << 5)
+  #endif
+
+static inline uint32_t detect_supported_architectures() {
+  uint32_t host_isa = instruction_set::DEFAULT;
+  #if defined(__linux__)
+  uint64_t hwcap = 0;
+  hwcap = getauxval(AT_HWCAP);
+  if (hwcap & HWCAP_LOONGARCH_LSX) {
+    host_isa |= instruction_set::LSX;
+  }
+  if (hwcap & HWCAP_LOONGARCH_LASX) {
+    host_isa |= instruction_set::LASX;
+  }
+  #endif
+  return host_isa;
+}
 #else // fallback
 
 // includes 32-bit ARM.
diff --git a/deps/sqlite/sqlite3.c b/deps/sqlite/sqlite3.c
index 099c5482f68df0..c748d033461fae 100644
--- a/deps/sqlite/sqlite3.c
+++ b/deps/sqlite/sqlite3.c
@@ -1,6 +1,6 @@
 /******************************************************************************
 ** This file is an amalgamation of many separate C source files from SQLite
-** version 3.47.1.  By combining all the individual C code files into this
+** version 3.47.2.  By combining all the individual C code files into this
 ** single large file, the entire code can be compiled as a single translation
 ** unit.  This allows many compilers to do optimizations that would not be
 ** possible if the files were compiled separately.  Performance improvements
@@ -18,7 +18,7 @@
 ** separate file. This file contains only code for the core SQLite library.
 **
 ** The content in this amalgamation comes from Fossil check-in
-** b95d11e958643b969c47a8e5857f3793b9e6.
+** 2aabe05e2e8cae4847a802ee2daddc1d7413.
 */
 #define SQLITE_CORE 1
 #define SQLITE_AMALGAMATION 1
@@ -462,9 +462,9 @@ extern "C" {
 ** [sqlite3_libversion_number()], [sqlite3_sourceid()],
 ** [sqlite_version()] and [sqlite_source_id()].
 */
-#define SQLITE_VERSION        "3.47.1"
-#define SQLITE_VERSION_NUMBER 3047001
-#define SQLITE_SOURCE_ID      "2024-11-25 12:07:48 b95d11e958643b969c47a8e5857f3793b9e69700b8f1469371386369a26e577e"
+#define SQLITE_VERSION        "3.47.2"
+#define SQLITE_VERSION_NUMBER 3047002
+#define SQLITE_SOURCE_ID      "2024-12-07 20:39:59 2aabe05e2e8cae4847a802ee2daddc1d7413d8fc560254d93ee3e72c14685b6c"
 
 /*
 ** CAPI3REF: Run-Time Library Version Numbers
@@ -35697,8 +35697,8 @@ SQLITE_PRIVATE int sqlite3AtoF(const char *z, double *pResult, int length, u8 en
   int eValid = 1;  /* True exponent is either not used or is well-formed */
   int nDigit = 0;  /* Number of digits processed */
   int eType = 1;   /* 1: pure integer,  2+: fractional  -1 or less: bad UTF16 */
+  u64 s2;          /* round-tripped significand */
   double rr[2];
-  u64 s2;
 
   assert( enc==SQLITE_UTF8 || enc==SQLITE_UTF16LE || enc==SQLITE_UTF16BE );
   *pResult = 0.0;   /* Default return value, in case of an error */
@@ -35801,7 +35801,7 @@ SQLITE_PRIVATE int sqlite3AtoF(const char *z, double *pResult, int length, u8 en
   e = (e*esign) + d;
 
   /* Try to adjust the exponent to make it smaller */
-  while( e>0 && s<(LARGEST_UINT64/10) ){
+  while( e>0 && s<((LARGEST_UINT64-0x7ff)/10) ){
     s *= 10;
     e--;
   }
@@ -35811,11 +35811,16 @@ SQLITE_PRIVATE int sqlite3AtoF(const char *z, double *pResult, int length, u8 en
   }
 
   rr[0] = (double)s;
-  s2 = (u64)rr[0];
-#if defined(_MSC_VER) && _MSC_VER<1700
-  if( s2==0x8000000000000000LL ){ s2 = 2*(u64)(0.5*rr[0]); }
-#endif
-  rr[1] = s>=s2 ? (double)(s - s2) : -(double)(s2 - s);
+  assert( sizeof(s2)==sizeof(rr[0]) );
+  memcpy(&s2, &rr[0], sizeof(s2));
+  if( s2<=0x43efffffffffffffLL ){
+    s2 = (u64)rr[0];
+    rr[1] = s>=s2 ? (double)(s - s2) : -(double)(s2 - s);
+  }else{
+    rr[1] = 0.0;
+  }
+  assert( rr[1]<=1.0e-10*rr[0] );  /* Equal only when rr[0]==0.0 */
+
   if( e>0 ){
     while( e>=100  ){
       e -= 100;
@@ -147605,32 +147610,32 @@ static Expr *substExpr(
         if( pSubst->isOuterJoin ){
           ExprSetProperty(pNew, EP_CanBeNull);
         }
-        if( ExprHasProperty(pExpr,EP_OuterON|EP_InnerON) ){
-          sqlite3SetJoinExpr(pNew, pExpr->w.iJoin,
-                             pExpr->flags & (EP_OuterON|EP_InnerON));
-        }
-        sqlite3ExprDelete(db, pExpr);
-        pExpr = pNew;
-        if( pExpr->op==TK_TRUEFALSE ){
-          pExpr->u.iValue = sqlite3ExprTruthValue(pExpr);
-          pExpr->op = TK_INTEGER;
-          ExprSetProperty(pExpr, EP_IntValue);
+        if( pNew->op==TK_TRUEFALSE ){
+          pNew->u.iValue = sqlite3ExprTruthValue(pNew);
+          pNew->op = TK_INTEGER;
+          ExprSetProperty(pNew, EP_IntValue);
         }
 
         /* Ensure that the expression now has an implicit collation sequence,
         ** just as it did when it was a column of a view or sub-query. */
         {
-          CollSeq *pNat = sqlite3ExprCollSeq(pSubst->pParse, pExpr);
+          CollSeq *pNat = sqlite3ExprCollSeq(pSubst->pParse, pNew);
           CollSeq *pColl = sqlite3ExprCollSeq(pSubst->pParse,
                 pSubst->pCList->a[iColumn].pExpr
           );
-          if( pNat!=pColl || (pExpr->op!=TK_COLUMN && pExpr->op!=TK_COLLATE) ){
-            pExpr = sqlite3ExprAddCollateString(pSubst->pParse, pExpr,
+          if( pNat!=pColl || (pNew->op!=TK_COLUMN && pNew->op!=TK_COLLATE) ){
+            pNew = sqlite3ExprAddCollateString(pSubst->pParse, pNew,
                 (pColl ? pColl->zName : "BINARY")
             );
           }
         }
-        ExprClearProperty(pExpr, EP_Collate);
+        ExprClearProperty(pNew, EP_Collate);
+        if( ExprHasProperty(pExpr,EP_OuterON|EP_InnerON) ){
+          sqlite3SetJoinExpr(pNew, pExpr->w.iJoin,
+                             pExpr->flags & (EP_OuterON|EP_InnerON));
+        }
+        sqlite3ExprDelete(db, pExpr);
+        pExpr = pNew;
       }
     }
   }else{
@@ -254938,7 +254943,7 @@ static void fts5SourceIdFunc(
 ){
   assert( nArg==0 );
   UNUSED_PARAM2(nArg, apUnused);
-  sqlite3_result_text(pCtx, "fts5: 2024-11-25 12:07:48 b95d11e958643b969c47a8e5857f3793b9e69700b8f1469371386369a26e577e", -1, SQLITE_TRANSIENT);
+  sqlite3_result_text(pCtx, "fts5: 2024-12-07 20:39:59 2aabe05e2e8cae4847a802ee2daddc1d7413d8fc560254d93ee3e72c14685b6c", -1, SQLITE_TRANSIENT);
 }
 
 /*
diff --git a/deps/sqlite/sqlite3.h b/deps/sqlite/sqlite3.h
index dbecc3fe896cf7..d8ce1482a352af 100644
--- a/deps/sqlite/sqlite3.h
+++ b/deps/sqlite/sqlite3.h
@@ -146,9 +146,9 @@ extern "C" {
 ** [sqlite3_libversion_number()], [sqlite3_sourceid()],
 ** [sqlite_version()] and [sqlite_source_id()].
 */
-#define SQLITE_VERSION        "3.47.1"
-#define SQLITE_VERSION_NUMBER 3047001
-#define SQLITE_SOURCE_ID      "2024-11-25 12:07:48 b95d11e958643b969c47a8e5857f3793b9e69700b8f1469371386369a26e577e"
+#define SQLITE_VERSION        "3.47.2"
+#define SQLITE_VERSION_NUMBER 3047002
+#define SQLITE_SOURCE_ID      "2024-12-07 20:39:59 2aabe05e2e8cae4847a802ee2daddc1d7413d8fc560254d93ee3e72c14685b6c"
 
 /*
 ** CAPI3REF: Run-Time Library Version Numbers
diff --git a/deps/uv/AUTHORS b/deps/uv/AUTHORS
index 807440b30e8488..041b7aff610f57 100644
--- a/deps/uv/AUTHORS
+++ b/deps/uv/AUTHORS
@@ -588,3 +588,5 @@ Raihaan Shouhell <raihaanhimself@gmail.com>
 Rialbat <miha-wead@mail.ru>
 Adam <adam@NetBSD.org>
 Poul T Lomholt <ptlomholt@users.noreply.github.com>
+dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
+Thad House <ThadHouse@users.noreply.github.com>
diff --git a/deps/uv/ChangeLog b/deps/uv/ChangeLog
index e1d1aa32989124..dc2dd2790c57d3 100644
--- a/deps/uv/ChangeLog
+++ b/deps/uv/ChangeLog
@@ -1,4 +1,22 @@
-2024.10.11, Version 1.49.1 (Stable)
+2024.10.18, Version 1.49.2 (Stable)
+
+Changes since version 1.49.1:
+
+* win,fs: remove trailing slash in junctions (Hüseyin Açacak)
+
+* Revert "linux: eliminate a read on eventfd per wakeup" (Ben Noordhuis)
+
+* win: Fix linked list logic in getaddrinfo (Thad House)
+
+* win: fix compilation against Windows 24H2 SDK (Thad House)
+
+* win: remap ERROR_NOACCESS and ERROR_BUFFER_OVERFLOW (Jameson Nash)
+
+* win,fs: match trailing slash presence in junctions to user input (Jameson
+  Nash)
+
+
+2024.10.11, Version 1.49.1 (Stable), 8be336f4ee296d20e1c071a44d6adf279e202236
 
 Changes since version 1.49.0:
 
diff --git a/deps/uv/configure.ac b/deps/uv/configure.ac
index e3ee8a840c6872..98c59363026f86 100644
--- a/deps/uv/configure.ac
+++ b/deps/uv/configure.ac
@@ -13,7 +13,7 @@
 # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 
 AC_PREREQ(2.57)
-AC_INIT([libuv], [1.49.1], [https://github.com/libuv/libuv/issues])
+AC_INIT([libuv], [1.49.2], [https://github.com/libuv/libuv/issues])
 AC_CONFIG_MACRO_DIR([m4])
 m4_include([m4/libuv-extra-automake-flags.m4])
 m4_include([m4/as_case.m4])
diff --git a/deps/uv/include/uv/version.h b/deps/uv/include/uv/version.h
index 77a8b2541749f9..cfa7871322e690 100644
--- a/deps/uv/include/uv/version.h
+++ b/deps/uv/include/uv/version.h
@@ -32,7 +32,7 @@
 
 #define UV_VERSION_MAJOR 1
 #define UV_VERSION_MINOR 49
-#define UV_VERSION_PATCH 1
+#define UV_VERSION_PATCH 2
 #define UV_VERSION_IS_RELEASE 1
 #define UV_VERSION_SUFFIX ""
 
diff --git a/deps/uv/src/unix/async.c b/deps/uv/src/unix/async.c
index bc97ec54c4fcc6..0ff2669e30a628 100644
--- a/deps/uv/src/unix/async.c
+++ b/deps/uv/src/unix/async.c
@@ -38,34 +38,6 @@
 #include <sys/eventfd.h>
 #endif
 
-#if UV__KQUEUE_EVFILT_USER
-static uv_once_t kqueue_runtime_detection_guard = UV_ONCE_INIT;
-static int kqueue_evfilt_user_support = 1;
-
-
-static void uv__kqueue_runtime_detection(void) {
-  int kq;
-  struct kevent ev[2];
-  struct timespec timeout = {0, 0};
-
-  /* Perform the runtime detection to ensure that kqueue with
-   * EVFILT_USER actually works. */
-  kq = kqueue();
-  EV_SET(ev, UV__KQUEUE_EVFILT_USER_IDENT, EVFILT_USER,
-         EV_ADD | EV_CLEAR, 0, 0, 0);
-  EV_SET(ev + 1, UV__KQUEUE_EVFILT_USER_IDENT, EVFILT_USER,
-         0, NOTE_TRIGGER, 0, 0);
-  if (kevent(kq, ev, 2, ev, 1, &timeout) < 1 || 
-      ev[0].filter != EVFILT_USER ||
-      ev[0].ident != UV__KQUEUE_EVFILT_USER_IDENT ||
-      ev[0].flags & EV_ERROR)
-    /* If we wind up here, we can assume that EVFILT_USER is defined but
-     * broken on the current system. */
-    kqueue_evfilt_user_support = 0;
-  uv__close(kq);
-}
-#endif
-
 static void uv__async_send(uv_loop_t* loop);
 static int uv__async_start(uv_loop_t* loop);
 static void uv__cpu_relax(void);
@@ -158,10 +130,8 @@ void uv__async_close(uv_async_t* handle) {
 
 
 static void uv__async_io(uv_loop_t* loop, uv__io_t* w, unsigned int events) {
-#ifndef __linux__
   char buf[1024];
   ssize_t r;
-#endif
   struct uv__queue queue;
   struct uv__queue* q;
   uv_async_t* h;
@@ -169,12 +139,7 @@ static void uv__async_io(uv_loop_t* loop, uv__io_t* w, unsigned int events) {
 
   assert(w == &loop->async_io_watcher);
 
-#ifndef __linux__
-#if UV__KQUEUE_EVFILT_USER
-  for (;!kqueue_evfilt_user_support;) {
-#else
   for (;;) {
-#endif
     r = read(w->fd, buf, sizeof(buf));
 
     if (r == sizeof(buf))
@@ -191,7 +156,6 @@ static void uv__async_io(uv_loop_t* loop, uv__io_t* w, unsigned int events) {
 
     abort();
   }
-#endif /* !__linux__ */
 
   uv__queue_move(&loop->async_handles, &queue);
   while (!uv__queue_empty(&queue)) {
@@ -215,58 +179,34 @@ static void uv__async_io(uv_loop_t* loop, uv__io_t* w, unsigned int events) {
 
 
 static void uv__async_send(uv_loop_t* loop) {
+  const void* buf;
+  ssize_t len;
   int fd;
-  ssize_t r;
-#ifdef __linux__
-  uint64_t val;
-
-  fd = loop->async_io_watcher.fd;  /* eventfd */
-  for (val = 1; /* empty */; val = 1) {
-    r = write(fd, &val, sizeof(uint64_t));
-    if (r < 0) {
-      /* When EAGAIN occurs, the eventfd counter hits the maximum value of the unsigned 64-bit.
-       * We need to first drain the eventfd and then write again.
-       *
-       * Check out https://man7.org/linux/man-pages/man2/eventfd.2.html for details.
-       */
-      if (errno == EAGAIN) {
-        /* It's ready to retry. */
-        if (read(fd, &val, sizeof(uint64_t)) > 0 || errno == EAGAIN) {
-          continue;
-        }
-      }
-      /* Unknown error occurs. */
-      break;
-    }
-    return;
-  }
-#else
-#if UV__KQUEUE_EVFILT_USER
-  struct kevent ev;
-
-  if (kqueue_evfilt_user_support) {
-    fd = loop->async_io_watcher.fd; /* magic number for EVFILT_USER */
-    EV_SET(&ev, fd, EVFILT_USER, 0, NOTE_TRIGGER, 0, 0);
-    r = kevent(loop->backend_fd, &ev, 1, NULL, 0, NULL);
-    if (r == 0)
-      return;
-    else
-      abort();
+  int r;
+
+  buf = "";
+  len = 1;
+  fd = loop->async_wfd;
+
+#if defined(__linux__)
+  if (fd == -1) {
+    static const uint64_t val = 1;
+    buf = &val;
+    len = sizeof(val);
+    fd = loop->async_io_watcher.fd;  /* eventfd */
   }
 #endif
 
-  fd = loop->async_wfd; /* write end of the pipe */
   do
-    r = write(fd, "x", 1);
+    r = write(fd, buf, len);
   while (r == -1 && errno == EINTR);
 
-  if (r == 1)
+  if (r == len)
     return;
 
   if (r == -1)
     if (errno == EAGAIN || errno == EWOULDBLOCK)
       return;
-#endif
 
   abort();
 }
@@ -275,9 +215,6 @@ static void uv__async_send(uv_loop_t* loop) {
 static int uv__async_start(uv_loop_t* loop) {
   int pipefd[2];
   int err;
-#if UV__KQUEUE_EVFILT_USER
-  struct kevent ev;
-#endif
 
   if (loop->async_io_watcher.fd != -1)
     return 0;
@@ -289,36 +226,6 @@ static int uv__async_start(uv_loop_t* loop) {
 
   pipefd[0] = err;
   pipefd[1] = -1;
-#elif UV__KQUEUE_EVFILT_USER
-  uv_once(&kqueue_runtime_detection_guard, uv__kqueue_runtime_detection);
-  if (kqueue_evfilt_user_support) {
-    /* In order not to break the generic pattern of I/O polling, a valid
-     * file descriptor is required to take up a room in loop->watchers,
-     * thus we create one for that, but this fd will not be actually used,
-     * it's just a placeholder and magic number which is going to be closed
-     * during the cleanup, as other FDs. */
-    err = uv__open_cloexec("/dev/null", O_RDONLY);
-    if (err < 0)
-      return err;
-
-    pipefd[0] = err;
-    pipefd[1] = -1;
-
-    /* When using EVFILT_USER event to wake up the kqueue, this event must be
-     * registered beforehand. Otherwise, calling kevent() to issue an
-     * unregistered EVFILT_USER event will get an ENOENT.
-     * Since uv__async_send() may happen before uv__io_poll() with multi-threads,
-     * we can't defer this registration of EVFILT_USER event as we did for other
-     * events, but must perform it right away. */
-    EV_SET(&ev, err, EVFILT_USER, EV_ADD | EV_CLEAR, 0, 0, 0);
-    err = kevent(loop->backend_fd, &ev, 1, NULL, 0, NULL);
-    if (err < 0)
-      return UV__ERR(errno);
-  } else {
-    err = uv__make_pipe(pipefd, UV_NONBLOCK_PIPE);
-    if (err < 0)
-      return err;
-  }
 #else
   err = uv__make_pipe(pipefd, UV_NONBLOCK_PIPE);
   if (err < 0)
@@ -329,13 +236,6 @@ static int uv__async_start(uv_loop_t* loop) {
   uv__io_start(loop, &loop->async_io_watcher, POLLIN);
   loop->async_wfd = pipefd[1];
 
-#if UV__KQUEUE_EVFILT_USER
-  /* Prevent the EVFILT_USER event from being added to kqueue redundantly
-   * and mistakenly later in uv__io_poll(). */
-  if (kqueue_evfilt_user_support)
-    loop->async_io_watcher.events = loop->async_io_watcher.pevents; 
-#endif
-
   return 0;
 }
 
diff --git a/deps/uv/src/unix/internal.h b/deps/uv/src/unix/internal.h
index 568a55b55acb35..8d586b0b64a96c 100644
--- a/deps/uv/src/unix/internal.h
+++ b/deps/uv/src/unix/internal.h
@@ -35,10 +35,6 @@
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
-#if defined(__APPLE__) || defined(__DragonFly__) || \
-    defined(__FreeBSD__) || defined(__NetBSD__)
-#include <sys/event.h>
-#endif
 
 #define uv__msan_unpoison(p, n)                                               \
   do {                                                                        \
@@ -508,22 +504,4 @@ int uv__get_constrained_cpu(uv__cpu_constraint* constraint);
 #endif
 #endif
 
-#if defined(EVFILT_USER) && defined(NOTE_TRIGGER)
-/* EVFILT_USER is available since OS X 10.6, DragonFlyBSD 4.0,
- * FreeBSD 8.1, and NetBSD 10.0.
- * 
- * Note that even though EVFILT_USER is defined on the current system,
- * it may still fail to work at runtime somehow. In that case, we fall
- * back to pipe-based signaling.
- */
-#define UV__KQUEUE_EVFILT_USER 1
-/* Magic number of identifier used for EVFILT_USER during runtime detection.
- * There are no Google hits for this number when I create it. That way,
- * people will be directed here if this number gets printed due to some
- * kqueue error and they google for help. */
-#define UV__KQUEUE_EVFILT_USER_IDENT 0x1e7e7711
-#else
-#define UV__KQUEUE_EVFILT_USER 0
-#endif
-
 #endif /* UV_UNIX_INTERNAL_H_ */
diff --git a/deps/uv/src/unix/kqueue.c b/deps/uv/src/unix/kqueue.c
index 876b717086c609..66aa166f053f52 100644
--- a/deps/uv/src/unix/kqueue.c
+++ b/deps/uv/src/unix/kqueue.c
@@ -367,17 +367,6 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
         continue;
       }
 
-#if UV__KQUEUE_EVFILT_USER
-      if (ev->filter == EVFILT_USER) {
-        w = &loop->async_io_watcher;
-        assert(fd == w->fd);
-        uv__metrics_update_idle_time(loop);
-        w->cb(loop, w, w->events);
-        nevents++;
-        continue;
-      }
-#endif
-
       if (ev->filter == EVFILT_VNODE) {
         assert(w->events == POLLIN);
         assert(w->pevents == POLLIN);
diff --git a/deps/uv/src/unix/linux.c b/deps/uv/src/unix/linux.c
index 803a9a9d3f04c9..857a4ef8a6686f 100644
--- a/deps/uv/src/unix/linux.c
+++ b/deps/uv/src/unix/linux.c
@@ -1414,12 +1414,6 @@ void uv__io_poll(uv_loop_t* loop, int timeout) {
 
     w->events = w->pevents;
     e.events = w->pevents;
-    if (w == &loop->async_io_watcher)
-      /* Enable edge-triggered mode on async_io_watcher(eventfd),
-       * so that we're able to eliminate the overhead of reading
-       * the eventfd via system call on each event loop wakeup.
-       */
-      e.events |= EPOLLET;
     e.data.fd = w->fd;
     fd = w->fd;
 
diff --git a/deps/uv/src/win/error.c b/deps/uv/src/win/error.c
index 58587c5fb785ea..7abf906bb5c823 100644
--- a/deps/uv/src/win/error.c
+++ b/deps/uv/src/win/error.c
@@ -69,7 +69,6 @@ int uv_translate_sys_error(int sys_errno) {
   }
 
   switch (sys_errno) {
-    case ERROR_NOACCESS:                    return UV_EACCES;
     case WSAEACCES:                         return UV_EACCES;
     case ERROR_ELEVATION_REQUIRED:          return UV_EACCES;
     case ERROR_CANT_ACCESS_FILE:            return UV_EACCES;
@@ -96,7 +95,7 @@ int uv_translate_sys_error(int sys_errno) {
     case WSAECONNRESET:                     return UV_ECONNRESET;
     case ERROR_ALREADY_EXISTS:              return UV_EEXIST;
     case ERROR_FILE_EXISTS:                 return UV_EEXIST;
-    case ERROR_BUFFER_OVERFLOW:             return UV_EFAULT;
+    case ERROR_NOACCESS:                    return UV_EFAULT;
     case WSAEFAULT:                         return UV_EFAULT;
     case ERROR_HOST_UNREACHABLE:            return UV_EHOSTUNREACH;
     case WSAEHOSTUNREACH:                   return UV_EHOSTUNREACH;
@@ -127,6 +126,7 @@ int uv_translate_sys_error(int sys_errno) {
     case ERROR_TOO_MANY_OPEN_FILES:         return UV_EMFILE;
     case WSAEMFILE:                         return UV_EMFILE;
     case WSAEMSGSIZE:                       return UV_EMSGSIZE;
+    case ERROR_BUFFER_OVERFLOW:             return UV_ENAMETOOLONG;
     case ERROR_FILENAME_EXCED_RANGE:        return UV_ENAMETOOLONG;
     case ERROR_NETWORK_UNREACHABLE:         return UV_ENETUNREACH;
     case WSAENETUNREACH:                    return UV_ENETUNREACH;
diff --git a/deps/uv/src/win/fs.c b/deps/uv/src/win/fs.c
index 08b42eb14c972a..f2215bb3082178 100644
--- a/deps/uv/src/win/fs.c
+++ b/deps/uv/src/win/fs.c
@@ -2566,16 +2566,17 @@ static void fs__create_junction(uv_fs_t* req, const WCHAR* path,
 
     path_buf[path_buf_len++] = path[i];
   }
-  path_buf[path_buf_len++] = L'\\';
+  if (add_slash)
+    path_buf[path_buf_len++] = L'\\';
   len = path_buf_len - start;
 
+  /* Insert null terminator */
+  path_buf[path_buf_len++] = L'\0';
+
   /* Set the info about the substitute name */
   buffer->MountPointReparseBuffer.SubstituteNameOffset = start * sizeof(WCHAR);
   buffer->MountPointReparseBuffer.SubstituteNameLength = len * sizeof(WCHAR);
 
-  /* Insert null terminator */
-  path_buf[path_buf_len++] = L'\0';
-
   /* Copy the print name of the target path */
   start = path_buf_len;
   add_slash = 0;
@@ -2593,18 +2594,18 @@ static void fs__create_junction(uv_fs_t* req, const WCHAR* path,
     path_buf[path_buf_len++] = path[i];
   }
   len = path_buf_len - start;
-  if (len == 2) {
+  if (len == 2 || add_slash) {
     path_buf[path_buf_len++] = L'\\';
     len++;
   }
 
+  /* Insert another null terminator */
+  path_buf[path_buf_len++] = L'\0';
+
   /* Set the info about the print name */
   buffer->MountPointReparseBuffer.PrintNameOffset = start * sizeof(WCHAR);
   buffer->MountPointReparseBuffer.PrintNameLength = len * sizeof(WCHAR);
 
-  /* Insert another null terminator */
-  path_buf[path_buf_len++] = L'\0';
-
   /* Calculate how much buffer space was actually used */
   used_buf_size = FIELD_OFFSET(REPARSE_DATA_BUFFER, MountPointReparseBuffer.PathBuffer) +
     path_buf_len * sizeof(WCHAR);
diff --git a/deps/uv/src/win/getaddrinfo.c b/deps/uv/src/win/getaddrinfo.c
index f20e10d49d974a..4b8ee75a0622f6 100644
--- a/deps/uv/src/win/getaddrinfo.c
+++ b/deps/uv/src/win/getaddrinfo.c
@@ -191,8 +191,9 @@ static void uv__getaddrinfo_done(struct uv__work* w, int status) {
         if (addrinfow_ptr == NULL)
           break;
         cur_off = align_offset(cur_off, sizeof(void *));
-        addrinfo_ptr = (struct addrinfo *)(alloc_ptr + cur_off);
-        addrinfo_ptr->ai_next = addrinfo_ptr;
+        struct addrinfo *next_addrinfo_ptr = (struct addrinfo *)(alloc_ptr + cur_off);
+        addrinfo_ptr->ai_next = next_addrinfo_ptr;
+        addrinfo_ptr = next_addrinfo_ptr;
       }
       req->addrinfo = (struct addrinfo*)alloc_ptr;
     } else {
diff --git a/deps/uv/src/win/winapi.h b/deps/uv/src/win/winapi.h
index 548081f23a9276..5800e70dfd7d11 100644
--- a/deps/uv/src/win/winapi.h
+++ b/deps/uv/src/win/winapi.h
@@ -4125,6 +4125,12 @@ typedef const UNICODE_STRING *PCUNICODE_STRING;
 # define DEVICE_TYPE DWORD
 #endif
 
+#ifndef NTDDI_WIN11_ZN
+# define NTDDI_WIN11_ZN  0x0A00000E
+#endif
+
+/* API is defined in newer SDKS */
+#if (NTDDI_VERSION < NTDDI_WIN11_ZN)
 typedef struct _FILE_STAT_BASIC_INFORMATION {
   LARGE_INTEGER FileId;
   LARGE_INTEGER CreationTime;
@@ -4142,6 +4148,7 @@ typedef struct _FILE_STAT_BASIC_INFORMATION {
   FILE_ID_128 FileId128;
   LARGE_INTEGER VolumeSerialNumber;
 } FILE_STAT_BASIC_INFORMATION;
+#endif
 
 /* MinGW already has a definition for REPARSE_DATA_BUFFER, but mingw-w64 does
  * not.
@@ -4783,6 +4790,8 @@ typedef struct _TCP_INITIAL_RTO_PARAMETERS {
 #endif
 
 /* from winnt.h */
+/* API is defined in newer SDKS */
+#if (NTDDI_VERSION < NTDDI_WIN11_ZN)
 typedef enum _FILE_INFO_BY_NAME_CLASS {
   FileStatByNameInfo,
   FileStatLxByNameInfo,
@@ -4790,6 +4799,7 @@ typedef enum _FILE_INFO_BY_NAME_CLASS {
   FileStatBasicByNameInfo,
   MaximumFileInfoByNameClass
 } FILE_INFO_BY_NAME_CLASS;
+#endif
 
 typedef BOOL(WINAPI* sGetFileInformationByName)(
     PCWSTR FileName,
diff --git a/deps/uv/test/test-error.c b/deps/uv/test/test-error.c
index 2c6d0ca49790e0..b6e18b0f052eae 100644
--- a/deps/uv/test/test-error.c
+++ b/deps/uv/test/test-error.c
@@ -64,7 +64,7 @@ TEST_IMPL(error_message) {
 
 TEST_IMPL(sys_error) {
 #if defined(_WIN32)
-  ASSERT_EQ(uv_translate_sys_error(ERROR_NOACCESS), UV_EACCES);
+  ASSERT_EQ(uv_translate_sys_error(ERROR_NOACCESS), UV_EFAULT);
   ASSERT_EQ(uv_translate_sys_error(ERROR_ELEVATION_REQUIRED), UV_EACCES);
   ASSERT_EQ(uv_translate_sys_error(WSAEADDRINUSE), UV_EADDRINUSE);
   ASSERT_EQ(uv_translate_sys_error(ERROR_BAD_PIPE), UV_EPIPE);
diff --git a/deps/uv/test/test-fs.c b/deps/uv/test/test-fs.c
index ff0f9fc89a2d1b..33cbd428707c36 100644
--- a/deps/uv/test/test-fs.c
+++ b/deps/uv/test/test-fs.c
@@ -2379,8 +2379,8 @@ int test_symlink_dir_impl(int type) {
   strcpy(test_dir_abs_buf, "\\\\?\\");
   uv_cwd(test_dir_abs_buf + 4, &test_dir_abs_size);
   test_dir_abs_size += 4;
-  strcat(test_dir_abs_buf, "\\test_dir\\");
-  test_dir_abs_size += strlen("\\test_dir\\");
+  strcat(test_dir_abs_buf, "\\test_dir");
+  test_dir_abs_size += strlen("\\test_dir");
   test_dir = test_dir_abs_buf;
 #else
   uv_cwd(test_dir_abs_buf, &test_dir_abs_size);
@@ -2435,8 +2435,8 @@ int test_symlink_dir_impl(int type) {
   r = uv_fs_realpath(NULL, &req, "test_dir_symlink", NULL);
   ASSERT_OK(r);
 #ifdef _WIN32
-  ASSERT_EQ(strlen(req.ptr), test_dir_abs_size - 5);
-  ASSERT_OK(_strnicmp(req.ptr, test_dir + 4, test_dir_abs_size - 5));
+  ASSERT_EQ(strlen(req.ptr), test_dir_abs_size - 4);
+  ASSERT_OK(_strnicmp(req.ptr, test_dir + 4, test_dir_abs_size - 4));
 #else
   ASSERT_OK(strcmp(req.ptr, test_dir_abs_buf));
 #endif
diff --git a/deps/v8/include/v8config.h b/deps/v8/include/v8config.h
index b6d087b958edc1..73a6a91d49bf0e 100644
--- a/deps/v8/include/v8config.h
+++ b/deps/v8/include/v8config.h
@@ -581,11 +581,15 @@ path. Add it with -I<path> to the command line
 // functions.
 // Use like:
 //   V8_NOINLINE V8_PRESERVE_MOST void UnlikelyMethod();
+#if V8_OS_WIN
+# define V8_PRESERVE_MOST
+#else
 #if V8_HAS_ATTRIBUTE_PRESERVE_MOST
 # define V8_PRESERVE_MOST __attribute__((preserve_most))
 #else
 # define V8_PRESERVE_MOST /* NOT SUPPORTED */
 #endif
+#endif
 
 
 // A macro (V8_DEPRECATED) to mark classes or functions as deprecated.
diff --git a/doc/api/addons.md b/doc/api/addons.md
index e0e00dca0b9e8b..8e2864952e0841 100644
--- a/doc/api/addons.md
+++ b/doc/api/addons.md
@@ -72,6 +72,7 @@ namespace demo {
 using v8::FunctionCallbackInfo;
 using v8::Isolate;
 using v8::Local;
+using v8::NewStringType;
 using v8::Object;
 using v8::String;
 using v8::Value;
@@ -79,7 +80,7 @@ using v8::Value;
 void Method(const FunctionCallbackInfo<Value>& args) {
   Isolate* isolate = args.GetIsolate();
   args.GetReturnValue().Set(String::NewFromUtf8(
-      isolate, "world").ToLocalChecked());
+      isolate, "world", NewStringType::kNormal).ToLocalChecked());
 }
 
 void Initialize(Local<Object> exports) {
diff --git a/doc/api/buffer.md b/doc/api/buffer.md
index d72e8720c688fa..c07443601c8b67 100644
--- a/doc/api/buffer.md
+++ b/doc/api/buffer.md
@@ -1042,7 +1042,8 @@ in `list` by adding their lengths.
 
 If `totalLength` is provided, it is coerced to an unsigned integer. If the
 combined length of the `Buffer`s in `list` exceeds `totalLength`, the result is
-truncated to `totalLength`.
+truncated to `totalLength`. If the combined length of the `Buffer`s in `list` is
+less than `totalLength`, the remaining space is filled with zeros.
 
 ```mjs
 import { Buffer } from 'node:buffer';
diff --git a/doc/api/cli.md b/doc/api/cli.md
index 8a4f376360008f..271ede0755cefb 100644
--- a/doc/api/cli.md
+++ b/doc/api/cli.md
@@ -123,7 +123,7 @@ require('nodejs-addon-example');
 ```
 
 ```console
-$ node --experimental-permission --allow-fs-read=* index.js
+$ node --permission --allow-fs-read=* index.js
 node:internal/modules/cjs/loader:1319
   return process.dlopen(module, path.toNamespacedPath(filename));
                  ^
@@ -165,7 +165,7 @@ childProcess.spawn('node', ['-e', 'require("fs").writeFileSync("/new-file", "exa
 ```
 
 ```console
-$ node --experimental-permission --allow-fs-read=* index.js
+$ node --permission --allow-fs-read=* index.js
 node:internal/child_process:388
   const err = this._handle.spawn(options);
                            ^
@@ -189,12 +189,15 @@ Error: Access to this API has been restricted
 <!-- YAML
 added: v20.0.0
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56201
+    description: Permission Model and --allow-fs flags are stable.
   - version: v20.7.0
     pr-url: https://github.com/nodejs/node/pull/49047
     description: Paths delimited by comma (`,`) are no longer allowed.
 -->
 
-> Stability: 1.1 - Active development
+> Stability: 2 - Stable.
 
 This flag configures file system read permissions using
 the [Permission Model][].
@@ -210,7 +213,7 @@ Examples can be found in the [File System Permissions][] documentation.
 The initializer module also needs to be allowed. Consider the following example:
 
 ```console
-$ node --experimental-permission index.js
+$ node --permission index.js
 
 Error: Access to this API has been restricted
     at node:internal/main/run_main_module:23:47 {
@@ -223,7 +226,7 @@ Error: Access to this API has been restricted
 The process needs to have access to the `index.js` module:
 
 ```bash
-node --experimental-permission --allow-fs-read=/path/to/index.js index.js
+node --permission --allow-fs-read=/path/to/index.js index.js
 ```
 
 ### `--allow-fs-write`
@@ -231,12 +234,15 @@ node --experimental-permission --allow-fs-read=/path/to/index.js index.js
 <!-- YAML
 added: v20.0.0
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56201
+    description: Permission Model and --allow-fs flags are stable.
   - version: v20.7.0
     pr-url: https://github.com/nodejs/node/pull/49047
     description: Paths delimited by comma (`,`) are no longer allowed.
 -->
 
-> Stability: 1.1 - Active development
+> Stability: 2 - Stable.
 
 This flag configures file system write permissions using
 the [Permission Model][].
@@ -282,7 +288,7 @@ new WASI({
 ```
 
 ```console
-$ node --experimental-permission --allow-fs-read=* index.js
+$ node --permission --allow-fs-read=* index.js
 
 Error: Access to this API has been restricted
     at node:internal/main/run_main_module:30:49 {
@@ -313,7 +319,7 @@ new Worker(__filename);
 ```
 
 ```console
-$ node --experimental-permission --allow-fs-read=* index.js
+$ node --permission --allow-fs-read=* index.js
 
 Error: Access to this API has been restricted
     at node:internal/main/run_main_module:17:47 {
@@ -949,24 +955,6 @@ added:
 
 Enable experimental support for the network inspection with Chrome DevTools.
 
-### `--experimental-permission`
-
-<!-- YAML
-added: v20.0.0
--->
-
-> Stability: 1.1 - Active development
-
-Enable the Permission Model for current process. When enabled, the
-following permissions are restricted:
-
-* File System - manageable through
-  [`--allow-fs-read`][], [`--allow-fs-write`][] flags
-* Child Process - manageable through [`--allow-child-process`][] flag
-* Worker Threads - manageable through [`--allow-worker`][] flag
-* WASI - manageable through [`--allow-wasi`][] flag
-* Addons - manageable through [`--allow-addons`][] flag
-
 ### `--experimental-print-required-tla`
 
 <!-- YAML
@@ -1780,6 +1768,28 @@ unless either the `--pending-deprecation` command-line flag, or the
 are used to provide a kind of selective "early warning" mechanism that
 developers may leverage to detect deprecated API usage.
 
+### `--permission`
+
+<!-- YAML
+added: v20.0.0
+changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56201
+    description: Permission Model is now stable.
+-->
+
+> Stability: 2 - Stable.
+
+Enable the Permission Model for current process. When enabled, the
+following permissions are restricted:
+
+* File System - manageable through
+  [`--allow-fs-read`][], [`--allow-fs-write`][] flags
+* Child Process - manageable through [`--allow-child-process`][] flag
+* Worker Threads - manageable through [`--allow-worker`][] flag
+* WASI - manageable through [`--allow-wasi`][] flag
+* Addons - manageable through [`--allow-addons`][] flag
+
 ### `--preserve-symlinks`
 
 <!-- YAML
@@ -2257,6 +2267,9 @@ This option may be specified multiple times to exclude multiple glob patterns.
 If both `--test-coverage-exclude` and `--test-coverage-include` are provided,
 files must meet **both** criteria to be included in the coverage report.
 
+By default all the matching test files are excluded from the coverage report.
+Specifying this option will override the default behavior.
+
 ### `--test-coverage-functions=threshold`
 
 <!-- YAML
@@ -2613,6 +2626,18 @@ added:
 Prints a stack trace whenever an environment is exited proactively,
 i.e. invoking `process.exit()`.
 
+### `--trace-require-module=mode`
+
+<!-- YAML
+added:
+ - v23.5.0
+-->
+
+Prints information about usage of [Loading ECMAScript modules using `require()`][].
+
+When `mode` is `all`, all usage is printed. When `mode` is `no-node-modules`, usage
+from the `node_modules` folder is excluded.
+
 ### `--trace-sigint`
 
 <!-- YAML
@@ -3077,6 +3102,7 @@ one is included in the list below.
 * `--openssl-legacy-provider`
 * `--openssl-shared-config`
 * `--pending-deprecation`
+* `--permission`
 * `--preserve-symlinks-main`
 * `--preserve-symlinks`
 * `--prof-process`
@@ -3123,6 +3149,7 @@ one is included in the list below.
 * `--trace-event-file-pattern`
 * `--trace-events-enabled`
 * `--trace-exit`
+* `--trace-require-module`
 * `--trace-sigint`
 * `--trace-sync-io`
 * `--trace-tls`
diff --git a/doc/api/dgram.md b/doc/api/dgram.md
index 2243b6abdea9bc..4d2ef8dea164f9 100644
--- a/doc/api/dgram.md
+++ b/doc/api/dgram.md
@@ -957,6 +957,13 @@ changes:
   * `sendBufferSize` {number} Sets the `SO_SNDBUF` socket value.
   * `lookup` {Function} Custom lookup function. **Default:** [`dns.lookup()`][].
   * `signal` {AbortSignal} An AbortSignal that may be used to close a socket.
+  * `receiveBlockList` {net.BlockList} `receiveBlockList` can be used for discarding
+    inbound datagram to specific IP addresses, IP ranges, or IP subnets. This does not
+    work if the server is behind a reverse proxy, NAT, etc. because the address
+    checked against the blocklist is the address of the proxy, or the one
+    specified by the NAT.
+  * `sendBlockList` {net.BlockList} `sendBlockList` can be used for disabling outbound
+    access to specific IP addresses, IP ranges, or IP subnets.
 * `callback` {Function} Attached as a listener for `'message'` events. Optional.
 * Returns: {dgram.Socket}
 
diff --git a/doc/api/errors.md b/doc/api/errors.md
index f66cebd088ae4c..d59a51329a8bfa 100644
--- a/doc/api/errors.md
+++ b/doc/api/errors.md
@@ -2172,6 +2172,16 @@ added:
 An ESM loader hook returned without calling `next()` and without explicitly
 signaling a short circuit.
 
+<a id="ERR_LOAD_SQLITE_EXTENSION"></a>
+
+### `ERR_LOAD_SQLITE_EXTENSION`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+An error occurred while loading a SQLite extension.
+
 <a id="ERR_MEMORY_ALLOCATION_FAILED"></a>
 
 ### `ERR_MEMORY_ALLOCATION_FAILED`
diff --git a/doc/api/esm.md b/doc/api/esm.md
index ecb27680129737..23348d28c3649c 100644
--- a/doc/api/esm.md
+++ b/doc/api/esm.md
@@ -340,7 +340,7 @@ modules it can be used to load ES modules.
 * {Object}
 
 The `import.meta` meta property is an `Object` that contains the following
-properties.
+properties. It is only supported in ES modules.
 
 ### `import.meta.dirname`
 
diff --git a/doc/api/events.md b/doc/api/events.md
index 30985b1ce0c12f..be9de5d9aca0a2 100644
--- a/doc/api/events.md
+++ b/doc/api/events.md
@@ -1171,6 +1171,10 @@ that a "possible EventEmitter memory leak" has been detected. For any single
 `EventEmitter`, the `emitter.getMaxListeners()` and `emitter.setMaxListeners()`
 methods can be used to temporarily avoid this warning:
 
+`defaultMaxListeners` has no effect on `AbortSignal` instances. While it is
+still possible to use [`emitter.setMaxListeners(n)`][] to set a warning limit
+for individual `AbortSignal` instances, per default `AbortSignal` instances will not warn.
+
 ```mjs
 import { EventEmitter } from 'node:events';
 const emitter = new EventEmitter();
diff --git a/doc/api/http.md b/doc/api/http.md
index 9b9175a003f56c..886bbe26ecc95c 100644
--- a/doc/api/http.md
+++ b/doc/api/http.md
@@ -3848,8 +3848,13 @@ changes:
   * `port` {number} Port of remote server. **Default:** `defaultPort` if set,
     else `80`.
   * `protocol` {string} Protocol to use. **Default:** `'http:'`.
+  * `setDefaultHeaders` {boolean}: Specifies whether or not to automatically add
+    default headers such as `Connection`, `Content-Length`, `Transfer-Encoding`,
+    and `Host`. If set to `false` then all necessary headers must be added
+    manually. Defaults to `true`.
   * `setHost` {boolean}: Specifies whether or not to automatically add the
-    `Host` header. Defaults to `true`.
+    `Host` header. If provided, this overrides `setDefaultHeaders`. Defaults to
+    `true`.
   * `signal` {AbortSignal}: An AbortSignal that may be used to abort an ongoing
     request.
   * `socketPath` {string} Unix domain socket. Cannot be used if one of `host`
diff --git a/doc/api/module.md b/doc/api/module.md
index 19346182bf42d7..40615c39728cab 100644
--- a/doc/api/module.md
+++ b/doc/api/module.md
@@ -21,6 +21,10 @@ added:
   - v9.3.0
   - v8.10.0
   - v6.13.0
+changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56185
+    description: The list now also contains prefix-only modules.
 -->
 
 * {string\[]}
@@ -28,8 +32,6 @@ added:
 A list of the names of all modules provided by Node.js. Can be used to verify
 if a module is maintained by a third party or not.
 
-Note: the list doesn't contain [prefix-only modules][] like `node:test`.
-
 `module` in this context isn't the same object that's provided
 by the [module wrapper][]. To access it, require the `Module` module:
 
@@ -64,159 +66,6 @@ const require = createRequire(import.meta.url);
 const siblingModule = require('./sibling-module');
 ```
 
-### `module.constants.compileCacheStatus`
-
-<!-- YAML
-added: v22.8.0
--->
-
-> Stability: 1.1 - Active Development
-
-The following constants are returned as the `status` field in the object returned by
-[`module.enableCompileCache()`][] to indicate the result of the attempt to enable the
-[module compile cache][].
-
-<table>
-  <tr>
-    <th>Constant</th>
-    <th>Description</th>
-  </tr>
-  <tr>
-    <td><code>ENABLED</code></td>
-    <td>
-      Node.js has enabled the compile cache successfully. The directory used to store the
-      compile cache will be returned in the <code>directory</code> field in the
-      returned object.
-    </td>
-  </tr>
-  <tr>
-    <td><code>ALREADY_ENABLED</code></td>
-    <td>
-      The compile cache has already been enabled before, either by a previous call to
-      <code>module.enableCompileCache()</code>, or by the <code>NODE_COMPILE_CACHE=dir</code>
-      environment variable. The directory used to store the
-      compile cache will be returned in the <code>directory</code> field in the
-      returned object.
-    </td>
-  </tr>
-  <tr>
-    <td><code>FAILED</code></td>
-    <td>
-      Node.js fails to enable the compile cache. This can be caused by the lack of
-      permission to use the specified directory, or various kinds of file system errors.
-      The detail of the failure will be returned in the <code>message</code> field in the
-      returned object.
-    </td>
-  </tr>
-  <tr>
-    <td><code>DISABLED</code></td>
-    <td>
-      Node.js cannot enable the compile cache because the environment variable
-      <code>NODE_DISABLE_COMPILE_CACHE=1</code> has been set.
-    </td>
-  </tr>
-</table>
-
-### `module.enableCompileCache([cacheDir])`
-
-<!-- YAML
-added: v22.8.0
--->
-
-> Stability: 1.1 - Active Development
-
-* `cacheDir` {string|undefined} Optional path to specify the directory where the compile cache
-  will be stored/retrieved.
-* Returns: {Object}
-  * `status` {integer} One of the [`module.constants.compileCacheStatus`][]
-  * `message` {string|undefined} If Node.js cannot enable the compile cache, this contains
-    the error message. Only set if `status` is `module.constants.compileCacheStatus.FAILED`.
-  * `directory` {string|undefined} If the compile cache is enabled, this contains the directory
-    where the compile cache is stored. Only set if  `status` is
-    `module.constants.compileCacheStatus.ENABLED` or
-    `module.constants.compileCacheStatus.ALREADY_ENABLED`.
-
-Enable [module compile cache][] in the current Node.js instance.
-
-If `cacheDir` is not specified, Node.js will either use the directory specified by the
-[`NODE_COMPILE_CACHE=dir`][] environment variable if it's set, or use
-`path.join(os.tmpdir(), 'node-compile-cache')` otherwise. For general use cases, it's
-recommended to call `module.enableCompileCache()` without specifying the `cacheDir`,
-so that the directory can be overridden by the `NODE_COMPILE_CACHE` environment
-variable when necessary.
-
-Since compile cache is supposed to be a quiet optimization that is not required for the
-application to be functional, this method is designed to not throw any exception when the
-compile cache cannot be enabled. Instead, it will return an object containing an error
-message in the `message` field to aid debugging.
-If compile cache is enabled successfully, the `directory` field in the returned object
-contains the path to the directory where the compile cache is stored. The `status`
-field in the returned object would be one of the `module.constants.compileCacheStatus`
-values to indicate the result of the attempt to enable the [module compile cache][].
-
-This method only affects the current Node.js instance. To enable it in child worker threads,
-either call this method in child worker threads too, or set the
-`process.env.NODE_COMPILE_CACHE` value to compile cache directory so the behavior can
-be inherited into the child workers. The directory can be obtained either from the
-`directory` field returned by this method, or with [`module.getCompileCacheDir()`][].
-
-#### Module compile cache
-
-<!-- YAML
-added: v22.1.0
-changes:
-  - version: v22.8.0
-    pr-url: https://github.com/nodejs/node/pull/54501
-    description: add initial JavaScript APIs for runtime access.
--->
-
-The module compile cache can be enabled either using the [`module.enableCompileCache()`][]
-method or the [`NODE_COMPILE_CACHE=dir`][] environment variable. After it is enabled,
-whenever Node.js compiles a CommonJS or a ECMAScript Module, it will use on-disk
-[V8 code cache][] persisted in the specified directory to speed up the compilation.
-This may slow down the first load of a module graph, but subsequent loads of the same module
-graph may get a significant speedup if the contents of the modules do not change.
-
-To clean up the generated compile cache on disk, simply remove the cache directory. The cache
-directory will be recreated the next time the same directory is used for for compile cache
-storage. To avoid filling up the disk with stale cache, it is recommended to use a directory
-under the [`os.tmpdir()`][]. If the compile cache is enabled by a call to
-[`module.enableCompileCache()`][] without specifying the directory, Node.js will use
-the [`NODE_COMPILE_CACHE=dir`][] environment variable if it's set, or defaults
-to `path.join(os.tmpdir(), 'node-compile-cache')` otherwise. To locate the compile cache
-directory used by a running Node.js instance, use [`module.getCompileCacheDir()`][].
-
-Currently when using the compile cache with [V8 JavaScript code coverage][], the
-coverage being collected by V8 may be less precise in functions that are
-deserialized from the code cache. It's recommended to turn this off when
-running tests to generate precise coverage.
-
-The enabled module compile cache can be disabled by the [`NODE_DISABLE_COMPILE_CACHE=1`][]
-environment variable. This can be useful when the compile cache leads to unexpected or
-undesired behaviors (e.g. less precise test coverage).
-
-Compilation cache generated by one version of Node.js can not be reused by a different
-version of Node.js. Cache generated by different versions of Node.js will be stored
-separately if the same base directory is used to persist the cache, so they can co-exist.
-
-At the moment, when the compile cache is enabled and a module is loaded afresh, the
-code cache is generated from the compiled code immediately, but will only be written
-to disk when the Node.js instance is about to exit. This is subject to change. The
-[`module.flushCompileCache()`][] method can be used to ensure the accumulated code cache
-is flushed to disk in case the application wants to spawn other Node.js instances
-and let them share the cache long before the parent exits.
-
-### `module.getCompileCacheDir()`
-
-<!-- YAML
-added: v22.8.0
--->
-
-> Stability: 1.1 - Active Development
-
-* Returns: {string|undefined} Path to the [module compile cache][] directory if it is enabled,
-  or `undefined` otherwise.
-
 ### `module.findPackageJSON(specifier[, base])`
 
 <!-- YAML
@@ -352,7 +201,22 @@ changes:
 Register a module that exports [hooks][] that customize Node.js module
 resolution and loading behavior. See [Customization hooks][].
 
-## `module.stripTypeScriptTypes(code[, options])`
+### `module.registerHooks(options)`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+> Stability: 1.1 - Active development
+
+* `options` {Object}
+  * `load` {Function|undefined} See [load hook][]. **Default:** `undefined`.
+  * `resolve` {Function|undefined} See [resolve hook][]. **Default:** `undefined`.
+
+Register [hooks][] that customize Node.js module resolution and loading behavior.
+See [Customization hooks][].
+
+### `module.stripTypeScriptTypes(code[, options])`
 
 <!-- YAML
 added: v23.2.0
@@ -490,6 +354,174 @@ import('node:fs').then((esmFS) => {
 });
 ```
 
+## Module compile cache
+
+<!-- YAML
+added: v22.1.0
+changes:
+  - version: v22.8.0
+    pr-url: https://github.com/nodejs/node/pull/54501
+    description: add initial JavaScript APIs for runtime access.
+-->
+
+The module compile cache can be enabled either using the [`module.enableCompileCache()`][]
+method or the [`NODE_COMPILE_CACHE=dir`][] environment variable. After it is enabled,
+whenever Node.js compiles a CommonJS or a ECMAScript Module, it will use on-disk
+[V8 code cache][] persisted in the specified directory to speed up the compilation.
+This may slow down the first load of a module graph, but subsequent loads of the same module
+graph may get a significant speedup if the contents of the modules do not change.
+
+To clean up the generated compile cache on disk, simply remove the cache directory. The cache
+directory will be recreated the next time the same directory is used for for compile cache
+storage. To avoid filling up the disk with stale cache, it is recommended to use a directory
+under the [`os.tmpdir()`][]. If the compile cache is enabled by a call to
+[`module.enableCompileCache()`][] without specifying the directory, Node.js will use
+the [`NODE_COMPILE_CACHE=dir`][] environment variable if it's set, or defaults
+to `path.join(os.tmpdir(), 'node-compile-cache')` otherwise. To locate the compile cache
+directory used by a running Node.js instance, use [`module.getCompileCacheDir()`][].
+
+Currently when using the compile cache with [V8 JavaScript code coverage][], the
+coverage being collected by V8 may be less precise in functions that are
+deserialized from the code cache. It's recommended to turn this off when
+running tests to generate precise coverage.
+
+The enabled module compile cache can be disabled by the [`NODE_DISABLE_COMPILE_CACHE=1`][]
+environment variable. This can be useful when the compile cache leads to unexpected or
+undesired behaviors (e.g. less precise test coverage).
+
+Compilation cache generated by one version of Node.js can not be reused by a different
+version of Node.js. Cache generated by different versions of Node.js will be stored
+separately if the same base directory is used to persist the cache, so they can co-exist.
+
+At the moment, when the compile cache is enabled and a module is loaded afresh, the
+code cache is generated from the compiled code immediately, but will only be written
+to disk when the Node.js instance is about to exit. This is subject to change. The
+[`module.flushCompileCache()`][] method can be used to ensure the accumulated code cache
+is flushed to disk in case the application wants to spawn other Node.js instances
+and let them share the cache long before the parent exits.
+
+### `module.constants.compileCacheStatus`
+
+<!-- YAML
+added: v22.8.0
+-->
+
+> Stability: 1.1 - Active Development
+
+The following constants are returned as the `status` field in the object returned by
+[`module.enableCompileCache()`][] to indicate the result of the attempt to enable the
+[module compile cache][].
+
+<table>
+  <tr>
+    <th>Constant</th>
+    <th>Description</th>
+  </tr>
+  <tr>
+    <td><code>ENABLED</code></td>
+    <td>
+      Node.js has enabled the compile cache successfully. The directory used to store the
+      compile cache will be returned in the <code>directory</code> field in the
+      returned object.
+    </td>
+  </tr>
+  <tr>
+    <td><code>ALREADY_ENABLED</code></td>
+    <td>
+      The compile cache has already been enabled before, either by a previous call to
+      <code>module.enableCompileCache()</code>, or by the <code>NODE_COMPILE_CACHE=dir</code>
+      environment variable. The directory used to store the
+      compile cache will be returned in the <code>directory</code> field in the
+      returned object.
+    </td>
+  </tr>
+  <tr>
+    <td><code>FAILED</code></td>
+    <td>
+      Node.js fails to enable the compile cache. This can be caused by the lack of
+      permission to use the specified directory, or various kinds of file system errors.
+      The detail of the failure will be returned in the <code>message</code> field in the
+      returned object.
+    </td>
+  </tr>
+  <tr>
+    <td><code>DISABLED</code></td>
+    <td>
+      Node.js cannot enable the compile cache because the environment variable
+      <code>NODE_DISABLE_COMPILE_CACHE=1</code> has been set.
+    </td>
+  </tr>
+</table>
+
+### `module.enableCompileCache([cacheDir])`
+
+<!-- YAML
+added: v22.8.0
+-->
+
+> Stability: 1.1 - Active Development
+
+* `cacheDir` {string|undefined} Optional path to specify the directory where the compile cache
+  will be stored/retrieved.
+* Returns: {Object}
+  * `status` {integer} One of the [`module.constants.compileCacheStatus`][]
+  * `message` {string|undefined} If Node.js cannot enable the compile cache, this contains
+    the error message. Only set if `status` is `module.constants.compileCacheStatus.FAILED`.
+  * `directory` {string|undefined} If the compile cache is enabled, this contains the directory
+    where the compile cache is stored. Only set if  `status` is
+    `module.constants.compileCacheStatus.ENABLED` or
+    `module.constants.compileCacheStatus.ALREADY_ENABLED`.
+
+Enable [module compile cache][] in the current Node.js instance.
+
+If `cacheDir` is not specified, Node.js will either use the directory specified by the
+[`NODE_COMPILE_CACHE=dir`][] environment variable if it's set, or use
+`path.join(os.tmpdir(), 'node-compile-cache')` otherwise. For general use cases, it's
+recommended to call `module.enableCompileCache()` without specifying the `cacheDir`,
+so that the directory can be overridden by the `NODE_COMPILE_CACHE` environment
+variable when necessary.
+
+Since compile cache is supposed to be a quiet optimization that is not required for the
+application to be functional, this method is designed to not throw any exception when the
+compile cache cannot be enabled. Instead, it will return an object containing an error
+message in the `message` field to aid debugging.
+If compile cache is enabled successfully, the `directory` field in the returned object
+contains the path to the directory where the compile cache is stored. The `status`
+field in the returned object would be one of the `module.constants.compileCacheStatus`
+values to indicate the result of the attempt to enable the [module compile cache][].
+
+This method only affects the current Node.js instance. To enable it in child worker threads,
+either call this method in child worker threads too, or set the
+`process.env.NODE_COMPILE_CACHE` value to compile cache directory so the behavior can
+be inherited into the child workers. The directory can be obtained either from the
+`directory` field returned by this method, or with [`module.getCompileCacheDir()`][].
+
+### `module.flushCompileCache()`
+
+<!-- YAML
+added:
+ - v23.0.0
+-->
+
+> Stability: 1.1 - Active Development
+
+Flush the [module compile cache][] accumulated from modules already loaded
+in the current Node.js instance to disk. This returns after all the flushing
+file system operations come to an end, no matter they succeed or not. If there
+are any errors, this will fail silently, since compile cache misses should not
+interfere with the actual operation of the application.
+
+### `module.getCompileCacheDir()`
+
+<!-- YAML
+added: v22.8.0
+-->
+
+> Stability: 1.1 - Active Development
+
+* Returns: {string|undefined} Path to the [module compile cache][] directory if it is enabled,
+  or `undefined` otherwise.
+
 <i id="module_customization_hooks"></i>
 
 ## Customization Hooks
@@ -497,6 +529,9 @@ import('node:fs').then((esmFS) => {
 <!-- YAML
 added: v8.8.0
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/55698
+    description: Add support for synchronous and in-thread hooks.
   - version:
     - v20.6.0
     - v18.19.0
@@ -513,27 +548,43 @@ changes:
                  `globalPreload`; added `load` hook and `getGlobalPreload` hook.
 -->
 
-> Stability: 1.2 - Release candidate
-
 <!-- type=misc -->
 
+> Stability: 1.2 - Release candidate (asynchronous version)
+> Stability: 1.1 - Active development (synchronous version)
+
+There are two types of module customization hooks that are currently supported:
+
+1. `module.register(specifier[, parentURL][, options])` which takes a module that
+   exports asynchronous hook functions. The functions are run on a separate loader
+   thread.
+2. `module.registerHooks(options)` which takes synchronous hook functions that are
+   run directly on the thread where the module is loaded.
+
 <i id="enabling_module_customization_hooks"></i>
 
 ### Enabling
 
-Module resolution and loading can be customized by registering a file which
-exports a set of hooks. This can be done using the [`register`][] method
-from `node:module`, which you can run before your application code by
-using the `--import` flag:
+Module resolution and loading can be customized by:
+
+1. Registering a file which exports a set of asynchronous hook functions, using the
+   [`register`][] method from `node:module`,
+2. Registering a set of synchronous hook functions using the [`registerHooks`][] method
+   from `node:module`.
+
+The hooks can be registered before the application code is run by using the
+[`--import`][] or [`--require`][] flag:
 
 ```bash
 node --import ./register-hooks.js ./my-app.js
+node --require ./register-hooks.js ./my-app.js
 ```
 
 ```mjs
 // register-hooks.js
+// This file can only be require()-ed if it doesn't contain top-level await.
+// Use module.register() to register asynchronous hooks in a dedicated thread.
 import { register } from 'node:module';
-
 register('./hooks.mjs', import.meta.url);
 ```
 
@@ -541,24 +592,46 @@ register('./hooks.mjs', import.meta.url);
 // register-hooks.js
 const { register } = require('node:module');
 const { pathToFileURL } = require('node:url');
-
+// Use module.register() to register asynchronous hooks in a dedicated thread.
 register('./hooks.mjs', pathToFileURL(__filename));
 ```
 
-The file passed to `--import` can also be an export from a dependency:
+```mjs
+// Use module.registerHooks() to register synchronous hooks in the main thread.
+import { registerHooks } from 'node:module';
+registerHooks({
+  resolve(specifier, context, nextResolve) { /* implementation */ },
+  load(url, context, nextLoad) { /* implementation */ },
+});
+```
+
+```cjs
+// Use module.registerHooks() to register synchronous hooks in the main thread.
+const { registerHooks } = require('node:module');
+registerHooks({
+  resolve(specifier, context, nextResolve) { /* implementation */ },
+  load(url, context, nextLoad) { /* implementation */ },
+});
+```
+
+The file passed to `--import` or `--require` can also be an export from a dependency:
 
 ```bash
 node --import some-package/register ./my-app.js
+node --require some-package/register ./my-app.js
 ```
 
 Where `some-package` has an [`"exports"`][] field defining the `/register`
 export to map to a file that calls `register()`, like the following `register-hooks.js`
 example.
 
-Using `--import` ensures that the hooks are registered before any application
-files are imported, including the entry point of the application. Alternatively,
-`register` can be called from the entry point, but dynamic `import()` must be
-used for any code that should be run after the hooks are registered:
+Using `--import` or `--require` ensures that the hooks are registered before any
+application files are imported, including the entry point of the application and for
+any worker threads by default as well.
+
+Alternatively, `register()` and `registerHooks()` can be called from the entry point,
+though dynamic `import()` must be used for any ESM code that should be run after the hooks
+are registered.
 
 ```mjs
 import { register } from 'node:module';
@@ -581,18 +654,52 @@ register('http-to-https', pathToFileURL(__filename));
 import('./my-app.js');
 ```
 
+Customization hooks will run for any modules loaded later than the registration
+and the modules they reference via `import` and the built-in `require`.
+`require` function created by users using `module.createRequire()` can only be
+customized by the synchronous hooks.
+
 In this example, we are registering the `http-to-https` hooks, but they will
-only be available for subsequently imported modules—in this case, `my-app.js`
-and anything it references via `import` (and optionally `require`). If the
-`import('./my-app.js')` had instead been a static `import './my-app.js'`, the
+only be available for subsequently imported modules — in this case, `my-app.js`
+and anything it references via `import` or built-in `require` in CommonJS dependencies.
+
+If the `import('./my-app.js')` had instead been a static `import './my-app.js'`, the
 app would have _already_ been loaded **before** the `http-to-https` hooks were
 registered. This due to the ES modules specification, where static imports are
 evaluated from the leaves of the tree first, then back to the trunk. There can
 be static imports _within_ `my-app.js`, which will not be evaluated until
 `my-app.js` is dynamically imported.
 
-`my-app.js` can also be CommonJS. Customization hooks will run for any
-modules that it references via `import` (and optionally `require`).
+If synchronous hooks are used, both `import`, `require` and user `require` created
+using `createRequire()` are supported.
+
+```mjs
+import { registerHooks, createRequire } from 'node:module';
+
+registerHooks({ /* implementation of synchronous hooks */ });
+
+const require = createRequire(import.meta.url);
+
+// The synchronous hooks affect import, require() and user require() function
+// created through createRequire().
+await import('./my-app.js');
+require('./my-app-2.js');
+```
+
+```cjs
+const { register, registerHooks } = require('node:module');
+const { pathToFileURL } = require('node:url');
+
+registerHooks({ /* implementation of synchronous hooks */ });
+
+const userRequire = createRequire(__filename);
+
+// The synchronous hooks affect import, require() and user require() function
+// created through createRequire().
+import('./my-app.js');
+require('./my-app-2.js');
+userRequire('./my-app-3.js');
+```
 
 Finally, if all you want to do is register hooks before your app runs and you
 don't want to create a separate file for that purpose, you can pass a `data:`
@@ -642,9 +749,36 @@ earlier registered hooks transpile into JavaScript.
 The `register` method cannot be called from within the module that defines the
 hooks.
 
+Chaining of `registerHooks` work similarly. If synchronous and asynchronous
+hooks are mixed, the synchronous hooks are always run first before the asynchronous
+hooks start running, that is, in the last synchronous hook being run, its next
+hook includes invocation of the asynchronous hooks.
+
+```mjs
+// entrypoint.mjs
+import { registerHooks } from 'node:module';
+
+const hook1 = { /* implementation of hooks */ };
+const hook2 = { /* implementation of hooks */ };
+// hook2 run before hook1.
+registerHooks(hook1);
+registerHooks(hook2);
+```
+
+```cjs
+// entrypoint.cjs
+const { registerHooks } = require('node:module');
+
+const hook1 = { /* implementation of hooks */ };
+const hook2 = { /* implementation of hooks */ };
+// hook2 run before hook1.
+registerHooks(hook1);
+registerHooks(hook2);
+```
+
 ### Communication with module customization hooks
 
-Module customization hooks run on a dedicated thread, separate from the main
+Asynchronous hooks run on a dedicated thread, separate from the main
 thread that runs application code. This means mutating global variables won't
 affect the other thread(s), and message channels must be used to communicate
 between the threads.
@@ -693,8 +827,13 @@ register('./my-hooks.mjs', {
 });
 ```
 
+Synchronous module hooks are run on the same thread where the application code is
+run. They can directly mutate the globals of the context accessed by the main thread.
+
 ### Hooks
 
+#### Asynchronous hooks accepted by `module.register()`
+
 The [`register`][] method can be used to register a module that exports a set of
 hooks. The hooks are functions that are called by Node.js to customize the
 module resolution and loading process. The exported functions must have specific
@@ -714,6 +853,46 @@ export async function load(url, context, nextLoad) {
 }
 ```
 
+Asynchronous hooks are run in a separate thread, isolated from the main thread where
+application code runs. That means it is a different [realm][]. The hooks thread
+may be terminated by the main thread at any time, so do not depend on
+asynchronous operations (like `console.log`) to complete. They are inherited into
+child workers by default.
+
+#### Synchronous hooks accepted by `module.registerHooks()`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+> Stability: 1.1 - Active development
+
+The `module.registerHooks()` method accepts synchronous hook functions.
+`initialize()` is not supported nor necessary, as the hook implementer
+can simply run the initialization code directly before the call to
+`module.registerHooks()`.
+
+```mjs
+function resolve(specifier, context, nextResolve) {
+  // Take an `import` or `require` specifier and resolve it to a URL.
+}
+
+function load(url, context, nextLoad) {
+  // Take a resolved URL and return the source code to be evaluated.
+}
+```
+
+Synchronous hooks are run in the same thread and the same [realm][] where the modules
+are loaded. Unlike the asynchronous hooks they are not inherited into child worker
+threads by default, though if the hooks are registered using a file preloaded by
+[`--import`][] or [`--require`][], child worker threads can inherit the preloaded scripts
+via `process.execArgv` inheritance. See [the documentation of `Worker`][] for detail.
+
+In synchronous hooks, users can expect `console.log()` to complete in the same way that
+they expect `console.log()` in module code to complete.
+
+#### Conventions of hooks
+
 Hooks are part of a [chain][], even if that chain consists of only one
 custom (user-provided) hook and the default hook, which is always present. Hook
 functions nest: each one must always return a plain object, and chaining happens
@@ -726,11 +905,6 @@ hook that returns without calling `next<hookName>()` _and_ without returning
 prevent unintentional breaks in the chain. Return `shortCircuit: true` from a
 hook to signal that the chain is intentionally ending at your hook.
 
-Hooks are run in a separate thread, isolated from the main thread where
-application code runs. That means it is a different [realm][]. The hooks thread
-may be terminated by the main thread at any time, so do not depend on
-asynchronous operations (like `console.log`) to complete.
-
 #### `initialize()`
 
 <!-- YAML
@@ -743,6 +917,10 @@ added:
 
 * `data` {any} The data from `register(loader, import.meta.url, { data })`.
 
+The `initialize` hook is only accepted by [`register`][]. `registerHooks()` does
+not support nor need it since initialization done for synchronous hooks can be run
+directly before the call to `registerHooks()`.
+
 The `initialize` hook provides a way to define a custom function that runs in
 the hooks thread when the hooks module is initialized. Initialization happens
 when the hooks module is registered via [`register`][].
@@ -813,6 +991,9 @@ register('./path-to-my-hooks.js', {
 
 <!-- YAML
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/55698
+    description: Add support for synchronous and in-thread hooks.
   - version:
     - v21.0.0
     - v20.10.0
@@ -835,7 +1016,8 @@ changes:
     description: Add support for import assertions.
 -->
 
-> Stability: 1.2 - Release candidate
+> Stability: 1.2 - Release candidate (asynchronous version)
+> Stability: 1.1 - Active development (synchronous version)
 
 * `specifier` {string}
 * `context` {Object}
@@ -848,7 +1030,9 @@ changes:
   Node.js default `resolve` hook after the last user-supplied `resolve` hook
   * `specifier` {string}
   * `context` {Object}
-* Returns: {Object|Promise}
+* Returns: {Object|Promise} The asynchronous version takes either an object containing the
+  following properties, or a `Promise` that will resolve to such an object. The
+  synchronous version only accepts an object returned synchronously.
   * `format` {string|null|undefined} A hint to the load hook (it might be
     ignored)
     `'builtin' | 'commonjs' | 'json' | 'module' | 'wasm'`
@@ -858,8 +1042,9 @@ changes:
     terminate the chain of `resolve` hooks. **Default:** `false`
   * `url` {string} The absolute URL to which this input resolves
 
-> **Warning** Despite support for returning promises and async functions, calls
-> to `resolve` may block the main thread which can impact performance.
+> **Warning** In the case of the asynchronous version, despite support for returning
+> promises and async functions, calls to `resolve` may still block the main thread which
+> can impact performance.
 
 The `resolve` hook chain is responsible for telling Node.js where to find and
 how to cache a given `import` statement or expression, or `require` call. It can
@@ -874,8 +1059,8 @@ the internal module cache. The `resolve` hook is responsible for returning an
 `importAttributes` object if the module should be cached with different
 attributes than were present in the source code.
 
-The `conditions` property in `context` is an array of conditions for
-[package exports conditions][Conditional exports] that apply to this resolution
+The `conditions` property in `context` is an array of conditions that will be used
+to match [package exports conditions][Conditional exports] for this resolution
 request. They can be used for looking up conditional mappings elsewhere or to
 modify the list when calling the default resolution logic.
 
@@ -885,7 +1070,11 @@ Node.js module specifier resolution behavior_ when calling `defaultResolve`, the
 `context.conditions` array passed to it _must_ include _all_ elements of the
 `context.conditions` array originally passed into the `resolve` hook.
 
+<!-- TODO(joyeecheung): Math.random() is a bit too contrived. At least do a
+find-and-replace mangling on the URLs. -->
+
 ```mjs
+// Asynchronous version accepted by module.register().
 export async function resolve(specifier, context, nextResolve) {
   const { parentURL = null } = context;
 
@@ -915,10 +1104,21 @@ export async function resolve(specifier, context, nextResolve) {
 }
 ```
 
+```mjs
+// Synchronous version accepted by module.registerHooks().
+function resolve(specifier, context, nextResolve) {
+  // Similar to the asynchronous resolve() above, since that one does not have
+  // any asynchronous logic.
+}
+```
+
 #### `load(url, context, nextLoad)`
 
 <!-- YAML
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/55698
+    description: Add support for synchronous and in-thread version.
   - version: v20.6.0
     pr-url: https://github.com/nodejs/node/pull/47999
     description: Add support for `source` with format `commonjs`.
@@ -931,7 +1131,8 @@ changes:
       its return.
 -->
 
-> Stability: 1.2 - Release candidate
+> Stability: 1.2 - Release candidate (asynchronous version)
+> Stability: 1.1 - Active development (synchronous version)
 
 * `url` {string} The URL returned by the `resolve` chain
 * `context` {Object}
@@ -943,7 +1144,9 @@ changes:
   Node.js default `load` hook after the last user-supplied `load` hook
   * `url` {string}
   * `context` {Object}
-* Returns: {Object}
+* Returns: {Object|Promise} The asynchronous version takes either an object containing the
+  following properties, or a `Promise` that will resolve to such an object. The
+  synchronous version only accepts an object returned synchronously.
   * `format` {string}
   * `shortCircuit` {undefined|boolean} A signal that this hook intends to
     terminate the chain of `load` hooks. **Default:** `false`
@@ -966,7 +1169,10 @@ The final value of `format` must be one of the following:
 The value of `source` is ignored for type `'builtin'` because currently it is
 not possible to replace the value of a Node.js builtin (core) module.
 
-Omitting vs providing a `source` for `'commonjs'` has very different effects:
+##### Caveat in the asynchronous `load` hook
+
+When using the asynchronous `load` hook, omitting vs providing a `source` for
+`'commonjs'` has very different effects:
 
 * When a `source` is provided, all `require` calls from this module will be
   processed by the ESM loader with registered `resolve` and `load` hooks; all
@@ -980,7 +1186,12 @@ Omitting vs providing a `source` for `'commonjs'` has very different effects:
   registered hooks. This behavior for nullish `source` is temporary — in the
   future, nullish `source` will not be supported.
 
-The Node.js internal `load` implementation, which is the value of `next` for the
+These caveats do not apply to the synchronous `load` hook, in which case
+the complete set of CommonJS APIs available to the customized CommonJS
+modules, and `require`/`require.resolve` always go through the registered
+hooks.
+
+The Node.js internal asynchronous `load` implementation, which is the value of `next` for the
 last hook in the `load` chain, returns `null` for `source` when `format` is
 `'commonjs'` for backward compatibility. Here is an example hook that would
 opt-in to using the non-default behavior:
@@ -988,6 +1199,8 @@ opt-in to using the non-default behavior:
 ```mjs
 import { readFile } from 'node:fs/promises';
 
+// Asynchronous version accepted by module.register(). This fix is not needed
+// for the synchronous version accepted by module.registerSync().
 export async function load(url, context, nextLoad) {
   const result = await nextLoad(url, context);
   if (result.format === 'commonjs') {
@@ -997,9 +1210,14 @@ export async function load(url, context, nextLoad) {
 }
 ```
 
-> **Warning**: The ESM `load` hook and namespaced exports from CommonJS modules
-> are incompatible. Attempting to use them together will result in an empty
-> object from the import. This may be addressed in the future.
+This doesn't apply to the synchronous `load` hook either, in which case the
+`source` returned contains source code loaded by the next hook, regardless
+of module format.
+
+> **Warning**: The asynchronous `load` hook and namespaced exports from CommonJS
+> modules are incompatible. Attempting to use them together will result in an empty
+> object from the import. This may be addressed in the future. This does not apply
+> to the synchronous `load` hook, in which case exports can be used as usual.
 
 > These types all correspond to classes defined in ECMAScript.
 
@@ -1015,6 +1233,7 @@ reading files from disk. It could also be used to map an unrecognized format to
 a supported one, for example `yaml` to `module`.
 
 ```mjs
+// Asynchronous version accepted by module.register().
 export async function load(url, context, nextLoad) {
   const { format } = context;
 
@@ -1038,6 +1257,14 @@ export async function load(url, context, nextLoad) {
 }
 ```
 
+```mjs
+// Synchronous version accepted by module.registerHooks().
+function load(url, context, nextLoad) {
+  // Similar to the asynchronous load() above, since that one does not have
+  // any asynchronous logic.
+}
+```
+
 In a more advanced scenario, this can also be used to transform an unsupported
 source to a supported one (see [Examples](#examples) below).
 
@@ -1096,6 +1323,10 @@ With the preceding hooks module, running
 prints the current version of CoffeeScript per the module at the URL in
 `main.mjs`.
 
+<!-- TODO(joyeecheung): add an example on how to implement it with a fetchSync based on
+workers and Atomics.wait() - or all these examples are too much to be put in the API
+documentation already and should be put into a repository instead? -->
+
 #### Transpilation
 
 Sources that are in formats Node.js doesn't understand can be converted into
@@ -1104,6 +1335,8 @@ JavaScript using the [`load` hook][load hook].
 This is less performant than transpiling source files before running Node.js;
 transpiler hooks should only be used for development and testing purposes.
 
+##### Asynchronous version
+
 ```mjs
 // coffeescript-hooks.mjs
 import { readFile } from 'node:fs/promises';
@@ -1169,6 +1402,57 @@ async function getPackageType(url) {
 }
 ```
 
+##### Synchronous version
+
+```mjs
+// coffeescript-sync-hooks.mjs
+import { readFileSync } from 'node:fs/promises';
+import { registerHooks } from 'node:module';
+import { dirname, extname, resolve as resolvePath } from 'node:path';
+import { cwd } from 'node:process';
+import { fileURLToPath, pathToFileURL } from 'node:url';
+import coffeescript from 'coffeescript';
+
+const extensionsRegex = /\.(coffee|litcoffee|coffee\.md)$/;
+
+function load(url, context, nextLoad) {
+  if (extensionsRegex.test(url)) {
+    const format = getPackageType(url);
+
+    const { source: rawSource } = nextLoad(url, { ...context, format });
+    const transformedSource = coffeescript.compile(rawSource.toString(), url);
+
+    return {
+      format,
+      shortCircuit: true,
+      source: transformedSource,
+    };
+  }
+
+  return nextLoad(url);
+}
+
+function getPackageType(url) {
+  const isFilePath = !!extname(url);
+  const dir = isFilePath ? dirname(fileURLToPath(url)) : url;
+  const packagePath = resolvePath(dir, 'package.json');
+
+  let type;
+  try {
+    const filestring = readFileSync(packagePath, { encoding: 'utf8' });
+    type = JSON.parse(filestring).type;
+  } catch (err) {
+    if (err?.code !== 'ENOENT') console.error(err);
+  }
+  if (type) return type;
+  return dir.length > 1 && getPackageType(resolvePath(dir, '..'));
+}
+
+registerHooks({ load });
+```
+
+#### Running hooks
+
 ```coffee
 # main.coffee
 import { scream } from './scream.coffee'
@@ -1183,8 +1467,9 @@ console.log "Brought to you by Node.js version #{version}"
 export scream = (str) -> str.toUpperCase()
 ```
 
-With the preceding hooks module, running
+With the preceding hooks modules, running
 `node --import 'data:text/javascript,import { register } from "node:module"; import { pathToFileURL } from "node:url"; register(pathToFileURL("./coffeescript-hooks.mjs"));' ./main.coffee`
+or `node --import ./coffeescript-sync-hooks.mjs ./main.coffee`
 causes `main.coffee` to be turned into JavaScript after its source code is
 loaded from disk but before Node.js executes it; and so on for any `.coffee`,
 `.litcoffee` or `.coffee.md` files referenced via `import` statements of any
@@ -1197,6 +1482,8 @@ The previous two examples defined `load` hooks. This is an example of a
 which specifiers to override to other URLs (this is a very simplistic
 implementation of a small subset of the "import maps" specification).
 
+##### Asynchronous version
+
 ```mjs
 // import-map-hooks.js
 import fs from 'node:fs/promises';
@@ -1212,6 +1499,28 @@ export async function resolve(specifier, context, nextResolve) {
 }
 ```
 
+##### Synchronous version
+
+```mjs
+// import-map-sync-hooks.js
+import fs from 'node:fs/promises';
+import module from 'node:module';
+
+const { imports } = JSON.parse(fs.readFileSync('import-map.json', 'utf-8'));
+
+function resolve(specifier, context, nextResolve) {
+  if (Object.hasOwn(imports, specifier)) {
+    return nextResolve(imports[specifier], context);
+  }
+
+  return nextResolve(specifier, context);
+}
+
+module.registerHooks({ resolve });
+```
+
+##### Using the hooks
+
 With these files:
 
 ```mjs
@@ -1234,6 +1543,7 @@ console.log('some module!');
 ```
 
 Running `node --import 'data:text/javascript,import { register } from "node:module"; import { pathToFileURL } from "node:url"; register(pathToFileURL("./import-map-hooks.js"));' main.js`
+or `node --import ./import-map-sync-hooks.js main.js`
 should print `some module!`.
 
 ## Source map v3 support
@@ -1285,21 +1595,6 @@ added:
 `path` is the resolved path for the file for which a corresponding source map
 should be fetched.
 
-### `module.flushCompileCache()`
-
-<!-- YAML
-added:
- - v23.0.0
--->
-
-> Stability: 1.1 - Active Development
-
-Flush the [module compile cache][] accumulated from modules already loaded
-in the current Node.js instance to disk. This returns after all the flushing
-file system operations come to an end, no matter they succeed or not. If there
-are any errors, this will fail silently, since compile cache misses should not
-interfere with the actual operation of the application.
-
 ### Class: `module.SourceMap`
 
 <!-- YAML
@@ -1404,6 +1699,8 @@ returned object contains the following keys:
 [V8 code cache]: https://v8.dev/blog/code-caching-for-devs
 [`"exports"`]: packages.md#exports
 [`--enable-source-maps`]: cli.md#--enable-source-maps
+[`--import`]: cli.md#--importmodule
+[`--require`]: cli.md#-r---require-module
 [`ArrayBuffer`]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/ArrayBuffer
 [`NODE_COMPILE_CACHE=dir`]: cli.md#node_compile_cachedir
 [`NODE_DISABLE_COMPILE_CACHE=1`]: cli.md#node_disable_compile_cache1
@@ -1419,6 +1716,7 @@ returned object contains the following keys:
 [`module.getCompileCacheDir()`]: #modulegetcompilecachedir
 [`module`]: #the-module-object
 [`os.tmpdir()`]: os.md#ostmpdir
+[`registerHooks`]: #moduleregisterhooksoptions
 [`register`]: #moduleregisterspecifier-parenturl-options
 [`string`]: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String
 [`util.TextDecoder`]: util.md#class-utiltextdecoder
@@ -1427,9 +1725,10 @@ returned object contains the following keys:
 [load hook]: #loadurl-context-nextload
 [module compile cache]: #module-compile-cache
 [module wrapper]: modules.md#the-module-wrapper
-[prefix-only modules]: modules.md#built-in-modules-with-mandatory-node-prefix
 [realm]: https://tc39.es/ecma262/#realm
+[resolve hook]: #resolvespecifier-context-nextresolve
 [source map include directives]: https://sourcemaps.info/spec.html#h.lmz475t4mvbx
+[the documentation of `Worker`]: worker_threads.md#new-workerfilename-options
 [transferable objects]: worker_threads.md#portpostmessagevalue-transferlist
 [transform TypeScript features]: typescript.md#typescript-features
 [type-stripping]: typescript.md#type-stripping
diff --git a/doc/api/modules.md b/doc/api/modules.md
index 720a47205edb2a..204809f5b779b0 100644
--- a/doc/api/modules.md
+++ b/doc/api/modules.md
@@ -175,6 +175,11 @@ added:
   - v22.0.0
   - v20.17.0
 changes:
+  - version:
+    - v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56194
+    description: This feature no longer emits an experimental warning by default,
+                 though the warning can still be emitted by --trace-require-module.
   - version: v23.0.0
     pr-url: https://github.com/nodejs/node/pull/55085
     description: This feature is no longer behind the `--experimental-require-module` CLI flag.
@@ -315,9 +320,8 @@ help users fix them.
 
 Support for loading ES modules using `require()` is currently
 experimental and can be disabled using `--no-experimental-require-module`.
-When `require()` actually encounters an ES module for the
-first time in the process, it will emit an experimental warning. The
-warning is expected to be removed when this feature stablizes.
+To print where this feature is used, use [`--trace-require-module`][].
+
 This feature can be detected by checking if
 [`process.features.require_module`][] is `true`.
 
@@ -509,7 +513,7 @@ Some built-in modules are always preferentially loaded if their identifier is
 passed to `require()`. For instance, `require('http')` will always
 return the built-in HTTP module, even if there is a file by that name. The list
 of built-in modules that can be loaded without using the `node:` prefix is exposed
-as [`module.builtinModules`][].
+in [`module.builtinModules`][], listed without the prefix.
 
 ### Built-in modules with mandatory `node:` prefix
 
@@ -523,6 +527,8 @@ taken the name. Currently the built-in modules that requires the `node:` prefix
 * [`node:test`][]
 * [`node:test/reporters`][]
 
+The list of these modules is exposed in [`module.builtinModules`][], including the prefix.
+
 ## Cycles
 
 <!--type=misc-->
@@ -1267,6 +1273,7 @@ This section was moved to
 [GLOBAL_FOLDERS]: #loading-from-the-global-folders
 [`"main"`]: packages.md#main
 [`"type"`]: packages.md#type
+[`--trace-require-module`]: cli.md#--trace-require-modulemode
 [`ERR_REQUIRE_ASYNC_MODULE`]: errors.md#err_require_async_module
 [`ERR_UNSUPPORTED_DIR_IMPORT`]: errors.md#err_unsupported_dir_import
 [`MODULE_NOT_FOUND`]: errors.md#module_not_found
diff --git a/doc/api/perf_hooks.md b/doc/api/perf_hooks.md
index 308f2cfc698dac..58f4114b39a3de 100644
--- a/doc/api/perf_hooks.md
+++ b/doc/api/perf_hooks.md
@@ -17,7 +17,26 @@ Node.js supports the following [Web Performance APIs][]:
 * [User Timing][]
 * [Resource Timing][]
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const obs = new PerformanceObserver((items) => {
+  console.log(items.getEntries()[0].duration);
+  performance.clearMarks();
+});
+obs.observe({ type: 'measure' });
+performance.measure('Start to Now');
+
+performance.mark('A');
+doSomeLongRunningProcess(() => {
+  performance.measure('A to Now', 'A');
+
+  performance.mark('B');
+  performance.measure('A to B', 'A', 'B');
+});
+```
+
+```cjs
 const { PerformanceObserver, performance } = require('node:perf_hooks');
 
 const obs = new PerformanceObserver((items) => {
@@ -138,7 +157,18 @@ loop has spent outside the event loop's event provider (e.g. `epoll_wait`).
 No other CPU idle time is taken into consideration. The following is an example
 of how a mostly idle process will have a high ELU.
 
-```js
+```mjs
+import { eventLoopUtilization } from 'node:perf_hooks';
+import { spawnSync } from 'node:child_process';
+
+setImmediate(() => {
+  const elu = eventLoopUtilization();
+  spawnSync('sleep', ['5']);
+  console.log(eventLoopUtilization(elu).utilization);
+});
+```
+
+```cjs
 'use strict';
 const { eventLoopUtilization } = require('node:perf_hooks').performance;
 const { spawnSync } = require('node:child_process');
@@ -415,7 +445,29 @@ Wraps a function within a new function that measures the running time of the
 wrapped function. A `PerformanceObserver` must be subscribed to the `'function'`
 event type in order for the timing details to be accessed.
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+function someFunction() {
+  console.log('hello world');
+}
+
+const wrapped = performance.timerify(someFunction);
+
+const obs = new PerformanceObserver((list) => {
+  console.log(list.getEntries()[0].duration);
+
+  performance.clearMarks();
+  performance.clearMeasures();
+  obs.disconnect();
+});
+obs.observe({ entryTypes: ['function'] });
+
+// A performance timeline entry will be created
+wrapped();
+```
+
+```cjs
 const {
   performance,
   PerformanceObserver,
@@ -1258,7 +1310,22 @@ changes:
 `PerformanceObserver` objects provide notifications when new
 `PerformanceEntry` instances have been added to the Performance Timeline.
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const obs = new PerformanceObserver((list, observer) => {
+  console.log(list.getEntries());
+
+  performance.clearMarks();
+  performance.clearMeasures();
+  observer.disconnect();
+});
+obs.observe({ entryTypes: ['mark'], buffered: true });
+
+performance.mark('test');
+```
+
+```cjs
 const {
   performance,
   PerformanceObserver,
@@ -1324,7 +1391,19 @@ Subscribes the {PerformanceObserver} instance to notifications of new
 {PerformanceEntry} instances identified either by `options.entryTypes`
 or `options.type`:
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const obs = new PerformanceObserver((list, observer) => {
+  // Called once asynchronously. `list` contains three items.
+});
+obs.observe({ type: 'mark' });
+
+for (let n = 0; n < 3; n++)
+  performance.mark(`test${n}`);
+```
+
+```cjs
 const {
   performance,
   PerformanceObserver,
@@ -1368,7 +1447,41 @@ added: v8.5.0
 Returns a list of `PerformanceEntry` objects in chronological order
 with respect to `performanceEntry.startTime`.
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const obs = new PerformanceObserver((perfObserverList, observer) => {
+  console.log(perfObserverList.getEntries());
+  /**
+   * [
+   *   PerformanceEntry {
+   *     name: 'test',
+   *     entryType: 'mark',
+   *     startTime: 81.465639,
+   *     duration: 0,
+   *     detail: null
+   *   },
+   *   PerformanceEntry {
+   *     name: 'meow',
+   *     entryType: 'mark',
+   *     startTime: 81.860064,
+   *     duration: 0,
+   *     detail: null
+   *   }
+   * ]
+   */
+
+  performance.clearMarks();
+  performance.clearMeasures();
+  observer.disconnect();
+});
+obs.observe({ type: 'mark' });
+
+performance.mark('test');
+performance.mark('meow');
+```
+
+```cjs
 const {
   performance,
   PerformanceObserver,
@@ -1420,7 +1533,49 @@ with respect to `performanceEntry.startTime` whose `performanceEntry.name` is
 equal to `name`, and optionally, whose `performanceEntry.entryType` is equal to
 `type`.
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const obs = new PerformanceObserver((perfObserverList, observer) => {
+  console.log(perfObserverList.getEntriesByName('meow'));
+  /**
+   * [
+   *   PerformanceEntry {
+   *     name: 'meow',
+   *     entryType: 'mark',
+   *     startTime: 98.545991,
+   *     duration: 0,
+   *     detail: null
+   *   }
+   * ]
+   */
+  console.log(perfObserverList.getEntriesByName('nope')); // []
+
+  console.log(perfObserverList.getEntriesByName('test', 'mark'));
+  /**
+   * [
+   *   PerformanceEntry {
+   *     name: 'test',
+   *     entryType: 'mark',
+   *     startTime: 63.518931,
+   *     duration: 0,
+   *     detail: null
+   *   }
+   * ]
+   */
+  console.log(perfObserverList.getEntriesByName('test', 'measure')); // []
+
+  performance.clearMarks();
+  performance.clearMeasures();
+  observer.disconnect();
+});
+obs.observe({ entryTypes: ['mark', 'measure'] });
+
+performance.mark('test');
+performance.mark('meow');
+```
+
+```cjs
 const {
   performance,
   PerformanceObserver,
@@ -1478,7 +1633,40 @@ Returns a list of `PerformanceEntry` objects in chronological order
 with respect to `performanceEntry.startTime` whose `performanceEntry.entryType`
 is equal to `type`.
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const obs = new PerformanceObserver((perfObserverList, observer) => {
+  console.log(perfObserverList.getEntriesByType('mark'));
+  /**
+   * [
+   *   PerformanceEntry {
+   *     name: 'test',
+   *     entryType: 'mark',
+   *     startTime: 55.897834,
+   *     duration: 0,
+   *     detail: null
+   *   },
+   *   PerformanceEntry {
+   *     name: 'meow',
+   *     entryType: 'mark',
+   *     startTime: 56.350146,
+   *     duration: 0,
+   *     detail: null
+   *   }
+   * ]
+   */
+  performance.clearMarks();
+  performance.clearMeasures();
+  observer.disconnect();
+});
+obs.observe({ type: 'mark' });
+
+performance.mark('test');
+performance.mark('meow');
+```
+
+```cjs
 const {
   performance,
   PerformanceObserver,
@@ -1556,7 +1744,23 @@ event loop. That is, a delay in the loop will cause a delay in the execution
 of the timer, and those delays are specifically what this API is intended to
 detect.
 
-```js
+```mjs
+import { monitorEventLoopDelay } from 'node:perf_hooks';
+
+const h = monitorEventLoopDelay({ resolution: 20 });
+h.enable();
+// Do something.
+h.disable();
+console.log(h.min);
+console.log(h.max);
+console.log(h.mean);
+console.log(h.stddev);
+console.log(h.percentiles);
+console.log(h.percentile(50));
+console.log(h.percentile(99));
+```
+
+```cjs
 const { monitorEventLoopDelay } = require('node:perf_hooks');
 const h = monitorEventLoopDelay({ resolution: 20 });
 h.enable();
@@ -1824,7 +2028,42 @@ The following example uses the [Async Hooks][] and Performance APIs to measure
 the actual duration of a Timeout operation (including the amount of time it took
 to execute the callback).
 
-```js
+```mjs
+import { createHook } from 'node:async_hooks';
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+const set = new Set();
+const hook = createHook({
+  init(id, type) {
+    if (type === 'Timeout') {
+      performance.mark(`Timeout-${id}-Init`);
+      set.add(id);
+    }
+  },
+  destroy(id) {
+    if (set.has(id)) {
+      set.delete(id);
+      performance.mark(`Timeout-${id}-Destroy`);
+      performance.measure(`Timeout-${id}`,
+                          `Timeout-${id}-Init`,
+                          `Timeout-${id}-Destroy`);
+    }
+  },
+});
+hook.enable();
+
+const obs = new PerformanceObserver((list, observer) => {
+  console.log(list.getEntries()[0]);
+  performance.clearMarks();
+  performance.clearMeasures();
+  observer.disconnect();
+});
+obs.observe({ entryTypes: ['measure'], buffered: true });
+
+setTimeout(() => {}, 1000);
+```
+
+```cjs
 'use strict';
 const async_hooks = require('node:async_hooks');
 const {
@@ -1870,7 +2109,29 @@ dependencies:
 
 <!-- eslint-disable no-global-assign -->
 
-```js
+```mjs
+import { performance, PerformanceObserver } from 'node:perf_hooks';
+
+// Activate the observer
+const obs = new PerformanceObserver((list) => {
+  const entries = list.getEntries();
+  entries.forEach((entry) => {
+    console.log(`import('${entry[0]}')`, entry.duration);
+  });
+  performance.clearMarks();
+  performance.clearMeasures();
+  obs.disconnect();
+});
+obs.observe({ entryTypes: ['function'], buffered: true });
+
+const timedImport = performance.timerify(async (module) => {
+  return await import(module);
+});
+
+await timedImport('some-module');
+```
+
+```cjs
 'use strict';
 const {
   performance,
@@ -1906,7 +2167,28 @@ it means the time interval between starting the request and receiving the
 response, and for HTTP request, it means the time interval between receiving
 the request and sending the response:
 
-```js
+```mjs
+import { PerformanceObserver } from 'node:perf_hooks';
+import { createServer, get } from 'node:http';
+
+const obs = new PerformanceObserver((items) => {
+  items.getEntries().forEach((item) => {
+    console.log(item);
+  });
+});
+
+obs.observe({ entryTypes: ['http'] });
+
+const PORT = 8080;
+
+createServer((req, res) => {
+  res.end('ok');
+}).listen(PORT, () => {
+  get(`http://127.0.0.1:${PORT}`);
+});
+```
+
+```cjs
 'use strict';
 const { PerformanceObserver } = require('node:perf_hooks');
 const http = require('node:http');
@@ -1930,7 +2212,25 @@ http.createServer((req, res) => {
 
 ### Measuring how long the `net.connect` (only for TCP) takes when the connection is successful
 
-```js
+```mjs
+import { PerformanceObserver } from 'node:perf_hooks';
+import { connect, createServer } from 'node:net';
+
+const obs = new PerformanceObserver((items) => {
+  items.getEntries().forEach((item) => {
+    console.log(item);
+  });
+});
+obs.observe({ entryTypes: ['net'] });
+const PORT = 8080;
+createServer((socket) => {
+  socket.destroy();
+}).listen(PORT, () => {
+  connect(PORT);
+});
+```
+
+```cjs
 'use strict';
 const { PerformanceObserver } = require('node:perf_hooks');
 const net = require('node:net');
@@ -1950,7 +2250,21 @@ net.createServer((socket) => {
 
 ### Measuring how long the DNS takes when the request is successful
 
-```js
+```mjs
+import { PerformanceObserver } from 'node:perf_hooks';
+import { lookup, promises } from 'node:dns';
+
+const obs = new PerformanceObserver((items) => {
+  items.getEntries().forEach((item) => {
+    console.log(item);
+  });
+});
+obs.observe({ entryTypes: ['dns'] });
+lookup('localhost', () => {});
+promises.resolve('localhost');
+```
+
+```cjs
 'use strict';
 const { PerformanceObserver } = require('node:perf_hooks');
 const dns = require('node:dns');
diff --git a/doc/api/permissions.md b/doc/api/permissions.md
index a03285e28641e8..ea3ccc17b306b7 100644
--- a/doc/api/permissions.md
+++ b/doc/api/permissions.md
@@ -28,25 +28,25 @@ If you find a potential security vulnerability, please refer to our
 
 <!-- type=misc -->
 
-> Stability: 1.1 - Active development
+> Stability: 2 - Stable.
 
 <!-- name=permission-model -->
 
 The Node.js Permission Model is a mechanism for restricting access to specific
 resources during execution.
-The API exists behind a flag [`--experimental-permission`][] which when enabled,
+The API exists behind a flag [`--permission`][] which when enabled,
 will restrict access to all available permissions.
 
-The available permissions are documented by the [`--experimental-permission`][]
+The available permissions are documented by the [`--permission`][]
 flag.
 
-When starting Node.js with `--experimental-permission`,
+When starting Node.js with `--permission`,
 the ability to access the file system through the `fs` module, spawn processes,
 use `node:worker_threads`, use native addons, use WASI, and enable the runtime inspector
 will be restricted.
 
 ```console
-$ node --experimental-permission index.js
+$ node --permission index.js
 
 Error: Access to this API has been restricted
     at node:internal/main/run_main_module:23:47 {
@@ -64,7 +64,7 @@ flag. For WASI, use the [`--allow-wasi`][] flag.
 
 #### Runtime API
 
-When enabling the Permission Model through the [`--experimental-permission`][]
+When enabling the Permission Model through the [`--permission`][]
 flag a new property `permission` is added to the `process` object.
 This property contains one function:
 
@@ -90,10 +90,8 @@ To allow access to the file system, use the [`--allow-fs-read`][] and
 [`--allow-fs-write`][] flags:
 
 ```console
-$ node --experimental-permission --allow-fs-read=* --allow-fs-write=* index.js
+$ node --permission --allow-fs-read=* --allow-fs-write=* index.js
 Hello world!
-(node:19836) ExperimentalWarning: Permission is an experimental feature
-(Use `node --trace-warnings ...` to show where the warning was created)
 ```
 
 The valid arguments for both flags are:
@@ -147,6 +145,8 @@ There are constraints you need to know before using this system:
   flags that can be set via runtime through `v8.setFlagsFromString`.
 * OpenSSL engines cannot be requested at runtime when the Permission
   Model is enabled, affecting the built-in crypto, https, and tls modules.
+* Run-Time Loadable Extensions cannot be loaded when the Permission Model is
+  enabled, affecting the sqlite module.
 * Using existing file descriptors via the `node:fs` module bypasses the
   Permission Model.
 
@@ -165,5 +165,5 @@ There are constraints you need to know before using this system:
 [`--allow-fs-write`]: cli.md#--allow-fs-write
 [`--allow-wasi`]: cli.md#--allow-wasi
 [`--allow-worker`]: cli.md#--allow-worker
-[`--experimental-permission`]: cli.md#--experimental-permission
+[`--permission`]: cli.md#--permission
 [`permission.has()`]: process.md#processpermissionhasscope-reference
diff --git a/doc/api/process.md b/doc/api/process.md
index 5bf05b1d909860..379694b95d8472 100644
--- a/doc/api/process.md
+++ b/doc/api/process.md
@@ -3103,7 +3103,7 @@ added: v20.0.0
 
 * {Object}
 
-This API is available through the [`--experimental-permission`][] flag.
+This API is available through the [`--permission`][] flag.
 
 `process.permission` is an object whose methods are used to manage permissions
 for the current process. Additional documentation is available in the
@@ -4440,8 +4440,8 @@ cases:
 [`'exit'`]: #event-exit
 [`'message'`]: child_process.md#event-message
 [`'uncaughtException'`]: #event-uncaughtexception
-[`--experimental-permission`]: cli.md#--experimental-permission
 [`--no-deprecation`]: cli.md#--no-deprecation
+[`--permission`]: cli.md#--permission
 [`--unhandled-rejections`]: cli.md#--unhandled-rejectionsmode
 [`Buffer`]: buffer.md
 [`ChildProcess.disconnect()`]: child_process.md#subprocessdisconnect
diff --git a/doc/api/readline.md b/doc/api/readline.md
index bf0951fdd1b55c..cb1aa52605dc4a 100644
--- a/doc/api/readline.md
+++ b/doc/api/readline.md
@@ -703,9 +703,18 @@ added: v17.0.0
 The `readlinePromises.createInterface()` method creates a new `readlinePromises.Interface`
 instance.
 
-```js
-const readlinePromises = require('node:readline/promises');
-const rl = readlinePromises.createInterface({
+```mjs
+import { createInterface } from 'node:readline/promises';
+import { stdin, stdout } from 'node:process';
+const rl = createInterface({
+  input: stdin,
+  output: stdout,
+});
+```
+
+```cjs
+const { createInterface } = require('node:readline/promises');
+const rl = createInterface({
   input: process.stdin,
   output: process.stdout,
 });
@@ -960,9 +969,18 @@ changes:
 The `readline.createInterface()` method creates a new `readline.Interface`
 instance.
 
-```js
-const readline = require('node:readline');
-const rl = readline.createInterface({
+```mjs
+import { createInterface } from 'node:readline';
+import { stdin, stdout } from 'node:process';
+const rl = createInterface({
+  input: stdin,
+  output: stdout,
+});
+```
+
+```cjs
+const { createInterface } = require('node:readline');
+const rl = createInterface({
   input: process.stdin,
   output: process.stdout,
 });
@@ -1098,9 +1116,36 @@ if (process.stdin.isTTY)
 The following example illustrates the use of `readline.Interface` class to
 implement a small command-line interface:
 
-```js
-const readline = require('node:readline');
-const rl = readline.createInterface({
+```mjs
+import { createInterface } from 'node:readline';
+import { exit, stdin, stdout } from 'node:process';
+const rl = createInterface({
+  input: stdin,
+  output: stdout,
+  prompt: 'OHAI> ',
+});
+
+rl.prompt();
+
+rl.on('line', (line) => {
+  switch (line.trim()) {
+    case 'hello':
+      console.log('world!');
+      break;
+    default:
+      console.log(`Say what? I might have heard '${line.trim()}'`);
+      break;
+  }
+  rl.prompt();
+}).on('close', () => {
+  console.log('Have a great day!');
+  exit(0);
+});
+```
+
+```cjs
+const { createInterface } = require('node:readline');
+const rl = createInterface({
   input: process.stdin,
   output: process.stdout,
   prompt: 'OHAI> ',
@@ -1130,14 +1175,37 @@ A common use case for `readline` is to consume an input file one line at a
 time. The easiest way to do so is leveraging the [`fs.ReadStream`][] API as
 well as a `for await...of` loop:
 
-```js
-const fs = require('node:fs');
-const readline = require('node:readline');
+```mjs
+import { createReadStream } from 'node:fs';
+import { createInterface } from 'node:readline';
 
 async function processLineByLine() {
-  const fileStream = fs.createReadStream('input.txt');
+  const fileStream = createReadStream('input.txt');
 
-  const rl = readline.createInterface({
+  const rl = createInterface({
+    input: fileStream,
+    crlfDelay: Infinity,
+  });
+  // Note: we use the crlfDelay option to recognize all instances of CR LF
+  // ('\r\n') in input.txt as a single line break.
+
+  for await (const line of rl) {
+    // Each line in input.txt will be successively available here as `line`.
+    console.log(`Line from file: ${line}`);
+  }
+}
+
+processLineByLine();
+```
+
+```cjs
+const { createReadStream } = require('node:fs');
+const { createInterface } = require('node:readline');
+
+async function processLineByLine() {
+  const fileStream = createReadStream('input.txt');
+
+  const rl = createInterface({
     input: fileStream,
     crlfDelay: Infinity,
   });
@@ -1155,12 +1223,26 @@ processLineByLine();
 
 Alternatively, one could use the [`'line'`][] event:
 
-```js
-const fs = require('node:fs');
-const readline = require('node:readline');
+```mjs
+import { createReadStream } from 'node:fs';
+import { createInterface } from 'node:readline';
 
-const rl = readline.createInterface({
-  input: fs.createReadStream('sample.txt'),
+const rl = createInterface({
+  input: createReadStream('sample.txt'),
+  crlfDelay: Infinity,
+});
+
+rl.on('line', (line) => {
+  console.log(`Line from file: ${line}`);
+});
+```
+
+```cjs
+const { createReadStream } = require('node:fs');
+const { createInterface } = require('node:readline');
+
+const rl = createInterface({
+  input: createReadStream('sample.txt'),
   crlfDelay: Infinity,
 });
 
@@ -1172,7 +1254,32 @@ rl.on('line', (line) => {
 Currently, `for await...of` loop can be a bit slower. If `async` / `await`
 flow and speed are both essential, a mixed approach can be applied:
 
-```js
+```mjs
+import { once } from 'node:events';
+import { createReadStream } from 'node:fs';
+import { createInterface } from 'node:readline';
+
+(async function processLineByLine() {
+  try {
+    const rl = createInterface({
+      input: createReadStream('big-file.txt'),
+      crlfDelay: Infinity,
+    });
+
+    rl.on('line', (line) => {
+      // Process the line.
+    });
+
+    await once(rl, 'close');
+
+    console.log('File processed.');
+  } catch (err) {
+    console.error(err);
+  }
+})();
+```
+
+```cjs
 const { once } = require('node:events');
 const { createReadStream } = require('node:fs');
 const { createInterface } = require('node:readline');
diff --git a/doc/api/repl.md b/doc/api/repl.md
index 8d00cdeed3916a..a134c493a54812 100644
--- a/doc/api/repl.md
+++ b/doc/api/repl.md
@@ -10,7 +10,11 @@ The `node:repl` module provides a Read-Eval-Print-Loop (REPL) implementation
 that is available both as a standalone program or includible in other
 applications. It can be accessed using:
 
-```js
+```mjs
+import repl from 'node:repl';
+```
+
+```cjs
 const repl = require('node:repl');
 ```
 
@@ -106,7 +110,14 @@ The default evaluator provides access to any variables that exist in the global
 scope. It is possible to expose a variable to the REPL explicitly by assigning
 it to the `context` object associated with each `REPLServer`:
 
-```js
+```mjs
+import repl from 'node:repl';
+const msg = 'message';
+
+repl.start('> ').context.m = msg;
+```
+
+```cjs
 const repl = require('node:repl');
 const msg = 'message';
 
@@ -124,7 +135,19 @@ $ node repl_test.js
 Context properties are not read-only by default. To specify read-only globals,
 context properties must be defined using `Object.defineProperty()`:
 
-```js
+```mjs
+import repl from 'node:repl';
+const msg = 'message';
+
+const r = repl.start('> ');
+Object.defineProperty(r.context, 'm', {
+  configurable: false,
+  enumerable: true,
+  value: msg,
+});
+```
+
+```cjs
 const repl = require('node:repl');
 const msg = 'message';
 
@@ -280,20 +303,34 @@ When a new [`repl.REPLServer`][] is created, a custom evaluation function may be
 provided. This can be used, for instance, to implement fully customized REPL
 applications.
 
-The following illustrates a hypothetical example of a REPL that performs
-translation of text from one language to another:
+The following illustrates an example of a REPL that squares a given number:
 
-```js
+```mjs
+import repl from 'node:repl';
+
+function byThePowerOfTwo(number) {
+  return number * number;
+}
+
+function myEval(cmd, context, filename, callback) {
+  callback(null, byThePowerOfTwo(cmd));
+}
+
+repl.start({ prompt: 'Enter a number: ', eval: myEval });
+```
+
+```cjs
 const repl = require('node:repl');
-const { Translator } = require('translator');
 
-const myTranslator = new Translator('en', 'fr');
+function byThePowerOfTwo(number) {
+  return number * number;
+}
 
 function myEval(cmd, context, filename, callback) {
-  callback(null, myTranslator.translate(cmd));
+  callback(null, byThePowerOfTwo(cmd));
 }
 
-repl.start({ prompt: '> ', eval: myEval });
+repl.start({ prompt: 'Enter a number: ', eval: myEval });
 ```
 
 #### Recoverable errors
@@ -354,7 +391,21 @@ To fully customize the output of a [`repl.REPLServer`][] instance pass in a new
 function for the `writer` option on construction. The following example, for
 instance, simply converts any input text to upper case:
 
-```js
+```mjs
+import repl from 'node:repl';
+
+const r = repl.start({ prompt: '> ', eval: myEval, writer: myWriter });
+
+function myEval(cmd, context, filename, callback) {
+  callback(null, cmd);
+}
+
+function myWriter(output) {
+  return output.toUpperCase();
+}
+```
+
+```cjs
 const repl = require('node:repl');
 
 const r = repl.start({ prompt: '> ', eval: myEval, writer: myWriter });
@@ -380,7 +431,16 @@ added: v0.1.91
 Instances of `repl.REPLServer` are created using the [`repl.start()`][] method
 or directly using the JavaScript `new` keyword.
 
-```js
+```mjs
+import repl from 'node:repl';
+
+const options = { useColors: true };
+
+const firstInstance = repl.start(options);
+const secondInstance = new repl.REPLServer(options);
+```
+
+```cjs
 const repl = require('node:repl');
 
 const options = { useColors: true };
@@ -424,7 +484,20 @@ reference to the `context` object as the only argument.
 This can be used primarily to re-initialize REPL context to some pre-defined
 state:
 
-```js
+```mjs
+import repl from 'node:repl';
+
+function initializeContext(context) {
+  context.m = 'test';
+}
+
+const r = repl.start({ prompt: '> ' });
+initializeContext(r.context);
+
+r.on('reset', initializeContext);
+```
+
+```cjs
 const repl = require('node:repl');
 
 function initializeContext(context) {
@@ -475,7 +548,25 @@ properties:
 
 The following example shows two new commands added to the REPL instance:
 
-```js
+```mjs
+import repl from 'node:repl';
+
+const replServer = repl.start({ prompt: '> ' });
+replServer.defineCommand('sayhello', {
+  help: 'Say hello',
+  action(name) {
+    this.clearBufferedCommand();
+    console.log(`Hello, ${name}!`);
+    this.displayPrompt();
+  },
+});
+replServer.defineCommand('saybye', function saybye() {
+  console.log('Goodbye!');
+  this.close();
+});
+```
+
+```cjs
 const repl = require('node:repl');
 
 const replServer = repl.start({ prompt: '> ' });
@@ -637,7 +728,14 @@ The `repl.start()` method creates and starts a [`repl.REPLServer`][] instance.
 
 If `options` is a string, then it specifies the input prompt:
 
-```js
+```mjs
+import repl from 'node:repl';
+
+// a Unix style prompt
+repl.start('$ ');
+```
+
+```cjs
 const repl = require('node:repl');
 
 // a Unix style prompt
@@ -709,7 +807,43 @@ separate I/O interfaces.
 The following example, for instance, provides separate REPLs on `stdin`, a Unix
 socket, and a TCP socket:
 
-```js
+```mjs
+import net from 'node:net';
+import repl from 'node:repl';
+import process from 'node:process';
+
+let connections = 0;
+
+repl.start({
+  prompt: 'Node.js via stdin> ',
+  input: process.stdin,
+  output: process.stdout,
+});
+
+net.createServer((socket) => {
+  connections += 1;
+  repl.start({
+    prompt: 'Node.js via Unix socket> ',
+    input: socket,
+    output: socket,
+  }).on('exit', () => {
+    socket.end();
+  });
+}).listen('/tmp/node-repl-sock');
+
+net.createServer((socket) => {
+  connections += 1;
+  repl.start({
+    prompt: 'Node.js via TCP socket> ',
+    input: socket,
+    output: socket,
+  }).on('exit', () => {
+    socket.end();
+  });
+}).listen(5001);
+```
+
+```cjs
 const net = require('node:net');
 const repl = require('node:repl');
 let connections = 0;
diff --git a/doc/api/report.md b/doc/api/report.md
index c74dc7e4c0880b..ad4e5418234e92 100644
--- a/doc/api/report.md
+++ b/doc/api/report.md
@@ -35,7 +35,7 @@ is provided below for reference.
 ```json
 {
   "header": {
-    "reportVersion": 4,
+    "reportVersion": 5,
     "event": "exception",
     "trigger": "Exception",
     "filename": "report.20181221.005011.8974.0.001.json",
@@ -392,7 +392,7 @@ is provided below for reference.
       "soft": "",
       "hard": "unlimited"
     },
-    "data_seg_size_kbytes": {
+    "data_seg_size_bytes": {
       "soft": "unlimited",
       "hard": "unlimited"
     },
@@ -404,7 +404,7 @@ is provided below for reference.
       "soft": "unlimited",
       "hard": 65536
     },
-    "max_memory_size_kbytes": {
+    "max_memory_size_bytes": {
       "soft": "unlimited",
       "hard": "unlimited"
     },
@@ -424,7 +424,7 @@ is provided below for reference.
       "soft": "unlimited",
       "hard": 4127290
     },
-    "virtual_memory_kbytes": {
+    "virtual_memory_bytes": {
       "soft": "unlimited",
       "hard": "unlimited"
     }
@@ -588,6 +588,41 @@ Report version definitions are consistent across LTS releases.
 
 ### Version history
 
+#### Version 5
+
+<!-- YAML
+changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56068
+    description: Fix typos in the memory limit units.
+-->
+
+Replace the keys `data_seg_size_kbytes`, `max_memory_size_kbytes`, and `virtual_memory_kbytes`
+with `data_seg_size_bytes`, `max_memory_size_bytes`, and `virtual_memory_bytes`
+respectively in the `userLimits` section, as these values are given in bytes.
+
+```json
+{
+  "userLimits": {
+    // Skip some keys ...
+    "data_seg_size_bytes": { // replacing data_seg_size_kbytes
+      "soft": "unlimited",
+      "hard": "unlimited"
+    },
+    // ...
+    "max_memory_size_bytes": { // replacing max_memory_size_kbytes
+      "soft": "unlimited",
+      "hard": "unlimited"
+    },
+    // ...
+    "virtual_memory_bytes": { // replacing virtual_memory_kbytes
+      "soft": "unlimited",
+      "hard": "unlimited"
+    }
+  }
+}
+```
+
 #### Version 4
 
 <!-- YAML
diff --git a/doc/api/single-executable-applications.md b/doc/api/single-executable-applications.md
index be86536291ba7d..30ebd0fc38aaa6 100644
--- a/doc/api/single-executable-applications.md
+++ b/doc/api/single-executable-applications.md
@@ -352,7 +352,7 @@ writes to the returned array buffer is likely to result in a crash.
 
 * `key`  {string} the key for the asset in the dictionary specified by the
   `assets` field in the single-executable application configuration.
-* Returns: {string|ArrayBuffer}
+* Returns: {ArrayBuffer}
 
 ### `require(id)` in the injected main script is not file based
 
diff --git a/doc/api/sqlite.md b/doc/api/sqlite.md
index cdf5d690521a06..1f5054cd65e26d 100644
--- a/doc/api/sqlite.md
+++ b/doc/api/sqlite.md
@@ -108,6 +108,10 @@ added: v22.5.0
     [double-quoted string literals][]. This is not recommended but can be
     enabled for compatibility with legacy database schemas.
     **Default:** `false`.
+  * `allowExtension` {boolean} If `true`, the `loadExtension` SQL function
+    and the `loadExtension()` method are enabled.
+    You can call `enableLoadExtension(false)` later to disable this feature.
+    **Default:** `false`.
 
 Constructs a new `DatabaseSync` instance.
 
@@ -120,6 +124,30 @@ added: v22.5.0
 Closes the database connection. An exception is thrown if the database is not
 open. This method is a wrapper around [`sqlite3_close_v2()`][].
 
+### `database.loadExtension(path)`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+* `path` {string} The path to the shared library to load.
+
+Loads a shared library into the database connection. This method is a wrapper
+around [`sqlite3_load_extension()`][]. It is required to enable the
+`allowExtension` option when constructing the `DatabaseSync` instance.
+
+### `database.enableLoadExtension(allow)`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+* `allow` {boolean} Whether to allow loading extensions.
+
+Enables or disables the `loadExtension` SQL function, and the `loadExtension()`
+method. When `allowExtension` is `false` when constructing, you cannot enable
+loading extensions for security reasons.
+
 ### `database.exec(sql)`
 
 <!-- YAML
@@ -132,6 +160,31 @@ This method allows one or more SQL statements to be executed without returning
 any results. This method is useful when executing SQL statements read from a
 file. This method is a wrapper around [`sqlite3_exec()`][].
 
+### `database.function(name[, options], function)`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+* `name` {string} The name of the SQLite function to create.
+* `options` {Object} Optional configuration settings for the function. The
+  following properties are supported:
+  * `deterministic` {boolean} If `true`, the [`SQLITE_DETERMINISTIC`][] flag is
+    set on the created function. **Default:** `false`.
+  * `directOnly` {boolean} If `true`, the [`SQLITE_DIRECTONLY`][] flag is set on
+    the created function. **Default:** `false`.
+  * `useBigIntArguments` {boolean} If `true`, integer arguments to `function`
+    are converted to `BigInt`s. If `false`, integer arguments are passed as
+    JavaScript numbers. **Default:** `false`.
+  * `varargs` {boolean} If `true`, `function` can accept a variable number of
+    arguments. If `false`, `function` must be invoked with exactly
+    `function.length` arguments. **Default:** `false`.
+* `function` {Function} The JavaScript function to call when the SQLite
+  function is invoked.
+
+This method is used to create SQLite user-defined functions. This method is a
+wrapper around [`sqlite3_create_function_v2()`][].
+
 ### `database.open()`
 
 <!-- YAML
@@ -419,11 +472,19 @@ exception.
 | `TEXT`    | {string}             |
 | `BLOB`    | {Uint8Array}         |
 
-## SQLite constants
+## `sqlite.constants`
+
+<!-- YAML
+added: v23.5.0
+-->
+
+* {Object}
+
+An object containing commonly used constants for SQLite operations.
 
-The following constants are exported by the `node:sqlite` module.
+### SQLite constants
 
-### SQLite Session constants
+The following constants are exported by the `sqlite.constants` object.
 
 #### Conflict-resolution constants
 
@@ -444,7 +505,7 @@ The following constants are meant for use with [`database.applyChangeset()`](#da
   </tr>
   <tr>
     <td><code>SQLITE_CHANGESET_ABORT</code></td>
-    <td>Abort when a change encounters a conflict and roll back databsase.</td>
+    <td>Abort when a change encounters a conflict and roll back database.</td>
   </tr>
 </table>
 
@@ -452,11 +513,15 @@ The following constants are meant for use with [`database.applyChangeset()`](#da
 [SQL injection]: https://en.wikipedia.org/wiki/SQL_injection
 [`ATTACH DATABASE`]: https://www.sqlite.org/lang_attach.html
 [`PRAGMA foreign_keys`]: https://www.sqlite.org/pragma.html#pragma_foreign_keys
+[`SQLITE_DETERMINISTIC`]: https://www.sqlite.org/c3ref/c_deterministic.html
+[`SQLITE_DIRECTONLY`]: https://www.sqlite.org/c3ref/c_deterministic.html
 [`sqlite3_changes64()`]: https://www.sqlite.org/c3ref/changes.html
 [`sqlite3_close_v2()`]: https://www.sqlite.org/c3ref/close.html
+[`sqlite3_create_function_v2()`]: https://www.sqlite.org/c3ref/create_function.html
 [`sqlite3_exec()`]: https://www.sqlite.org/c3ref/exec.html
 [`sqlite3_expanded_sql()`]: https://www.sqlite.org/c3ref/expanded_sql.html
 [`sqlite3_last_insert_rowid()`]: https://www.sqlite.org/c3ref/last_insert_rowid.html
+[`sqlite3_load_extension()`]: https://www.sqlite.org/c3ref/load_extension.html
 [`sqlite3_prepare_v2()`]: https://www.sqlite.org/c3ref/prepare.html
 [`sqlite3_sql()`]: https://www.sqlite.org/c3ref/expanded_sql.html
 [`sqlite3changeset_apply()`]: https://www.sqlite.org/session/sqlite3changeset_apply.html
diff --git a/doc/api/test.md b/doc/api/test.md
index 4acc6b8c76005a..2d2245083d1ac4 100644
--- a/doc/api/test.md
+++ b/doc/api/test.md
@@ -476,8 +476,10 @@ all tests have completed. If the [`NODE_V8_COVERAGE`][] environment variable is
 used to specify a code coverage directory, the generated V8 coverage files are
 written to that directory. Node.js core modules and files within
 `node_modules/` directories are, by default, not included in the coverage report.
-However, they can be explicitly included via the [`--test-coverage-include`][] flag. If
-coverage is enabled, the coverage report is sent to any [test reporters][] via
+However, they can be explicitly included via the [`--test-coverage-include`][] flag.
+By default all the matching test files are excluded from the coverage report.
+Exclusions can be overridden by using the [`--test-coverage-exclude`][] flag.
+If coverage is enabled, the coverage report is sent to any [test reporters][] via
 the `'test:coverage'` event.
 
 Coverage can be disabled on a series of lines using the following
@@ -3592,6 +3594,7 @@ Can be used to abort test subtasks when the test has been aborted.
 [`--experimental-test-module-mocks`]: cli.md#--experimental-test-module-mocks
 [`--import`]: cli.md#--importmodule
 [`--test-concurrency`]: cli.md#--test-concurrency
+[`--test-coverage-exclude`]: cli.md#--test-coverage-exclude
 [`--test-coverage-include`]: cli.md#--test-coverage-include
 [`--test-name-pattern`]: cli.md#--test-name-pattern
 [`--test-only`]: cli.md#--test-only
diff --git a/doc/api/tls.md b/doc/api/tls.md
index 3436ba6697950f..8c52daf294bc1f 100644
--- a/doc/api/tls.md
+++ b/doc/api/tls.md
@@ -10,7 +10,11 @@ The `node:tls` module provides an implementation of the Transport Layer Security
 (TLS) and Secure Socket Layer (SSL) protocols that is built on top of OpenSSL.
 The module can be accessed using:
 
-```js
+```mjs
+import tls from 'node:tls';
+```
+
+```cjs
 const tls = require('node:tls');
 ```
 
@@ -461,17 +465,31 @@ To adjust the security level in your Node.js application, you can include `@SECL
 within a cipher string, where `X` is the desired security level. For example,
 to set the security level to 0 while using the default OpenSSL cipher list, you could use:
 
-```js
-const tls = require('node:tls');
+```mjs
+import { createServer, connect } from 'node:tls';
+const port = 443;
+
+createServer({ ciphers: 'DEFAULT@SECLEVEL=0', minVersion: 'TLSv1' }, function(socket) {
+  console.log('Client connected with protocol:', socket.getProtocol());
+  socket.end();
+  this.close();
+})
+.listen(port, () => {
+  connect(port, { ciphers: 'DEFAULT@SECLEVEL=0', maxVersion: 'TLSv1' });
+});
+```
+
+```cjs
+const { createServer, connect } = require('node:tls');
 const port = 443;
 
-tls.createServer({ ciphers: 'DEFAULT@SECLEVEL=0', minVersion: 'TLSv1' }, function(socket) {
+createServer({ ciphers: 'DEFAULT@SECLEVEL=0', minVersion: 'TLSv1' }, function(socket) {
   console.log('Client connected with protocol:', socket.getProtocol());
   socket.end();
   this.close();
 })
 .listen(port, () => {
-  tls.connect(port, { ciphers: 'DEFAULT@SECLEVEL=0', maxVersion: 'TLSv1' });
+  connect(port, { ciphers: 'DEFAULT@SECLEVEL=0', maxVersion: 'TLSv1' });
 });
 ```
 
@@ -1785,24 +1803,57 @@ to `host`.
 The following illustrates a client for the echo server example from
 [`tls.createServer()`][]:
 
-```js
+```mjs
 // Assumes an echo server that is listening on port 8000.
-const tls = require('node:tls');
-const fs = require('node:fs');
+import { connect } from 'node:tls';
+import { readFileSync } from 'node:fs';
+import { stdin } from 'node:process';
+
+const options = {
+  // Necessary only if the server requires client certificate authentication.
+  key: readFileSync('client-key.pem'),
+  cert: readFileSync('client-cert.pem'),
+
+  // Necessary only if the server uses a self-signed certificate.
+  ca: [ readFileSync('server-cert.pem') ],
+
+  // Necessary only if the server's cert isn't for "localhost".
+  checkServerIdentity: () => { return null; },
+};
+
+const socket = connect(8000, options, () => {
+  console.log('client connected',
+              socket.authorized ? 'authorized' : 'unauthorized');
+  stdin.pipe(socket);
+  stdin.resume();
+});
+socket.setEncoding('utf8');
+socket.on('data', (data) => {
+  console.log(data);
+});
+socket.on('end', () => {
+  console.log('server ends connection');
+});
+```
+
+```cjs
+// Assumes an echo server that is listening on port 8000.
+const { connect } = require('node:tls');
+const { readFileSync } = require('node:fs');
 
 const options = {
   // Necessary only if the server requires client certificate authentication.
-  key: fs.readFileSync('client-key.pem'),
-  cert: fs.readFileSync('client-cert.pem'),
+  key: readFileSync('client-key.pem'),
+  cert: readFileSync('client-cert.pem'),
 
   // Necessary only if the server uses a self-signed certificate.
-  ca: [ fs.readFileSync('server-cert.pem') ],
+  ca: [ readFileSync('server-cert.pem') ],
 
   // Necessary only if the server's cert isn't for "localhost".
   checkServerIdentity: () => { return null; },
 };
 
-const socket = tls.connect(8000, options, () => {
+const socket = connect(8000, options, () => {
   console.log('client connected',
               socket.authorized ? 'authorized' : 'unauthorized');
   process.stdin.pipe(socket);
@@ -1817,6 +1868,20 @@ socket.on('end', () => {
 });
 ```
 
+To generate the certificate and key for this example, run:
+
+```bash
+openssl req -x509 -newkey rsa:2048 -nodes -sha256 -subj '/CN=localhost' \
+  -keyout client-key.pem -out client-cert.pem
+```
+
+Then, to generate the `server-cert.pem` certificate for this example, run:
+
+```bash
+openssl pkcs12 -certpbe AES-256-CBC -export -out server-cert.pem \
+  -inkey client-key.pem -in client-cert.pem
+```
+
 ## `tls.connect(path[, options][, callback])`
 
 <!-- YAML
@@ -2228,22 +2293,22 @@ workers.
 
 The following illustrates a simple echo server:
 
-```js
-const tls = require('node:tls');
-const fs = require('node:fs');
+```mjs
+import { createServer } from 'node:tls';
+import { readFileSync } from 'node:fs';
 
 const options = {
-  key: fs.readFileSync('server-key.pem'),
-  cert: fs.readFileSync('server-cert.pem'),
+  key: readFileSync('server-key.pem'),
+  cert: readFileSync('server-cert.pem'),
 
   // This is necessary only if using client certificate authentication.
   requestCert: true,
 
   // This is necessary only if the client uses a self-signed certificate.
-  ca: [ fs.readFileSync('client-cert.pem') ],
+  ca: [ readFileSync('client-cert.pem') ],
 };
 
-const server = tls.createServer(options, (socket) => {
+const server = createServer(options, (socket) => {
   console.log('server connected',
               socket.authorized ? 'authorized' : 'unauthorized');
   socket.write('welcome!\n');
@@ -2255,6 +2320,47 @@ server.listen(8000, () => {
 });
 ```
 
+```cjs
+const { createServer } = require('node:tls');
+const { readFileSync } = require('node:fs');
+
+const options = {
+  key: readFileSync('server-key.pem'),
+  cert: readFileSync('server-cert.pem'),
+
+  // This is necessary only if using client certificate authentication.
+  requestCert: true,
+
+  // This is necessary only if the client uses a self-signed certificate.
+  ca: [ readFileSync('client-cert.pem') ],
+};
+
+const server = createServer(options, (socket) => {
+  console.log('server connected',
+              socket.authorized ? 'authorized' : 'unauthorized');
+  socket.write('welcome!\n');
+  socket.setEncoding('utf8');
+  socket.pipe(socket);
+});
+server.listen(8000, () => {
+  console.log('server bound');
+});
+```
+
+To generate the certificate and key for this example, run:
+
+```bash
+openssl req -x509 -newkey rsa:2048 -nodes -sha256 -subj '/CN=localhost' \
+  -keyout server-key.pem -out server-cert.pem
+```
+
+Then, to generate the `client-cert.pem` certificate for this example, run:
+
+```bash
+openssl pkcs12 -certpbe AES-256-CBC -export -out client-cert.pem \
+  -inkey server-key.pem -in server-cert.pem
+```
+
 The server can be tested by connecting to it using the example client from
 [`tls.connect()`][].
 
diff --git a/doc/api/util.md b/doc/api/util.md
index eb06e93c503811..7299d23ab67de3 100644
--- a/doc/api/util.md
+++ b/doc/api/util.md
@@ -1915,13 +1915,16 @@ console.log(util.stripVTControlCharacters('\u001B[4mvalue\u001B[0m'));
 
 ## `util.styleText(format, text[, options])`
 
-> Stability: 1.1 - Active development
+> Stability: 2 - Stable.
 
 <!-- YAML
 added:
   - v21.7.0
   - v20.12.0
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56265
+    description: styleText is now stable.
   - version:
     - v22.8.0
     - v20.18.0
@@ -2244,39 +2247,55 @@ added:
 > Stability: 1 - Experimental
 
 * `signal` {AbortSignal}
-* `resource` {Object} Any non-null entity, reference to which is held weakly.
+* `resource` {Object} Any non-null object tied to the abortable operation and held weakly.
+  If `resource` is garbage collected before the `signal` aborts, the promise remains pending,
+  allowing Node.js to stop tracking it.
+  This helps prevent memory leaks in long-running or non-cancelable operations.
 * Returns: {Promise}
 
-Listens to abort event on the provided `signal` and
-returns a promise that is fulfilled when the `signal` is
-aborted. If the passed `resource` is garbage collected before the `signal` is
-aborted, the returned promise shall remain pending indefinitely.
+Listens to abort event on the provided `signal` and returns a promise that resolves when the `signal` is aborted.
+If `resource` is provided, it weakly references the operation's associated object,
+so if `resource` is garbage collected before the `signal` aborts,
+then returned promise shall remain pending.
+This prevents memory leaks in long-running or non-cancelable operations.
 
 ```cjs
 const { aborted } = require('node:util');
 
+// Obtain an object with an abortable signal, like a custom resource or operation.
 const dependent = obtainSomethingAbortable();
 
+// Pass `dependent` as the resource, indicating the promise should only resolve
+// if `dependent` is still in memory when the signal is aborted.
 aborted(dependent.signal, dependent).then(() => {
-  // Do something when dependent is aborted.
+
+  // This code runs when `dependent` is aborted.
+  console.log('Dependent resource was aborted.');
 });
 
+// Simulate an event that triggers the abort.
 dependent.on('event', () => {
-  dependent.abort();
+  dependent.abort(); // This will cause the `aborted` promise to resolve.
 });
 ```
 
 ```mjs
 import { aborted } from 'node:util';
 
+// Obtain an object with an abortable signal, like a custom resource or operation.
 const dependent = obtainSomethingAbortable();
 
+// Pass `dependent` as the resource, indicating the promise should only resolve
+// if `dependent` is still in memory when the signal is aborted.
 aborted(dependent.signal, dependent).then(() => {
-  // Do something when dependent is aborted.
+
+  // This code runs when `dependent` is aborted.
+  console.log('Dependent resource was aborted.');
 });
 
+// Simulate an event that triggers the abort.
 dependent.on('event', () => {
-  dependent.abort();
+  dependent.abort(); // This will cause the `aborted` promise to resolve.
 });
 ```
 
diff --git a/doc/api/webcrypto.md b/doc/api/webcrypto.md
index 349fb116b1c0d6..147b93d4a8a682 100644
--- a/doc/api/webcrypto.md
+++ b/doc/api/webcrypto.md
@@ -2,6 +2,9 @@
 
 <!-- YAML
 changes:
+  - version: v23.5.0
+    pr-url: https://github.com/nodejs/node/pull/56142
+    description: Algorithms `Ed25519` and `X25519` are now stable.
   - version:
     - v20.0.0
     - v18.17.0
@@ -113,9 +116,7 @@ async function generateEcKey(namedCurve = 'P-521') {
 }
 ```
 
-#### Ed25519/Ed448/X25519/X448 key pairs
-
-> Stability: 1 - Experimental
+#### Ed25519/X25519 key pairs
 
 ```js
 const { subtle } = globalThis.crypto;
@@ -353,28 +354,28 @@ async function digest(data, algorithm = 'SHA-512') {
 The table details the algorithms supported by the Node.js Web Crypto API
 implementation and the APIs supported for each:
 
-| Algorithm                                                 | `generateKey` | `exportKey` | `importKey` | `encrypt` | `decrypt` | `wrapKey` | `unwrapKey` | `deriveBits` | `deriveKey` | `sign` | `verify` | `digest` |
-| --------------------------------------------------------- | ------------- | ----------- | ----------- | --------- | --------- | --------- | ----------- | ------------ | ----------- | ------ | -------- | -------- |
-| `'RSASSA-PKCS1-v1_5'`                                     | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
-| `'RSA-PSS'`                                               | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
-| `'RSA-OAEP'`                                              | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
-| `'ECDSA'`                                                 | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
-| `'Ed25519'` <span class="experimental-inline"></span>[^1] | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
-| `'Ed448'` <span class="experimental-inline"></span>[^1]   | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
-| `'ECDH'`                                                  | ✔             | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
-| `'X25519'` <span class="experimental-inline"></span>[^1]  | ✔             | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
-| `'X448'` <span class="experimental-inline"></span>[^1]    | ✔             | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
-| `'AES-CTR'`                                               | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
-| `'AES-CBC'`                                               | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
-| `'AES-GCM'`                                               | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
-| `'AES-KW'`                                                | ✔             | ✔           | ✔           |           |           | ✔         | ✔           |              |             |        |          |          |
-| `'HMAC'`                                                  | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
-| `'HKDF'`                                                  |               | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
-| `'PBKDF2'`                                                |               | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
-| `'SHA-1'`                                                 |               |             |             |           |           |           |             |              |             |        |          | ✔        |
-| `'SHA-256'`                                               |               |             |             |           |           |           |             |              |             |        |          | ✔        |
-| `'SHA-384'`                                               |               |             |             |           |           |           |             |              |             |        |          | ✔        |
-| `'SHA-512'`                                               |               |             |             |           |           |           |             |              |             |        |          | ✔        |
+| Algorithm                                               | `generateKey` | `exportKey` | `importKey` | `encrypt` | `decrypt` | `wrapKey` | `unwrapKey` | `deriveBits` | `deriveKey` | `sign` | `verify` | `digest` |
+| ------------------------------------------------------- | ------------- | ----------- | ----------- | --------- | --------- | --------- | ----------- | ------------ | ----------- | ------ | -------- | -------- |
+| `'RSASSA-PKCS1-v1_5'`                                   | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
+| `'RSA-PSS'`                                             | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
+| `'RSA-OAEP'`                                            | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
+| `'ECDSA'`                                               | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
+| `'Ed25519'`                                             | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
+| `'Ed448'` <span class="experimental-inline"></span>[^1] | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
+| `'ECDH'`                                                | ✔             | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
+| `'X25519'`                                              | ✔             | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
+| `'X448'` <span class="experimental-inline"></span>[^1]  | ✔             | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
+| `'AES-CTR'`                                             | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
+| `'AES-CBC'`                                             | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
+| `'AES-GCM'`                                             | ✔             | ✔           | ✔           | ✔         | ✔         | ✔         | ✔           |              |             |        |          |          |
+| `'AES-KW'`                                              | ✔             | ✔           | ✔           |           |           | ✔         | ✔           |              |             |        |          |          |
+| `'HMAC'`                                                | ✔             | ✔           | ✔           |           |           |           |             |              |             | ✔      | ✔        |          |
+| `'HKDF'`                                                |               | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
+| `'PBKDF2'`                                              |               | ✔           | ✔           |           |           |           |             | ✔            | ✔           |        |          |          |
+| `'SHA-1'`                                               |               |             |             |           |           |           |             |              |             |        |          | ✔        |
+| `'SHA-256'`                                             |               |             |             |           |           |           |             |              |             |        |          | ✔        |
+| `'SHA-384'`                                             |               |             |             |           |           |           |             |              |             |        |          | ✔        |
+| `'SHA-512'`                                             |               |             |             |           |           |           |             |              |             |        |          | ✔        |
 
 ## Class: `Crypto`
 
@@ -496,24 +497,24 @@ The possible usages are:
 Valid key usages depend on the key algorithm (identified by
 `cryptokey.algorithm.name`).
 
-| Key Type                                                  | `'encrypt'` | `'decrypt'` | `'sign'` | `'verify'` | `'deriveKey'` | `'deriveBits'` | `'wrapKey'` | `'unwrapKey'` |
-| --------------------------------------------------------- | ----------- | ----------- | -------- | ---------- | ------------- | -------------- | ----------- | ------------- |
-| `'AES-CBC'`                                               | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
-| `'AES-CTR'`                                               | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
-| `'AES-GCM'`                                               | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
-| `'AES-KW'`                                                |             |             |          |            |               |                | ✔           | ✔             |
-| `'ECDH'`                                                  |             |             |          |            | ✔             | ✔              |             |               |
-| `'X25519'` <span class="experimental-inline"></span>[^1]  |             |             |          |            | ✔             | ✔              |             |               |
-| `'X448'` <span class="experimental-inline"></span>[^1]    |             |             |          |            | ✔             | ✔              |             |               |
-| `'ECDSA'`                                                 |             |             | ✔        | ✔          |               |                |             |               |
-| `'Ed25519'` <span class="experimental-inline"></span>[^1] |             |             | ✔        | ✔          |               |                |             |               |
-| `'Ed448'` <span class="experimental-inline"></span>[^1]   |             |             | ✔        | ✔          |               |                |             |               |
-| `'HDKF'`                                                  |             |             |          |            | ✔             | ✔              |             |               |
-| `'HMAC'`                                                  |             |             | ✔        | ✔          |               |                |             |               |
-| `'PBKDF2'`                                                |             |             |          |            | ✔             | ✔              |             |               |
-| `'RSA-OAEP'`                                              | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
-| `'RSA-PSS'`                                               |             |             | ✔        | ✔          |               |                |             |               |
-| `'RSASSA-PKCS1-v1_5'`                                     |             |             | ✔        | ✔          |               |                |             |               |
+| Key Type                                                | `'encrypt'` | `'decrypt'` | `'sign'` | `'verify'` | `'deriveKey'` | `'deriveBits'` | `'wrapKey'` | `'unwrapKey'` |
+| ------------------------------------------------------- | ----------- | ----------- | -------- | ---------- | ------------- | -------------- | ----------- | ------------- |
+| `'AES-CBC'`                                             | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
+| `'AES-CTR'`                                             | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
+| `'AES-GCM'`                                             | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
+| `'AES-KW'`                                              |             |             |          |            |               |                | ✔           | ✔             |
+| `'ECDH'`                                                |             |             |          |            | ✔             | ✔              |             |               |
+| `'X25519'`                                              |             |             |          |            | ✔             | ✔              |             |               |
+| `'X448'` <span class="experimental-inline"></span>[^1]  |             |             |          |            | ✔             | ✔              |             |               |
+| `'ECDSA'`                                               |             |             | ✔        | ✔          |               |                |             |               |
+| `'Ed25519'`                                             |             |             | ✔        | ✔          |               |                |             |               |
+| `'Ed448'` <span class="experimental-inline"></span>[^1] |             |             | ✔        | ✔          |               |                |             |               |
+| `'HDKF'`                                                |             |             |          |            | ✔             | ✔              |             |               |
+| `'HMAC'`                                                |             |             | ✔        | ✔          |               |                |             |               |
+| `'PBKDF2'`                                              |             |             |          |            | ✔             | ✔              |             |               |
+| `'RSA-OAEP'`                                            | ✔           | ✔           |          |            |               |                | ✔           | ✔             |
+| `'RSA-PSS'`                                             |             |             | ✔        | ✔          |               |                |             |               |
+| `'RSASSA-PKCS1-v1_5'`                                   |             |             | ✔        | ✔          |               |                |             |               |
 
 ## Class: `CryptoKeyPair`
 
@@ -610,7 +611,7 @@ containing the generated data.
 The algorithms currently supported include:
 
 * `'ECDH'`
-* `'X25519'` <span class="experimental-inline"></span>[^1]
+* `'X25519'`
 * `'X448'` <span class="experimental-inline"></span>[^1]
 * `'HKDF'`
 * `'PBKDF2'`
@@ -650,7 +651,7 @@ generate raw keying material, then passing the result into the
 The algorithms currently supported include:
 
 * `'ECDH'`
-* `'X25519'` <span class="experimental-inline"></span>[^1]
+* `'X25519'`
 * `'X448'` <span class="experimental-inline"></span>[^1]
 * `'HKDF'`
 * `'PBKDF2'`
@@ -734,22 +735,22 @@ When `format` is `'jwk'` and the export is successful, the returned promise
 will be resolved with a JavaScript object conforming to the [JSON Web Key][]
 specification.
 
-| Key Type                                                  | `'spki'` | `'pkcs8'` | `'jwk'` | `'raw'` |
-| --------------------------------------------------------- | -------- | --------- | ------- | ------- |
-| `'AES-CBC'`                                               |          |           | ✔       | ✔       |
-| `'AES-CTR'`                                               |          |           | ✔       | ✔       |
-| `'AES-GCM'`                                               |          |           | ✔       | ✔       |
-| `'AES-KW'`                                                |          |           | ✔       | ✔       |
-| `'ECDH'`                                                  | ✔        | ✔         | ✔       | ✔       |
-| `'ECDSA'`                                                 | ✔        | ✔         | ✔       | ✔       |
-| `'Ed25519'` <span class="experimental-inline"></span>[^1] | ✔        | ✔         | ✔       | ✔       |
-| `'Ed448'` <span class="experimental-inline"></span>[^1]   | ✔        | ✔         | ✔       | ✔       |
-| `'HDKF'`                                                  |          |           |         |         |
-| `'HMAC'`                                                  |          |           | ✔       | ✔       |
-| `'PBKDF2'`                                                |          |           |         |         |
-| `'RSA-OAEP'`                                              | ✔        | ✔         | ✔       |         |
-| `'RSA-PSS'`                                               | ✔        | ✔         | ✔       |         |
-| `'RSASSA-PKCS1-v1_5'`                                     | ✔        | ✔         | ✔       |         |
+| Key Type                                                | `'spki'` | `'pkcs8'` | `'jwk'` | `'raw'` |
+| ------------------------------------------------------- | -------- | --------- | ------- | ------- |
+| `'AES-CBC'`                                             |          |           | ✔       | ✔       |
+| `'AES-CTR'`                                             |          |           | ✔       | ✔       |
+| `'AES-GCM'`                                             |          |           | ✔       | ✔       |
+| `'AES-KW'`                                              |          |           | ✔       | ✔       |
+| `'ECDH'`                                                | ✔        | ✔         | ✔       | ✔       |
+| `'ECDSA'`                                               | ✔        | ✔         | ✔       | ✔       |
+| `'Ed25519'`                                             | ✔        | ✔         | ✔       | ✔       |
+| `'Ed448'` <span class="experimental-inline"></span>[^1] | ✔        | ✔         | ✔       | ✔       |
+| `'HDKF'`                                                |          |           |         |         |
+| `'HMAC'`                                                |          |           | ✔       | ✔       |
+| `'PBKDF2'`                                              |          |           |         |         |
+| `'RSA-OAEP'`                                            | ✔        | ✔         | ✔       |         |
+| `'RSA-PSS'`                                             | ✔        | ✔         | ✔       |         |
+| `'RSASSA-PKCS1-v1_5'`                                   | ✔        | ✔         | ✔       |         |
 
 ### `subtle.generateKey(algorithm, extractable, keyUsages)`
 
@@ -778,10 +779,10 @@ include:
 * `'RSA-PSS'`
 * `'RSA-OAEP'`
 * `'ECDSA'`
-* `'Ed25519'` <span class="experimental-inline"></span>[^1]
+* `'Ed25519'`
 * `'Ed448'` <span class="experimental-inline"></span>[^1]
 * `'ECDH'`
-* `'X25519'` <span class="experimental-inline"></span>[^1]
+* `'X25519'`
 * `'X448'` <span class="experimental-inline"></span>[^1]
 
 The {CryptoKey} (secret key) generating algorithms supported include:
@@ -830,24 +831,24 @@ If importing a `'PBKDF2'` key, `extractable` must be `false`.
 
 The algorithms currently supported include:
 
-| Key Type                                                  | `'spki'` | `'pkcs8'` | `'jwk'` | `'raw'` |
-| --------------------------------------------------------- | -------- | --------- | ------- | ------- |
-| `'AES-CBC'`                                               |          |           | ✔       | ✔       |
-| `'AES-CTR'`                                               |          |           | ✔       | ✔       |
-| `'AES-GCM'`                                               |          |           | ✔       | ✔       |
-| `'AES-KW'`                                                |          |           | ✔       | ✔       |
-| `'ECDH'`                                                  | ✔        | ✔         | ✔       | ✔       |
-| `'X25519'` <span class="experimental-inline"></span>[^1]  | ✔        | ✔         | ✔       | ✔       |
-| `'X448'` <span class="experimental-inline"></span>[^1]    | ✔        | ✔         | ✔       | ✔       |
-| `'ECDSA'`                                                 | ✔        | ✔         | ✔       | ✔       |
-| `'Ed25519'` <span class="experimental-inline"></span>[^1] | ✔        | ✔         | ✔       | ✔       |
-| `'Ed448'` <span class="experimental-inline"></span>[^1]   | ✔        | ✔         | ✔       | ✔       |
-| `'HDKF'`                                                  |          |           |         | ✔       |
-| `'HMAC'`                                                  |          |           | ✔       | ✔       |
-| `'PBKDF2'`                                                |          |           |         | ✔       |
-| `'RSA-OAEP'`                                              | ✔        | ✔         | ✔       |         |
-| `'RSA-PSS'`                                               | ✔        | ✔         | ✔       |         |
-| `'RSASSA-PKCS1-v1_5'`                                     | ✔        | ✔         | ✔       |         |
+| Key Type                                                | `'spki'` | `'pkcs8'` | `'jwk'` | `'raw'` |
+| ------------------------------------------------------- | -------- | --------- | ------- | ------- |
+| `'AES-CBC'`                                             |          |           | ✔       | ✔       |
+| `'AES-CTR'`                                             |          |           | ✔       | ✔       |
+| `'AES-GCM'`                                             |          |           | ✔       | ✔       |
+| `'AES-KW'`                                              |          |           | ✔       | ✔       |
+| `'ECDH'`                                                | ✔        | ✔         | ✔       | ✔       |
+| `'X25519'`                                              | ✔        | ✔         | ✔       | ✔       |
+| `'X448'` <span class="experimental-inline"></span>[^1]  | ✔        | ✔         | ✔       | ✔       |
+| `'ECDSA'`                                               | ✔        | ✔         | ✔       | ✔       |
+| `'Ed25519'`                                             | ✔        | ✔         | ✔       | ✔       |
+| `'Ed448'` <span class="experimental-inline"></span>[^1] | ✔        | ✔         | ✔       | ✔       |
+| `'HDKF'`                                                |          |           |         | ✔       |
+| `'HMAC'`                                                |          |           | ✔       | ✔       |
+| `'PBKDF2'`                                              |          |           |         | ✔       |
+| `'RSA-OAEP'`                                            | ✔        | ✔         | ✔       |         |
+| `'RSA-PSS'`                                             | ✔        | ✔         | ✔       |         |
+| `'RSASSA-PKCS1-v1_5'`                                   | ✔        | ✔         | ✔       |         |
 
 ### `subtle.sign(algorithm, key, data)`
 
@@ -880,7 +881,7 @@ The algorithms currently supported include:
 * `'RSASSA-PKCS1-v1_5'`
 * `'RSA-PSS'`
 * `'ECDSA'`
-* `'Ed25519'` <span class="experimental-inline"></span>[^1]
+* `'Ed25519'`
 * `'Ed448'` <span class="experimental-inline"></span>[^1]
 * `'HMAC'`
 
@@ -928,10 +929,10 @@ The unwrapped key algorithms supported include:
 * `'RSA-PSS'`
 * `'RSA-OAEP'`
 * `'ECDSA'`
-* `'Ed25519'` <span class="experimental-inline"></span>[^1]
+* `'Ed25519'`
 * `'Ed448'` <span class="experimental-inline"></span>[^1]
 * `'ECDH'`
-* `'X25519'` <span class="experimental-inline"></span>[^1]
+* `'X25519'`
 * `'X448'` <span class="experimental-inline"></span>[^1]
 * `'HMAC'`
 * `'AES-CTR'`
@@ -971,7 +972,7 @@ The algorithms currently supported include:
 * `'RSASSA-PKCS1-v1_5'`
 * `'RSA-PSS'`
 * `'ECDSA'`
-* `'Ed25519'` <span class="experimental-inline"></span>[^1]
+* `'Ed25519'`
 * `'Ed448'` <span class="experimental-inline"></span>[^1]
 * `'HMAC'`
 
@@ -1644,8 +1645,8 @@ added: v15.0.0
 
 The length (in bytes) of the random salt to use.
 
-[^1]: An experimental implementation of
-    [Secure Curves in the Web Cryptography API][] as of 30 August 2023
+[^1]: An experimental implementation of Ed448 and X448 algorithms from
+    [Secure Curves in the Web Cryptography API][] as of 21 October 2024
 
 [JSON Web Key]: https://tools.ietf.org/html/rfc7517
 [Key usages]: #cryptokeyusages
diff --git a/doc/api_assets/hljs.css b/doc/api_assets/hljs.css
index 86bd405709276c..91cece59e7cced 100644
--- a/doc/api_assets/hljs.css
+++ b/doc/api_assets/hljs.css
@@ -49,7 +49,7 @@
     color: #808080;
   }
   .hljs-attr {
-    color: #f00;
+    color: #d00;
   }
   .hljs-symbol,
   .hljs-bullet,
diff --git a/doc/changelogs/CHANGELOG_V23.md b/doc/changelogs/CHANGELOG_V23.md
index 9e723609b72bca..62a3e1231d5c7c 100644
--- a/doc/changelogs/CHANGELOG_V23.md
+++ b/doc/changelogs/CHANGELOG_V23.md
@@ -8,6 +8,7 @@
 </tr>
 <tr>
 <td>
+<a href="#23.5.0">23.5.0</a><br/>
 <a href="#23.4.0">23.4.0</a><br/>
 <a href="#23.3.0">23.3.0</a><br/>
 <a href="#23.2.0">23.2.0</a><br/>
@@ -42,6 +43,175 @@
   * [io.js](CHANGELOG_IOJS.md)
   * [Archive](CHANGELOG_ARCHIVE.md)
 
+<a id="23.5.0"></a>
+
+## 2024-12-19, Version 23.5.0 (Current), @aduh95
+
+### Notable Changes
+
+#### WebCryptoAPI Ed25519 and X25519 algorithms are now stable
+
+Following the merge of Curve25519 into the
+[Web Cryptography API Editor's Draft](https://w3c.github.io/webcrypto/) the
+`Ed25519` and `X25519` algorithm identifiers are now stable and will no longer
+emit an ExperimentalWarning upon use.
+
+Contributed by Filip Skokan in [#56142](https://github.com/nodejs/node/pull/56142).
+
+#### On-thread hooks are back
+
+This release introduces `module.registerHooks()` for registering module loader
+customization hooks that are run for all modules loaded by `require()`, `import`
+and functions returned by `createRequire()` in the same thread, which makes them
+easier for CJS monkey-patchers to migrate to.
+
+```mjs
+import assert from 'node:assert';
+import { registerHooks, createRequire } from 'node:module';
+import { writeFileSync } from 'node:fs';
+
+writeFileSync('./bar.js', 'export const id = 123;', 'utf8');
+
+registerHooks({
+  resolve(specifier, context, nextResolve) {
+    const replaced = specifier.replace('foo', 'bar');
+    return nextResolve(replaced, context);
+  },
+  load(url, context, nextLoad) {
+    const result = nextLoad(url, context);
+    return {
+      ...result,
+      source: result.source.toString().replace('123', '456'),
+    };
+  },
+});
+
+// Checks that it works with require.
+const require = createRequire(import.meta.url);
+const required = require('./foo.js');  // Redirected by resolve hook to bar.js
+assert.strictEqual(required.id, 456);  // Replaced by load hook to 456
+
+// Checks that it works with import.
+const imported = await import('./foo.js');  // Redirected by resolve hook to bar.js
+assert.strictEqual(imported.id, 456);  // Replaced by load hook to 456
+```
+
+This complements the `module.register()` hooks - the new hooks fit better
+internally and cover all corners in the module graph; whereas
+`module.register()` previously could not cover `require()` while it was
+on-thread, and still cannot cover `createRequire()` after being moved
+off-thread.
+
+They are also run in the same thread as the modules being loaded and where the
+hooks are registered, which means they are easier to debug (no more
+`console.log()` getting lost) and do not have the many deadlock issues haunting
+the `module.register()` hooks. The new API also takes functions directly so that
+it's easier for intermediate loader packages to take user options from files
+that the hooks can't be aware of, like many existing CJS monkey-patchers do.
+
+Contributed by Joyee Cheung in [#55698](https://github.com/nodejs/node/pull/55698).
+
+#### Other notable changes
+
+* \[[`59cae91465`](https://github.com/nodejs/node/commit/59cae91465)] - **(SEMVER-MINOR)** **dgram**: support blocklist in udp (theanarkh) [#56087](https://github.com/nodejs/node/pull/56087)
+* \[[`72f79b44ed`](https://github.com/nodejs/node/commit/72f79b44ed)] - **doc**: stabilize util.styleText (Rafael Gonzaga) [#56265](https://github.com/nodejs/node/pull/56265)
+* \[[`b5a2c0777d`](https://github.com/nodejs/node/commit/b5a2c0777d)] - **(SEMVER-MINOR)** **module**: add prefix-only modules to `module.builtinModules` (Jordan Harband) [#56185](https://github.com/nodejs/node/pull/56185)
+* \[[`9863d27566`](https://github.com/nodejs/node/commit/9863d27566)] - **(SEMVER-MINOR)** **module**: only emit require(esm) warning under --trace-require-module (Joyee Cheung) [#56194](https://github.com/nodejs/node/pull/56194)
+* \[[`8e780bc5ae`](https://github.com/nodejs/node/commit/8e780bc5ae)] - **(SEMVER-MINOR)** **module**: use synchronous hooks for preparsing in import(cjs) (Joyee Cheung) [#55698](https://github.com/nodejs/node/pull/55698)
+* \[[`65bc8e847f`](https://github.com/nodejs/node/commit/65bc8e847f)] - **(SEMVER-MINOR)** **report**: fix typos in report keys and bump the version (Yuan-Ming Hsu) [#56068](https://github.com/nodejs/node/pull/56068)
+* \[[`0ab36e1937`](https://github.com/nodejs/node/commit/0ab36e1937)] - **(SEMVER-MINOR)** **sqlite**: aggregate constants in a single property (Edigleysson Silva (Edy)) [#56213](https://github.com/nodejs/node/pull/56213)
+* \[[`efcc5d90c5`](https://github.com/nodejs/node/commit/efcc5d90c5)] - **(SEMVER-MINOR)** **src,lib**: stabilize permission model (Rafael Gonzaga) [#56201](https://github.com/nodejs/node/pull/56201)
+
+### Commits
+
+* \[[`2314e4916e`](https://github.com/nodejs/node/commit/2314e4916e)] - **assert**: make Maps be partially compared in partialDeepStrictEqual (Giovanni Bucci) [#56195](https://github.com/nodejs/node/pull/56195)
+* \[[`cfbdff7b45`](https://github.com/nodejs/node/commit/cfbdff7b45)] - **assert**: make partialDeepStrictEqual work with ArrayBuffers (Giovanni Bucci) [#56098](https://github.com/nodejs/node/pull/56098)
+* \[[`f264dd6d20`](https://github.com/nodejs/node/commit/f264dd6d20)] - **buffer**: document concat zero-fill (Duncan) [#55562](https://github.com/nodejs/node/pull/55562)
+* \[[`4831b87d83`](https://github.com/nodejs/node/commit/4831b87d83)] - **build**: set DESTCPU correctly for 'make binary' on loongarch64 (吴小白) [#56271](https://github.com/nodejs/node/pull/56271)
+* \[[`1497bb405e`](https://github.com/nodejs/node/commit/1497bb405e)] - **build**: fix missing fp16 dependency in d8 builds (Joyee Cheung) [#56266](https://github.com/nodejs/node/pull/56266)
+* \[[`445c8c7489`](https://github.com/nodejs/node/commit/445c8c7489)] - **build**: add major release action (Rafael Gonzaga) [#56199](https://github.com/nodejs/node/pull/56199)
+* \[[`f4faedfa69`](https://github.com/nodejs/node/commit/f4faedfa69)] - **build**: fix C string encoding for `PRODUCT_DIR_ABS` (Anna Henningsen) [#56111](https://github.com/nodejs/node/pull/56111)
+* \[[`6f49c8006c`](https://github.com/nodejs/node/commit/6f49c8006c)] - **build**: use variable for simdutf path (Shelley Vohr) [#56196](https://github.com/nodejs/node/pull/56196)
+* \[[`fcaa2c82a6`](https://github.com/nodejs/node/commit/fcaa2c82a6)] - **build**: fix GN build on macOS (Joyee Cheung) [#56141](https://github.com/nodejs/node/pull/56141)
+* \[[`08e5309f4f`](https://github.com/nodejs/node/commit/08e5309f4f)] - _**Revert**_ "**build**: avoid compiling with VS v17.12" (Gerhard Stöbich) [#56151](https://github.com/nodejs/node/pull/56151)
+* \[[`c2fb38cfdf`](https://github.com/nodejs/node/commit/c2fb38cfdf)] - **crypto**: graduate WebCryptoAPI Ed25519 and X25519 algorithms as stable (Filip Skokan) [#56142](https://github.com/nodejs/node/pull/56142)
+* \[[`8658833884`](https://github.com/nodejs/node/commit/8658833884)] - **deps**: update nghttp3 to 1.6.0 (Node.js GitHub Bot) [#56258](https://github.com/nodejs/node/pull/56258)
+* \[[`7c941d4610`](https://github.com/nodejs/node/commit/7c941d4610)] - **deps**: update simdutf to 5.6.4 (Node.js GitHub Bot) [#56255](https://github.com/nodejs/node/pull/56255)
+* \[[`4e9113eada`](https://github.com/nodejs/node/commit/4e9113eada)] - **deps**: update libuv to 1.49.2 (Luigi Pinca) [#56224](https://github.com/nodejs/node/pull/56224)
+* \[[`db6aba12e4`](https://github.com/nodejs/node/commit/db6aba12e4)] - **deps**: update c-ares to v1.34.4 (Node.js GitHub Bot) [#56256](https://github.com/nodejs/node/pull/56256)
+* \[[`25bb462bc2`](https://github.com/nodejs/node/commit/25bb462bc2)] - **deps**: define V8\_PRESERVE\_MOST as no-op on Windows (Stefan Stojanovic) [#56238](https://github.com/nodejs/node/pull/56238)
+* \[[`54308c51bb`](https://github.com/nodejs/node/commit/54308c51bb)] - **deps**: update sqlite to 3.47.2 (Node.js GitHub Bot) [#56178](https://github.com/nodejs/node/pull/56178)
+* \[[`59cae91465`](https://github.com/nodejs/node/commit/59cae91465)] - **(SEMVER-MINOR)** **dgram**: support blocklist in udp (theanarkh) [#56087](https://github.com/nodejs/node/pull/56087)
+* \[[`52c18e605e`](https://github.com/nodejs/node/commit/52c18e605e)] - **doc**: fix color contrast issue in light mode (Rich Trott) [#56272](https://github.com/nodejs/node/pull/56272)
+* \[[`72f79b44ed`](https://github.com/nodejs/node/commit/72f79b44ed)] - **doc**: stabilize util.styleText (Rafael Gonzaga) [#56265](https://github.com/nodejs/node/pull/56265)
+* \[[`0d08756d0c`](https://github.com/nodejs/node/commit/0d08756d0c)] - **doc**: clarify util.aborted resource usage (Kunal Kumar) [#55780](https://github.com/nodejs/node/pull/55780)
+* \[[`f94f21080b`](https://github.com/nodejs/node/commit/f94f21080b)] - **doc**: add esm examples to node:repl (Alfredo González) [#55432](https://github.com/nodejs/node/pull/55432)
+* \[[`7a10ef88d9`](https://github.com/nodejs/node/commit/7a10ef88d9)] - **doc**: add esm examples to node:readline (Alfredo González) [#55335](https://github.com/nodejs/node/pull/55335)
+* \[[`cc7a7c391b`](https://github.com/nodejs/node/commit/cc7a7c391b)] - **doc**: fix 'which' to 'that' and add commas (Selveter Senitro) [#56216](https://github.com/nodejs/node/pull/56216)
+* \[[`c5b086250e`](https://github.com/nodejs/node/commit/c5b086250e)] - **doc**: fix winget config path (Alex Yang) [#56233](https://github.com/nodejs/node/pull/56233)
+* \[[`71c38a24d4`](https://github.com/nodejs/node/commit/71c38a24d4)] - **doc**: add esm examples to node:tls (Alfredo González) [#56229](https://github.com/nodejs/node/pull/56229)
+* \[[`394fffbbde`](https://github.com/nodejs/node/commit/394fffbbde)] - **doc**: add esm examples to node:perf\_hooks (Alfredo González) [#55257](https://github.com/nodejs/node/pull/55257)
+* \[[`7b2a6ee61e`](https://github.com/nodejs/node/commit/7b2a6ee61e)] - **doc**: `sea.getRawAsset(key)` always returns an ArrayBuffer (沈鸿飞) [#56206](https://github.com/nodejs/node/pull/56206)
+* \[[`8092dcf27e`](https://github.com/nodejs/node/commit/8092dcf27e)] - **doc**: update announce documentation for releases (Rafael Gonzaga) [#56200](https://github.com/nodejs/node/pull/56200)
+* \[[`2974667815`](https://github.com/nodejs/node/commit/2974667815)] - **doc**: update blog link to /vulnerability (Rafael Gonzaga) [#56198](https://github.com/nodejs/node/pull/56198)
+* \[[`f3b3ff85e0`](https://github.com/nodejs/node/commit/f3b3ff85e0)] - **doc**: call out import.meta is only supported in ES modules (Anton Kastritskii) [#56186](https://github.com/nodejs/node/pull/56186)
+* \[[`a9e67280e7`](https://github.com/nodejs/node/commit/a9e67280e7)] - **doc**: add ambassador message - benefits of Node.js (Michael Dawson) [#56085](https://github.com/nodejs/node/pull/56085)
+* \[[`e4922ab15f`](https://github.com/nodejs/node/commit/e4922ab15f)] - **doc**: fix incorrect link to style guide (Yuan-Ming Hsu) [#56181](https://github.com/nodejs/node/pull/56181)
+* \[[`114a3e5a05`](https://github.com/nodejs/node/commit/114a3e5a05)] - **doc**: fix c++ addon hello world sample (Edigleysson Silva (Edy)) [#56172](https://github.com/nodejs/node/pull/56172)
+* \[[`f1c2d2f65e`](https://github.com/nodejs/node/commit/f1c2d2f65e)] - **doc**: update blog release-post link (Ruy Adorno) [#56123](https://github.com/nodejs/node/pull/56123)
+* \[[`d48b5224c0`](https://github.com/nodejs/node/commit/d48b5224c0)] - **doc**: fix module.md headings (Chengzhong Wu) [#56131](https://github.com/nodejs/node/pull/56131)
+* \[[`4cc0493a0b`](https://github.com/nodejs/node/commit/4cc0493a0b)] - **fs**: make mutating `options` in Callback `readdir()` not affect results (LiviaMedeiros) [#56057](https://github.com/nodejs/node/pull/56057)
+* \[[`8d485f1c09`](https://github.com/nodejs/node/commit/8d485f1c09)] - **fs**: make mutating `options` in Promises `readdir()` not affect results (LiviaMedeiros) [#56057](https://github.com/nodejs/node/pull/56057)
+* \[[`595851b5ed`](https://github.com/nodejs/node/commit/595851b5ed)] - **fs,win**: fix readdir for named pipe (Hüseyin Açacak) [#56110](https://github.com/nodejs/node/pull/56110)
+* \[[`075b36b7b4`](https://github.com/nodejs/node/commit/075b36b7b4)] - **http**: add setDefaultHeaders option to http.request (Tim Perry) [#56112](https://github.com/nodejs/node/pull/56112)
+* \[[`febd969c46`](https://github.com/nodejs/node/commit/febd969c46)] - **http2**: remove duplicate codeblock (Vitaly Aminev) [#55915](https://github.com/nodejs/node/pull/55915)
+* \[[`b0ebd23e52`](https://github.com/nodejs/node/commit/b0ebd23e52)] - **http2**: support ALPNCallback option (ZYSzys) [#56187](https://github.com/nodejs/node/pull/56187)
+* \[[`f10239fde7`](https://github.com/nodejs/node/commit/f10239fde7)] - **lib**: remove redundant global regexps (Gürgün Dayıoğlu) [#56182](https://github.com/nodejs/node/pull/56182)
+* \[[`fd55d3cbdd`](https://github.com/nodejs/node/commit/fd55d3cbdd)] - **lib**: clean up persisted signals when they are settled (Edigleysson Silva (Edy)) [#56001](https://github.com/nodejs/node/pull/56001)
+* \[[`889094fdbc`](https://github.com/nodejs/node/commit/889094fdbc)] - **lib**: handle Float16Array in node:v8 serdes (Bartek Iwańczuk) [#55996](https://github.com/nodejs/node/pull/55996)
+* \[[`5aec513207`](https://github.com/nodejs/node/commit/5aec513207)] - **lib**: disable default memory leak warning for AbortSignal (Lenz Weber-Tronic) [#55816](https://github.com/nodejs/node/pull/55816)
+* \[[`b5a2c0777d`](https://github.com/nodejs/node/commit/b5a2c0777d)] - **(SEMVER-MINOR)** **module**: add prefix-only modules to `module.builtinModules` (Jordan Harband) [#56185](https://github.com/nodejs/node/pull/56185)
+* \[[`9863d27566`](https://github.com/nodejs/node/commit/9863d27566)] - **(SEMVER-MINOR)** **module**: only emit require(esm) warning under --trace-require-module (Joyee Cheung) [#56194](https://github.com/nodejs/node/pull/56194)
+* \[[`5665e86da6`](https://github.com/nodejs/node/commit/5665e86da6)] - **module**: prevent main thread exiting before esm worker ends (Shima Ryuhei) [#56183](https://github.com/nodejs/node/pull/56183)
+* \[[`8e780bc5ae`](https://github.com/nodejs/node/commit/8e780bc5ae)] - **(SEMVER-MINOR)** **module**: use synchronous hooks for preparsing in import(cjs) (Joyee Cheung) [#55698](https://github.com/nodejs/node/pull/55698)
+* \[[`e5bb6c2303`](https://github.com/nodejs/node/commit/e5bb6c2303)] - **(SEMVER-MINOR)** **module**: implement module.registerHooks() (Joyee Cheung) [#55698](https://github.com/nodejs/node/pull/55698)
+* \[[`f883bedceb`](https://github.com/nodejs/node/commit/f883bedceb)] - **node-api**: allow napi\_delete\_reference in finalizers (Chengzhong Wu) [#55620](https://github.com/nodejs/node/pull/55620)
+* \[[`65bc8e847f`](https://github.com/nodejs/node/commit/65bc8e847f)] - **(SEMVER-MINOR)** **report**: fix typos in report keys and bump the version (Yuan-Ming Hsu) [#56068](https://github.com/nodejs/node/pull/56068)
+* \[[`a6f0cfa468`](https://github.com/nodejs/node/commit/a6f0cfa468)] - **sea**: only assert snapshot main function for main threads (Joyee Cheung) [#56120](https://github.com/nodejs/node/pull/56120)
+* \[[`0ab36e1937`](https://github.com/nodejs/node/commit/0ab36e1937)] - **(SEMVER-MINOR)** **sqlite**: aggregate constants in a single property (Edigleysson Silva (Edy)) [#56213](https://github.com/nodejs/node/pull/56213)
+* \[[`4745798225`](https://github.com/nodejs/node/commit/4745798225)] - **sqlite**: add support for custom functions (Colin Ihrig) [#55985](https://github.com/nodejs/node/pull/55985)
+* \[[`53cc0cc744`](https://github.com/nodejs/node/commit/53cc0cc744)] - **sqlite**: support `db.loadExtension` (Alex Yang) [#53900](https://github.com/nodejs/node/pull/53900)
+* \[[`3968599702`](https://github.com/nodejs/node/commit/3968599702)] - **src**: fix outdated js2c.cc references (Chengzhong Wu) [#56133](https://github.com/nodejs/node/pull/56133)
+* \[[`efcc5d90c5`](https://github.com/nodejs/node/commit/efcc5d90c5)] - **(SEMVER-MINOR)** **src,lib**: stabilize permission model (Rafael Gonzaga) [#56201](https://github.com/nodejs/node/pull/56201)
+* \[[`a4a83613cb`](https://github.com/nodejs/node/commit/a4a83613cb)] - **stream**: commit pull-into descriptors after filling from queue (Mattias Buelens) [#56072](https://github.com/nodejs/node/pull/56072)
+* \[[`3298ef4891`](https://github.com/nodejs/node/commit/3298ef4891)] - **test**: remove test-sqlite-statement-sync flaky designation (Luigi Pinca) [#56051](https://github.com/nodejs/node/pull/56051)
+* \[[`1d8cc6179d`](https://github.com/nodejs/node/commit/1d8cc6179d)] - **test**: use --permission over --experimental-permission (Rafael Gonzaga) [#56239](https://github.com/nodejs/node/pull/56239)
+* \[[`5d252b7a67`](https://github.com/nodejs/node/commit/5d252b7a67)] - **test**: remove exludes for sea tests on PPC (Michael Dawson) [#56217](https://github.com/nodejs/node/pull/56217)
+* \[[`8288f57724`](https://github.com/nodejs/node/commit/8288f57724)] - **test**: fix test-abortsignal-drop-settled-signals flakiness (Edigleysson Silva (Edy)) [#56197](https://github.com/nodejs/node/pull/56197)
+* \[[`683cc15796`](https://github.com/nodejs/node/commit/683cc15796)] - **test**: move localizationd data from `test-icu-env` to external file (Livia Medeiros) [#55618](https://github.com/nodejs/node/pull/55618)
+* \[[`a0c4a5f122`](https://github.com/nodejs/node/commit/a0c4a5f122)] - **test**: update WPT for url to 6fa3fe8a92 (Node.js GitHub Bot) [#56136](https://github.com/nodejs/node/pull/56136)
+* \[[`a0e3926285`](https://github.com/nodejs/node/commit/a0e3926285)] - **test**: remove `hasOpenSSL3x` utils (Antoine du Hamel) [#56164](https://github.com/nodejs/node/pull/56164)
+* \[[`041a49094e`](https://github.com/nodejs/node/commit/041a49094e)] - **test**: update streams wpt (Mattias Buelens) [#56072](https://github.com/nodejs/node/pull/56072)
+* \[[`ea9a675f56`](https://github.com/nodejs/node/commit/ea9a675f56)] - **test\_runner**: exclude test files from coverage by default (Pietro Marchini) [#56060](https://github.com/nodejs/node/pull/56060)
+* \[[`118cd9998f`](https://github.com/nodejs/node/commit/118cd9998f)] - **tools**: fix `node:` enforcement for docs (Antoine du Hamel) [#56284](https://github.com/nodejs/node/pull/56284)
+* \[[`c4c56daae8`](https://github.com/nodejs/node/commit/c4c56daae8)] - **tools**: update github\_reporter to 1.7.2 (Node.js GitHub Bot) [#56205](https://github.com/nodejs/node/pull/56205)
+* \[[`78743b1533`](https://github.com/nodejs/node/commit/78743b1533)] - **tools**: add REPLACEME check to workflow (Mert Can Altin) [#56251](https://github.com/nodejs/node/pull/56251)
+* \[[`002ee71d9b`](https://github.com/nodejs/node/commit/002ee71d9b)] - **tools**: use `github.actor` instead of bot username for release proposals (Antoine du Hamel) [#56232](https://github.com/nodejs/node/pull/56232)
+* \[[`d25d16efeb`](https://github.com/nodejs/node/commit/d25d16efeb)] - _**Revert**_ "**tools**: disable automated libuv updates" (Luigi Pinca) [#56223](https://github.com/nodejs/node/pull/56223)
+* \[[`b395e0c8c9`](https://github.com/nodejs/node/commit/b395e0c8c9)] - **tools**: update gyp-next to 0.19.1 (Anna Henningsen) [#56111](https://github.com/nodejs/node/pull/56111)
+* \[[`a5aaf31c50`](https://github.com/nodejs/node/commit/a5aaf31c50)] - **tools**: fix release proposal linter to support more than 1 folk preparing (Antoine du Hamel) [#56203](https://github.com/nodejs/node/pull/56203)
+* \[[`fa667d609e`](https://github.com/nodejs/node/commit/fa667d609e)] - **tools**: remove has\_absl\_stringify from gyp file (Michaël Zasso) [#56157](https://github.com/nodejs/node/pull/56157)
+* \[[`65b541e70e`](https://github.com/nodejs/node/commit/65b541e70e)] - **tools**: enable linter for `tools/icu/**` (Livia Medeiros) [#56176](https://github.com/nodejs/node/pull/56176)
+* \[[`28a4b6ff58`](https://github.com/nodejs/node/commit/28a4b6ff58)] - **tools**: use commit title as PR title when creating release proposal (Antoine du Hamel) [#56165](https://github.com/nodejs/node/pull/56165)
+* \[[`e20eef659f`](https://github.com/nodejs/node/commit/e20eef659f)] - **tools**: update gyp-next to 0.19.0 (Node.js GitHub Bot) [#56158](https://github.com/nodejs/node/pull/56158)
+* \[[`efcc829085`](https://github.com/nodejs/node/commit/efcc829085)] - **tools**: bump the eslint group in /tools/eslint with 4 updates (dependabot\[bot]) [#56099](https://github.com/nodejs/node/pull/56099)
+* \[[`5620b2be8a`](https://github.com/nodejs/node/commit/5620b2be8a)] - **tools**: improve release proposal PR opening (Antoine du Hamel) [#56161](https://github.com/nodejs/node/pull/56161)
+* \[[`3e17a8e78e`](https://github.com/nodejs/node/commit/3e17a8e78e)] - **util**: harden more built-in classes against prototype pollution (Antoine du Hamel) [#56225](https://github.com/nodejs/node/pull/56225)
+* \[[`13815417c7`](https://github.com/nodejs/node/commit/13815417c7)] - **util**: fix Latin1 decoding to return string output (Mert Can Altin) [#56222](https://github.com/nodejs/node/pull/56222)
+* \[[`77397c5013`](https://github.com/nodejs/node/commit/77397c5013)] - **util**: do not rely on mutable `Object` and `Function`' `constructor` prop (Antoine du Hamel) [#56188](https://github.com/nodejs/node/pull/56188)
+* \[[`84f98e0a74`](https://github.com/nodejs/node/commit/84f98e0a74)] - **v8,tools**: expose experimental wasm revectorize feature (Yolanda-Chen) [#54896](https://github.com/nodejs/node/pull/54896)
+* \[[`8325fa5c04`](https://github.com/nodejs/node/commit/8325fa5c04)] - **worker**: fix crash when a worker joins after exit (Stephen Belanger) [#56191](https://github.com/nodejs/node/pull/56191)
+
 <a id="23.4.0"></a>
 
 ## 2024-12-10, Version 23.4.0 (Current), @aduh95 prepared by @targos
diff --git a/doc/contributing/advocacy-ambassador-program.md b/doc/contributing/advocacy-ambassador-program.md
index cfb8c5cb1cd484..31d8fd58a1a4bf 100644
--- a/doc/contributing/advocacy-ambassador-program.md
+++ b/doc/contributing/advocacy-ambassador-program.md
@@ -130,3 +130,38 @@ or the information to be shared.
 Add a list of GitHub handles for those within the project that
 have volunteered to be contacated when necessary by ambassadors
 to get more info about the message to be promoted.
+
+#### Node.js is a great choice for a JavaScript runtime
+
+##### Goal
+
+Highlight the benefits of chosing Node.js as your backend JavaScript runtime. Focus on what is great
+about Node.js without drawing comparisons to alternatives. We don't want to say negative things about
+other options, only highlight what is great about Node.js as a choice.
+
+Some of the things to highlight include:
+
+* How widely it is used (you never get fired for chosing Node.js).
+* The openess of the project. It is part of the OpenJS Foundation and it's governance is set up to avoid
+  any one company from dominating the project. Decisions are made by the collaborators (of which there are quite
+  a few) versus a small number of people.
+* It has predictable and stable releases and has delivered on the release schedule since 2015.
+* It was a well defined security release process and manages security releases well.
+* As the defacto standard, it has the highest likelihood of being supported for a given package on npm.
+* It is not dependent on any one company for its continued existence reducing risk of using it.
+* The large number of platforms supported.
+* Asynchronous non-blocking i/o architecture drives high transactional throughput, making it ideal for web workloads.
+* Single threaded programming model enables very low resource consumption, making it ideal for containerised workloads.
+* Highly vibrant ecosystem with enterprise support from many vendors.
+
+#### Related Links
+
+* <https://github.com/nodejs/release>
+* <https://github.com/nodejs/node/blob/main/doc/contributing/security-release-process.md>
+* <https://github.com/nodejs/TSC/blob/main/TSC-Charter.md>
+* <https://github.com/mhdawson/presentations/blob/main/2024/NodeConfEU_2024-Node.js_whats_next.pdf>
+  for slide  usage and topping recent surveys.
+
+#### Project contacts
+
+* @mhdawson
diff --git a/doc/contributing/pull-requests.md b/doc/contributing/pull-requests.md
index 295e9d3695c47e..2ad538b3fd8e29 100644
--- a/doc/contributing/pull-requests.md
+++ b/doc/contributing/pull-requests.md
@@ -122,7 +122,7 @@ If you are modifying code, please be sure to run `make lint` (or
 code style guide.
 
 Any documentation you write (including code comments and API documentation)
-should follow the [Style Guide](../../README.md). Code samples
+should follow the [Style Guide](../../doc/README.md). Code samples
 included in the API docs will also be checked when running `make lint` (or
 `vcbuild.bat lint` on Windows). If you are adding to or deprecating an API,
 add or change the appropriate YAML documentation. Use `REPLACEME` for the
diff --git a/doc/contributing/releases.md b/doc/contributing/releases.md
index 055290263388b5..b3b20b8ae5589e 100644
--- a/doc/contributing/releases.md
+++ b/doc/contributing/releases.md
@@ -1090,20 +1090,18 @@ This script will use the promoted builds and changelog to generate the post. Run
 ### 19. Announce
 
 The nodejs.org website will automatically rebuild and include the new version.
-To announce the build on Twitter through the official @nodejs account, email
-<pr@nodejs.org> with a message such as:
+To announce the build on social media, please ping the @nodejs-social-team
+on offical slack channel.
+
+Node.js is also available on Bluesky and a release announcement can be
+reposted using [nodejs/bluesky](https://github.com/nodejs/bluesky) repository.
+
+The post content can be as simple as:
 
 > v5.8.0 of @nodejs is out: <https://nodejs.org/en/blog/release/v5.8.0/>
 > …
 > something here about notable changes
 
-To ensure communication goes out with the timing of the blog post, please allow
-24 hour prior notice. If known, please include the date and time the release
-will be shared with the community in the email to coordinate these
-announcements.
-
-Ping the IRC ops and the other [Partner Communities][] liaisons.
-
 <details>
 <summary>Security release</summary>
 
@@ -1437,10 +1435,9 @@ Typical resolution: sign the release again.
 [Build issue tracker]: https://github.com/nodejs/build/issues/new
 [CI lockdown procedure]: https://github.com/nodejs/build/blob/HEAD/doc/jenkins-guide.md#restricting-access-for-security-releases
 [Node.js Snap management repository]: https://github.com/nodejs/snap
-[Partner Communities]: https://github.com/nodejs/community-committee/blob/HEAD/governance/PARTNER_COMMUNITIES.md
 [Snap]: https://snapcraft.io/node
 [`create-release-proposal`]: https://github.com/nodejs/node/actions/workflows/create-release-proposal.yml
 [build-infra team]: https://github.com/orgs/nodejs/teams/build-infra
 [expected assets]: https://github.com/nodejs/build/tree/HEAD/ansible/www-standalone/tools/promote/expected_assets
-[nodejs.org release-post.js script]: https://github.com/nodejs/nodejs.org/blob/HEAD/scripts/release-post/index.mjs
+[nodejs.org release-post.js script]: https://github.com/nodejs/nodejs.org/blob/HEAD/apps/site/scripts/release-post/index.mjs
 [nodejs.org repository]: https://github.com/nodejs/nodejs.org
diff --git a/doc/contributing/technical-priorities.md b/doc/contributing/technical-priorities.md
index 9e566f12ae6750..68ac6f8dd0d00a 100644
--- a/doc/contributing/technical-priorities.md
+++ b/doc/contributing/technical-priorities.md
@@ -21,11 +21,11 @@ on October 1st 2022.
 
 _Present in: 2021_
 
-Base HTTP support is a key component of modern cloud-native applications
+Base HTTP support is a key component of modern cloud-native applications,
 and built-in support was part of what made Node.js a success in the first
 10 years. The current implementation is hard to support and a common
 source of vulnerabilities. We must work towards an
-implementation which is easier to support and makes it easier to integrate
+implementation that is easier to support and makes it easier to integrate
 the new HTTP versions (HTTP3, QUIC) and to support efficient
 implementations of different versions concurrently.
 
@@ -96,7 +96,7 @@ supported tools to implement those processes (logging, metrics and tracing).
 This includes support within the Node.js runtime itself (for example
 generating heap dumps, performance metrics, etc.) as well as support for
 applications on top of the runtime. In addition, it is also important to
-clearly document the use cases, problem determination methods and best
+clearly document the use cases, problem determination methods, and best
 practices for those tools.
 
 ## Better multithreaded support
diff --git a/doc/eslint.config_partial.mjs b/doc/eslint.config_partial.mjs
index edcc72930875b0..609c38c26e9336 100644
--- a/doc/eslint.config_partial.mjs
+++ b/doc/eslint.config_partial.mjs
@@ -3,7 +3,9 @@ import {
   noRestrictedSyntaxCommonLib,
   requireEslintTool,
 } from '../tools/eslint/eslint.config_utils.mjs';
-import { builtinModules as builtin } from 'node:module';
+import { builtinModules } from 'node:module';
+
+const builtin = builtinModules.filter((name) => !name.startsWith('node:'));
 
 const globals = requireEslintTool('globals');
 
diff --git a/doc/node.1 b/doc/node.1
index e38ce7f0431e62..2692c1848de359 100644
--- a/doc/node.1
+++ b/doc/node.1
@@ -171,8 +171,8 @@ Specify the
 .Ar module
 to use as a custom module loader.
 .
-.It Fl -experimental-permission
-Enable the experimental permission model.
+.It Fl -permission
+Enable the permission model.
 .
 .It Fl -experimental-shadow-realm
 Use this flag to enable ShadowRealm support.
diff --git a/eslint.config.mjs b/eslint.config.mjs
index 8e574a312d4a3e..2e8dd65a1b3733 100644
--- a/eslint.config.mjs
+++ b/eslint.config.mjs
@@ -50,7 +50,6 @@ export default [
       'test/addons/??_*',
       'test/fixtures/**',
       'tools/github_reporter/**',
-      'tools/icu/**',
     ],
   },
   // #endregion
diff --git a/lib/_http_client.js b/lib/_http_client.js
index 91ba264339fa4f..00b59f357fa45d 100644
--- a/lib/_http_client.js
+++ b/lib/_http_client.js
@@ -199,7 +199,13 @@ function ClientRequest(input, options, cb) {
   const host = optsWithoutSignal.host = validateHost(options.hostname, 'hostname') ||
                                         validateHost(options.host, 'host') || 'localhost';
 
-  const setHost = (options.setHost === undefined || Boolean(options.setHost));
+  const setHost = options.setHost !== undefined ?
+    Boolean(options.setHost) :
+    options.setDefaultHeaders !== false;
+
+  this._removedConnection = options.setDefaultHeaders === false;
+  this._removedContLen = options.setDefaultHeaders === false;
+  this._removedTE = options.setDefaultHeaders === false;
 
   this.socketPath = options.socketPath;
 
diff --git a/lib/assert.js b/lib/assert.js
index 3e212a1c3aebbe..a2991a096ac081 100644
--- a/lib/assert.js
+++ b/lib/assert.js
@@ -21,35 +21,44 @@
 'use strict';
 
 const {
+  ArrayBufferIsView,
+  ArrayBufferPrototypeGetByteLength,
   ArrayFrom,
   ArrayIsArray,
   ArrayPrototypeIndexOf,
   ArrayPrototypeJoin,
   ArrayPrototypePush,
   ArrayPrototypeSlice,
+  DataViewPrototypeGetBuffer,
+  DataViewPrototypeGetByteLength,
+  DataViewPrototypeGetByteOffset,
   Error,
   FunctionPrototypeCall,
-  MapPrototypeDelete,
   MapPrototypeGet,
+  MapPrototypeGetSize,
   MapPrototypeHas,
-  MapPrototypeSet,
   NumberIsNaN,
   ObjectAssign,
   ObjectIs,
   ObjectKeys,
   ObjectPrototypeIsPrototypeOf,
+  ObjectPrototypeToString,
   ReflectApply,
   ReflectHas,
   ReflectOwnKeys,
   RegExpPrototypeExec,
+  SafeArrayIterator,
   SafeMap,
   SafeSet,
   SafeWeakSet,
+  SetPrototypeGetSize,
   String,
   StringPrototypeIndexOf,
   StringPrototypeSlice,
   StringPrototypeSplit,
   SymbolIterator,
+  TypedArrayPrototypeGetLength,
+  Uint8Array,
 } = primordials;
 
 const {
@@ -65,6 +74,8 @@ const AssertionError = require('internal/assert/assertion_error');
 const { inspect } = require('internal/util/inspect');
 const { Buffer } = require('buffer');
 const {
+  isArrayBuffer,
+  isDataView,
   isKeyObject,
   isPromise,
   isRegExp,
@@ -73,6 +84,8 @@ const {
   isDate,
   isWeakSet,
   isWeakMap,
+  isSharedArrayBuffer,
+  isAnyArrayBuffer,
 } = require('internal/util/types');
 const { isError, deprecate, emitExperimentalWarning } = require('internal/util');
 const { innerOk } = require('internal/assert/utils');
@@ -369,9 +382,165 @@ function isSpecial(obj) {
 }
 
 const typesToCallDeepStrictEqualWith = [
-  isKeyObject, isWeakSet, isWeakMap, Buffer.isBuffer,
+  isKeyObject, isWeakSet, isWeakMap, Buffer.isBuffer, isSharedArrayBuffer,
 ];
 
+function partiallyCompareMaps(actual, expected, comparedObjects) {
+  if (MapPrototypeGetSize(expected) > MapPrototypeGetSize(actual)) {
+    return false;
+  }
+
+  comparedObjects ??= new SafeWeakSet();
+  const expectedIterator = FunctionPrototypeCall(SafeMap.prototype[SymbolIterator], expected);
+
+  for (const { 0: key, 1: expectedValue } of expectedIterator) {
+    if (!MapPrototypeHas(actual, key)) {
+      return false;
+    }
+
+    const actualValue = MapPrototypeGet(actual, key);
+
+    if (!compareBranch(actualValue, expectedValue, comparedObjects)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+function partiallyCompareArrayBuffersOrViews(actual, expected) {
+  let actualView, expectedView, expectedViewLength;
+
+  if (!ArrayBufferIsView(actual)) {
+    let actualViewLength;
+
+    if (isArrayBuffer(actual) && isArrayBuffer(expected)) {
+      actualViewLength = ArrayBufferPrototypeGetByteLength(actual);
+      expectedViewLength = ArrayBufferPrototypeGetByteLength(expected);
+    } else if (isSharedArrayBuffer(actual) && isSharedArrayBuffer(expected)) {
+      actualViewLength = actual.byteLength;
+      expectedViewLength = expected.byteLength;
+    } else {
+      // Cannot compare ArrayBuffers with SharedArrayBuffers
+      return false;
+    }
+
+    if (expectedViewLength > actualViewLength) {
+      return false;
+    }
+    actualView = new Uint8Array(actual);
+    expectedView = new Uint8Array(expected);
+
+  } else if (isDataView(actual)) {
+    if (!isDataView(expected)) {
+      return false;
+    }
+    const actualByteLength = DataViewPrototypeGetByteLength(actual);
+    expectedViewLength = DataViewPrototypeGetByteLength(expected);
+    if (expectedViewLength > actualByteLength) {
+      return false;
+    }
+
+    actualView = new Uint8Array(
+      DataViewPrototypeGetBuffer(actual),
+      DataViewPrototypeGetByteOffset(actual),
+      actualByteLength,
+    );
+    expectedView = new Uint8Array(
+      DataViewPrototypeGetBuffer(expected),
+      DataViewPrototypeGetByteOffset(expected),
+      expectedViewLength,
+    );
+  } else {
+    if (ObjectPrototypeToString(actual) !== ObjectPrototypeToString(expected)) {
+      return false;
+    }
+    actualView = actual;
+    expectedView = expected;
+    expectedViewLength = TypedArrayPrototypeGetLength(expected);
+
+    if (expectedViewLength > TypedArrayPrototypeGetLength(actual)) {
+      return false;
+    }
+  }
+
+  for (let i = 0; i < expectedViewLength; i++) {
+    if (actualView[i] !== expectedView[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+function partiallyCompareSets(actual, expected, comparedObjects) {
+  if (SetPrototypeGetSize(expected) > SetPrototypeGetSize(actual)) {
+    return false; // `expected` can't be a subset if it has more elements
+  }
+
+  if (isDeepEqual === undefined) lazyLoadComparison();
+
+  const actualArray = ArrayFrom(FunctionPrototypeCall(SafeSet.prototype[SymbolIterator], actual));
+  const expectedIterator = FunctionPrototypeCall(SafeSet.prototype[SymbolIterator], expected);
+  const usedIndices = new SafeSet();
+
+  expectedIteration: for (const expectedItem of expectedIterator) {
+    for (let actualIdx = 0; actualIdx < actualArray.length; actualIdx++) {
+      if (!usedIndices.has(actualIdx) && isDeepStrictEqual(actualArray[actualIdx], expectedItem)) {
+        usedIndices.add(actualIdx);
+        continue expectedIteration;
+      }
+    }
+    return false;
+  }
+
+  return true;
+}
+
+function partiallyCompareArrays(actual, expected, comparedObjects) {
+  if (expected.length > actual.length) {
+    return false;
+  }
+
+  if (isDeepEqual === undefined) lazyLoadComparison();
+
+  // Create a map to count occurrences of each element in the expected array
+  const expectedCounts = new SafeMap();
+  for (const expectedItem of expected) {
+    let found = false;
+    for (const { 0: key, 1: count } of expectedCounts) {
+      if (isDeepStrictEqual(key, expectedItem)) {
+        expectedCounts.set(key, count + 1);
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      expectedCounts.set(expectedItem, 1);
+    }
+  }
+
+  const safeActual = new SafeArrayIterator(actual);
+
+  // Create a map to count occurrences of relevant elements in the actual array
+  for (const actualItem of safeActual) {
+    for (const { 0: key, 1: count } of expectedCounts) {
+      if (isDeepStrictEqual(key, actualItem)) {
+        if (count === 1) {
+          expectedCounts.delete(key);
+        } else {
+          expectedCounts.set(key, count - 1);
+        }
+        break;
+      }
+    }
+  }
+
+  const { size } = expectedCounts;
+  expectedCounts.clear();
+  return size === 0;
+}
+
 /**
  * Compares two objects or values recursively to check if they are equal.
  * @param {any} actual - The actual value to compare.
@@ -388,22 +557,16 @@ function compareBranch(
 ) {
   // Check for Map object equality
   if (isMap(actual) && isMap(expected)) {
-    if (actual.size !== expected.size) {
-      return false;
-    }
-    const safeIterator = FunctionPrototypeCall(SafeMap.prototype[SymbolIterator], actual);
-
-    comparedObjects ??= new SafeWeakSet();
+    return partiallyCompareMaps(actual, expected, comparedObjects);
+  }
 
-    for (const { 0: key, 1: val } of safeIterator) {
-      if (!MapPrototypeHas(expected, key)) {
-        return false;
-      }
-      if (!compareBranch(val, MapPrototypeGet(expected, key), comparedObjects)) {
-        return false;
-      }
-    }
-    return true;
+  if (
+    ArrayBufferIsView(actual) ||
+    isAnyArrayBuffer(actual) ||
+    ArrayBufferIsView(expected) ||
+    isAnyArrayBuffer(expected)
+  ) {
+    return partiallyCompareArrayBuffersOrViews(actual, expected);
   }
 
   for (const type of typesToCallDeepStrictEqualWith) {
@@ -415,68 +578,12 @@ function compareBranch(
 
   // Check for Set object equality
   if (isSet(actual) && isSet(expected)) {
-    if (expected.size > actual.size) {
-      return false; // `expected` can't be a subset if it has more elements
-    }
-
-    if (isDeepEqual === undefined) lazyLoadComparison();
-
-    const actualArray = ArrayFrom(FunctionPrototypeCall(SafeSet.prototype[SymbolIterator], actual));
-    const expectedIterator = FunctionPrototypeCall(SafeSet.prototype[SymbolIterator], expected);
-    const usedIndices = new SafeSet();
-
-    expectedIteration: for (const expectedItem of expectedIterator) {
-      for (let actualIdx = 0; actualIdx < actualArray.length; actualIdx++) {
-        if (!usedIndices.has(actualIdx) && isDeepStrictEqual(actualArray[actualIdx], expectedItem)) {
-          usedIndices.add(actualIdx);
-          continue expectedIteration;
-        }
-      }
-      return false;
-    }
-
-    return true;
+    return partiallyCompareSets(actual, expected, comparedObjects);
   }
 
   // Check if expected array is a subset of actual array
   if (ArrayIsArray(actual) && ArrayIsArray(expected)) {
-    if (expected.length > actual.length) {
-      return false;
-    }
-
-    if (isDeepEqual === undefined) lazyLoadComparison();
-
-    // Create a map to count occurrences of each element in the expected array
-    const expectedCounts = new SafeMap();
-    for (const expectedItem of expected) {
-      let found = false;
-      for (const { 0: key, 1: count } of expectedCounts) {
-        if (isDeepStrictEqual(key, expectedItem)) {
-          MapPrototypeSet(expectedCounts, key, count + 1);
-          found = true;
-          break;
-        }
-      }
-      if (!found) {
-        MapPrototypeSet(expectedCounts, expectedItem, 1);
-      }
-    }
-
-    // Create a map to count occurrences of relevant elements in the actual array
-    for (const actualItem of actual) {
-      for (const { 0: key, 1: count } of expectedCounts) {
-        if (isDeepStrictEqual(key, actualItem)) {
-          if (count === 1) {
-            MapPrototypeDelete(expectedCounts, key);
-          } else {
-            MapPrototypeSet(expectedCounts, key, count - 1);
-          }
-          break;
-        }
-      }
-    }
-
-    return !expectedCounts.size;
+    return partiallyCompareArrays(actual, expected, comparedObjects);
   }
 
   // Comparison done when at least one of the values is not an object
diff --git a/lib/buffer.js b/lib/buffer.js
index 756657e910893e..8f235e5f0dae6c 100644
--- a/lib/buffer.js
+++ b/lib/buffer.js
@@ -35,6 +35,7 @@ const {
   NumberMIN_SAFE_INTEGER,
   ObjectDefineProperties,
   ObjectDefineProperty,
+  ObjectPrototypeHasOwnProperty,
   ObjectSetPrototypeOf,
   RegExpPrototypeSymbolReplace,
   StringPrototypeCharCodeAt,
@@ -910,7 +911,14 @@ Buffer.prototype[customInspectSymbol] = function inspect(recurseTimes, ctx) {
       }), 27, -2);
     }
   }
-  return `<${this.constructor.name} ${str}>`;
+  let constructorName = 'Buffer';
+  try {
+    const { constructor } = this;
+    if (typeof constructor === 'function' && ObjectPrototypeHasOwnProperty(constructor, 'name')) {
+      constructorName = constructor.name;
+    }
+  } catch { /* Ignore error and use default name */ }
+  return `<${constructorName} ${str}>`;
 };
 Buffer.prototype.inspect = Buffer.prototype[customInspectSymbol];
 
diff --git a/lib/dgram.js b/lib/dgram.js
index 09630b6c901181..b4c5db6439784a 100644
--- a/lib/dgram.js
+++ b/lib/dgram.js
@@ -41,6 +41,7 @@ const {
     ERR_BUFFER_OUT_OF_BOUNDS,
     ERR_INVALID_ARG_TYPE,
     ERR_INVALID_FD_TYPE,
+    ERR_IP_BLOCKED,
     ERR_MISSING_ARGS,
     ERR_SOCKET_ALREADY_BOUND,
     ERR_SOCKET_BAD_BUFFER_SIZE,
@@ -55,6 +56,7 @@ const {
   _createSocketHandle,
   newHandle,
 } = require('internal/dgram');
+const { isIP } = require('internal/net');
 const {
   isInt32,
   validateAbortSignal,
@@ -99,12 +101,18 @@ let _cluster = null;
 function lazyLoadCluster() {
   return _cluster ??= require('cluster');
 }
+let _blockList = null;
+function lazyLoadBlockList() {
+  return _blockList ??= require('internal/blocklist').BlockList;
+}
 
 function Socket(type, listener) {
   FunctionPrototypeCall(EventEmitter, this);
   let lookup;
   let recvBufferSize;
   let sendBufferSize;
+  let receiveBlockList;
+  let sendBlockList;
 
   let options;
   if (type !== null && typeof type === 'object') {
@@ -119,6 +127,18 @@ function Socket(type, listener) {
     }
     recvBufferSize = options.recvBufferSize;
     sendBufferSize = options.sendBufferSize;
+    if (options.receiveBlockList) {
+      if (!lazyLoadBlockList().isBlockList(options.receiveBlockList)) {
+        throw new ERR_INVALID_ARG_TYPE('options.receiveBlockList', 'net.BlockList', options.receiveBlockList);
+      }
+      receiveBlockList = options.receiveBlockList;
+    }
+    if (options.sendBlockList) {
+      if (!lazyLoadBlockList().isBlockList(options.sendBlockList)) {
+        throw new ERR_INVALID_ARG_TYPE('options.sendBlockList', 'net.BlockList', options.sendBlockList);
+      }
+      sendBlockList = options.sendBlockList;
+    }
   }
 
   const handle = newHandle(type, lookup);
@@ -141,6 +161,8 @@ function Socket(type, listener) {
     ipv6Only: options?.ipv6Only,
     recvBufferSize,
     sendBufferSize,
+    receiveBlockList,
+    sendBlockList,
   };
 
   if (options?.signal !== undefined) {
@@ -439,7 +461,9 @@ function doConnect(ex, self, ip, address, port, callback) {
   const state = self[kStateSymbol];
   if (!state.handle)
     return;
-
+  if (!ex && state.sendBlockList?.check(ip, `ipv${isIP(ip)}`)) {
+    ex = new ERR_IP_BLOCKED(ip);
+  }
   if (!ex) {
     const err = state.handle.connect(ip, port);
     if (err) {
@@ -703,6 +727,13 @@ function doSend(ex, self, ip, list, address, port, callback) {
     return;
   }
 
+  if (ip && state.sendBlockList?.check(ip, `ipv${isIP(ip)}`)) {
+    if (callback) {
+      process.nextTick(callback, new ERR_IP_BLOCKED(ip));
+    }
+    return;
+  }
+
   const req = new SendWrap();
   req.list = list;  // Keep reference alive.
   req.address = address;
@@ -951,6 +982,10 @@ function onMessage(nread, handle, buf, rinfo) {
   if (nread < 0) {
     return self.emit('error', new ErrnoException(nread, 'recvmsg'));
   }
+  if (self[kStateSymbol]?.receiveBlockList?.check(rinfo.address,
+                                                  rinfo.family?.toLocaleLowerCase())) {
+    return;
+  }
   rinfo.size = buf.length; // compatibility
   self.emit('message', buf, rinfo);
 }
diff --git a/lib/eslint.config_partial.mjs b/lib/eslint.config_partial.mjs
index 78d9f409a37956..520df40aecbc90 100644
--- a/lib/eslint.config_partial.mjs
+++ b/lib/eslint.config_partial.mjs
@@ -347,6 +347,11 @@ export default [
           name: 'SubtleCrypto',
           message: "Use `const { SubtleCrypto } = require('internal/crypto/webcrypto');` instead of the global.",
         },
+        // Float16Array is not available in primordials because it's only available with --js-float16array CLI flag.
+        {
+          name: 'Float16Array',
+          message: 'Use `const { Float16Array } = globalThis;` instead of the global.',
+        },
       ],
       'no-restricted-modules': [
         'error',
diff --git a/lib/fs.js b/lib/fs.js
index dd256566d268f3..714fce8e8b29be 100644
--- a/lib/fs.js
+++ b/lib/fs.js
@@ -1530,6 +1530,9 @@ function readdir(path, options, callback) {
   }
 
   if (options.recursive) {
+    // Make shallow copy to prevent mutating options from affecting results
+    options = copyObject(options);
+
     readdirRecursive(path, options, callback);
     return;
   }
diff --git a/lib/internal/abort_controller.js b/lib/internal/abort_controller.js
index b812f588c23e99..8ec9034a4f9352 100644
--- a/lib/internal/abort_controller.js
+++ b/lib/internal/abort_controller.js
@@ -28,6 +28,7 @@ const {
   kResistStopPropagation,
   kWeakHandler,
 } = require('internal/event_target');
+const { kMaxEventTargetListeners } = require('events');
 const {
   customInspectSymbol,
   kEmptyObject,
@@ -95,8 +96,21 @@ const dependantSignalsCleanupRegistry = new SafeFinalizationRegistry((signalWeak
     }
   });
 });
+
 const gcPersistentSignals = new SafeSet();
 
+const sourceSignalsCleanupRegistry = new SafeFinalizationRegistry(({ sourceSignalRef, composedSignalRef }) => {
+  const composedSignal = composedSignalRef.deref();
+  if (composedSignal !== undefined) {
+    composedSignal[kSourceSignals].delete(sourceSignalRef);
+
+    if (composedSignal[kSourceSignals].size === 0) {
+      // This signal will no longer abort. There's no need to keep it in the gcPersistentSignals set.
+      gcPersistentSignals.delete(composedSignal);
+    }
+  }
+});
+
 const kAborted = Symbol('kAborted');
 const kReason = Symbol('kReason');
 const kCloneData = Symbol('kCloneData');
@@ -164,6 +178,7 @@ class AbortSignal extends EventTarget {
     }
     super();
 
+    this[kMaxEventTargetListeners] = 0;
     const {
       aborted = false,
       reason = undefined,
@@ -258,6 +273,10 @@ class AbortSignal extends EventTarget {
         resultSignal[kSourceSignals].add(signalWeakRef);
         signal[kDependantSignals].add(resultSignalWeakRef);
         dependantSignalsCleanupRegistry.register(resultSignal, signalWeakRef);
+        sourceSignalsCleanupRegistry.register(signal, {
+          sourceSignalRef: signalWeakRef,
+          composedSignalRef: resultSignalWeakRef,
+        });
       } else if (!signal[kSourceSignals]) {
         continue;
       } else {
@@ -275,6 +294,10 @@ class AbortSignal extends EventTarget {
           resultSignal[kSourceSignals].add(sourceSignalWeakRef);
           sourceSignal[kDependantSignals].add(resultSignalWeakRef);
           dependantSignalsCleanupRegistry.register(resultSignal, sourceSignalWeakRef);
+          sourceSignalsCleanupRegistry.register(signal, {
+            sourceSignalRef: sourceSignalWeakRef,
+            composedSignalRef: resultSignalWeakRef,
+          });
         }
       }
     }
@@ -434,6 +457,7 @@ class AbortController {
    */
   get signal() {
     this.#signal ??= new AbortSignal(kDontThrowSymbol);
+
     return this.#signal;
   }
 
diff --git a/lib/internal/bootstrap/realm.js b/lib/internal/bootstrap/realm.js
index c11f70dd6bf329..7e87f1ad1ab5b6 100644
--- a/lib/internal/bootstrap/realm.js
+++ b/lib/internal/bootstrap/realm.js
@@ -54,6 +54,7 @@ const {
   ArrayPrototypeIncludes,
   ArrayPrototypeMap,
   ArrayPrototypePush,
+  ArrayPrototypePushApply,
   ArrayPrototypeSlice,
   Error,
   ObjectDefineProperty,
@@ -320,14 +321,16 @@ class BuiltinModule {
     );
   }
 
-  static getCanBeRequiredByUsersWithoutSchemeList() {
-    return ArrayFrom(canBeRequiredByUsersWithoutSchemeList);
-  }
-
   static getSchemeOnlyModuleNames() {
     return ArrayFrom(schemelessBlockList);
   }
 
+  static getAllBuiltinModuleIds() {
+    const allBuiltins = ArrayFrom(canBeRequiredByUsersWithoutSchemeList);
+    ArrayPrototypePushApply(allBuiltins, ArrayFrom(schemelessBlockList, (x) => `node:${x}`));
+    return allBuiltins;
+  }
+
   // Used by user-land module loaders to compile and load builtins.
   compileForPublicLoader() {
     if (!BuiltinModule.canBeRequiredByUsers(this.id)) {
diff --git a/lib/internal/crypto/util.js b/lib/internal/crypto/util.js
index a1d7e84dae36bc..3cf6599a186c3f 100644
--- a/lib/internal/crypto/util.js
+++ b/lib/internal/crypto/util.js
@@ -189,18 +189,22 @@ const kSupportedAlgorithms = {
     'AES-GCM': 'AesKeyGenParams',
     'AES-KW': 'AesKeyGenParams',
     'HMAC': 'HmacKeyGenParams',
+    'Ed25519': null,
+    'X25519': null,
   },
   'sign': {
     'RSASSA-PKCS1-v1_5': null,
     'RSA-PSS': 'RsaPssParams',
     'ECDSA': 'EcdsaParams',
     'HMAC': null,
+    'Ed25519': null,
   },
   'verify': {
     'RSASSA-PKCS1-v1_5': null,
     'RSA-PSS': 'RsaPssParams',
     'ECDSA': 'EcdsaParams',
     'HMAC': null,
+    'Ed25519': null,
   },
   'importKey': {
     'RSASSA-PKCS1-v1_5': 'RsaHashedImportParams',
@@ -215,11 +219,14 @@ const kSupportedAlgorithms = {
     'AES-CBC': null,
     'AES-GCM': null,
     'AES-KW': null,
+    'Ed25519': null,
+    'X25519': null,
   },
   'deriveBits': {
     'HKDF': 'HkdfParams',
     'PBKDF2': 'Pbkdf2Params',
     'ECDH': 'EcdhKeyDeriveParams',
+    'X25519': 'EcdhKeyDeriveParams',
   },
   'encrypt': {
     'RSA-OAEP': 'RsaOaepParams',
@@ -251,17 +258,6 @@ const kSupportedAlgorithms = {
 };
 
 const experimentalAlgorithms = ObjectEntries({
-  'X25519': {
-    generateKey: null,
-    importKey: null,
-    deriveBits: 'EcdhKeyDeriveParams',
-  },
-  'Ed25519': {
-    generateKey: null,
-    sign: null,
-    verify: null,
-    importKey: null,
-  },
   'X448': {
     generateKey: null,
     importKey: null,
diff --git a/lib/internal/fs/glob.js b/lib/internal/fs/glob.js
index 0f4939f7e1ba0d..6876b3d5721c67 100644
--- a/lib/internal/fs/glob.js
+++ b/lib/internal/fs/glob.js
@@ -650,7 +650,30 @@ class Glob {
   }
 }
 
+/**
+ * Check if a path matches a glob pattern
+ * @param {string} path the path to check
+ * @param {string} pattern the glob pattern to match
+ * @param {boolean} windows whether the path is on a Windows system, defaults to `isWindows`
+ * @returns {boolean}
+ */
+function matchGlobPattern(path, pattern, windows = isWindows) {
+  validateString(path, 'path');
+  validateString(pattern, 'pattern');
+  return lazyMinimatch().minimatch(path, pattern, {
+    kEmptyObject,
+    nocase: isMacOS || isWindows,
+    windowsPathsNoEscape: true,
+    nonegate: true,
+    nocomment: true,
+    optimizationLevel: 2,
+    platform: windows ? 'win32' : 'posix',
+    nocaseMagicOnly: true,
+  });
+}
+
 module.exports = {
   __proto__: null,
   Glob,
+  matchGlobPattern,
 };
diff --git a/lib/internal/fs/promises.js b/lib/internal/fs/promises.js
index 41be9a2a213b13..404f1f65edeb6c 100644
--- a/lib/internal/fs/promises.js
+++ b/lib/internal/fs/promises.js
@@ -944,6 +944,10 @@ async function readdirRecursive(originalPath, options) {
 
 async function readdir(path, options) {
   options = getOptions(options);
+
+  // Make shallow copy to prevent mutating options from affecting results
+  options = copyObject(options);
+
   path = getValidatedPath(path);
   if (options.recursive) {
     return readdirRecursive(path, options);
diff --git a/lib/internal/http2/core.js b/lib/internal/http2/core.js
index 6ce633092bca4b..b41e1baee24644 100644
--- a/lib/internal/http2/core.js
+++ b/lib/internal/http2/core.js
@@ -3136,9 +3136,13 @@ function initializeOptions(options) {
 
 function initializeTLSOptions(options, servername) {
   options = initializeOptions(options);
-  options.ALPNProtocols = ['h2'];
-  if (options.allowHTTP1 === true)
-    options.ALPNProtocols.push('http/1.1');
+
+  if (!options.ALPNCallback) {
+    options.ALPNProtocols = ['h2'];
+    if (options.allowHTTP1 === true)
+      options.ALPNProtocols.push('http/1.1');
+  }
+
   if (servername !== undefined && !options.servername)
     options.servername = servername;
   return options;
diff --git a/lib/internal/modules/cjs/loader.js b/lib/internal/modules/cjs/loader.js
index f99d0fc2a7a0eb..0779190e1c9070 100644
--- a/lib/internal/modules/cjs/loader.js
+++ b/lib/internal/modules/cjs/loader.js
@@ -102,6 +102,7 @@ const kIsCachedByESMLoader = Symbol('kIsCachedByESMLoader');
 const kRequiredModuleSymbol = Symbol('kRequiredModuleSymbol');
 const kIsExecuting = Symbol('kIsExecuting');
 
+const kURL = Symbol('kURL');
 const kFormat = Symbol('kFormat');
 
 // Set first due to cycle with ESM loader functions.
@@ -112,6 +113,9 @@ module.exports = {
   kModuleCircularVisited,
   initializeCJS,
   Module,
+  findLongestRegisteredExtension,
+  resolveForCJSWithHooks,
+  loadSourceForCJSWithHooks: loadSource,
   wrapSafe,
   wrapModuleLoad,
   kIsMainSymbol,
@@ -157,6 +161,15 @@ const {
   stripBOM,
   toRealPath,
 } = require('internal/modules/helpers');
+const {
+  convertCJSFilenameToURL,
+  convertURLToCJSFilename,
+  loadHooks,
+  loadWithHooks,
+  registerHooks,
+  resolveHooks,
+  resolveWithHooks,
+} = require('internal/modules/customization_hooks');
 const { stripTypeScriptModuleTypes } = require('internal/modules/typescript');
 const packageJsonReader = require('internal/modules/package_json_reader');
 const { getOptionValue, getEmbedderOptions } = require('internal/options');
@@ -173,6 +186,7 @@ const {
     ERR_REQUIRE_CYCLE_MODULE,
     ERR_REQUIRE_ESM,
     ERR_UNKNOWN_BUILTIN_MODULE,
+    ERR_UNKNOWN_MODULE_FORMAT,
   },
   setArrowMessage,
 } = require('internal/errors');
@@ -420,8 +434,8 @@ Module.isBuiltin = BuiltinModule.isBuiltin;
  */
 function initializeCJS() {
   // This need to be done at runtime in case --expose-internals is set.
-  const builtinModules = BuiltinModule.getCanBeRequiredByUsersWithoutSchemeList();
-  Module.builtinModules = ObjectFreeze(builtinModules);
+
+  Module.builtinModules = ObjectFreeze(BuiltinModule.getAllBuiltinModuleIds());
 
   initializeCjsConditions();
 
@@ -585,7 +599,7 @@ function trySelfParentPath(parent) {
  * @param {string} parentPath The path of the parent module
  * @param {string} request The module request to resolve
  */
-function trySelf(parentPath, request) {
+function trySelf(parentPath, request, conditions) {
   if (!parentPath) { return false; }
 
   const pkg = packageJsonReader.getNearestParentPackageJSON(parentPath);
@@ -606,7 +620,7 @@ function trySelf(parentPath, request) {
     const { packageExportsResolve } = require('internal/modules/esm/resolve');
     return finalizeEsmResolution(packageExportsResolve(
       pathToFileURL(pkg.path), expansion, pkg.data,
-      pathToFileURL(parentPath), getCjsConditions()), parentPath, pkg.path);
+      pathToFileURL(parentPath), conditions), parentPath, pkg.path);
   } catch (e) {
     if (e.code === 'ERR_MODULE_NOT_FOUND') {
       throw createEsmNotFoundErr(request, pkg.path);
@@ -627,7 +641,7 @@ const EXPORTS_PATTERN = /^((?:@[^/\\%]+\/)?[^./\\%][^/\\%]*)(\/.*)?$/;
  * @param {string} nmPath The path to the module.
  * @param {string} request The request for the module.
  */
-function resolveExports(nmPath, request) {
+function resolveExports(nmPath, request, conditions) {
   // The implementation's behavior is meant to mirror resolution in ESM.
   const { 1: name, 2: expansion = '' } =
     RegExpPrototypeExec(EXPORTS_PATTERN, request) || kEmptyObject;
@@ -639,7 +653,7 @@ function resolveExports(nmPath, request) {
       const { packageExportsResolve } = require('internal/modules/esm/resolve');
       return finalizeEsmResolution(packageExportsResolve(
         pathToFileURL(pkgPath + '/package.json'), '.' + expansion, pkg, null,
-        getCjsConditions()), null, pkgPath);
+        conditions), null, pkgPath);
     } catch (e) {
       if (e.code === 'ERR_MODULE_NOT_FOUND') {
         throw createEsmNotFoundErr(request, pkgPath + '/package.json');
@@ -681,7 +695,7 @@ function getDefaultExtensions() {
  * @param {boolean} isMain Whether the request is the main app entry point
  * @returns {string | false}
  */
-Module._findPath = function(request, paths, isMain) {
+Module._findPath = function(request, paths, isMain, conditions = getCjsConditions()) {
   const absoluteRequest = path.isAbsolute(request);
   if (absoluteRequest) {
     paths = [''];
@@ -736,7 +750,7 @@ Module._findPath = function(request, paths, isMain) {
     }
 
     if (!absoluteRequest) {
-      const exportsResolved = resolveExports(curPath, request);
+      const exportsResolved = resolveExports(curPath, request, conditions);
       if (exportsResolved) {
         return exportsResolved;
       }
@@ -1017,6 +1031,153 @@ function getExportsForCircularRequire(module) {
   return module.exports;
 }
 
+/**
+ * Resolve a module request for CommonJS, invoking hooks from module.registerHooks()
+ * if necessary.
+ * @param {string} specifier
+ * @param {Module|undefined} parent
+ * @param {boolean} isMain
+ * @returns {{url?: string, format?: string, parentURL?: string, filename: string}}
+ */
+function resolveForCJSWithHooks(specifier, parent, isMain) {
+  let defaultResolvedURL;
+  let defaultResolvedFilename;
+  let format;
+
+  function defaultResolveImpl(specifier, parent, isMain, options) {
+    // For backwards compatibility, when encountering requests starting with node:,
+    // throw ERR_UNKNOWN_BUILTIN_MODULE on failure or return the normalized ID on success
+    // without going into Module._resolveFilename.
+    let normalized;
+    if (StringPrototypeStartsWith(specifier, 'node:')) {
+      normalized = BuiltinModule.normalizeRequirableId(specifier);
+      if (!normalized) {
+        throw new ERR_UNKNOWN_BUILTIN_MODULE(specifier);
+      }
+      defaultResolvedURL = specifier;
+      format = 'builtin';
+      return normalized;
+    }
+    return Module._resolveFilename(specifier, parent, isMain, options).toString();
+  }
+
+  // Fast path: no hooks, just return simple results.
+  if (!resolveHooks.length) {
+    const filename = defaultResolveImpl(specifier, parent, isMain);
+    return { __proto__: null, url: defaultResolvedURL, filename, format };
+  }
+
+  // Slow path: has hooks, do the URL conversions and invoke hooks with contexts.
+  let parentURL;
+  if (parent) {
+    if (!parent[kURL] && parent.filename) {
+      parent[kURL] = convertCJSFilenameToURL(parent.filename);
+    }
+    parentURL = parent[kURL];
+  }
+
+  // This is used as the last nextResolve for the resolve hooks.
+  function defaultResolve(specifier, context) {
+    // TODO(joyeecheung): parent and isMain should be part of context, then we
+    // no longer need to use a different defaultResolve for every resolution.
+    defaultResolvedFilename = defaultResolveImpl(specifier, parent, isMain, {
+      __proto__: null,
+      conditions: context.conditions,
+    });
+
+    defaultResolvedURL = convertCJSFilenameToURL(defaultResolvedFilename);
+    return { __proto__: null, url: defaultResolvedURL };
+  }
+
+  const resolveResult = resolveWithHooks(specifier, parentURL, /* importAttributes */ undefined,
+                                         getCjsConditions(), defaultResolve);
+  const { url } = resolveResult;
+  format = resolveResult.format;
+
+  let filename;
+  if (url === defaultResolvedURL) {  // Not overridden, skip the re-conversion.
+    filename = defaultResolvedFilename;
+  } else {
+    filename = convertURLToCJSFilename(url);
+  }
+
+  return { __proto__: null, url, format, filename, parentURL };
+}
+
+/**
+ * @typedef {import('internal/modules/customization_hooks').ModuleLoadContext} ModuleLoadContext;
+ * @typedef {import('internal/modules/customization_hooks').ModuleLoadResult} ModuleLoadResult;
+ */
+
+/**
+ * Load the source code of a module based on format.
+ * @param {string} filename Filename of the module.
+ * @param {string|undefined|null} format Format of the module.
+ * @returns {string|null}
+ */
+function defaultLoadImpl(filename, format) {
+  switch (format) {
+    case undefined:
+    case null:
+    case 'module':
+    case 'commonjs':
+    case 'json':
+    case 'module-typescript':
+    case 'commonjs-typescript':
+    case 'typescript': {
+      return fs.readFileSync(filename, 'utf8');
+    }
+    case 'builtin':
+      return null;
+    default:
+      // URL is not necessarily necessary/available - convert it on the spot for errors.
+      throw new ERR_UNKNOWN_MODULE_FORMAT(format, convertCJSFilenameToURL(filename));
+  }
+}
+
+/**
+ * Construct a last nextLoad() for load hooks invoked for the CJS loader.
+ * @param {string} url URL passed from the hook.
+ * @param {string} filename Filename inferred from the URL.
+ * @returns {(url: string, context: ModuleLoadContext) => ModuleLoadResult}
+ */
+function getDefaultLoad(url, filename) {
+  return function defaultLoad(urlFromHook, context) {
+    // If the url is the same as the original one, save the conversion.
+    const isLoadingOriginalModule = (urlFromHook === url);
+    const filenameFromHook = isLoadingOriginalModule ? filename : convertURLToCJSFilename(url);
+    const source = defaultLoadImpl(filenameFromHook, context.format);
+    // Format from context is directly returned, because format detection should only be
+    // done after the entire load chain is completed.
+    return { source, format: context.format };
+  };
+}
+
+/**
+ * Load a specified builtin module, invoking load hooks if necessary.
+ * @param {string} id The module ID (without the node: prefix)
+ * @param {string} url  The module URL (with the node: prefix)
+ * @param {string} format Format from resolution.
+ * @returns {any} If there are no load hooks or the load hooks do not override the format of the
+ *                builtin, load and return the exports of the builtin. Otherwise, return undefined.
+ */
+function loadBuiltinWithHooks(id, url, format) {
+  if (loadHooks.length) {
+    url ??= `node:${id}`;
+    // TODO(joyeecheung): do we really want to invoke the load hook for the builtins?
+    const loadResult = loadWithHooks(url, format || 'builtin', /* importAttributes */ undefined,
+                                     getCjsConditions(), getDefaultLoad(url, id));
+    if (loadResult.format && loadResult.format !== 'builtin') {
+      return undefined;  // Format has been overridden, return undefined for the caller to continue loading.
+    }
+  }
+
+  // No hooks or the hooks have not overridden the format. Load it as a builtin module and return the
+  // exports.
+  const mod = loadBuiltinModule(id);
+  return mod.exports;
+}
+
 /**
  * Load a module from cache if it exists, otherwise create a new module instance.
  * 1. If a module already exists in the cache: return its exports object.
@@ -1051,19 +1212,18 @@ Module._load = function(request, parent, isMain) {
     }
   }
 
-  if (StringPrototypeStartsWith(request, 'node:')) {
-    // Slice 'node:' prefix
-    const id = StringPrototypeSlice(request, 5);
+  const { url, format, filename } = resolveForCJSWithHooks(request, parent, isMain);
 
-    if (!BuiltinModule.canBeRequiredByUsers(id)) {
-      throw new ERR_UNKNOWN_BUILTIN_MODULE(request);
+  // For backwards compatibility, if the request itself starts with node:, load it before checking
+  // Module._cache. Otherwise, load it after the check.
+  if (StringPrototypeStartsWith(request, 'node:')) {
+    const result = loadBuiltinWithHooks(filename, url, format);
+    if (result) {
+      return result;
     }
-
-    const module = loadBuiltinModule(id, request);
-    return module.exports;
+    // The format of the builtin has been overridden by user hooks. Continue loading.
   }
 
-  const filename = Module._resolveFilename(request, parent, isMain);
   const cachedModule = Module._cache[filename];
   if (cachedModule !== undefined) {
     updateChildren(parent, cachedModule, true);
@@ -1088,8 +1248,11 @@ Module._load = function(request, parent, isMain) {
   }
 
   if (BuiltinModule.canBeRequiredWithoutScheme(filename)) {
-    const mod = loadBuiltinModule(filename, request);
-    return mod.exports;
+    const result = loadBuiltinWithHooks(filename, url, format);
+    if (result) {
+      return result;
+    }
+    // The format of the builtin has been overridden by user hooks. Continue loading.
   }
 
   // Don't call updateChildren(), Module constructor already does.
@@ -1108,6 +1271,10 @@ Module._load = function(request, parent, isMain) {
     reportModuleToWatchMode(filename);
     Module._cache[filename] = module;
     module[kIsCachedByESMLoader] = false;
+    // If there are resolve hooks, carry the context information into the
+    // load hooks for the module keyed by the (potentially customized) filename.
+    module[kURL] = url;
+    module[kFormat] = format;
   }
 
   if (parent !== undefined) {
@@ -1150,11 +1317,13 @@ Module._load = function(request, parent, isMain) {
  * @param {ResolveFilenameOptions} options Options object
  * @typedef {object} ResolveFilenameOptions
  * @property {string[]} paths Paths to search for modules in
+ * @property {string[]} conditions Conditions used for resolution.
  */
 Module._resolveFilename = function(request, parent, isMain, options) {
   if (BuiltinModule.normalizeRequirableId(request)) {
     return request;
   }
+  const conditions = (options?.conditions) || getCjsConditions();
 
   let paths;
 
@@ -1200,7 +1369,7 @@ Module._resolveFilename = function(request, parent, isMain, options) {
       try {
         const { packageImportsResolve } = require('internal/modules/esm/resolve');
         return finalizeEsmResolution(
-          packageImportsResolve(request, pathToFileURL(parentPath), getCjsConditions()),
+          packageImportsResolve(request, pathToFileURL(parentPath), conditions),
           parentPath,
           pkg.path,
         );
@@ -1215,7 +1384,7 @@ Module._resolveFilename = function(request, parent, isMain, options) {
 
   // Try module self resolution first
   const parentPath = trySelfParentPath(parent);
-  const selfResolved = trySelf(parentPath, request);
+  const selfResolved = trySelf(parentPath, request, conditions);
   if (selfResolved) {
     const cacheKey = request + '\x00' +
          (paths.length === 1 ? paths[0] : ArrayPrototypeJoin(paths, '\x00'));
@@ -1224,7 +1393,7 @@ Module._resolveFilename = function(request, parent, isMain, options) {
   }
 
   // Look up the filename first, since that's the cache key.
-  const filename = Module._findPath(request, paths, isMain);
+  const filename = Module._findPath(request, paths, isMain, conditions);
   if (filename) { return filename; }
   const requireStack = [];
   for (let cursor = parent;
@@ -1291,8 +1460,8 @@ Module.prototype.load = function(filename) {
   debug('load %j for module %j', filename, this.id);
 
   assert(!this.loaded);
-  this.filename = filename;
-  this.paths = Module._nodeModulePaths(path.dirname(filename));
+  this.filename ??= filename;
+  this.paths ??= Module._nodeModulePaths(path.dirname(filename));
 
   const extension = findLongestRegisteredExtension(filename);
 
@@ -1330,7 +1499,7 @@ Module.prototype.require = function(id) {
   }
 };
 
-let emittedRequireModuleWarning = false;
+let requireModuleWarningMode;
 /**
  * Resolve and evaluate it synchronously as ESM if it's ESM.
  * @param {Module} mod CJS module instance
@@ -1351,17 +1520,22 @@ function loadESMFromCJS(mod, filename, format, source) {
   } else {
     const parent = mod[kModuleParent];
 
-    if (!emittedRequireModuleWarning) {
+    requireModuleWarningMode ??= getOptionValue('--trace-require-module');
+    if (requireModuleWarningMode) {
       let shouldEmitWarning = false;
-      // Check if the require() comes from node_modules.
-      if (parent) {
-        shouldEmitWarning = !isUnderNodeModules(parent.filename);
-      } else if (mod[kIsCachedByESMLoader]) {
-        // It comes from the require() built for `import cjs` and doesn't have a parent recorded
-        // in the CJS module instance. Inspect the stack trace to see if the require()
-        // comes from node_modules and reduce the noise. If there are more than 100 frames,
-        // just give up and assume it is under node_modules.
-        shouldEmitWarning = !isInsideNodeModules(100, true);
+      if (requireModuleWarningMode === 'no-node-modules') {
+        // Check if the require() comes from node_modules.
+        if (parent) {
+          shouldEmitWarning = !isUnderNodeModules(parent.filename);
+        } else if (mod[kIsCachedByESMLoader]) {
+          // It comes from the require() built for `import cjs` and doesn't have a parent recorded
+          // in the CJS module instance. Inspect the stack trace to see if the require()
+          // comes from node_modules and reduce the noise. If there are more than 100 frames,
+          // just give up and assume it is under node_modules.
+          shouldEmitWarning = !isInsideNodeModules(100, true);
+        }
+      } else {
+        shouldEmitWarning = true;
       }
       if (shouldEmitWarning) {
         let messagePrefix;
@@ -1387,7 +1561,7 @@ function loadESMFromCJS(mod, filename, format, source) {
                                 messagePrefix,
                                 undefined,
                                 parent?.require);
-        emittedRequireModuleWarning = true;
+        requireModuleWarningMode = true;
       }
     }
     const {
@@ -1572,27 +1746,41 @@ Module.prototype._compile = function(content, filename, format) {
 };
 
 /**
- * Get the source code of a module, using cached ones if it's cached.
+ * Get the source code of a module, using cached ones if it's cached. This is used
+ * for TypeScript, JavaScript and JSON loading.
  * After this returns, mod[kFormat], mod[kModuleSource] and mod[kURL] will be set.
  * @param {Module} mod Module instance whose source is potentially already cached.
  * @param {string} filename Absolute path to the file of the module.
  * @returns {{source: string, format?: string}}
  */
 function loadSource(mod, filename, formatFromNode) {
-  if (formatFromNode !== undefined) {
+  if (mod[kFormat] === undefined) {
     mod[kFormat] = formatFromNode;
   }
-  const format = mod[kFormat];
+  // If the module was loaded before, just return.
+  if (mod[kModuleSource] !== undefined) {
+    return { source: mod[kModuleSource], format: mod[kFormat] };
+  }
 
-  let source = mod[kModuleSource];
-  if (source !== undefined) {
-    mod[kModuleSource] = undefined;
-  } else {
-    // TODO(joyeecheung): we can read a buffer instead to speed up
-    // compilation.
-    source = fs.readFileSync(filename, 'utf8');
+  // Fast path: no hooks, just load it and return.
+  if (!loadHooks.length) {
+    const source = defaultLoadImpl(filename, formatFromNode);
+    return { source, format: formatFromNode };
+  }
+
+  if (mod[kURL] === undefined) {
+    mod[kURL] = convertCJSFilenameToURL(filename);
   }
-  return { source, format };
+
+  const loadResult = loadWithHooks(mod[kURL], mod[kFormat], /* importAttributes */ undefined, getCjsConditions(),
+                                   getDefaultLoad(mod[kURL], filename));
+
+  // Reset the module properties with load hook results.
+  if (loadResult.format !== undefined) {
+    mod[kFormat] = loadResult.format;
+  }
+  mod[kModuleSource] = loadResult.source;
+  return { source: mod[kModuleSource], format: mod[kFormat] };
 }
 
 /**
@@ -1610,7 +1798,6 @@ function loadMTS(mod, filename) {
  * @param {Module} module CJS module instance
  * @param {string} filename The file path of the module
  */
-
 function loadCTS(module, filename) {
   const loadResult = loadSource(module, filename, 'commonjs-typescript');
   module._compile(loadResult.source, filename, loadResult.format);
@@ -1724,7 +1911,7 @@ Module._extensions['.js'] = function(module, filename) {
  * @param {string} filename The file path of the module
  */
 Module._extensions['.json'] = function(module, filename) {
-  const content = fs.readFileSync(filename, 'utf8');
+  const { source: content } = loadSource(module, filename, 'json');
 
   try {
     setOwnProperty(module, 'exports', JSONParse(stripBOM(content)));
@@ -1878,3 +2065,4 @@ ObjectDefineProperty(Module.prototype, 'constructor', {
 
 // Backwards compatibility
 Module.Module = Module;
+Module.registerHooks = registerHooks;
diff --git a/lib/internal/modules/customization_hooks.js b/lib/internal/modules/customization_hooks.js
new file mode 100644
index 00000000000000..c7a7a6d53dffd8
--- /dev/null
+++ b/lib/internal/modules/customization_hooks.js
@@ -0,0 +1,366 @@
+'use strict';
+
+const {
+  ArrayPrototypeFindIndex,
+  ArrayPrototypePush,
+  ArrayPrototypeSplice,
+  ObjectFreeze,
+  StringPrototypeStartsWith,
+  Symbol,
+} = primordials;
+const {
+  isAnyArrayBuffer,
+  isArrayBufferView,
+} = require('internal/util/types');
+
+const { BuiltinModule } = require('internal/bootstrap/realm');
+const {
+  ERR_INVALID_RETURN_PROPERTY_VALUE,
+} = require('internal/errors').codes;
+const { validateFunction } = require('internal/validators');
+const { isAbsolute } = require('path');
+const { pathToFileURL, fileURLToPath } = require('internal/url');
+
+let debug = require('internal/util/debuglog').debuglog('module_hooks', (fn) => {
+  debug = fn;
+});
+
+/** @typedef {import('internal/modules/cjs/loader.js').Module} Module */
+/**
+ * @typedef {(specifier: string, context: ModuleResolveContext, nextResolve: ResolveHook)
+ *            => ModuleResolveResult} ResolveHook
+ * @typedef {(url: string, context: ModuleLoadContext, nextLoad: LoadHook)
+ *            => ModuleLoadResult} LoadHook
+ */
+
+// Use arrays for better insertion and iteration performance, we don't care
+// about deletion performance as much.
+const resolveHooks = [];
+const loadHooks = [];
+const hookId = Symbol('kModuleHooksIdKey');
+let nextHookId = 0;
+
+class ModuleHooks {
+  /**
+   * @param {ResolveHook|undefined} resolve User-provided hook.
+   * @param {LoadHook|undefined} load User-provided hook.
+   */
+  constructor(resolve, load) {
+    this[hookId] = Symbol(`module-hook-${nextHookId++}`);
+    // Always initialize all hooks, if it's unspecified it'll be an owned undefined.
+    this.resolve = resolve;
+    this.load = load;
+
+    if (resolve) {
+      ArrayPrototypePush(resolveHooks, this);
+    }
+    if (load) {
+      ArrayPrototypePush(loadHooks, this);
+    }
+
+    ObjectFreeze(this);
+  }
+  // TODO(joyeecheung): we may want methods that allow disabling/enabling temporarily
+  // which just sets the item in the array to undefined temporarily.
+  // TODO(joyeecheung): this can be the [Symbol.dispose] implementation to pair with
+  // `using` when the explicit resource management proposal is shipped by V8.
+  /**
+   * Deregister the hook instance.
+   */
+  deregister() {
+    const id = this[hookId];
+    let index = ArrayPrototypeFindIndex(resolveHooks, (hook) => hook[hookId] === id);
+    if (index !== -1) {
+      ArrayPrototypeSplice(resolveHooks, index, 1);
+    }
+    index = ArrayPrototypeFindIndex(loadHooks, (hook) => hook[hookId] === id);
+    if (index !== -1) {
+      ArrayPrototypeSplice(loadHooks, index, 1);
+    }
+  }
+};
+
+/**
+ * TODO(joyeecheung): taken an optional description?
+ * @param {{ resolve?: ResolveHook, load?: LoadHook }} hooks User-provided hooks
+ * @returns {ModuleHooks}
+ */
+function registerHooks(hooks) {
+  const { resolve, load } = hooks;
+  if (resolve) {
+    validateFunction(resolve, 'hooks.resolve');
+  }
+  if (load) {
+    validateFunction(load, 'hooks.load');
+  }
+  return new ModuleHooks(resolve, load);
+}
+
+/**
+ * @param {string} filename
+ * @returns {string}
+ */
+function convertCJSFilenameToURL(filename) {
+  if (!filename) { return filename; }
+  const builtinId = BuiltinModule.normalizeRequirableId(filename);
+  if (builtinId) {
+    return `node:${builtinId}`;
+  }
+  // Handle the case where filename is neither a path, nor a built-in id,
+  // which is possible via monkey-patching.
+  if (isAbsolute(filename)) {
+    return pathToFileURL(filename).href;
+  }
+  return filename;
+}
+
+/**
+ * @param {string} url
+ * @returns {string}
+ */
+function convertURLToCJSFilename(url) {
+  if (!url) { return url; }
+  const builtinId = BuiltinModule.normalizeRequirableId(url);
+  if (builtinId) {
+    return builtinId;
+  }
+  if (StringPrototypeStartsWith(url, 'file://')) {
+    return fileURLToPath(url);
+  }
+  return url;
+}
+
+/**
+ * Convert a list of hooks into a function that can be used to do an operation through
+ * a chain of hooks. If any of the hook returns without calling the next hook, it
+ * must return shortCircuit: true to stop the chain from continuing to avoid
+ * forgetting to invoke the next hook by mistake.
+ * @param {ModuleHooks[]} hooks A list of hooks whose last argument is `nextHook`.
+ * @param {'load'|'resolve'} name Name of the hook in ModuleHooks.
+ * @param {Function} defaultStep The default step in the chain.
+ * @param {Function} validate A function that validates and sanitize the result returned by the chain.
+ * @returns {Function}
+ */
+function buildHooks(hooks, name, defaultStep, validate) {
+  let lastRunIndex = hooks.length;
+  function wrapHook(index, userHook, next) {
+    return function wrappedHook(...args) {
+      lastRunIndex = index;
+      const hookResult = userHook(...args, next);
+      if (lastRunIndex > 0 && lastRunIndex === index && !hookResult.shortCircuit) {
+        throw new ERR_INVALID_RETURN_PROPERTY_VALUE('true', name, 'shortCircuit',
+                                                    hookResult.shortCircuit);
+      }
+      return validate(...args, hookResult);
+    };
+  }
+  const chain = [wrapHook(0, defaultStep)];
+  for (let i = 0; i < hooks.length; ++i) {
+    const wrappedHook = wrapHook(i + 1, hooks[i][name], chain[i]);
+    ArrayPrototypePush(chain, wrappedHook);
+  }
+  return chain[chain.length - 1];
+}
+
+/**
+ * @typedef {object} ModuleResolveResult
+ * @property {string} url Resolved URL of the module.
+ * @property {string|undefined} format Format of the module.
+ * @property {ImportAttributes|undefined} importAttributes Import attributes for the request.
+ * @property {boolean|undefined} shortCircuit Whether the next hook has been skipped.
+ */
+
+/**
+ * Validate the result returned by a chain of resolve hook.
+ * @param {string} specifier Specifier passed into the hooks.
+ * @param {ModuleResolveContext} context Context passed into the hooks.
+ * @param {ModuleResolveResult} result Result produced by resolve hooks.
+ * @returns {ModuleResolveResult}
+ */
+function validateResolve(specifier, context, result) {
+  const { url, format, importAttributes } = result;
+  if (typeof url !== 'string') {
+    throw new ERR_INVALID_RETURN_PROPERTY_VALUE(
+      'a URL string',
+      'resolve',
+      'url',
+      url,
+    );
+  }
+
+  if (format && typeof format !== 'string') {
+    throw new ERR_INVALID_RETURN_PROPERTY_VALUE(
+      'a string',
+      'resolve',
+      'format',
+      format,
+    );
+  }
+
+  if (importAttributes && typeof importAttributes !== 'object') {
+    throw new ERR_INVALID_RETURN_PROPERTY_VALUE(
+      'an object',
+      'resolve',
+      'importAttributes',
+      importAttributes,
+    );
+  }
+
+  return {
+    __proto__: null,
+    url,
+    format,
+    importAttributes,
+  };
+}
+
+/**
+ * @typedef {object} ModuleLoadResult
+ * @property {string|undefined} format Format of the loaded module.
+ * @property {string|ArrayBuffer|TypedArray} source Source code of the module.
+ * @property {boolean|undefined} shortCircuit Whether the next hook has been skipped.
+ */
+
+/**
+ * Validate the result returned by a chain of resolve hook.
+ * @param {string} url URL passed into the hooks.
+ * @param {ModuleLoadContext} context Context passed into the hooks.
+ * @param {ModuleLoadResult} result Result produced by load hooks.
+ * @returns {ModuleLoadResult}
+ */
+function validateLoad(url, context, result) {
+  const { source, format } = result;
+  // To align with module.register(), the load hooks are still invoked for
+  // the builtins even though the default load step only provides null as source,
+  // and any source content for builtins provided by the user hooks are ignored.
+  if (!StringPrototypeStartsWith(url, 'node:') &&
+    typeof result.source !== 'string' &&
+      !isAnyArrayBuffer(source) &&
+      !isArrayBufferView(source)) {
+    throw new ERR_INVALID_RETURN_PROPERTY_VALUE(
+      'a string, an ArrayBuffer, or a TypedArray',
+      'load',
+      'source',
+      source,
+    );
+  }
+
+  if (typeof format !== 'string' && format !== undefined) {
+    throw new ERR_INVALID_RETURN_PROPERTY_VALUE(
+      'a string',
+      'load',
+      'format',
+      format,
+    );
+  }
+
+  return {
+    __proto__: null,
+    format,
+    source,
+  };
+}
+
+class ModuleResolveContext {
+  /**
+   * Context for the resolve hook.
+   * @param {string|undefined} parentURL Parent URL.
+   * @param {ImportAttributes|undefined} importAttributes Import attributes.
+   * @param {string[]} conditions Conditions.
+   */
+  constructor(parentURL, importAttributes, conditions) {
+    this.parentURL = parentURL;
+    this.importAttributes = importAttributes;
+    this.conditions = conditions;
+    // TODO(joyeecheung): a field to differentiate between require and import?
+  }
+};
+
+class ModuleLoadContext {
+  /**
+   * Context for the load hook.
+   * @param {string|undefined} format URL.
+   * @param {ImportAttributes|undefined} importAttributes Import attributes.
+   * @param {string[]} conditions Conditions.
+   */
+  constructor(format, importAttributes, conditions) {
+    this.format = format;
+    this.importAttributes = importAttributes;
+    this.conditions = conditions;
+  }
+};
+
+let decoder;
+/**
+ * Load module source for a url, through a hooks chain if it exists.
+ * @param {string} url
+ * @param {string|undefined} originalFormat
+ * @param {ImportAttributes|undefined} importAttributes
+ * @param {string[]} conditions
+ * @param {(url: string, context: ModuleLoadContext) => ModuleLoadResult} defaultLoad
+ * @returns {ModuleLoadResult}
+ */
+function loadWithHooks(url, originalFormat, importAttributes, conditions, defaultLoad) {
+  debug('loadWithHooks', url, originalFormat);
+  const context = new ModuleLoadContext(originalFormat, importAttributes, conditions);
+  if (loadHooks.length === 0) {
+    return defaultLoad(url, context);
+  }
+
+  const runner = buildHooks(loadHooks, 'load', defaultLoad, validateLoad);
+
+  const result = runner(url, context);
+  const { source, format } = result;
+  if (!isAnyArrayBuffer(source) && !isArrayBufferView(source)) {
+    return result;
+  }
+
+  switch (format) {
+    // Text formats:
+    case undefined:
+    case 'module':
+    case 'commonjs':
+    case 'json':
+    case 'module-typescript':
+    case 'commonjs-typescript':
+    case 'typescript': {
+      decoder ??= new (require('internal/encoding').TextDecoder)();
+      result.source = decoder.decode(source);
+      break;
+    }
+    default:
+      break;
+  }
+  return result;
+}
+
+/**
+ * Resolve module request to a url, through a hooks chain if it exists.
+ * @param {string} specifier
+ * @param {string|undefined} parentURL
+ * @param {ImportAttributes|undefined} importAttributes
+ * @param {string[]} conditions
+ * @param {(specifier: string, context: ModuleResolveContext) => ModuleResolveResult} defaultResolve
+ * @returns {ModuleResolveResult}
+ */
+function resolveWithHooks(specifier, parentURL, importAttributes, conditions, defaultResolve) {
+  debug('resolveWithHooks', specifier, parentURL, importAttributes);
+  const context = new ModuleResolveContext(parentURL, importAttributes, conditions);
+  if (resolveHooks.length === 0) {
+    return defaultResolve(specifier, context);
+  }
+
+  const runner = buildHooks(resolveHooks, 'resolve', defaultResolve, validateResolve);
+
+  return runner(specifier, context);
+}
+
+module.exports = {
+  convertCJSFilenameToURL,
+  convertURLToCJSFilename,
+  loadHooks,
+  loadWithHooks,
+  registerHooks,
+  resolveHooks,
+  resolveWithHooks,
+};
diff --git a/lib/internal/modules/esm/loader.js b/lib/internal/modules/esm/loader.js
index c5594e07d667c3..c52f388754d5f1 100644
--- a/lib/internal/modules/esm/loader.js
+++ b/lib/internal/modules/esm/loader.js
@@ -42,6 +42,12 @@ const { ModuleWrap, kEvaluating, kEvaluated } = internalBinding('module_wrap');
 const {
   urlToFilename,
 } = require('internal/modules/helpers');
+const {
+  resolveHooks,
+  resolveWithHooks,
+  loadHooks,
+  loadWithHooks,
+} = require('internal/modules/customization_hooks');
 let defaultResolve, defaultLoad, defaultLoadSync, importMetaInitializer;
 
 const { tracingChannel } = require('diagnostics_channel');
@@ -137,7 +143,7 @@ class ModuleLoader {
 
   /**
    * Customizations to pass requests to.
-   *
+   * @type {import('./hooks.js').Hooks}
    * Note that this value _MUST_ be set with `setCustomizations`
    * because it needs to copy `customizations.allowImportMetaResolve`
    *  to this property and failure to do so will cause undefined
@@ -350,7 +356,7 @@ class ModuleLoader {
 
     // TODO(joyeecheung): consolidate cache behavior and use resolveSync() and
     // loadSync() here.
-    const resolveResult = this.#cachedDefaultResolve(specifier, parentURL, importAttributes);
+    const resolveResult = this.#cachedResolveSync(specifier, parentURL, importAttributes);
     const { url, format } = resolveResult;
     if (!getOptionValue('--experimental-require-module')) {
       throw new ERR_REQUIRE_ESM(url, true);
@@ -375,8 +381,7 @@ class ModuleLoader {
       return job;
     }
 
-    defaultLoadSync ??= require('internal/modules/esm/load').defaultLoadSync;
-    const loadResult = defaultLoadSync(url, { format, importAttributes });
+    const loadResult = this.#loadSync(url, { format, importAttributes });
 
     // Use the synchronous commonjs translator which can deal with cycles.
     const finalFormat = loadResult.format === 'commonjs' ? 'commonjs-sync' : loadResult.format;
@@ -580,6 +585,10 @@ class ModuleLoader {
    */
   resolve(specifier, parentURL, importAttributes) {
     specifier = `${specifier}`;
+    if (resolveHooks.length) {
+      // Has module.registerHooks() hooks, use the synchronous variant that can handle both hooks.
+      return this.resolveSync(specifier, parentURL, importAttributes);
+    }
     if (this.#customizations) {  // Only has module.register hooks.
       return this.#customizations.resolve(specifier, parentURL, importAttributes);
     }
@@ -606,7 +615,26 @@ class ModuleLoader {
   }
 
   /**
-   * This is the default resolve step for future synchronous hooks, which incorporates asynchronous hooks
+   * Either return a cached resolution, or perform the synchronous resolution, and
+   * cache the result.
+   * @param {string} specifier See {@link resolve}.
+   * @param {string} [parentURL] See {@link resolve}.
+   * @param {ImportAttributes} importAttributes See {@link resolve}.
+   * @returns {{ format: string, url: string }}
+   */
+  #cachedResolveSync(specifier, parentURL, importAttributes) {
+    const requestKey = this.#resolveCache.serializeKey(specifier, importAttributes);
+    const cachedResult = this.#resolveCache.get(requestKey, parentURL);
+    if (cachedResult != null) {
+      return cachedResult;
+    }
+    const result = this.resolveSync(specifier, parentURL, importAttributes);
+    this.#resolveCache.set(requestKey, parentURL, result);
+    return result;
+  }
+
+  /**
+   * This is the default resolve step for module.registerHooks(), which incorporates asynchronous hooks
    * from module.register() which are run in a blocking fashion for it to be synchronous.
    * @param {string|URL} specifier See {@link resolveSync}.
    * @param {{ parentURL?: string, importAttributes: ImportAttributes}} context See {@link resolveSync}.
@@ -624,7 +652,7 @@ class ModuleLoader {
    * asynchronous resolve hooks from module.register(), it will block until the results are returned
    * from the loader thread for this to be synchornous.
    * This is here to support `import.meta.resolve()`, `require()` in imported CJS, and
-   * future synchronous hooks.
+   * `module.registerHooks()` hooks.
    *
    * TODO(joyeecheung): consolidate the cache behavior and use this in require(esm).
    * @param {string|URL} specifier See {@link resolve}.
@@ -633,7 +661,13 @@ class ModuleLoader {
    * @returns {{ format: string, url: string }}
    */
   resolveSync(specifier, parentURL, importAttributes = { __proto__: null }) {
-    return this.#resolveAndMaybeBlockOnLoaderThread(`${specifier}`, { parentURL, importAttributes });
+    specifier = `${specifier}`;
+    if (resolveHooks.length) {
+      // Has module.registerHooks() hooks, chain the asynchronous hooks in the default step.
+      return resolveWithHooks(specifier, parentURL, importAttributes, this.#defaultConditions,
+                              this.#resolveAndMaybeBlockOnLoaderThread.bind(this));
+    }
+    return this.#resolveAndMaybeBlockOnLoaderThread(specifier, { parentURL, importAttributes });
   }
 
   /**
@@ -662,6 +696,10 @@ class ModuleLoader {
    * @returns {Promise<{ format: ModuleFormat, source: ModuleSource }>}
    */
   async load(url, context) {
+    if (loadHooks.length) {
+      // Has module.registerHooks() hooks, use the synchronous variant that can handle both hooks.
+      return this.#loadSync(url, context);
+    }
     if (this.#customizations) {
       return this.#customizations.load(url, context);
     }
@@ -671,7 +709,7 @@ class ModuleLoader {
   }
 
   /**
-   * This is the default load step for future synchronous hooks, which incorporates asynchronous hooks
+   * This is the default load step for module.registerHooks(), which incorporates asynchronous hooks
    * from module.register() which are run in a blocking fashion for it to be synchronous.
    * @param {string} url See {@link load}
    * @param {object} context See {@link load}
@@ -689,7 +727,7 @@ class ModuleLoader {
    * Similar to {@link load} but this is always run synchronously. If there are asynchronous hooks
    * from module.register(), this blocks on the loader thread for it to return synchronously.
    *
-   * This is here to support `require()` in imported CJS and future synchronous hooks.
+   * This is here to support `require()` in imported CJS and `module.registerHooks()` hooks.
    *
    * TODO(joyeecheung): consolidate the cache behavior and use this in require(esm).
    * @param {string} url See {@link load}
@@ -697,6 +735,13 @@ class ModuleLoader {
    * @returns {{ format: ModuleFormat, source: ModuleSource }}
    */
   #loadSync(url, context) {
+    if (loadHooks.length) {
+      // Has module.registerHooks() hooks, chain the asynchronous hooks in the default step.
+      // TODO(joyeecheung): construct the ModuleLoadContext in the loaders directly instead
+      // of converting them from plain objects in the hooks.
+      return loadWithHooks(url, context.format, context.importAttributes, this.#defaultConditions,
+                           this.#loadAndMaybeBlockOnLoaderThread.bind(this));
+    }
     return this.#loadAndMaybeBlockOnLoaderThread(url, context);
   }
 
diff --git a/lib/internal/modules/esm/module_job.js b/lib/internal/modules/esm/module_job.js
index 8fba05e7b8f699..8039e2f57a500f 100644
--- a/lib/internal/modules/esm/module_job.js
+++ b/lib/internal/modules/esm/module_job.js
@@ -131,7 +131,8 @@ class ModuleJob extends ModuleJobBase {
     // Iterate with index to avoid calling into userspace with `Symbol.iterator`.
     for (let idx = 0; idx < moduleRequests.length; idx++) {
       const { specifier, attributes } = moduleRequests[idx];
-
+      // TODO(joyeecheung): resolve all requests first, then load them in another
+      // loop so that hooks can pre-fetch sources off-thread.
       const dependencyJobPromise = this.#loader.getModuleJobForImport(
         specifier, this.url, attributes,
       );
diff --git a/lib/internal/modules/esm/translators.js b/lib/internal/modules/esm/translators.js
index a9a3234befe10f..5b2a865582e5cd 100644
--- a/lib/internal/modules/esm/translators.js
+++ b/lib/internal/modules/esm/translators.js
@@ -26,7 +26,7 @@ const {
 const { BuiltinModule } = require('internal/bootstrap/realm');
 const assert = require('internal/assert');
 const { readFileSync } = require('fs');
-const { dirname, extname, isAbsolute } = require('path');
+const { dirname, extname } = require('path');
 const {
   assertBufferSource,
   loadBuiltinModule,
@@ -42,6 +42,9 @@ const {
   kModuleSource,
   kModuleExport,
   kModuleExportNames,
+  findLongestRegisteredExtension,
+  resolveForCJSWithHooks,
+  loadSourceForCJSWithHooks,
 } = require('internal/modules/cjs/loader');
 const { fileURLToPath, pathToFileURL, URL } = require('internal/url');
 let debug = require('internal/util/debuglog').debuglog('esm', (fn) => {
@@ -171,17 +174,18 @@ const cjsCache = new SafeMap();
  * @param {string} url - The URL of the module.
  * @param {string} source - The source code of the module.
  * @param {boolean} isMain - Whether the module is the main module.
+ * @param {string} format - Format of the module.
  * @param {typeof loadCJSModule} [loadCJS=loadCJSModule] - The function to load the CommonJS module.
  * @returns {ModuleWrap} The ModuleWrap object for the CommonJS module.
  */
-function createCJSModuleWrap(url, source, isMain, loadCJS = loadCJSModule) {
+function createCJSModuleWrap(url, source, isMain, format, loadCJS = loadCJSModule) {
   debug(`Translating CJSModule ${url}`);
 
   const filename = urlToFilename(url);
   // In case the source was not provided by the `load` step, we need fetch it now.
   source = stringify(source ?? getSource(new URL(url)).source);
 
-  const { exportNames, module } = cjsPreparseModuleExports(filename, source);
+  const { exportNames, module } = cjsPreparseModuleExports(filename, source, isMain, format);
   cjsCache.set(url, module);
 
   const wrapperNames = [...exportNames, 'module.exports'];
@@ -228,7 +232,7 @@ function createCJSModuleWrap(url, source, isMain, loadCJS = loadCJSModule) {
 translators.set('commonjs-sync', function requireCommonJS(url, source, isMain) {
   initCJSParseSync();
 
-  return createCJSModuleWrap(url, source, isMain, (module, source, url, filename, isMain) => {
+  return createCJSModuleWrap(url, source, isMain, 'commonjs', (module, source, url, filename, isMain) => {
     assert(module === CJSModule._cache[filename]);
     wrapModuleLoad(filename, null, isMain);
   });
@@ -240,7 +244,7 @@ translators.set('require-commonjs', (url, source, isMain) => {
   initCJSParseSync();
   assert(cjsParse);
 
-  return createCJSModuleWrap(url, source);
+  return createCJSModuleWrap(url, source, isMain, 'commonjs');
 });
 
 // Handle CommonJS modules referenced by `require` calls.
@@ -249,7 +253,7 @@ translators.set('require-commonjs-typescript', (url, source, isMain) => {
   emitExperimentalWarning('Type Stripping');
   assert(cjsParse);
   const code = stripTypeScriptModuleTypes(stringify(source), url);
-  return createCJSModuleWrap(url, code);
+  return createCJSModuleWrap(url, code, isMain, 'commonjs-typescript');
 });
 
 // Handle CommonJS modules referenced by `import` statements or expressions,
@@ -273,16 +277,17 @@ translators.set('commonjs', function commonjsStrategy(url, source, isMain) {
   } catch {
     // Continue regardless of error.
   }
-  return createCJSModuleWrap(url, source, isMain, cjsLoader);
+  return createCJSModuleWrap(url, source, isMain, 'commonjs', cjsLoader);
 });
 
 /**
  * Pre-parses a CommonJS module's exports and re-exports.
  * @param {string} filename - The filename of the module.
  * @param {string} [source] - The source code of the module.
+ * @param {boolean} isMain - Whether it is pre-parsing for the entry point.
+ * @param {string} format
  */
-function cjsPreparseModuleExports(filename, source) {
-  // TODO: Do we want to keep hitting the user mutable CJS loader here?
+function cjsPreparseModuleExports(filename, source, isMain, format) {
   let module = CJSModule._cache[filename];
   if (module && module[kModuleExportNames] !== undefined) {
     return { module, exportNames: module[kModuleExportNames] };
@@ -293,10 +298,15 @@ function cjsPreparseModuleExports(filename, source) {
     module.filename = filename;
     module.paths = CJSModule._nodeModulePaths(module.path);
     module[kIsCachedByESMLoader] = true;
-    module[kModuleSource] = source;
     CJSModule._cache[filename] = module;
   }
 
+  if (source === undefined) {
+    ({ source } = loadSourceForCJSWithHooks(module, filename, format));
+  }
+  module[kModuleSource] = source;
+
+  debug(`Preparsing exports of ${filename}`);
   let exports, reexports;
   try {
     ({ exports, reexports } = cjsParse(source || ''));
@@ -310,34 +320,27 @@ function cjsPreparseModuleExports(filename, source) {
   // Set first for cycles.
   module[kModuleExportNames] = exportNames;
 
+  // If there are any re-exports e.g. `module.exports = { ...require(...) }`,
+  // pre-parse the dependencies to find transitively exported names.
   if (reexports.length) {
-    module.filename = filename;
-    module.paths = CJSModule._nodeModulePaths(module.path);
+    module.filename ??= filename;
+    module.paths ??= CJSModule._nodeModulePaths(dirname(filename));
+
     for (let i = 0; i < reexports.length; i++) {
+      debug(`Preparsing re-exports of '${filename}'`);
       const reexport = reexports[i];
       let resolved;
+      let format;
       try {
-        // TODO: this should be calling the `resolve` hook chain instead.
-        // Doing so would mean dropping support for CJS in the loader thread, as
-        // this call needs to be sync from the perspective of the main thread,
-        // which we can do via HooksProxy and Atomics, but we can't do within
-        // the loaders thread. Until this is done, the lexer will use the
-        // monkey-patchable CJS loader to get the path to the module file to
-        // load (which may or may not be aligned with the URL that the `resolve`
-        // hook have returned).
-        resolved = CJSModule._resolveFilename(reexport, module);
-      } catch {
+        ({ format, filename: resolved } = resolveForCJSWithHooks(reexport, module, false));
+      } catch (e) {
+        debug(`Failed to resolve '${reexport}', skipping`, e);
         continue;
       }
-      // TODO: this should be calling the `load` hook chain and check if it returns
-      // `format: 'commonjs'` instead of relying on file extensions.
-      const ext = extname(resolved);
-      if ((ext === '.js' || ext === '.cjs' || !CJSModule._extensions[ext]) &&
-      isAbsolute(resolved)) {
-        // TODO: this should be calling the `load` hook chain to get the source
-        // (and fallback to reading the FS only if the source is nullish).
-        const source = readFileSync(resolved, 'utf-8');
-        const { exportNames: reexportNames } = cjsPreparseModuleExports(resolved, source);
+
+      if (format === 'commonjs' ||
+        (!BuiltinModule.normalizeRequirableId(resolved) && findLongestRegisteredExtension(resolved) === '.js')) {
+        const { exportNames: reexportNames } = cjsPreparseModuleExports(resolved, undefined, false, format);
         for (const name of reexportNames) {
           exportNames.add(name);
         }
diff --git a/lib/internal/modules/esm/worker.js b/lib/internal/modules/esm/worker.js
index 311d77fb099384..0213df7a92a0eb 100644
--- a/lib/internal/modules/esm/worker.js
+++ b/lib/internal/modules/esm/worker.js
@@ -215,8 +215,6 @@ async function customizedModuleWorker(lock, syncCommPort, errorHandler) {
       (port ?? syncCommPort).postMessage(wrapMessage('error', exception));
     }
 
-    AtomicsAdd(lock, WORKER_TO_MAIN_THREAD_NOTIFICATION, 1);
-    AtomicsNotify(lock, WORKER_TO_MAIN_THREAD_NOTIFICATION);
     if (shouldRemoveGlobalErrorHandler) {
       process.off('uncaughtException', errorHandler);
     }
@@ -225,6 +223,10 @@ async function customizedModuleWorker(lock, syncCommPort, errorHandler) {
     // We keep checking for new messages to not miss any.
     clearImmediate(immediate);
     immediate = setImmediate(checkForMessages).unref();
+    // To prevent the main thread from terminating before this function completes after unlocking,
+    // the following process is executed at the end of the function.
+    AtomicsAdd(lock, WORKER_TO_MAIN_THREAD_NOTIFICATION, 1);
+    AtomicsNotify(lock, WORKER_TO_MAIN_THREAD_NOTIFICATION);
   }
 }
 
diff --git a/lib/internal/modules/helpers.js b/lib/internal/modules/helpers.js
index 1e4b623af77877..c3122118cab75d 100644
--- a/lib/internal/modules/helpers.js
+++ b/lib/internal/modules/helpers.js
@@ -97,15 +97,14 @@ function getCjsConditions() {
 /**
  * Provide one of Node.js' public modules to user code.
  * @param {string} id - The identifier/specifier of the builtin module to load
- * @param {string} request - The module requiring or importing the builtin module
  */
-function loadBuiltinModule(id, request) {
+function loadBuiltinModule(id) {
   if (!BuiltinModule.canBeRequiredByUsers(id)) {
     return;
   }
   /** @type {import('internal/bootstrap/realm.js').BuiltinModule} */
   const mod = BuiltinModule.map.get(id);
-  debug('load built-in module %s', request);
+  debug('load built-in module %s', id);
   // compileForPublicLoader() throws if canBeRequiredByUsers is false:
   mod.compileForPublicLoader();
   return mod;
diff --git a/lib/internal/process/permission.js b/lib/internal/process/permission.js
index 7a6dd80d1d01f3..bfdfe29fe4739f 100644
--- a/lib/internal/process/permission.js
+++ b/lib/internal/process/permission.js
@@ -9,16 +9,16 @@ const { validateString, validateBuffer } = require('internal/validators');
 const { Buffer } = require('buffer');
 const { isBuffer } = Buffer;
 
-let experimentalPermission;
+let _permission;
 
 module.exports = ObjectFreeze({
   __proto__: null,
   isEnabled() {
-    if (experimentalPermission === undefined) {
+    if (_permission === undefined) {
       const { getOptionValue } = require('internal/options');
-      experimentalPermission = getOptionValue('--experimental-permission');
+      _permission = getOptionValue('--permission');
     }
-    return experimentalPermission;
+    return _permission;
   },
   has(scope, reference) {
     validateString(scope, 'scope');
diff --git a/lib/internal/process/pre_execution.js b/lib/internal/process/pre_execution.js
index 41ebf85900b100..b3aba59674b82b 100644
--- a/lib/internal/process/pre_execution.js
+++ b/lib/internal/process/pre_execution.js
@@ -520,14 +520,13 @@ function initializeClusterIPC() {
 }
 
 function initializePermission() {
-  const experimentalPermission = getOptionValue('--experimental-permission');
-  if (experimentalPermission) {
+  const permission = getOptionValue('--permission');
+  if (permission) {
     process.binding = function binding(_module) {
       throw new ERR_ACCESS_DENIED('process.binding');
     };
     // Guarantee path module isn't monkey-patched to bypass permission model
     ObjectFreeze(require('path'));
-    emitExperimentalWarning('Permission');
     const { has } = require('internal/process/permission');
     const warnFlags = [
       '--allow-addons',
@@ -579,7 +578,7 @@ function initializePermission() {
     ArrayPrototypeForEach(availablePermissionFlags, (flag) => {
       const value = getOptionValue(flag);
       if (value.length) {
-        throw new ERR_MISSING_OPTION('--experimental-permission');
+        throw new ERR_MISSING_OPTION('--permission');
       }
     });
   }
diff --git a/lib/internal/test_runner/coverage.js b/lib/internal/test_runner/coverage.js
index c27b34b9c762c4..42fa2b87b778f1 100644
--- a/lib/internal/test_runner/coverage.js
+++ b/lib/internal/test_runner/coverage.js
@@ -27,7 +27,7 @@ const {
 } = require('fs');
 const { setupCoverageHooks } = require('internal/util');
 const { tmpdir } = require('os');
-const { join, resolve, relative, matchesGlob } = require('path');
+const { join, resolve, relative } = require('path');
 const { fileURLToPath } = require('internal/url');
 const { kMappings, SourceMap } = require('internal/source_map/source_map');
 const {
@@ -36,6 +36,8 @@ const {
     ERR_SOURCE_MAP_MISSING_SOURCE,
   },
 } = require('internal/errors');
+const { matchGlobPattern } = require('internal/fs/glob');
+
 const kCoverageFileRegex = /^coverage-(\d+)-(\d{13})-(\d+)\.json$/;
 const kIgnoreRegex = /\/\* node:coverage ignore next (?<count>\d+ )?\*\//;
 const kLineEndingRegex = /\r?\n$/u;
@@ -464,19 +466,24 @@ class TestCoverage {
       coverageExcludeGlobs: excludeGlobs,
       coverageIncludeGlobs: includeGlobs,
     } = this.options;
+
     // This check filters out files that match the exclude globs.
     if (excludeGlobs?.length > 0) {
       for (let i = 0; i < excludeGlobs.length; ++i) {
-        if (matchesGlob(relativePath, excludeGlobs[i]) ||
-            matchesGlob(absolutePath, excludeGlobs[i])) return true;
+        if (
+          matchGlobPattern(relativePath, excludeGlobs[i]) ||
+          matchGlobPattern(absolutePath, excludeGlobs[i])
+        ) return true;
       }
     }
 
     // This check filters out files that do not match the include globs.
     if (includeGlobs?.length > 0) {
       for (let i = 0; i < includeGlobs.length; ++i) {
-        if (matchesGlob(relativePath, includeGlobs[i]) ||
-            matchesGlob(absolutePath, includeGlobs[i])) return false;
+        if (
+          matchGlobPattern(relativePath, includeGlobs[i]) ||
+          matchGlobPattern(absolutePath, includeGlobs[i])
+        ) return false;
       }
       return true;
     }
diff --git a/lib/internal/test_runner/utils.js b/lib/internal/test_runner/utils.js
index 243b648795a7ef..99c53a170313ad 100644
--- a/lib/internal/test_runner/utils.js
+++ b/lib/internal/test_runner/utils.js
@@ -287,6 +287,11 @@ function parseCommandLine() {
 
   if (coverage) {
     coverageExcludeGlobs = getOptionValue('--test-coverage-exclude');
+    if (!coverageExcludeGlobs || coverageExcludeGlobs.length === 0) {
+      // TODO(pmarchini): this default should follow something similar to c8 defaults
+      // Default exclusions should be also exported to be used by other tools / users
+      coverageExcludeGlobs = [kDefaultPattern];
+    }
     coverageIncludeGlobs = getOptionValue('--test-coverage-include');
 
     branchCoverage = getOptionValue('--test-coverage-branches');
diff --git a/lib/internal/util/debuglog.js b/lib/internal/util/debuglog.js
index 271c9d1497d88f..96b10c4bbac767 100644
--- a/lib/internal/util/debuglog.js
+++ b/lib/internal/util/debuglog.js
@@ -173,7 +173,7 @@ function formatTime(ms) {
 }
 
 function safeTraceLabel(label) {
-  return label.replace(/\\/g, '\\\\').replaceAll('"', '\\"');
+  return label.replaceAll('\\', '\\\\').replaceAll('"', '\\"');
 }
 
 /**
diff --git a/lib/internal/util/inspect.js b/lib/internal/util/inspect.js
index a697459468d7b9..57586a888e191f 100644
--- a/lib/internal/util/inspect.js
+++ b/lib/internal/util/inspect.js
@@ -2,7 +2,10 @@
 
 const {
   Array,
+  ArrayBuffer,
+  ArrayBufferPrototype,
   ArrayIsArray,
+  ArrayPrototype,
   ArrayPrototypeFilter,
   ArrayPrototypeForEach,
   ArrayPrototypeIncludes,
@@ -22,10 +25,15 @@ const {
   DatePrototypeToISOString,
   DatePrototypeToString,
   ErrorPrototypeToString,
+  Function,
+  FunctionPrototype,
   FunctionPrototypeBind,
   FunctionPrototypeCall,
+  FunctionPrototypeSymbolHasInstance,
   FunctionPrototypeToString,
   JSONStringify,
+  Map,
+  MapPrototype,
   MapPrototypeEntries,
   MapPrototypeGetSize,
   MathFloor,
@@ -50,6 +58,7 @@ const {
   ObjectGetPrototypeOf,
   ObjectIs,
   ObjectKeys,
+  ObjectPrototype,
   ObjectPrototypeHasOwnProperty,
   ObjectPrototypePropertyIsEnumerable,
   ObjectSeal,
@@ -64,6 +73,8 @@ const {
   SafeMap,
   SafeSet,
   SafeStringIterator,
+  Set,
+  SetPrototype,
   SetPrototypeGetSize,
   SetPrototypeValues,
   String,
@@ -89,6 +100,8 @@ const {
   SymbolPrototypeValueOf,
   SymbolToPrimitive,
   SymbolToStringTag,
+  TypedArray,
+  TypedArrayPrototype,
   TypedArrayPrototypeGetLength,
   TypedArrayPrototypeGetSymbolToStringTag,
   Uint8Array,
@@ -593,10 +606,31 @@ function isInstanceof(object, proto) {
   }
 }
 
+// Special-case for some builtin prototypes in case their `constructor` property has been tampered.
+const wellKnownPrototypes = new SafeMap();
+wellKnownPrototypes.set(ArrayPrototype, { name: 'Array', constructor: Array });
+wellKnownPrototypes.set(ArrayBufferPrototype, { name: 'ArrayBuffer', constructor: ArrayBuffer });
+wellKnownPrototypes.set(FunctionPrototype, { name: 'Function', constructor: Function });
+wellKnownPrototypes.set(MapPrototype, { name: 'Map', constructor: Map });
+wellKnownPrototypes.set(ObjectPrototype, { name: 'Object', constructor: Object });
+wellKnownPrototypes.set(SetPrototype, { name: 'Set', constructor: Set });
+wellKnownPrototypes.set(TypedArrayPrototype, { name: 'TypedArray', constructor: TypedArray });
+
 function getConstructorName(obj, ctx, recurseTimes, protoProps) {
   let firstProto;
   const tmp = obj;
   while (obj || isUndetectableObject(obj)) {
+    const wellKnownPrototypeNameAndConstructor = wellKnownPrototypes.get(obj);
+    if (wellKnownPrototypeNameAndConstructor != null) {
+      const { name, constructor } = wellKnownPrototypeNameAndConstructor;
+      if (FunctionPrototypeSymbolHasInstance(constructor, tmp)) {
+        if (protoProps !== undefined && firstProto !== obj) {
+          addPrototypeProperties(
+            ctx, tmp, firstProto || tmp, recurseTimes, protoProps);
+        }
+        return name;
+      }
+    }
     const descriptor = ObjectGetOwnPropertyDescriptor(obj, 'constructor');
     if (descriptor !== undefined &&
         typeof descriptor.value === 'function' &&
@@ -805,12 +839,12 @@ function formatValue(ctx, value, recurseTimes, typedArray) {
         // Filter out the util module, its inspect function is special.
         maybeCustom !== inspect &&
         // Also filter out any prototype objects using the circular check.
-        !(value.constructor && value.constructor.prototype === value)) {
+        ObjectGetOwnPropertyDescriptor(value, 'constructor')?.value?.prototype !== value) {
       // This makes sure the recurseTimes are reported as before while using
       // a counter internally.
       const depth = ctx.depth === null ? null : ctx.depth - recurseTimes;
       const isCrossContext =
-        proxy !== undefined || !(context instanceof Object);
+        proxy !== undefined || !FunctionPrototypeSymbolHasInstance(Object, context);
       const ret = FunctionPrototypeCall(
         maybeCustom,
         context,
@@ -954,7 +988,11 @@ function formatRaw(ctx, value, recurseTimes, typedArray) {
   if (noIterator) {
     keys = getKeys(value, ctx.showHidden);
     braces = ['{', '}'];
-    if (constructor === 'Object') {
+    if (typeof value === 'function') {
+      base = getFunctionBase(value, constructor, tag);
+      if (keys.length === 0 && protoProps === undefined)
+        return ctx.stylize(base, 'special');
+    } else if (constructor === 'Object') {
       if (isArgumentsObject(value)) {
         braces[0] = '[Arguments] {';
       } else if (tag !== '') {
@@ -963,10 +1001,6 @@ function formatRaw(ctx, value, recurseTimes, typedArray) {
       if (keys.length === 0 && protoProps === undefined) {
         return `${braces[0]}}`;
       }
-    } else if (typeof value === 'function') {
-      base = getFunctionBase(value, constructor, tag);
-      if (keys.length === 0 && protoProps === undefined)
-        return ctx.stylize(base, 'special');
     } else if (isRegExp(value)) {
       // Make RegExps say that they are RegExps
       base = RegExpPrototypeToString(
diff --git a/lib/internal/webstreams/readablestream.js b/lib/internal/webstreams/readablestream.js
index 2369175733c115..8d884c43c2f9c3 100644
--- a/lib/internal/webstreams/readablestream.js
+++ b/lib/internal/webstreams/readablestream.js
@@ -94,6 +94,7 @@ const {
   ArrayBufferViewGetByteLength,
   ArrayBufferViewGetByteOffset,
   AsyncIterator,
+  canCopyArrayBuffer,
   cloneAsUint8Array,
   copyArrayBuffer,
   createPromiseCallback,
@@ -2552,6 +2553,15 @@ function readableByteStreamControllerCommitPullIntoDescriptor(stream, desc) {
   }
 }
 
+function readableByteStreamControllerCommitPullIntoDescriptors(stream, descriptors) {
+  for (let i = 0; i < descriptors.length; ++i) {
+    readableByteStreamControllerCommitPullIntoDescriptor(
+      stream,
+      descriptors[i],
+    );
+  }
+}
+
 function readableByteStreamControllerInvalidateBYOBRequest(controller) {
   if (controller[kState].byobRequest === null)
     return;
@@ -2758,11 +2768,11 @@ function readableByteStreamControllerRespondInClosedState(controller, desc) {
     stream,
   } = controller[kState];
   if (readableStreamHasBYOBReader(stream)) {
-    while (readableStreamGetNumReadIntoRequests(stream) > 0) {
-      readableByteStreamControllerCommitPullIntoDescriptor(
-        stream,
-        readableByteStreamControllerShiftPendingPullInto(controller));
+    const filledPullIntos = [];
+    for (let i = 0; i < readableStreamGetNumReadIntoRequests(stream); ++i) {
+      ArrayPrototypePush(filledPullIntos, readableByteStreamControllerShiftPendingPullInto(controller));
     }
+    readableByteStreamControllerCommitPullIntoDescriptors(stream, filledPullIntos);
   }
 }
 
@@ -2843,8 +2853,9 @@ function readableByteStreamControllerEnqueue(controller, chunk) {
       transferredBuffer,
       byteOffset,
       byteLength);
-    readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(
+    const filledPullIntos = readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(
       controller);
+    readableByteStreamControllerCommitPullIntoDescriptors(stream, filledPullIntos);
   } else {
     assert(!isReadableStreamLocked(stream));
     readableByteStreamControllerEnqueueChunkToQueue(
@@ -2937,6 +2948,7 @@ function readableByteStreamControllerFillPullIntoDescriptorFromQueue(
   const maxAlignedBytes = maxBytesFilled - (maxBytesFilled % elementSize);
   let totalBytesToCopyRemaining = maxBytesToCopy;
   let ready = false;
+  assert(!ArrayBufferPrototypeGetDetached(buffer));
   assert(bytesFilled < minimumFill);
   if (maxAlignedBytes >= minimumFill) {
     totalBytesToCopyRemaining = maxAlignedBytes - bytesFilled;
@@ -2952,12 +2964,12 @@ function readableByteStreamControllerFillPullIntoDescriptorFromQueue(
       totalBytesToCopyRemaining,
       headOfQueue.byteLength);
     const destStart = byteOffset + desc.bytesFilled;
-    const arrayBufferByteLength = ArrayBufferPrototypeGetByteLength(buffer);
-    if (arrayBufferByteLength - destStart < bytesToCopy) {
-      throw new ERR_INVALID_STATE.RangeError(
-        'view ArrayBuffer size is invalid');
-    }
-    assert(arrayBufferByteLength - destStart >= bytesToCopy);
+    assert(canCopyArrayBuffer(
+      buffer,
+      destStart,
+      headOfQueue.buffer,
+      headOfQueue.byteOffset,
+      bytesToCopy));
     copyArrayBuffer(
       buffer,
       destStart,
@@ -2991,26 +3003,30 @@ function readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(
   const {
     closeRequested,
     pendingPullIntos,
-    stream,
   } = controller[kState];
   assert(!closeRequested);
+  const filledPullIntos = [];
   while (pendingPullIntos.length) {
     if (!controller[kState].queueTotalSize)
-      return;
+      break;
     const desc = pendingPullIntos[0];
     if (readableByteStreamControllerFillPullIntoDescriptorFromQueue(
       controller,
       desc)) {
       readableByteStreamControllerShiftPendingPullInto(controller);
-      readableByteStreamControllerCommitPullIntoDescriptor(stream, desc);
+      ArrayPrototypePush(filledPullIntos, desc);
     }
   }
+  return filledPullIntos;
 }
 
 function readableByteStreamControllerRespondInReadableState(
   controller,
   bytesWritten,
   desc) {
+  const {
+    stream,
+  } = controller[kState];
   const {
     buffer,
     bytesFilled,
@@ -3031,9 +3047,10 @@ function readableByteStreamControllerRespondInReadableState(
       controller,
       desc,
     );
-    readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(
+    const filledPullIntos = readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(
       controller,
     );
+    readableByteStreamControllerCommitPullIntoDescriptors(stream, filledPullIntos);
     return;
   }
 
@@ -3059,10 +3076,10 @@ function readableByteStreamControllerRespondInReadableState(
       ArrayBufferPrototypeGetByteLength(remainder));
   }
   desc.bytesFilled -= remainderSize;
-  readableByteStreamControllerCommitPullIntoDescriptor(
-    controller[kState].stream,
-    desc);
-  readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(controller);
+  const filledPullIntos = readableByteStreamControllerProcessPullIntoDescriptorsUsingQueue(controller);
+
+  readableByteStreamControllerCommitPullIntoDescriptor(stream, desc);
+  readableByteStreamControllerCommitPullIntoDescriptors(stream, filledPullIntos);
 }
 
 function readableByteStreamControllerRespondWithNewView(controller, view) {
diff --git a/lib/internal/webstreams/util.js b/lib/internal/webstreams/util.js
index 2c70ef7acdfe66..5bf016f73b7af5 100644
--- a/lib/internal/webstreams/util.js
+++ b/lib/internal/webstreams/util.js
@@ -1,6 +1,8 @@
 'use strict';
 
 const {
+  ArrayBufferPrototypeGetByteLength,
+  ArrayBufferPrototypeGetDetached,
   ArrayBufferPrototypeSlice,
   ArrayPrototypePush,
   ArrayPrototypeShift,
@@ -107,6 +109,14 @@ function cloneAsUint8Array(view) {
   );
 }
 
+function canCopyArrayBuffer(toBuffer, toIndex, fromBuffer, fromIndex, count) {
+  return toBuffer !== fromBuffer &&
+    !ArrayBufferPrototypeGetDetached(toBuffer) &&
+    !ArrayBufferPrototypeGetDetached(fromBuffer) &&
+    toIndex + count <= ArrayBufferPrototypeGetByteLength(toBuffer) &&
+    fromIndex + count <= ArrayBufferPrototypeGetByteLength(fromBuffer);
+}
+
 function isBrandCheck(brand) {
   return (value) => {
     return value != null &&
@@ -261,6 +271,7 @@ module.exports = {
   ArrayBufferViewGetByteLength,
   ArrayBufferViewGetByteOffset,
   AsyncIterator,
+  canCopyArrayBuffer,
   createPromiseCallback,
   cloneAsUint8Array,
   copyArrayBuffer,
diff --git a/lib/path.js b/lib/path.js
index 40161c45e2a911..8b9d6d0b6b9621 100644
--- a/lib/path.js
+++ b/lib/path.js
@@ -52,13 +52,12 @@ const {
 } = require('internal/validators');
 
 const {
-  getLazy,
   emitExperimentalWarning,
   isWindows,
-  isMacOS,
+  getLazy,
 } = require('internal/util');
 
-const lazyMinimatch = getLazy(() => require('internal/deps/minimatch/index'));
+const lazyMatchGlobPattern = getLazy(() => require('internal/fs/glob').matchGlobPattern);
 
 function isPathSeparator(code) {
   return code === CHAR_FORWARD_SLASH || code === CHAR_BACKWARD_SLASH;
@@ -164,22 +163,6 @@ function _format(sep, pathObject) {
   return dir === pathObject.root ? `${dir}${base}` : `${dir}${sep}${base}`;
 }
 
-function glob(path, pattern, windows) {
-  emitExperimentalWarning('glob');
-  validateString(path, 'path');
-  validateString(pattern, 'pattern');
-  return lazyMinimatch().minimatch(path, pattern, {
-    __proto__: null,
-    nocase: isMacOS || isWindows,
-    windowsPathsNoEscape: true,
-    nonegate: true,
-    nocomment: true,
-    optimizationLevel: 2,
-    platform: windows ? 'win32' : 'posix',
-    nocaseMagicOnly: true,
-  });
-}
-
 const win32 = {
   /**
    * path.resolve([from ...], to)
@@ -1140,7 +1123,8 @@ const win32 = {
   },
 
   matchesGlob(path, pattern) {
-    return glob(path, pattern, true);
+    emitExperimentalWarning('glob');
+    return lazyMatchGlobPattern()(path, pattern, true);
   },
 
   sep: '\\',
@@ -1616,7 +1600,8 @@ const posix = {
   },
 
   matchesGlob(path, pattern) {
-    return glob(path, pattern, false);
+    emitExperimentalWarning('glob');
+    return lazyMatchGlobPattern()(path, pattern, false);
   },
 
   sep: '/',
diff --git a/lib/repl.js b/lib/repl.js
index 37b34af2917643..904cd82dc78abe 100644
--- a/lib/repl.js
+++ b/lib/repl.js
@@ -130,7 +130,7 @@ const { shouldColorize } = require('internal/util/colors');
 const CJSModule = require('internal/modules/cjs/loader').Module;
 let _builtinLibs = ArrayPrototypeFilter(
   CJSModule.builtinModules,
-  (e) => e[0] !== '_',
+  (e) => e[0] !== '_' && !StringPrototypeStartsWith(e, 'node:'),
 );
 const nodeSchemeBuiltinLibs = ArrayPrototypeMap(
   _builtinLibs, (lib) => `node:${lib}`);
diff --git a/lib/url.js b/lib/url.js
index ef1b1a23d9a5c8..8acec11816f88e 100644
--- a/lib/url.js
+++ b/lib/url.js
@@ -705,7 +705,7 @@ Url.prototype.format = function format() {
     }
   }
 
-  search = search.replace(/#/g, '%23');
+  search = search.replaceAll('#', '%23');
 
   if (hash && hash.charCodeAt(0) !== CHAR_HASH)
     hash = '#' + hash;
diff --git a/lib/v8.js b/lib/v8.js
index 7a8979887bab49..e379233ea49e0d 100644
--- a/lib/v8.js
+++ b/lib/v8.js
@@ -31,6 +31,9 @@ const {
   Uint32Array,
   Uint8Array,
   Uint8ClampedArray,
+  globalThis: {
+    Float16Array,
+  },
 } = primordials;
 
 const { Buffer } = require('buffer');
@@ -63,6 +66,7 @@ const {
 } = require('internal/heap_utils');
 const promiseHooks = require('internal/promise_hooks');
 const { getOptionValue } = require('internal/options');
+
 /**
  * Generates a snapshot of the current V8 heap
  * and writes it to a JSON file.
@@ -289,6 +293,7 @@ function arrayBufferViewTypeToIndex(abView) {
   // Index 10 is FastBuffer.
   if (type === '[object BigInt64Array]') return 11;
   if (type === '[object BigUint64Array]') return 12;
+  if (type === '[object Float16Array]') return 13;
   return -1;
 }
 
@@ -306,6 +311,7 @@ function arrayBufferViewIndexToType(index) {
   if (index === 10) return FastBuffer;
   if (index === 11) return BigInt64Array;
   if (index === 12) return BigUint64Array;
+  if (index === 13) return Float16Array;
   return undefined;
 }
 
diff --git a/node.gni b/node.gni
index e199bba2d38641..a2123cc6c6d21c 100644
--- a/node.gni
+++ b/node.gni
@@ -7,11 +7,15 @@ declare_args() {
   # The location of Node.js in source code tree.
   node_path = "//node"
 
-  # The location of V8, use the one from node's deps by default.
+  # The location of V8 - use the one from node's deps by default.
   node_v8_path = "$node_path/deps/v8"
 
+  # The location of OpenSSL - use the one from node's deps by default.
   node_openssl_path = "$node_path/deps/openssl"
 
+  # The location of simdutf - use the one from node's deps by default.
+  node_simdutf_path = "$node_path/deps/simdutf"
+
   # The NODE_MODULE_VERSION defined in node_version.h.
   node_module_version = exec_script("$node_path/tools/getmoduleversion.py", [], "value")
 
diff --git a/node.gyp b/node.gyp
index 1633ed2d832fc5..a3688b8e6dff41 100644
--- a/node.gyp
+++ b/node.gyp
@@ -1296,6 +1296,26 @@
       ],
     }, # embedtest
 
+    {
+      'target_name': 'sqlite_extension',
+      'type': 'shared_library',
+      'sources': [
+        'test/sqlite/extension.c'
+      ],
+
+      'include_dirs': [
+        'test/sqlite',
+        'deps/sqlite',
+      ],
+
+      'cflags': [
+        '-fPIC',
+        '-Wall',
+        '-Wextra',
+        '-O3',
+      ],
+    }, # sqlitetest
+
     {
       'target_name': 'overlapped-checker',
       'type': 'executable',
diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc
index a132eeb62306c6..885a0d072312e9 100644
--- a/src/encoding_binding.cc
+++ b/src/encoding_binding.cc
@@ -286,9 +286,11 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
         env->isolate(), "The encoded data was not valid for encoding latin1");
   }
 
-  Local<Object> buffer_result =
-      node::Buffer::Copy(env, result.c_str(), written).ToLocalChecked();
-  args.GetReturnValue().Set(buffer_result);
+  Local<String> output =
+      String::NewFromUtf8(
+          env->isolate(), result.c_str(), v8::NewStringType::kNormal, written)
+          .ToLocalChecked();
+  args.GetReturnValue().Set(output);
 }
 
 }  // namespace encoding_binding
diff --git a/src/env.cc b/src/env.cc
index 8842f69e9bf58f..d4426432d67ba6 100644
--- a/src/env.cc
+++ b/src/env.cc
@@ -920,7 +920,7 @@ Environment::Environment(IsolateData* isolate_data,
                                       std::move(traced_value));
   }
 
-  if (options_->experimental_permission) {
+  if (options_->permission) {
     permission()->EnablePermissions();
     // The process shouldn't be able to neither
     // spawn/worker nor use addons or enable inspector
diff --git a/src/js_native_api_v8.cc b/src/js_native_api_v8.cc
index d2334f65023161..3159cd7f69b6f4 100644
--- a/src/js_native_api_v8.cc
+++ b/src/js_native_api_v8.cc
@@ -2769,10 +2769,12 @@ napi_status NAPI_CDECL napi_create_reference(napi_env env,
 
 // Deletes a reference. The referenced value is released, and may be GC'd unless
 // there are other references to it.
+// For a napi_reference returned from `napi_wrap`, this must be called in the
+// finalizer.
 napi_status NAPI_CDECL napi_delete_reference(napi_env env, napi_ref ref) {
   // Omit NAPI_PREAMBLE and GET_RETURN_STATUS because V8 calls here cannot throw
   // JS exceptions.
-  CHECK_ENV_NOT_IN_GC(env);
+  CHECK_ENV(env);
   CHECK_ARG(env, ref);
 
   delete reinterpret_cast<v8impl::Reference*>(ref);
diff --git a/src/node.cc b/src/node.cc
index dd52fbffac0dee..480681d0b02ff8 100644
--- a/src/node.cc
+++ b/src/node.cc
@@ -320,7 +320,8 @@ MaybeLocal<Value> StartExecution(Environment* env, StartExecutionCallback cb) {
   CHECK(!env->isolate_data()->is_building_snapshot());
 
 #ifndef DISABLE_SINGLE_EXECUTABLE_APPLICATION
-  if (sea::IsSingleExecutable()) {
+  // Snapshot in SEA is only loaded for the main thread.
+  if (sea::IsSingleExecutable() && env->is_main_thread()) {
     sea::SeaResource sea = sea::FindSingleExecutableResource();
     // The SEA preparation blob building process should already enforce this,
     // this check is just here to guard against the unlikely case where
@@ -342,6 +343,9 @@ MaybeLocal<Value> StartExecution(Environment* env, StartExecutionCallback cb) {
   // move the pre-execution part into a different file that can be
   // reused when dealing with user-defined main functions.
   if (!env->snapshot_deserialize_main().IsEmpty()) {
+    // Custom worker snapshot is not supported yet,
+    // so workers can't have deserialize main functions.
+    CHECK(env->is_main_thread());
     return env->RunSnapshotDeserializeMain();
   }
 
diff --git a/src/node_builtins.h b/src/node_builtins.h
index 1cb85b9058d065..a73de23a1debfd 100644
--- a/src/node_builtins.h
+++ b/src/node_builtins.h
@@ -71,7 +71,7 @@ using BuiltinSourceMap = std::map<std::string, UnionBytes>;
 using BuiltinCodeCacheMap =
     std::unordered_map<std::string, BuiltinCodeCacheData>;
 
-// Generated by tools/js2c.py as node_javascript.cc
+// Generated by tools/js2c.cc as node_javascript.cc
 void RegisterExternalReferencesForInternalizedBuiltinCode(
     ExternalReferenceRegistry* registry);
 
@@ -134,7 +134,7 @@ class NODE_EXTERN_PRIVATE BuiltinLoader {
   // Only allow access from friends.
   friend class CodeCacheBuilder;
 
-  // Generated by tools/js2c.py as node_javascript.cc
+  // Generated by tools/js2c.cc as node_javascript.cc
   void LoadJavaScriptSource();  // Loads data into source_
   UnionBytes GetConfig();       // Return data for config.gypi
 
diff --git a/src/node_errors.h b/src/node_errors.h
index d18e9439a2970a..a33177a5d8e7e6 100644
--- a/src/node_errors.h
+++ b/src/node_errors.h
@@ -91,6 +91,7 @@ void OOMErrorHandler(const char* location, const v8::OOMDetails& details);
   V(ERR_INVALID_THIS, TypeError)                                               \
   V(ERR_INVALID_URL, TypeError)                                                \
   V(ERR_INVALID_URL_SCHEME, TypeError)                                         \
+  V(ERR_LOAD_SQLITE_EXTENSION, Error)                                          \
   V(ERR_MEMORY_ALLOCATION_FAILED, Error)                                       \
   V(ERR_MESSAGE_TARGET_CONTEXT_UNAVAILABLE, Error)                             \
   V(ERR_MISSING_ARGS, TypeError)                                               \
@@ -191,6 +192,7 @@ ERRORS_WITH_CODE(V)
   V(ERR_INVALID_STATE, "Invalid state")                                        \
   V(ERR_INVALID_THIS, "Value of \"this\" is the wrong type")                   \
   V(ERR_INVALID_URL_SCHEME, "The URL must be of scheme file:")                 \
+  V(ERR_LOAD_SQLITE_EXTENSION, "Failed to load SQLite extension")              \
   V(ERR_MEMORY_ALLOCATION_FAILED, "Failed to allocate memory")                 \
   V(ERR_OSSL_EVP_INVALID_DIGEST, "Invalid digest used")                        \
   V(ERR_MESSAGE_TARGET_CONTEXT_UNAVAILABLE,                                    \
diff --git a/src/node_file.cc b/src/node_file.cc
index 5a50aacb1b939d..34a86ef7f140d7 100644
--- a/src/node_file.cc
+++ b/src/node_file.cc
@@ -1986,8 +1986,29 @@ static void ReadDir(const FunctionCallbackInfo<Value>& args) {
 
   BufferValue path(isolate, args[0]);
   CHECK_NOT_NULL(*path);
+#ifdef _WIN32
+  // On Windows, some API functions accept paths with trailing slashes,
+  // while others do not. This code checks if the input path ends with
+  // a slash (either '/' or '\\') and, if so, ensures that the processed
+  // path also ends with a trailing backslash ('\\').
+  bool slashCheck = false;
+  if (path.ToStringView().ends_with("/") ||
+      path.ToStringView().ends_with("\\")) {
+    slashCheck = true;
+  }
+#endif
+
   ToNamespacedPath(env, &path);
 
+#ifdef _WIN32
+  if (slashCheck) {
+    size_t new_length = path.length() + 1;
+    path.AllocateSufficientStorage(new_length + 1);
+    path.SetLengthAndZeroTerminate(new_length);
+    path.out()[new_length - 1] = '\\';
+  }
+#endif
+
   const enum encoding encoding = ParseEncoding(isolate, args[1], UTF8);
 
   bool with_types = args[2]->IsTrue();
diff --git a/src/node_http2.cc b/src/node_http2.cc
index 888f70bd4df8a3..f9b5226aea50dc 100644
--- a/src/node_http2.cc
+++ b/src/node_http2.cc
@@ -1316,11 +1316,7 @@ int Http2Session::OnDataChunkReceived(nghttp2_session* handle,
     } else {
       memcpy(buf.base, data, avail);
     }
-    if (buf.base == nullptr) [[likely]] {
-      buf.base = reinterpret_cast<char*>(const_cast<uint8_t*>(data));
-    } else {
-      memcpy(buf.base, data, avail);
-    }
+
     data += avail;
     len -= avail;
     stream->EmitRead(avail, buf);
diff --git a/src/node_options.cc b/src/node_options.cc
index 8d39925ff524c8..5c24a48411123d 100644
--- a/src/node_options.cc
+++ b/src/node_options.cc
@@ -135,6 +135,11 @@ void EnvironmentOptions::CheckOptions(std::vector<std::string>* errors,
     errors->push_back("--heapsnapshot-near-heap-limit must not be negative");
   }
 
+  if (!trace_require_module.empty() && trace_require_module != "all" &&
+      trace_require_module != "no-node-modules") {
+    errors->push_back("invalid value for --trace-require-module");
+  }
+
   if (test_runner) {
     if (test_isolation == "none") {
       debug_options_.allow_attaching_debugger = true;
@@ -451,11 +456,12 @@ EnvironmentOptionsParser::EnvironmentOptionsParser() {
             "experimental ES Module import.meta.resolve() parentURL support",
             &EnvironmentOptions::experimental_import_meta_resolve,
             kAllowedInEnvvar);
-  AddOption("--experimental-permission",
+  AddOption("--permission",
             "enable the permission system",
-            &EnvironmentOptions::experimental_permission,
+            &EnvironmentOptions::permission,
             kAllowedInEnvvar,
             false);
+  AddAlias("--experimental-permission", "--permission");
   AddOption("--allow-fs-read",
             "allow permissions to read the filesystem",
             &EnvironmentOptions::allow_fs_read,
@@ -770,6 +776,13 @@ EnvironmentOptionsParser::EnvironmentOptionsParser() {
       &EnvironmentOptions::trace_env_native_stack,
       kAllowedInEnvvar);
 
+  AddOption(
+      "--trace-require-module",
+      "Print access to require(esm). Options are 'all' (print all usage) and "
+      "'no-node-modules' (excluding usage from the node_modules folder)",
+      &EnvironmentOptions::trace_require_module,
+      kAllowedInEnvvar);
+
   AddOption("--extra-info-on-fatal-exception",
             "hide extra information on fatal exception that causes exit",
             &EnvironmentOptions::extra_info_on_fatal_exception,
diff --git a/src/node_options.h b/src/node_options.h
index 1b0adf32595fd3..24ad821837934f 100644
--- a/src/node_options.h
+++ b/src/node_options.h
@@ -132,7 +132,7 @@ class EnvironmentOptions : public Options {
   bool experimental_import_meta_resolve = false;
   std::string input_type;  // Value of --input-type
   bool entry_is_url = false;
-  bool experimental_permission = false;
+  bool permission = false;
   std::vector<std::string> allow_fs_read;
   std::vector<std::string> allow_fs_write;
   bool allow_addons = false;
@@ -210,6 +210,7 @@ class EnvironmentOptions : public Options {
   bool trace_env = false;
   bool trace_env_js_stack = false;
   bool trace_env_native_stack = false;
+  std::string trace_require_module;
   bool extra_info_on_fatal_exception = true;
   std::string unhandled_rejections;
   std::vector<std::string> userland_loaders;
diff --git a/src/node_report.cc b/src/node_report.cc
index 4f430ee28218c3..9ab66162ec32a6 100644
--- a/src/node_report.cc
+++ b/src/node_report.cc
@@ -23,7 +23,7 @@
 #include <cwctype>
 #include <fstream>
 
-constexpr int NODE_REPORT_VERSION = 4;
+constexpr int NODE_REPORT_VERSION = 5;
 constexpr int NANOS_PER_SEC = 1000 * 1000 * 1000;
 constexpr double SEC_PER_MICROS = 1e-6;
 constexpr int MAX_FRAME_COUNT = node::kMaxFrameCountForLogging;
@@ -732,13 +732,13 @@ static void PrintSystemInformation(JSONWriter* writer) {
     int id;
   } rlimit_strings[] = {
     {"core_file_size_blocks", RLIMIT_CORE},
-    {"data_seg_size_kbytes", RLIMIT_DATA},
+    {"data_seg_size_bytes", RLIMIT_DATA},
     {"file_size_blocks", RLIMIT_FSIZE},
 #if !(defined(_AIX) || defined(__sun))
     {"max_locked_memory_bytes", RLIMIT_MEMLOCK},
 #endif
 #ifndef __sun
-    {"max_memory_size_kbytes", RLIMIT_RSS},
+    {"max_memory_size_bytes", RLIMIT_RSS},
 #endif
     {"open_files", RLIMIT_NOFILE},
     {"stack_size_bytes", RLIMIT_STACK},
@@ -747,7 +747,7 @@ static void PrintSystemInformation(JSONWriter* writer) {
     {"max_user_processes", RLIMIT_NPROC},
 #endif
 #ifndef __OpenBSD__
-    {"virtual_memory_kbytes", RLIMIT_AS}
+    {"virtual_memory_bytes", RLIMIT_AS}
 #endif
   };
 
diff --git a/src/node_sqlite.cc b/src/node_sqlite.cc
index dd78f4e6dacf59..7f5e2f89ce9dba 100644
--- a/src/node_sqlite.cc
+++ b/src/node_sqlite.cc
@@ -1,4 +1,5 @@
 #include "node_sqlite.h"
+#include <path.h>
 #include "base_object-inl.h"
 #include "debug_utils-inl.h"
 #include "env-inl.h"
@@ -27,6 +28,8 @@ using v8::Function;
 using v8::FunctionCallback;
 using v8::FunctionCallbackInfo;
 using v8::FunctionTemplate;
+using v8::Global;
+using v8::Int32;
 using v8::Integer;
 using v8::Isolate;
 using v8::Local;
@@ -111,13 +114,133 @@ inline void THROW_ERR_SQLITE_ERROR(Isolate* isolate, const char* message) {
   }
 }
 
+class UserDefinedFunction {
+ public:
+  explicit UserDefinedFunction(Environment* env,
+                               Local<Function> fn,
+                               bool use_bigint_args)
+      : env_(env), fn_(env->isolate(), fn), use_bigint_args_(use_bigint_args) {}
+  virtual ~UserDefinedFunction() {}
+
+  static void xFunc(sqlite3_context* ctx, int argc, sqlite3_value** argv) {
+    UserDefinedFunction* self =
+        static_cast<UserDefinedFunction*>(sqlite3_user_data(ctx));
+    Environment* env = self->env_;
+    Isolate* isolate = env->isolate();
+    auto recv = Undefined(isolate);
+    auto fn = self->fn_.Get(isolate);
+    LocalVector<Value> js_argv(isolate);
+
+    for (int i = 0; i < argc; ++i) {
+      sqlite3_value* value = argv[i];
+      MaybeLocal<Value> js_val;
+
+      switch (sqlite3_value_type(value)) {
+        case SQLITE_INTEGER: {
+          sqlite3_int64 val = sqlite3_value_int64(value);
+          if (self->use_bigint_args_) {
+            js_val = BigInt::New(isolate, val);
+          } else if (std::abs(val) <= kMaxSafeJsInteger) {
+            js_val = Number::New(isolate, val);
+          } else {
+            THROW_ERR_OUT_OF_RANGE(isolate,
+                                   "Value is too large to be represented as a "
+                                   "JavaScript number: %" PRId64,
+                                   val);
+            return;
+          }
+          break;
+        }
+        case SQLITE_FLOAT:
+          js_val = Number::New(isolate, sqlite3_value_double(value));
+          break;
+        case SQLITE_TEXT: {
+          const char* v =
+              reinterpret_cast<const char*>(sqlite3_value_text(value));
+          js_val = String::NewFromUtf8(isolate, v).As<Value>();
+          break;
+        }
+        case SQLITE_NULL:
+          js_val = Null(isolate);
+          break;
+        case SQLITE_BLOB: {
+          size_t size = static_cast<size_t>(sqlite3_value_bytes(value));
+          auto data =
+              reinterpret_cast<const uint8_t*>(sqlite3_value_blob(value));
+          auto store = ArrayBuffer::NewBackingStore(isolate, size);
+          memcpy(store->Data(), data, size);
+          auto ab = ArrayBuffer::New(isolate, std::move(store));
+          js_val = Uint8Array::New(ab, 0, size);
+          break;
+        }
+        default:
+          UNREACHABLE("Bad SQLite value");
+      }
+
+      Local<Value> local;
+      if (!js_val.ToLocal(&local)) {
+        return;
+      }
+
+      js_argv.emplace_back(local);
+    }
+
+    MaybeLocal<Value> retval =
+        fn->Call(env->context(), recv, argc, js_argv.data());
+    Local<Value> result;
+    if (!retval.ToLocal(&result)) {
+      return;
+    }
+
+    if (result->IsUndefined() || result->IsNull()) {
+      sqlite3_result_null(ctx);
+    } else if (result->IsNumber()) {
+      sqlite3_result_double(ctx, result.As<Number>()->Value());
+    } else if (result->IsString()) {
+      Utf8Value val(isolate, result.As<String>());
+      sqlite3_result_text(ctx, *val, val.length(), SQLITE_TRANSIENT);
+    } else if (result->IsUint8Array()) {
+      ArrayBufferViewContents<uint8_t> buf(result);
+      sqlite3_result_blob(ctx, buf.data(), buf.length(), SQLITE_TRANSIENT);
+    } else if (result->IsBigInt()) {
+      bool lossless;
+      int64_t as_int = result.As<BigInt>()->Int64Value(&lossless);
+      if (!lossless) {
+        sqlite3_result_error(ctx, "BigInt value is too large for SQLite", -1);
+        return;
+      }
+      sqlite3_result_int64(ctx, as_int);
+    } else if (result->IsPromise()) {
+      sqlite3_result_error(
+          ctx, "Asynchronous user-defined functions are not supported", -1);
+    } else {
+      sqlite3_result_error(
+          ctx,
+          "Returned JavaScript value cannot be converted to a SQLite value",
+          -1);
+    }
+  }
+
+  static void xDestroy(void* self) {
+    delete static_cast<UserDefinedFunction*>(self);
+  }
+
+ private:
+  Environment* env_;
+  Global<Function> fn_;
+  bool use_bigint_args_;
+};
+
 DatabaseSync::DatabaseSync(Environment* env,
                            Local<Object> object,
                            DatabaseOpenConfiguration&& open_config,
-                           bool open)
+                           bool open,
+                           bool allow_load_extension)
     : BaseObject(env, object), open_config_(std::move(open_config)) {
   MakeWeak();
   connection_ = nullptr;
+  allow_load_extension_ = allow_load_extension;
+  enable_load_extension_ = allow_load_extension;
 
   if (open) {
     Open();
@@ -182,6 +305,19 @@ bool DatabaseSync::Open() {
   CHECK_ERROR_OR_THROW(env()->isolate(), connection_, r, SQLITE_OK, false);
   CHECK_EQ(foreign_keys_enabled, open_config_.get_enable_foreign_keys());
 
+  if (allow_load_extension_) {
+    if (env()->permission()->enabled()) [[unlikely]] {
+      THROW_ERR_LOAD_SQLITE_EXTENSION(env(),
+                                      "Cannot load SQLite extensions when the "
+                                      "permission model is enabled.");
+      return false;
+    }
+    const int load_extension_ret = sqlite3_db_config(
+        connection_, SQLITE_DBCONFIG_ENABLE_LOAD_EXTENSION, 1, nullptr);
+    CHECK_ERROR_OR_THROW(
+        env()->isolate(), connection_, load_extension_ret, SQLITE_OK, false);
+  }
+
   return true;
 }
 
@@ -227,6 +363,7 @@ void DatabaseSync::New(const FunctionCallbackInfo<Value>& args) {
   DatabaseOpenConfiguration open_config(std::move(location));
 
   bool open = true;
+  bool allow_load_extension = false;
 
   if (args.Length() > 1) {
     if (!args[1]->IsObject()) {
@@ -302,9 +439,28 @@ void DatabaseSync::New(const FunctionCallbackInfo<Value>& args) {
       }
       open_config.set_enable_dqs(enable_dqs_v.As<Boolean>()->Value());
     }
+
+    Local<String> allow_extension_string =
+        FIXED_ONE_BYTE_STRING(env->isolate(), "allowExtension");
+    Local<Value> allow_extension_v;
+    if (!options->Get(env->context(), allow_extension_string)
+             .ToLocal(&allow_extension_v)) {
+      return;
+    }
+
+    if (!allow_extension_v->IsUndefined()) {
+      if (!allow_extension_v->IsBoolean()) {
+        THROW_ERR_INVALID_ARG_TYPE(
+            env->isolate(),
+            "The \"options.allowExtension\" argument must be a boolean.");
+        return;
+      }
+      allow_load_extension = allow_extension_v.As<Boolean>()->Value();
+    }
   }
 
-  new DatabaseSync(env, args.This(), std::move(open_config), open);
+  new DatabaseSync(
+      env, args.This(), std::move(open_config), open, allow_load_extension);
 }
 
 void DatabaseSync::Open(const FunctionCallbackInfo<Value>& args) {
@@ -363,6 +519,151 @@ void DatabaseSync::Exec(const FunctionCallbackInfo<Value>& args) {
   CHECK_ERROR_OR_THROW(env->isolate(), db->connection_, r, SQLITE_OK, void());
 }
 
+void DatabaseSync::CustomFunction(const FunctionCallbackInfo<Value>& args) {
+  DatabaseSync* db;
+  ASSIGN_OR_RETURN_UNWRAP(&db, args.This());
+  Environment* env = Environment::GetCurrent(args);
+  THROW_AND_RETURN_ON_BAD_STATE(env, !db->IsOpen(), "database is not open");
+
+  if (!args[0]->IsString()) {
+    THROW_ERR_INVALID_ARG_TYPE(env->isolate(),
+                               "The \"name\" argument must be a string.");
+    return;
+  }
+
+  int fn_index = args.Length() < 3 ? 1 : 2;
+  bool use_bigint_args = false;
+  bool varargs = false;
+  bool deterministic = false;
+  bool direct_only = false;
+
+  if (fn_index > 1) {
+    if (!args[1]->IsObject()) {
+      THROW_ERR_INVALID_ARG_TYPE(env->isolate(),
+                                 "The \"options\" argument must be an object.");
+      return;
+    }
+
+    Local<Object> options = args[1].As<Object>();
+    Local<Value> use_bigint_args_v;
+    if (!options
+             ->Get(env->context(),
+                   FIXED_ONE_BYTE_STRING(env->isolate(), "useBigIntArguments"))
+             .ToLocal(&use_bigint_args_v)) {
+      return;
+    }
+
+    if (!use_bigint_args_v->IsUndefined()) {
+      if (!use_bigint_args_v->IsBoolean()) {
+        THROW_ERR_INVALID_ARG_TYPE(
+            env->isolate(),
+            "The \"options.useBigIntArguments\" argument must be a boolean.");
+        return;
+      }
+      use_bigint_args = use_bigint_args_v.As<Boolean>()->Value();
+    }
+
+    Local<Value> varargs_v;
+    if (!options
+             ->Get(env->context(),
+                   FIXED_ONE_BYTE_STRING(env->isolate(), "varargs"))
+             .ToLocal(&varargs_v)) {
+      return;
+    }
+
+    if (!varargs_v->IsUndefined()) {
+      if (!varargs_v->IsBoolean()) {
+        THROW_ERR_INVALID_ARG_TYPE(
+            env->isolate(),
+            "The \"options.varargs\" argument must be a boolean.");
+        return;
+      }
+      varargs = varargs_v.As<Boolean>()->Value();
+    }
+
+    Local<Value> deterministic_v;
+    if (!options
+             ->Get(env->context(),
+                   FIXED_ONE_BYTE_STRING(env->isolate(), "deterministic"))
+             .ToLocal(&deterministic_v)) {
+      return;
+    }
+
+    if (!deterministic_v->IsUndefined()) {
+      if (!deterministic_v->IsBoolean()) {
+        THROW_ERR_INVALID_ARG_TYPE(
+            env->isolate(),
+            "The \"options.deterministic\" argument must be a boolean.");
+        return;
+      }
+      deterministic = deterministic_v.As<Boolean>()->Value();
+    }
+
+    Local<Value> direct_only_v;
+    if (!options
+             ->Get(env->context(),
+                   FIXED_ONE_BYTE_STRING(env->isolate(), "directOnly"))
+             .ToLocal(&direct_only_v)) {
+      return;
+    }
+
+    if (!direct_only_v->IsUndefined()) {
+      if (!direct_only_v->IsBoolean()) {
+        THROW_ERR_INVALID_ARG_TYPE(
+            env->isolate(),
+            "The \"options.directOnly\" argument must be a boolean.");
+        return;
+      }
+      direct_only = direct_only_v.As<Boolean>()->Value();
+    }
+  }
+
+  if (!args[fn_index]->IsFunction()) {
+    THROW_ERR_INVALID_ARG_TYPE(env->isolate(),
+                               "The \"function\" argument must be a function.");
+    return;
+  }
+
+  Utf8Value name(env->isolate(), args[0].As<String>());
+  Local<Function> fn = args[fn_index].As<Function>();
+
+  int argc = 0;
+  if (varargs) {
+    argc = -1;
+  } else {
+    Local<Value> js_len;
+    if (!fn->Get(env->context(),
+                 FIXED_ONE_BYTE_STRING(env->isolate(), "length"))
+             .ToLocal(&js_len)) {
+      return;
+    }
+    argc = js_len.As<Int32>()->Value();
+  }
+
+  UserDefinedFunction* user_data =
+      new UserDefinedFunction(env, fn, use_bigint_args);
+  int text_rep = SQLITE_UTF8;
+
+  if (deterministic) {
+    text_rep |= SQLITE_DETERMINISTIC;
+  }
+
+  if (direct_only) {
+    text_rep |= SQLITE_DIRECTONLY;
+  }
+
+  int r = sqlite3_create_function_v2(db->connection_,
+                                     *name,
+                                     argc,
+                                     text_rep,
+                                     user_data,
+                                     UserDefinedFunction::xFunc,
+                                     nullptr,
+                                     nullptr,
+                                     UserDefinedFunction::xDestroy);
+  CHECK_ERROR_OR_THROW(env->isolate(), db->connection_, r, SQLITE_OK, void());
+}
+
 void DatabaseSync::CreateSession(const FunctionCallbackInfo<Value>& args) {
   std::string table;
   std::string db_name = "main";
@@ -526,6 +827,70 @@ void DatabaseSync::ApplyChangeset(const FunctionCallbackInfo<Value>& args) {
   args.GetReturnValue().Set(true);
 }
 
+void DatabaseSync::EnableLoadExtension(
+    const FunctionCallbackInfo<Value>& args) {
+  DatabaseSync* db;
+  ASSIGN_OR_RETURN_UNWRAP(&db, args.This());
+  Environment* env = Environment::GetCurrent(args);
+  if (!args[0]->IsBoolean()) {
+    THROW_ERR_INVALID_ARG_TYPE(env->isolate(),
+                               "The \"allow\" argument must be a boolean.");
+    return;
+  }
+
+  const int enable = args[0].As<Boolean>()->Value();
+  auto isolate = env->isolate();
+
+  if (db->allow_load_extension_ == false && enable == true) {
+    THROW_ERR_INVALID_STATE(
+        isolate,
+        "Cannot enable extension loading because it was disabled at database "
+        "creation.");
+    return;
+  }
+  db->enable_load_extension_ = enable;
+  const int load_extension_ret = sqlite3_db_config(
+      db->connection_, SQLITE_DBCONFIG_ENABLE_LOAD_EXTENSION, enable, nullptr);
+  CHECK_ERROR_OR_THROW(
+      isolate, db->connection_, load_extension_ret, SQLITE_OK, void());
+}
+
+void DatabaseSync::LoadExtension(const FunctionCallbackInfo<Value>& args) {
+  DatabaseSync* db;
+  ASSIGN_OR_RETURN_UNWRAP(&db, args.This());
+  Environment* env = Environment::GetCurrent(args);
+  THROW_AND_RETURN_ON_BAD_STATE(
+      env, db->connection_ == nullptr, "database is not open");
+  THROW_AND_RETURN_ON_BAD_STATE(
+      env, !db->allow_load_extension_, "extension loading is not allowed");
+  THROW_AND_RETURN_ON_BAD_STATE(
+      env, !db->enable_load_extension_, "extension loading is not allowed");
+
+  if (!args[0]->IsString()) {
+    THROW_ERR_INVALID_ARG_TYPE(env->isolate(),
+                               "The \"path\" argument must be a string.");
+    return;
+  }
+
+  auto isolate = env->isolate();
+
+  BufferValue path(isolate, args[0]);
+  BufferValue entryPoint(isolate, args[1]);
+  CHECK_NOT_NULL(*path);
+  ToNamespacedPath(env, &path);
+  if (*entryPoint == nullptr) {
+    ToNamespacedPath(env, &entryPoint);
+  }
+  THROW_IF_INSUFFICIENT_PERMISSIONS(
+      env, permission::PermissionScope::kFileSystemRead, path.ToStringView());
+  char* errmsg = nullptr;
+  const int r =
+      sqlite3_load_extension(db->connection_, *path, *entryPoint, &errmsg);
+  if (r != SQLITE_OK) {
+    isolate->ThrowException(ERR_LOAD_SQLITE_EXTENSION(isolate, errmsg));
+  }
+}
+
 StatementSync::StatementSync(Environment* env,
                              Local<Object> object,
                              DatabaseSync* db,
@@ -1293,6 +1658,12 @@ void Session::Delete() {
   session_ = nullptr;
 }
 
+void DefineConstants(Local<Object> target) {
+  NODE_DEFINE_CONSTANT(target, SQLITE_CHANGESET_OMIT);
+  NODE_DEFINE_CONSTANT(target, SQLITE_CHANGESET_REPLACE);
+  NODE_DEFINE_CONSTANT(target, SQLITE_CHANGESET_ABORT);
+}
+
 static void Initialize(Local<Object> target,
                        Local<Value> unused,
                        Local<Context> context,
@@ -1303,24 +1674,32 @@ static void Initialize(Local<Object> target,
       NewFunctionTemplate(isolate, DatabaseSync::New);
   db_tmpl->InstanceTemplate()->SetInternalFieldCount(
       DatabaseSync::kInternalFieldCount);
+  Local<Object> constants = Object::New(isolate);
+
+  DefineConstants(constants);
 
   SetProtoMethod(isolate, db_tmpl, "open", DatabaseSync::Open);
   SetProtoMethod(isolate, db_tmpl, "close", DatabaseSync::Close);
   SetProtoMethod(isolate, db_tmpl, "prepare", DatabaseSync::Prepare);
   SetProtoMethod(isolate, db_tmpl, "exec", DatabaseSync::Exec);
+  SetProtoMethod(isolate, db_tmpl, "function", DatabaseSync::CustomFunction);
   SetProtoMethod(
       isolate, db_tmpl, "createSession", DatabaseSync::CreateSession);
   SetProtoMethod(
       isolate, db_tmpl, "applyChangeset", DatabaseSync::ApplyChangeset);
+  SetProtoMethod(isolate,
+                 db_tmpl,
+                 "enableLoadExtension",
+                 DatabaseSync::EnableLoadExtension);
+  SetProtoMethod(
+      isolate, db_tmpl, "loadExtension", DatabaseSync::LoadExtension);
   SetConstructorFunction(context, target, "DatabaseSync", db_tmpl);
   SetConstructorFunction(context,
                          target,
                          "StatementSync",
                          StatementSync::GetConstructorTemplate(env));
 
-  NODE_DEFINE_CONSTANT(target, SQLITE_CHANGESET_OMIT);
-  NODE_DEFINE_CONSTANT(target, SQLITE_CHANGESET_REPLACE);
-  NODE_DEFINE_CONSTANT(target, SQLITE_CHANGESET_ABORT);
+  target->Set(context, OneByteString(isolate, "constants"), constants).Check();
 }
 
 }  // namespace sqlite
diff --git a/src/node_sqlite.h b/src/node_sqlite.h
index 256be0c6df9d40..e78aa39abb3ba5 100644
--- a/src/node_sqlite.h
+++ b/src/node_sqlite.h
@@ -49,15 +49,20 @@ class DatabaseSync : public BaseObject {
   DatabaseSync(Environment* env,
                v8::Local<v8::Object> object,
                DatabaseOpenConfiguration&& open_config,
-               bool open);
+               bool open,
+               bool allow_load_extension);
   void MemoryInfo(MemoryTracker* tracker) const override;
   static void New(const v8::FunctionCallbackInfo<v8::Value>& args);
   static void Open(const v8::FunctionCallbackInfo<v8::Value>& args);
   static void Close(const v8::FunctionCallbackInfo<v8::Value>& args);
   static void Prepare(const v8::FunctionCallbackInfo<v8::Value>& args);
   static void Exec(const v8::FunctionCallbackInfo<v8::Value>& args);
+  static void CustomFunction(const v8::FunctionCallbackInfo<v8::Value>& args);
   static void CreateSession(const v8::FunctionCallbackInfo<v8::Value>& args);
   static void ApplyChangeset(const v8::FunctionCallbackInfo<v8::Value>& args);
+  static void EnableLoadExtension(
+      const v8::FunctionCallbackInfo<v8::Value>& args);
+  static void LoadExtension(const v8::FunctionCallbackInfo<v8::Value>& args);
   void FinalizeStatements();
   void UntrackStatement(StatementSync* statement);
   bool IsOpen();
@@ -72,6 +77,8 @@ class DatabaseSync : public BaseObject {
 
   ~DatabaseSync() override;
   DatabaseOpenConfiguration open_config_;
+  bool allow_load_extension_;
+  bool enable_load_extension_;
   sqlite3* connection_;
 
   std::set<sqlite3_session*> sessions_;
diff --git a/src/node_version.h b/src/node_version.h
index 84af47bd71dd72..0edfb7c3d8babc 100644
--- a/src/node_version.h
+++ b/src/node_version.h
@@ -23,13 +23,13 @@
 #define SRC_NODE_VERSION_H_
 
 #define NODE_MAJOR_VERSION 23
-#define NODE_MINOR_VERSION 4
-#define NODE_PATCH_VERSION 1
+#define NODE_MINOR_VERSION 5
+#define NODE_PATCH_VERSION 0
 
 #define NODE_VERSION_IS_LTS 0
 #define NODE_VERSION_LTS_CODENAME ""
 
-#define NODE_VERSION_IS_RELEASE 0
+#define NODE_VERSION_IS_RELEASE 1
 
 #ifndef NODE_STRINGIFY
 #define NODE_STRINGIFY(n) NODE_STRINGIFY_HELPER(n)
diff --git a/src/node_worker.cc b/src/node_worker.cc
index e8026fe24c7021..f64609cf045441 100644
--- a/src/node_worker.cc
+++ b/src/node_worker.cc
@@ -449,6 +449,9 @@ void Worker::JoinThread() {
 
   env()->remove_sub_worker_context(this);
 
+  // Join may happen after the worker exits and disposes the isolate
+  if (!env()->can_call_into_js()) return;
+
   {
     HandleScope handle_scope(env()->isolate());
     Context::Scope context_scope(env()->context());
diff --git a/test/addons/no-addons/permission.js b/test/addons/no-addons/permission.js
index 0fbcd2bb1ee782..1d1bbf6e95468e 100644
--- a/test/addons/no-addons/permission.js
+++ b/test/addons/no-addons/permission.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 
 'use strict';
 
diff --git a/test/cctest/test_encoding_binding.cc b/test/cctest/test_encoding_binding.cc
index 06cc36d8f6ae34..d5d14c60fedf7e 100644
--- a/test/cctest/test_encoding_binding.cc
+++ b/test/cctest/test_encoding_binding.cc
@@ -26,7 +26,7 @@ bool RunDecodeLatin1(Environment* env,
     return false;
   }
 
-  *result = try_catch.Exception();
+  *result = args[0];
   return true;
 }
 
@@ -151,5 +151,26 @@ TEST_F(EncodingBindingTest, DecodeLatin1_BOMPresent) {
   EXPECT_STREQ(*utf8_result, "Áéó");
 }
 
+TEST_F(EncodingBindingTest, DecodeLatin1_ReturnsString) {
+  Environment* env = CreateEnvironment();
+  Isolate* isolate = env->isolate();
+  HandleScope handle_scope(isolate);
+
+  const uint8_t latin1_data[] = {0xC1, 0xE9, 0xF3};
+  Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
+  memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));
+
+  Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
+  Local<Value> args[] = {array};
+
+  Local<Value> result;
+  ASSERT_TRUE(RunDecodeLatin1(env, args, false, false, &result));
+
+  ASSERT_TRUE(result->IsString());
+
+  String::Utf8Value utf8_result(isolate, result);
+  EXPECT_STREQ(*utf8_result, "Áéó");
+}
+
 }  // namespace encoding_binding
 }  // namespace node
diff --git a/test/common/index.js b/test/common/index.js
index 50595945b193f4..d1eaf6e69f603b 100644
--- a/test/common/index.js
+++ b/test/common/index.js
@@ -1051,14 +1051,6 @@ const common = {
     return hasOpenSSL(3);
   },
 
-  get hasOpenSSL31() {
-    return hasOpenSSL(3, 1);
-  },
-
-  get hasOpenSSL32() {
-    return hasOpenSSL(3, 2);
-  },
-
   get inFreeBSDJail() {
     if (inFreeBSDJail !== null) return inFreeBSDJail;
 
diff --git a/test/common/report.js b/test/common/report.js
index 3280116feb83d9..ade0f9aa94a023 100644
--- a/test/common/report.js
+++ b/test/common/report.js
@@ -110,7 +110,7 @@ function _validateContent(report, fields = []) {
                         'glibcVersionRuntime', 'glibcVersionCompiler', 'cwd',
                         'reportVersion', 'networkInterfaces', 'threadId'];
   checkForUnknownFields(header, headerFields);
-  assert.strictEqual(header.reportVersion, 4);  // Increment as needed.
+  assert.strictEqual(header.reportVersion, 5);  // Increment as needed.
   assert.strictEqual(typeof header.event, 'string');
   assert.strictEqual(typeof header.trigger, 'string');
   assert(typeof header.filename === 'string' || header.filename === null);
@@ -309,11 +309,11 @@ function _validateContent(report, fields = []) {
 
   // Verify the format of the userLimits section on non-Windows platforms.
   if (!isWindows) {
-    const userLimitsFields = ['core_file_size_blocks', 'data_seg_size_kbytes',
+    const userLimitsFields = ['core_file_size_blocks', 'data_seg_size_bytes',
                               'file_size_blocks', 'max_locked_memory_bytes',
-                              'max_memory_size_kbytes', 'open_files',
+                              'max_memory_size_bytes', 'open_files',
                               'stack_size_bytes', 'cpu_time_seconds',
-                              'max_user_processes', 'virtual_memory_kbytes'];
+                              'max_user_processes', 'virtual_memory_bytes'];
     checkForUnknownFields(report.userLimits, userLimitsFields);
     for (const [type, limits] of Object.entries(report.userLimits)) {
       assert.strictEqual(typeof type, 'string');
diff --git a/test/es-module/test-cjs-legacyMainResolve-permission.js b/test/es-module/test-cjs-legacyMainResolve-permission.js
index 392bfb753d7764..fcebc22ccf2929 100644
--- a/test/es-module/test-cjs-legacyMainResolve-permission.js
+++ b/test/es-module/test-cjs-legacyMainResolve-permission.js
@@ -1,6 +1,6 @@
 'use strict';
 
-// Flags: --expose-internals --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --expose-internals --permission --allow-fs-read=* --allow-child-process
 
 require('../common');
 
@@ -40,7 +40,7 @@ describe('legacyMainResolve', () => {
         process.execPath,
         [
           '--expose-internals',
-          '--experimental-permission',
+          '--permission',
           ...allowReadFiles,
           '-e',
           `
@@ -98,7 +98,7 @@ describe('legacyMainResolve', () => {
         process.execPath,
         [
           '--expose-internals',
-          '--experimental-permission',
+          '--permission',
           ...allowReadFiles,
           '-e',
           `
diff --git a/test/es-module/test-esm-loader-hooks.mjs b/test/es-module/test-esm-loader-hooks.mjs
index 4a4d15648a79b5..ed5c27cbc4b84f 100644
--- a/test/es-module/test-esm-loader-hooks.mjs
+++ b/test/es-module/test-esm-loader-hooks.mjs
@@ -182,7 +182,7 @@ describe('Loader hooks', { concurrency: !process.env.TEST_PARALLEL }, () => {
   it('should work without worker permission', async () => {
     const { code, signal, stdout, stderr } = await spawnPromisified(execPath, [
       '--no-warnings',
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read',
       '*',
       '--experimental-loader',
@@ -199,7 +199,7 @@ describe('Loader hooks', { concurrency: !process.env.TEST_PARALLEL }, () => {
   it('should allow loader hooks to spawn workers when allowed by the CLI flags', async () => {
     const { code, signal, stdout, stderr } = await spawnPromisified(execPath, [
       '--no-warnings',
-      '--experimental-permission',
+      '--permission',
       '--allow-worker',
       '--allow-fs-read',
       '*',
@@ -217,7 +217,7 @@ describe('Loader hooks', { concurrency: !process.env.TEST_PARALLEL }, () => {
   it('should not allow loader hooks to spawn workers if restricted by the CLI flags', async () => {
     const { code, signal, stdout, stderr } = await spawnPromisified(execPath, [
       '--no-warnings',
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read',
       '*',
       '--experimental-loader',
diff --git a/test/es-module/test-esm-loader-spawn-promisified.mjs b/test/es-module/test-esm-loader-spawn-promisified.mjs
index 628ff3f0d423e5..2f27f7850f646e 100644
--- a/test/es-module/test-esm-loader-spawn-promisified.mjs
+++ b/test/es-module/test-esm-loader-spawn-promisified.mjs
@@ -285,4 +285,20 @@ describe('Loader hooks parsing modules', { concurrency: !process.env.TEST_PARALL
     assert.strictEqual(code, 0);
     assert.strictEqual(signal, null);
   });
+
+  it('throw maximum call stack error on the loader', async () => {
+    const { code, signal, stdout, stderr } = await spawnPromisified(execPath, [
+      '--no-warnings',
+      '--experimental-loader',
+      fixtures.fileURL('/es-module-loaders/hooks-custom.mjs'),
+      '--input-type=module',
+      '--eval',
+      'await import("esmHook/maximumCallStack.mjs")',
+    ]);
+
+    assert(stderr.includes('Maximum call stack size exceeded'));
+    assert.strictEqual(stdout, '');
+    assert.strictEqual(code, 1);
+    assert.strictEqual(signal, null);
+  });
 });
diff --git a/test/es-module/test-require-module-preload.js b/test/es-module/test-require-module-preload.js
index 7a8a09486b43d7..0b8b79118ee8e6 100644
--- a/test/es-module/test-require-module-preload.js
+++ b/test/es-module/test-require-module-preload.js
@@ -5,8 +5,6 @@ const { spawnSyncAndAssert } = require('../common/child_process');
 const { fixturesDir } = require('../common/fixtures');
 
 function testPreload(preloadFlag) {
-  // The warning is only emitted when ESM is loaded by --require.
-  const isRequire = preloadFlag === '--require';
   // Test named exports.
   {
     spawnSyncAndAssert(
@@ -22,8 +20,6 @@ function testPreload(preloadFlag) {
       },
       {
         stdout: 'A',
-        stderr: isRequire ?
-          /ExperimentalWarning: --require is loading ES Module .*module-named-exports\.mjs using require/ : undefined,
         trim: true,
       }
     );
@@ -43,8 +39,6 @@ function testPreload(preloadFlag) {
         cwd: fixturesDir
       },
       {
-        stderr: isRequire ?
-          /ExperimentalWarning: --require is loading ES Module .*import-esm\.mjs using require/ : undefined,
         stdout: /^world\s+A$/,
         trim: true,
       }
@@ -66,8 +60,6 @@ function testPreload(preloadFlag) {
       },
       {
         stdout: /^ok\s+A$/,
-        stderr: isRequire ?
-          /ExperimentalWarning: --require is loading ES Module .*cjs-exports\.mjs using require/ : undefined,
         trim: true,
       }
     );
@@ -90,8 +82,6 @@ function testPreload(preloadFlag) {
       },
       {
         stdout: /^world\s+A$/,
-        stderr: isRequire ?
-          /ExperimentalWarning: --require is loading ES Module .*require-cjs\.mjs using require/ : undefined,
         trim: true,
       }
     );
@@ -117,7 +107,6 @@ testPreload('--import');
     },
     {
       stdout: /^package-type-module\s+A$/,
-      stderr: /ExperimentalWarning: --require is loading ES Module .*package-type-module[\\/]index\.js using require/,
       trim: true,
     }
   );
diff --git a/test/es-module/test-require-module-warning.js b/test/es-module/test-require-module-warning.js
index d5be2fc6da8755..4c651efd5c0cc7 100644
--- a/test/es-module/test-require-module-warning.js
+++ b/test/es-module/test-require-module-warning.js
@@ -1,8 +1,6 @@
 'use strict';
 
-// This checks the warning and the stack trace emitted by the require(esm)
-// experimental warning. It can get removed when `require(esm)` becomes stable.
-
+// This checks the warning and the stack trace emitted by --trace-require-module=all.
 require('../common');
 const { spawnSyncAndAssert } = require('../common/child_process');
 const fixtures = require('../common/fixtures');
@@ -10,6 +8,7 @@ const assert = require('assert');
 
 spawnSyncAndAssert(process.execPath, [
   '--trace-warnings',
+  '--trace-require-module=all',
   fixtures.path('es-modules', 'require-module.js'),
 ], {
   trim: true,
@@ -33,3 +32,12 @@ spawnSyncAndAssert(process.execPath, [
     );
   }
 });
+
+spawnSyncAndAssert(process.execPath, [
+  '--trace-require-module=1',
+  fixtures.path('es-modules', 'require-module.js'),
+], {
+  status: 9,
+  trim: true,
+  stderr: /invalid value for --trace-require-module/
+});
diff --git a/test/es-module/test-require-module.js b/test/es-module/test-require-module.js
index 4987fbf7b07d43..7e05fbd4a5bf88 100644
--- a/test/es-module/test-require-module.js
+++ b/test/es-module/test-require-module.js
@@ -3,16 +3,6 @@
 
 const common = require('../common');
 const assert = require('assert');
-const path = require('path');
-
-// Only the first load will trigger the warning.
-common.expectWarning(
-  'ExperimentalWarning',
-  `CommonJS module ${__filename} is loading ES Module ` +
-  `${path.resolve(__dirname, '../fixtures/es-module-loaders/module-named-exports.mjs')} using require().\n` +
-  'Support for loading ES Module in require() is an experimental feature ' +
-  'and might change at any time'
-);
 
 // Test named exports.
 {
diff --git a/test/es-module/test-require-node-modules-warning.js b/test/es-module/test-require-node-modules-warning.js
index 837f174fd28950..bb3e38b7d92071 100644
--- a/test/es-module/test-require-node-modules-warning.js
+++ b/test/es-module/test-require-node-modules-warning.js
@@ -1,7 +1,7 @@
 'use strict';
 
-// This checks the experimental warning for require(esm) is disabled when the
-// require() comes from node_modules.
+// This checks the warning and the stack trace emitted by
+// --trace-require-module=no-node-modules.
 require('../common');
 const { spawnSyncAndAssert } = require('../common/child_process');
 const fixtures = require('../common/fixtures');
@@ -14,7 +14,10 @@ const warningRE = /Support for loading ES Module in require\(\)/;
 // require() in non-node_modules -> esm in node_modules should warn.
 spawnSyncAndAssert(
   process.execPath,
-  [fixtures.path('es-modules', 'test_node_modules', 'require-esm.js')],
+  [
+    '--trace-require-module=no-node-modules',
+    fixtures.path('es-modules', 'test_node_modules', 'require-esm.js'),
+  ],
   {
     trim: true,
     stderr: warningRE,
@@ -26,7 +29,10 @@ spawnSyncAndAssert(
 // should not warn.
 spawnSyncAndAssert(
   process.execPath,
-  [fixtures.path('es-modules', 'test_node_modules', 'require-require-esm.js')],
+  [
+    '--trace-require-module=no-node-modules',
+    fixtures.path('es-modules', 'test_node_modules', 'require-require-esm.js'),
+  ],
   {
     trim: true,
     stderr: '',
@@ -38,7 +44,10 @@ spawnSyncAndAssert(
 // should not warn.
 spawnSyncAndAssert(
   process.execPath,
-  [fixtures.path('es-modules', 'test_node_modules', 'import-require-esm.mjs')],
+  [
+    '--trace-require-module=no-node-modules',
+    fixtures.path('es-modules', 'test_node_modules', 'import-require-esm.mjs'),
+  ],
   {
     trim: true,
     stderr: '',
@@ -50,7 +59,10 @@ spawnSyncAndAssert(
 // require() in node_modules -> esm in node_modules should not warn.
 spawnSyncAndAssert(
   process.execPath,
-  [fixtures.path('es-modules', 'test_node_modules', 'import-import-require-esm.mjs')],
+  [
+    '--trace-require-module=no-node-modules',
+    fixtures.path('es-modules', 'test_node_modules', 'import-import-require-esm.mjs'),
+  ],
   {
     trim: true,
     stderr: '',
diff --git a/test/es-module/test-typescript-commonjs.mjs b/test/es-module/test-typescript-commonjs.mjs
index bbaaa7a414bc1d..8bdaaff62e8a8d 100644
--- a/test/es-module/test-typescript-commonjs.mjs
+++ b/test/es-module/test-typescript-commonjs.mjs
@@ -116,7 +116,6 @@ test('execute a .cts file importing a .mts file export', async () => {
     fixtures.path('typescript/cts/test-require-mts-module.cts'),
   ]);
 
-  match(result.stderr, /Support for loading ES Module in require\(\) is an experimental feature and might change at any time/);
   match(result.stdout, /Hello, TypeScript!/);
   strictEqual(result.code, 0);
 });
diff --git a/test/es-module/test-typescript.mjs b/test/es-module/test-typescript.mjs
index 00cc80e0ce93ff..495c82ffd9e79b 100644
--- a/test/es-module/test-typescript.mjs
+++ b/test/es-module/test-typescript.mjs
@@ -175,7 +175,6 @@ test('expect failure of a TypeScript file requiring ES module syntax', async ()
     fixtures.path('typescript/ts/test-require-module.ts'),
   ]);
 
-  match(result.stderr, /Support for loading ES Module in require\(\) is an experimental feature and might change at any time/);
   match(result.stdout, /Hello, TypeScript!/);
   strictEqual(result.code, 0);
 });
@@ -231,7 +230,6 @@ test('execute a TypeScript file with CommonJS syntax requiring .mts using requir
     fixtures.path('typescript/ts/test-require-mts.ts'),
   ]);
 
-  match(result.stderr, /Support for loading ES Module in require\(\) is an experimental feature and might change at any time/);
   match(result.stdout, /Hello, TypeScript!/);
   strictEqual(result.code, 0);
 });
diff --git a/test/fixtures/dotenv/node-options.env b/test/fixtures/dotenv/node-options.env
index f74ac01bc28de7..bd3be820f64e2b 100644
--- a/test/fixtures/dotenv/node-options.env
+++ b/test/fixtures/dotenv/node-options.env
@@ -1,6 +1,6 @@
 CUSTOM_VARIABLE=hello-world
 NODE_NO_WARNINGS=1
-NODE_OPTIONS="--experimental-permission --allow-fs-read=*"
+NODE_OPTIONS="--permission --allow-fs-read=*"
 TZ=Pacific/Honolulu
 UV_THREADPOOL_SIZE=5
 BASIC=overridden
diff --git a/test/fixtures/es-module-loaders/hooks-custom.mjs b/test/fixtures/es-module-loaders/hooks-custom.mjs
index 3c38649a88794f..5109d20f4d3711 100644
--- a/test/fixtures/es-module-loaders/hooks-custom.mjs
+++ b/test/fixtures/es-module-loaders/hooks-custom.mjs
@@ -105,5 +105,12 @@ export function load(url, context, next) {
     };
   }
 
+  if (url.endsWith('esmHook/maximumCallStack.mjs')) {
+    function recurse() {
+      recurse();
+    }
+    recurse();
+  }
+
   return next(url);
 }
diff --git a/test/fixtures/icu/localizationData-v74.2.json b/test/fixtures/icu/localizationData-v74.2.json
new file mode 100644
index 00000000000000..65671ba5acb299
--- /dev/null
+++ b/test/fixtures/icu/localizationData-v74.2.json
@@ -0,0 +1,128 @@
+{
+  "dateStrings": {
+    "en": "Fri Jul 25 1980 01:35:33 GMT+0100 (Central European Standard Time)",
+    "zh": "Fri Jul 25 1980 01:35:33 GMT+0100 (中欧标准时间)",
+    "hi": "Fri Jul 25 1980 01:35:33 GMT+0100 (मध्य यूरोपीय मानक समय)",
+    "es": "Fri Jul 25 1980 01:35:33 GMT+0100 (hora estándar de Europa central)",
+    "fr": "Fri Jul 25 1980 01:35:33 GMT+0100 (heure normale d’Europe centrale)",
+    "ar": "Fri Jul 25 1980 01:35:33 GMT+0100 (توقيت وسط أوروبا الرسمي)",
+    "bn": "Fri Jul 25 1980 01:35:33 GMT+0100 (মধ্য ইউরোপীয় মানক সময়)",
+    "ru": "Fri Jul 25 1980 01:35:33 GMT+0100 (Центральная Европа, стандартное время)",
+    "pt": "Fri Jul 25 1980 01:35:33 GMT+0100 (Horário Padrão da Europa Central)",
+    "ur": "Fri Jul 25 1980 01:35:33 GMT+0100 (وسطی یورپ کا معیاری وقت)",
+    "id": "Fri Jul 25 1980 01:35:33 GMT+0100 (Waktu Standar Eropa Tengah)",
+    "de": "Fri Jul 25 1980 01:35:33 GMT+0100 (Mitteleuropäische Normalzeit)",
+    "ja": "Fri Jul 25 1980 01:35:33 GMT+0100 (中央ヨーロッパ標準時)",
+    "pcm": "Fri Jul 25 1980 01:35:33 GMT+0100 (Mídúl Yúrop Fíksd Taim)",
+    "mr": "Fri Jul 25 1980 01:35:33 GMT+0100 (मध्‍य युरोपियन प्रमाण वेळ)",
+    "te": "Fri Jul 25 1980 01:35:33 GMT+0100 (సెంట్రల్ యూరోపియన్ ప్రామాణిక సమయం)"
+  },
+  "dateTimeFormats": {
+    "en": "7/25/1980, 1:35:33 AM",
+    "zh": "1980/7/25 01:35:33",
+    "hi": "25/7/1980, 1:35:33 am",
+    "es": "25/7/1980, 1:35:33",
+    "fr": "25/07/1980 01:35:33",
+    "ar": "٢٥‏/٧‏/١٩٨٠، ١:٣٥:٣٣ ص",
+    "bn": "২৫/৭/১৯৮০, ১:৩৫:৩৩ AM",
+    "ru": "25.07.1980, 01:35:33",
+    "pt": "25/07/1980, 01:35:33",
+    "ur": "25/7/1980، 1:35:33 AM",
+    "id": "25/7/1980, 01.35.33",
+    "de": "25.7.1980, 01:35:33",
+    "ja": "1980/7/25 1:35:33",
+    "pcm": "25/7/1980 01:35:33",
+    "mr": "२५/७/१९८०, १:३५:३३ AM",
+    "te": "25/7/1980 1:35:33 AM"
+  },
+  "dateFormats": {
+    "en": "7/25/1980",
+    "zh": "1980/7/25",
+    "hi": "25/7/1980",
+    "es": "25/7/1980",
+    "fr": "25/07/1980",
+    "ar": "٢٥‏/٧‏/١٩٨٠",
+    "bn": "২৫/৭/১৯৮০",
+    "ru": "25.07.1980",
+    "pt": "25/07/1980",
+    "ur": "25/7/1980",
+    "id": "25/7/1980",
+    "de": "25.7.1980",
+    "ja": "1980/7/25",
+    "pcm": "25/7/1980",
+    "mr": "२५/७/१९८०",
+    "te": "25/7/1980"
+  },
+  "displayNames": {
+    "en": "Switzerland",
+    "zh": "瑞士",
+    "hi": "स्विट्ज़रलैंड",
+    "es": "Suiza",
+    "fr": "Suisse",
+    "ar": "سويسرا",
+    "bn": "সুইজারল্যান্ড",
+    "ru": "Швейцария",
+    "pt": "Suíça",
+    "ur": "سوئٹزر لینڈ",
+    "id": "Swiss",
+    "de": "Schweiz",
+    "ja": "スイス",
+    "pcm": "Swítsaland",
+    "mr": "स्वित्झर्लंड",
+    "te": "స్విట్జర్లాండ్"
+  },
+  "numberFormats": {
+    "en": "275,760.913",
+    "zh": "275,760.913",
+    "hi": "2,75,760.913",
+    "es": "275.760,913",
+    "fr": "275 760,913",
+    "ar": "٢٧٥٬٧٦٠٫٩١٣",
+    "bn": "২,৭৫,৭৬০.৯১৩",
+    "ru": "275 760,913",
+    "pt": "275.760,913",
+    "ur": "275,760.913",
+    "id": "275.760,913",
+    "de": "275.760,913",
+    "ja": "275,760.913",
+    "pcm": "275,760.913",
+    "mr": "२,७५,७६०.९१३",
+    "te": "2,75,760.913"
+  },
+  "pluralRules": {
+    "en": "other",
+    "zh": "other",
+    "hi": "one",
+    "es": "other",
+    "fr": "one",
+    "ar": "zero",
+    "bn": "one",
+    "ru": "many",
+    "pt": "one",
+    "ur": "other",
+    "id": "other",
+    "de": "other",
+    "ja": "other",
+    "pcm": "one",
+    "mr": "other",
+    "te": "other"
+  },
+  "relativeTime": {
+    "en": "586,920.617 hours ago",
+    "zh": "586,920.617小时前",
+    "hi": "5,86,920.617 घंटे पहले",
+    "es": "hace 586.920,617 horas",
+    "fr": "il y a 586 920,617 heures",
+    "ar": "قبل ٥٨٦٬٩٢٠٫٦١٧ ساعة",
+    "bn": "৫,৮৬,৯২০.৬১৭ ঘন্টা আগে",
+    "ru": "586 920,617 часа назад",
+    "pt": "há 586.920,617 horas",
+    "ur": "586,920.617 گھنٹے پہلے",
+    "id": "586.920,617 jam yang lalu",
+    "de": "vor 586.920,617 Stunden",
+    "ja": "586,920.617 時間前",
+    "pcm": "586,920.617 áwa wé dọ́n pas",
+    "mr": "५,८६,९२०.६१७ तासांपूर्वी",
+    "te": "5,86,920.617 గంటల క్రితం"
+  }
+}
diff --git a/test/fixtures/icu/localizationData-v75.1.json b/test/fixtures/icu/localizationData-v75.1.json
new file mode 100644
index 00000000000000..65671ba5acb299
--- /dev/null
+++ b/test/fixtures/icu/localizationData-v75.1.json
@@ -0,0 +1,128 @@
+{
+  "dateStrings": {
+    "en": "Fri Jul 25 1980 01:35:33 GMT+0100 (Central European Standard Time)",
+    "zh": "Fri Jul 25 1980 01:35:33 GMT+0100 (中欧标准时间)",
+    "hi": "Fri Jul 25 1980 01:35:33 GMT+0100 (मध्य यूरोपीय मानक समय)",
+    "es": "Fri Jul 25 1980 01:35:33 GMT+0100 (hora estándar de Europa central)",
+    "fr": "Fri Jul 25 1980 01:35:33 GMT+0100 (heure normale d’Europe centrale)",
+    "ar": "Fri Jul 25 1980 01:35:33 GMT+0100 (توقيت وسط أوروبا الرسمي)",
+    "bn": "Fri Jul 25 1980 01:35:33 GMT+0100 (মধ্য ইউরোপীয় মানক সময়)",
+    "ru": "Fri Jul 25 1980 01:35:33 GMT+0100 (Центральная Европа, стандартное время)",
+    "pt": "Fri Jul 25 1980 01:35:33 GMT+0100 (Horário Padrão da Europa Central)",
+    "ur": "Fri Jul 25 1980 01:35:33 GMT+0100 (وسطی یورپ کا معیاری وقت)",
+    "id": "Fri Jul 25 1980 01:35:33 GMT+0100 (Waktu Standar Eropa Tengah)",
+    "de": "Fri Jul 25 1980 01:35:33 GMT+0100 (Mitteleuropäische Normalzeit)",
+    "ja": "Fri Jul 25 1980 01:35:33 GMT+0100 (中央ヨーロッパ標準時)",
+    "pcm": "Fri Jul 25 1980 01:35:33 GMT+0100 (Mídúl Yúrop Fíksd Taim)",
+    "mr": "Fri Jul 25 1980 01:35:33 GMT+0100 (मध्‍य युरोपियन प्रमाण वेळ)",
+    "te": "Fri Jul 25 1980 01:35:33 GMT+0100 (సెంట్రల్ యూరోపియన్ ప్రామాణిక సమయం)"
+  },
+  "dateTimeFormats": {
+    "en": "7/25/1980, 1:35:33 AM",
+    "zh": "1980/7/25 01:35:33",
+    "hi": "25/7/1980, 1:35:33 am",
+    "es": "25/7/1980, 1:35:33",
+    "fr": "25/07/1980 01:35:33",
+    "ar": "٢٥‏/٧‏/١٩٨٠، ١:٣٥:٣٣ ص",
+    "bn": "২৫/৭/১৯৮০, ১:৩৫:৩৩ AM",
+    "ru": "25.07.1980, 01:35:33",
+    "pt": "25/07/1980, 01:35:33",
+    "ur": "25/7/1980، 1:35:33 AM",
+    "id": "25/7/1980, 01.35.33",
+    "de": "25.7.1980, 01:35:33",
+    "ja": "1980/7/25 1:35:33",
+    "pcm": "25/7/1980 01:35:33",
+    "mr": "२५/७/१९८०, १:३५:३३ AM",
+    "te": "25/7/1980 1:35:33 AM"
+  },
+  "dateFormats": {
+    "en": "7/25/1980",
+    "zh": "1980/7/25",
+    "hi": "25/7/1980",
+    "es": "25/7/1980",
+    "fr": "25/07/1980",
+    "ar": "٢٥‏/٧‏/١٩٨٠",
+    "bn": "২৫/৭/১৯৮০",
+    "ru": "25.07.1980",
+    "pt": "25/07/1980",
+    "ur": "25/7/1980",
+    "id": "25/7/1980",
+    "de": "25.7.1980",
+    "ja": "1980/7/25",
+    "pcm": "25/7/1980",
+    "mr": "२५/७/१९८०",
+    "te": "25/7/1980"
+  },
+  "displayNames": {
+    "en": "Switzerland",
+    "zh": "瑞士",
+    "hi": "स्विट्ज़रलैंड",
+    "es": "Suiza",
+    "fr": "Suisse",
+    "ar": "سويسرا",
+    "bn": "সুইজারল্যান্ড",
+    "ru": "Швейцария",
+    "pt": "Suíça",
+    "ur": "سوئٹزر لینڈ",
+    "id": "Swiss",
+    "de": "Schweiz",
+    "ja": "スイス",
+    "pcm": "Swítsaland",
+    "mr": "स्वित्झर्लंड",
+    "te": "స్విట్జర్లాండ్"
+  },
+  "numberFormats": {
+    "en": "275,760.913",
+    "zh": "275,760.913",
+    "hi": "2,75,760.913",
+    "es": "275.760,913",
+    "fr": "275 760,913",
+    "ar": "٢٧٥٬٧٦٠٫٩١٣",
+    "bn": "২,৭৫,৭৬০.৯১৩",
+    "ru": "275 760,913",
+    "pt": "275.760,913",
+    "ur": "275,760.913",
+    "id": "275.760,913",
+    "de": "275.760,913",
+    "ja": "275,760.913",
+    "pcm": "275,760.913",
+    "mr": "२,७५,७६०.९१३",
+    "te": "2,75,760.913"
+  },
+  "pluralRules": {
+    "en": "other",
+    "zh": "other",
+    "hi": "one",
+    "es": "other",
+    "fr": "one",
+    "ar": "zero",
+    "bn": "one",
+    "ru": "many",
+    "pt": "one",
+    "ur": "other",
+    "id": "other",
+    "de": "other",
+    "ja": "other",
+    "pcm": "one",
+    "mr": "other",
+    "te": "other"
+  },
+  "relativeTime": {
+    "en": "586,920.617 hours ago",
+    "zh": "586,920.617小时前",
+    "hi": "5,86,920.617 घंटे पहले",
+    "es": "hace 586.920,617 horas",
+    "fr": "il y a 586 920,617 heures",
+    "ar": "قبل ٥٨٦٬٩٢٠٫٦١٧ ساعة",
+    "bn": "৫,৮৬,৯২০.৬১৭ ঘন্টা আগে",
+    "ru": "586 920,617 часа назад",
+    "pt": "há 586.920,617 horas",
+    "ur": "586,920.617 گھنٹے پہلے",
+    "id": "586.920,617 jam yang lalu",
+    "de": "vor 586.920,617 Stunden",
+    "ja": "586,920.617 時間前",
+    "pcm": "586,920.617 áwa wé dọ́n pas",
+    "mr": "५,८६,९२०.६१७ तासांपूर्वी",
+    "te": "5,86,920.617 గంటల క్రితం"
+  }
+}
diff --git a/test/fixtures/icu/localizationData-v76.1.json b/test/fixtures/icu/localizationData-v76.1.json
new file mode 100644
index 00000000000000..cb519d2bea2faa
--- /dev/null
+++ b/test/fixtures/icu/localizationData-v76.1.json
@@ -0,0 +1,128 @@
+{
+  "dateStrings": {
+    "en": "Fri Jul 25 1980 01:35:33 GMT+0100 (Central European Standard Time)",
+    "zh": "Fri Jul 25 1980 01:35:33 GMT+0100 (中欧标准时间)",
+    "hi": "Fri Jul 25 1980 01:35:33 GMT+0100 (मध्य यूरोपीय मानक समय)",
+    "es": "Fri Jul 25 1980 01:35:33 GMT+0100 (hora estándar de Europa central)",
+    "fr": "Fri Jul 25 1980 01:35:33 GMT+0100 (heure normale d’Europe centrale)",
+    "ar": "Fri Jul 25 1980 01:35:33 GMT+0100 (توقيت وسط أوروبا الرسمي)",
+    "bn": "Fri Jul 25 1980 01:35:33 GMT+0100 (মধ্য ইউরোপীয় মানক সময়)",
+    "ru": "Fri Jul 25 1980 01:35:33 GMT+0100 (Центральная Европа, стандартное время)",
+    "pt": "Fri Jul 25 1980 01:35:33 GMT+0100 (Horário Padrão da Europa Central)",
+    "ur": "Fri Jul 25 1980 01:35:33 GMT+0100 (وسطی یورپ کا معیاری وقت)",
+    "id": "Fri Jul 25 1980 01:35:33 GMT+0100 (Waktu Standar Eropa Tengah)",
+    "de": "Fri Jul 25 1980 01:35:33 GMT+0100 (Mitteleuropäische Normalzeit)",
+    "ja": "Fri Jul 25 1980 01:35:33 GMT+0100 (中央ヨーロッパ標準時)",
+    "pcm": "Fri Jul 25 1980 01:35:33 GMT+0100 (Mídúl Yúrop Fíksd Taim)",
+    "mr": "Fri Jul 25 1980 01:35:33 GMT+0100 (मध्‍य युरोपियन प्रमाण वेळ)",
+    "te": "Fri Jul 25 1980 01:35:33 GMT+0100 (సెంట్రల్ యూరోపియన్ ప్రామాణిక సమయం)"
+  },
+  "dateTimeFormats": {
+    "en": "7/25/1980, 1:35:33 AM",
+    "zh": "1980/7/25 01:35:33",
+    "hi": "25/7/1980, 1:35:33 am",
+    "es": "25/7/1980, 1:35:33",
+    "fr": "25/07/1980 01:35:33",
+    "ar": "25‏/7‏/1980، 1:35:33 ص",
+    "bn": "২৫/৭/১৯৮০, ১:৩৫:৩৩ AM",
+    "ru": "25.07.1980, 01:35:33",
+    "pt": "25/07/1980, 01:35:33",
+    "ur": "25/7/1980، 1:35:33 AM",
+    "id": "25/7/1980, 01.35.33",
+    "de": "25.7.1980, 01:35:33",
+    "ja": "1980/7/25 1:35:33",
+    "pcm": "25/7/1980 01:35:33",
+    "mr": "२५/७/१९८०, १:३५:३३ AM",
+    "te": "25/7/1980 1:35:33 AM"
+  },
+  "dateFormats": {
+    "en": "7/25/1980",
+    "zh": "1980/7/25",
+    "hi": "25/7/1980",
+    "es": "25/7/1980",
+    "fr": "25/07/1980",
+    "ar": "25‏/7‏/1980",
+    "bn": "২৫/৭/১৯৮০",
+    "ru": "25.07.1980",
+    "pt": "25/07/1980",
+    "ur": "25/7/1980",
+    "id": "25/7/1980",
+    "de": "25.7.1980",
+    "ja": "1980/7/25",
+    "pcm": "25/7/1980",
+    "mr": "२५/७/१९८०",
+    "te": "25/7/1980"
+  },
+  "displayNames": {
+    "en": "Switzerland",
+    "zh": "瑞士",
+    "hi": "स्विट्ज़रलैंड",
+    "es": "Suiza",
+    "fr": "Suisse",
+    "ar": "سويسرا",
+    "bn": "সুইজারল্যান্ড",
+    "ru": "Швейцария",
+    "pt": "Suíça",
+    "ur": "سوئٹزر لینڈ",
+    "id": "Swiss",
+    "de": "Schweiz",
+    "ja": "スイス",
+    "pcm": "Swítsaland",
+    "mr": "स्वित्झर्लंड",
+    "te": "స్విట్జర్లాండ్"
+  },
+  "numberFormats": {
+    "en": "275,760.913",
+    "zh": "275,760.913",
+    "hi": "2,75,760.913",
+    "es": "275.760,913",
+    "fr": "275 760,913",
+    "ar": "275,760.913",
+    "bn": "২,৭৫,৭৬০.৯১৩",
+    "ru": "275 760,913",
+    "pt": "275.760,913",
+    "ur": "275,760.913",
+    "id": "275.760,913",
+    "de": "275.760,913",
+    "ja": "275,760.913",
+    "pcm": "275,760.913",
+    "mr": "२,७५,७६०.९१३",
+    "te": "2,75,760.913"
+  },
+  "pluralRules": {
+    "en": "other",
+    "zh": "other",
+    "hi": "one",
+    "es": "other",
+    "fr": "one",
+    "ar": "zero",
+    "bn": "one",
+    "ru": "many",
+    "pt": "one",
+    "ur": "other",
+    "id": "other",
+    "de": "other",
+    "ja": "other",
+    "pcm": "one",
+    "mr": "other",
+    "te": "other"
+  },
+  "relativeTime": {
+    "en": "586,920.617 hours ago",
+    "zh": "586,920.617小时前",
+    "hi": "5,86,920.617 घंटे पहले",
+    "es": "hace 586.920,617 horas",
+    "fr": "il y a 586 920,617 heures",
+    "ar": "قبل 586,920.617 ساعة",
+    "bn": "৫,৮৬,৯২০.৬১৭ ঘন্টা আগে",
+    "ru": "586 920,617 часа назад",
+    "pt": "há 586.920,617 horas",
+    "ur": "586,920.617 گھنٹے پہلے",
+    "id": "586.920,617 jam yang lalu",
+    "de": "vor 586.920,617 Stunden",
+    "ja": "586,920.617 時間前",
+    "pcm": "586,920.617 áwa wé dọ́n pas",
+    "mr": "५,८६,९२०.६१७ तासांपूर्वी",
+    "te": "5,86,920.617 గంటల క్రితం"
+  }
+}
diff --git a/test/fixtures/module-hooks/add-hook.js b/test/fixtures/module-hooks/add-hook.js
new file mode 100644
index 00000000000000..807a73953c3d6b
--- /dev/null
+++ b/test/fixtures/module-hooks/add-hook.js
@@ -0,0 +1,30 @@
+'use strict';
+const { fileURLToPath } = require('url');
+const { registerHooks } = require('module');
+
+// This is a simplified version of the pirates package API to
+// check that a similar API can be built on top of the public
+// hooks.
+function addHook(hook, options) {
+  function load(url, context, nextLoad) {
+    const result = nextLoad(url, context);
+    const index = url.lastIndexOf('.');
+    const ext = url.slice(index);
+    if (!options.exts.includes(ext)) {
+      return result;
+    }
+    const filename = fileURLToPath(url);
+    if (!options.matcher(filename)) {
+      return result;
+    }
+    return { ...result, source: hook(result.source.toString(), filename) }
+  }
+
+  const registered = registerHooks({ load });
+
+  return function revert() {
+    registered.deregister();
+  };
+}
+
+module.exports = { addHook };
diff --git a/test/fixtures/module-hooks/get-stats.js b/test/fixtures/module-hooks/get-stats.js
new file mode 100644
index 00000000000000..fa5869a455cea1
--- /dev/null
+++ b/test/fixtures/module-hooks/get-stats.js
@@ -0,0 +1,20 @@
+'use strict';
+
+const path = require('path');
+
+// Adapted from https://github.com/watson/module-details-from-path/blob/master/index.js
+// used by require-in-the-middle to check the logic is still compatible with our new hooks.
+exports.getStats = function getStats(filepath) {
+  const segments = filepath.split(path.sep);
+  const index = segments.lastIndexOf('node_modules');
+  if (index === -1) return {};
+  if (!segments[index + 1]) return {};
+  const scoped = segments[index + 1][0] === '@';
+  const name = scoped ? segments[index + 1] + '/' + segments[index + 2] : segments[index + 1];
+  const offset = scoped ? 3 : 2;
+  return {
+    name: name,
+    basedir: segments.slice(0, index + offset).join(path.sep),
+    path: segments.slice(index + offset).join(path.sep)
+  }
+};
diff --git a/test/fixtures/module-hooks/load-from-this-dir.js b/test/fixtures/module-hooks/load-from-this-dir.js
new file mode 100644
index 00000000000000..e1c51d2f43db32
--- /dev/null
+++ b/test/fixtures/module-hooks/load-from-this-dir.js
@@ -0,0 +1,4 @@
+'use strict';
+
+exports.require = require;
+exports.import = (id) => import(id);
diff --git a/test/fixtures/module-hooks/log-user.cts b/test/fixtures/module-hooks/log-user.cts
new file mode 100644
index 00000000000000..2b2754f48c4c74
--- /dev/null
+++ b/test/fixtures/module-hooks/log-user.cts
@@ -0,0 +1,3 @@
+const { UserAccount, UserType } = require('./user.ts');
+const account: typeof UserAccount = new UserAccount('john', 100, UserType.Admin);
+console.log(account);
diff --git a/test/fixtures/module-hooks/log-user.mts b/test/fixtures/module-hooks/log-user.mts
new file mode 100644
index 00000000000000..9e2c3bfe1a3bb8
--- /dev/null
+++ b/test/fixtures/module-hooks/log-user.mts
@@ -0,0 +1,4 @@
+import { UserAccount, UserType } from './user.ts';
+import { log } from 'node:console';
+const account: UserAccount = new UserAccount('john', 100, UserType.Admin);
+log(account);
diff --git a/test/fixtures/module-hooks/node_modules/bar-esm/bar-esm.js b/test/fixtures/module-hooks/node_modules/bar-esm/bar-esm.js
new file mode 100644
index 00000000000000..2130577ddf4b51
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/bar-esm/bar-esm.js
@@ -0,0 +1 @@
+export const $key = 'bar-esm';
diff --git a/test/fixtures/module-hooks/node_modules/bar-esm/package.json b/test/fixtures/module-hooks/node_modules/bar-esm/package.json
new file mode 100644
index 00000000000000..3c3282814fb87c
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/bar-esm/package.json
@@ -0,0 +1,6 @@
+{
+  "name": "bar-esm",
+  "main": "bar-esm.js",
+  "type": "module",
+  "version": "1.0.0"
+}
diff --git a/test/fixtures/module-hooks/node_modules/bar/bar.js b/test/fixtures/module-hooks/node_modules/bar/bar.js
new file mode 100644
index 00000000000000..4d1a1e6dc010fd
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/bar/bar.js
@@ -0,0 +1,3 @@
+module.exports = {
+  $key: 'bar'
+};
diff --git a/test/fixtures/module-hooks/node_modules/bar/package.json b/test/fixtures/module-hooks/node_modules/bar/package.json
new file mode 100644
index 00000000000000..0a2e2f7d1dad6b
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/bar/package.json
@@ -0,0 +1,6 @@
+{
+  "name": "bar",
+  "main": "bar.js",
+  "version": "1.0.0"
+}
+
diff --git a/test/fixtures/module-hooks/node_modules/foo-esm/foo-esm.js b/test/fixtures/module-hooks/node_modules/foo-esm/foo-esm.js
new file mode 100644
index 00000000000000..caf20f7cf2b78e
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/foo-esm/foo-esm.js
@@ -0,0 +1 @@
+export const $key = 'foo-esm';
\ No newline at end of file
diff --git a/test/fixtures/module-hooks/node_modules/foo-esm/package.json b/test/fixtures/module-hooks/node_modules/foo-esm/package.json
new file mode 100644
index 00000000000000..2a98229ba262a3
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/foo-esm/package.json
@@ -0,0 +1,7 @@
+{
+  "name": "foo-esm",
+  "type": "module",
+  "main": "foo-esm.js",
+  "version": "1.0.0"
+}
+
diff --git a/test/fixtures/module-hooks/node_modules/foo/foo.js b/test/fixtures/module-hooks/node_modules/foo/foo.js
new file mode 100644
index 00000000000000..91592faf7ce0a6
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/foo/foo.js
@@ -0,0 +1,3 @@
+module.exports = {
+  $key: 'foo'
+};
diff --git a/test/fixtures/module-hooks/node_modules/foo/package.json b/test/fixtures/module-hooks/node_modules/foo/package.json
new file mode 100644
index 00000000000000..53416530e84f2f
--- /dev/null
+++ b/test/fixtures/module-hooks/node_modules/foo/package.json
@@ -0,0 +1,6 @@
+{
+  "name": "foo",
+  "main": "foo.js",
+  "version": "1.0.0"
+}
+
diff --git a/test/fixtures/module-hooks/redirected-assert.js b/test/fixtures/module-hooks/redirected-assert.js
new file mode 100644
index 00000000000000..9855afd7ee3a3c
--- /dev/null
+++ b/test/fixtures/module-hooks/redirected-assert.js
@@ -0,0 +1 @@
+exports.exports_for_test = 'redirected assert'
diff --git a/test/fixtures/module-hooks/redirected-fs.js b/test/fixtures/module-hooks/redirected-fs.js
new file mode 100644
index 00000000000000..84631b34c3539a
--- /dev/null
+++ b/test/fixtures/module-hooks/redirected-fs.js
@@ -0,0 +1 @@
+export const exports_for_test = 'redirected fs';
diff --git a/test/fixtures/module-hooks/redirected-zlib.js b/test/fixtures/module-hooks/redirected-zlib.js
new file mode 100644
index 00000000000000..9c2fcd5ac75b40
--- /dev/null
+++ b/test/fixtures/module-hooks/redirected-zlib.js
@@ -0,0 +1 @@
+exports.exports_for_test = 'redirected zlib';
diff --git a/test/fixtures/module-hooks/register-typescript-hooks.js b/test/fixtures/module-hooks/register-typescript-hooks.js
new file mode 100644
index 00000000000000..2f9177124ab304
--- /dev/null
+++ b/test/fixtures/module-hooks/register-typescript-hooks.js
@@ -0,0 +1,4 @@
+'use strict';
+
+const { registerHooks } = require('node:module');
+registerHooks(require('./typescript-transpiler'));
diff --git a/test/fixtures/module-hooks/typescript-transpiler.js b/test/fixtures/module-hooks/typescript-transpiler.js
new file mode 100644
index 00000000000000..b8cb638332ce85
--- /dev/null
+++ b/test/fixtures/module-hooks/typescript-transpiler.js
@@ -0,0 +1,71 @@
+'use strict';
+
+const ts = require('../snapshot/typescript');
+const extensions = {
+  '.cts': 'commonjs-typescript',
+  '.mts': 'module-typescript',
+  '.ts': 'typescript',
+};
+
+const output = {
+  'commonjs-typescript': {
+    options: { module: ts.ModuleKind.CommonJS },
+    format: 'commonjs',
+  },
+  'module-typescript': {
+    options: { module: ts.ModuleKind.ESNext },
+    format: 'module',
+  },
+  'typescript': {
+    options: { module: ts.ModuleKind.NodeNext },
+    format: 'commonjs',
+  },
+};
+
+function resolve(specifier, context, nextResolve) {
+  const resolved = nextResolve(specifier, context);
+  const index = resolved.url.lastIndexOf('.');
+  if (index === -1) {
+    return resolved;
+  }
+  const ext = resolved.url.slice(index);
+  const supportedFormat = extensions[ext];
+  if (!supportedFormat) {
+    return resolved;
+  }
+  const result = {
+    ...resolved,
+    format: supportedFormat,
+  };
+  return result;
+}
+
+let decoder;
+function load(url, context, nextLoad) {
+  const loadResult = nextLoad(url, context);
+  const { source, format } = loadResult;
+
+  if (!format || !format.includes('typescript')) {
+    return { format, source };
+  }
+
+  let str = source;
+  if (typeof str !== 'string') {
+    decoder ??= new TextDecoder();
+    str = decoder.decode(source);
+  }
+  const transpiled = ts.transpileModule(str, {
+    compilerOptions: output[format].options
+  });
+
+  const result = {
+    ...loadResult,
+    format: output[format].format,
+    source: transpiled.outputText,
+  };
+
+  return result;
+}
+
+exports.load = load;
+exports.resolve = resolve;
diff --git a/test/fixtures/module-hooks/user.ts b/test/fixtures/module-hooks/user.ts
new file mode 100644
index 00000000000000..f4e064b2739345
--- /dev/null
+++ b/test/fixtures/module-hooks/user.ts
@@ -0,0 +1,18 @@
+enum UserType {
+  Staff,
+  Admin,
+};
+
+class UserAccount {
+  name: string;
+  id: number;
+  type: UserType;
+
+  constructor(name: string, id: number, type: UserType) {
+    this.name = name;
+    this.id = id;
+    this.type = type;
+  }
+}
+
+export { UserAccount, UserType };
diff --git a/test/fixtures/test-runner/coverage-default-exclusion/file-test.js b/test/fixtures/test-runner/coverage-default-exclusion/file-test.js
new file mode 100644
index 00000000000000..ff1e8a2be02fcf
--- /dev/null
+++ b/test/fixtures/test-runner/coverage-default-exclusion/file-test.js
@@ -0,0 +1,7 @@
+const test = require('node:test');
+const assert = require('node:assert');
+const { foo } = require('./logic-file');
+
+test('foo returns 1', () => {
+    assert.strictEqual(foo(), 1);
+});
diff --git a/test/fixtures/test-runner/coverage-default-exclusion/file.test.mjs b/test/fixtures/test-runner/coverage-default-exclusion/file.test.mjs
new file mode 100644
index 00000000000000..5b25d4c85d6b6e
--- /dev/null
+++ b/test/fixtures/test-runner/coverage-default-exclusion/file.test.mjs
@@ -0,0 +1,7 @@
+import test from 'node:test';
+import assert from 'node:assert';
+import { foo } from './logic-file.js';
+
+test('foo returns 1', () => {
+    assert.strictEqual(foo(), 1);
+});
diff --git a/test/fixtures/test-runner/coverage-default-exclusion/file.test.ts b/test/fixtures/test-runner/coverage-default-exclusion/file.test.ts
new file mode 100644
index 00000000000000..5b25d4c85d6b6e
--- /dev/null
+++ b/test/fixtures/test-runner/coverage-default-exclusion/file.test.ts
@@ -0,0 +1,7 @@
+import test from 'node:test';
+import assert from 'node:assert';
+import { foo } from './logic-file.js';
+
+test('foo returns 1', () => {
+    assert.strictEqual(foo(), 1);
+});
diff --git a/test/fixtures/test-runner/coverage-default-exclusion/logic-file.js b/test/fixtures/test-runner/coverage-default-exclusion/logic-file.js
new file mode 100644
index 00000000000000..cc2ca8284abe7d
--- /dev/null
+++ b/test/fixtures/test-runner/coverage-default-exclusion/logic-file.js
@@ -0,0 +1,9 @@
+function foo() {
+    return 1;
+}
+
+function bar() {
+    return 'bar';
+}
+
+module.exports = { foo, bar };
diff --git a/test/fixtures/test-runner/coverage-default-exclusion/test.cjs b/test/fixtures/test-runner/coverage-default-exclusion/test.cjs
new file mode 100644
index 00000000000000..641bad44ed4f33
--- /dev/null
+++ b/test/fixtures/test-runner/coverage-default-exclusion/test.cjs
@@ -0,0 +1,7 @@
+const test = require('node:test');
+const assert = require('node:assert');
+const { foo } = require('./logic-file.js');
+
+test('foo returns 1', () => {
+    assert.strictEqual(foo(), 1);
+});
diff --git a/test/fixtures/test-runner/coverage-default-exclusion/test/not-matching-test-name.js b/test/fixtures/test-runner/coverage-default-exclusion/test/not-matching-test-name.js
new file mode 100644
index 00000000000000..8d01a2b260657f
--- /dev/null
+++ b/test/fixtures/test-runner/coverage-default-exclusion/test/not-matching-test-name.js
@@ -0,0 +1,7 @@
+const test = require('node:test');
+const assert = require('node:assert');
+const { foo } = require('../logic-file.js');
+
+test('foo returns 1', () => {
+    assert.strictEqual(foo(), 1);
+});
diff --git a/test/fixtures/test-runner/output/lcov_reporter.js b/test/fixtures/test-runner/output/lcov_reporter.js
index a6d17432d18c23..9c6709c3933761 100644
--- a/test/fixtures/test-runner/output/lcov_reporter.js
+++ b/test/fixtures/test-runner/output/lcov_reporter.js
@@ -4,4 +4,13 @@ const fixtures = require('../../../common/fixtures');
 const spawn = require('node:child_process').spawn;
 
 spawn(process.execPath,
-      ['--no-warnings', '--experimental-test-coverage', '--test-reporter', 'lcov', fixtures.path('test-runner/output/output.js')], { stdio: 'inherit' });
+      [
+            '--no-warnings',
+            '--experimental-test-coverage',
+            '--test-coverage-exclude=!test/**',
+            '--test-reporter',
+            'lcov',
+            fixtures.path('test-runner/output/output.js')
+      ],
+      { stdio: 'inherit' }
+);
diff --git a/test/fixtures/wpt/README.md b/test/fixtures/wpt/README.md
index 406e89b820e9f7..cc97c8787a2c98 100644
--- a/test/fixtures/wpt/README.md
+++ b/test/fixtures/wpt/README.md
@@ -27,8 +27,8 @@ Last update:
 - performance-timeline: https://github.com/web-platform-tests/wpt/tree/94caab7038/performance-timeline
 - resource-timing: https://github.com/web-platform-tests/wpt/tree/22d38586d0/resource-timing
 - resources: https://github.com/web-platform-tests/wpt/tree/1e140d63ec/resources
-- streams: https://github.com/web-platform-tests/wpt/tree/2bd26e124c/streams
-- url: https://github.com/web-platform-tests/wpt/tree/67880a4eb8/url
+- streams: https://github.com/web-platform-tests/wpt/tree/bc9dcbbf1a/streams
+- url: https://github.com/web-platform-tests/wpt/tree/6fa3fe8a92/url
 - user-timing: https://github.com/web-platform-tests/wpt/tree/5ae85bf826/user-timing
 - wasm/jsapi: https://github.com/web-platform-tests/wpt/tree/cde25e7e3c/wasm/jsapi
 - wasm/webapi: https://github.com/web-platform-tests/wpt/tree/fd1b23eeaa/wasm/webapi
diff --git a/test/fixtures/wpt/streams/idlharness-shadowrealm.window.js b/test/fixtures/wpt/streams/idlharness-shadowrealm.window.js
deleted file mode 100644
index 099b2475ca7e87..00000000000000
--- a/test/fixtures/wpt/streams/idlharness-shadowrealm.window.js
+++ /dev/null
@@ -1,2 +0,0 @@
-// META: script=/resources/idlharness-shadowrealm.js
-idl_test_shadowrealm(["streams"], ["dom"]);
diff --git a/test/fixtures/wpt/streams/idlharness.any.js b/test/fixtures/wpt/streams/idlharness.any.js
index 42a17da58c5ae3..0be03b2078f9bc 100644
--- a/test/fixtures/wpt/streams/idlharness.any.js
+++ b/test/fixtures/wpt/streams/idlharness.any.js
@@ -1,4 +1,4 @@
-// META: global=window,worker
+// META: global=window,worker,shadowrealm-in-window
 // META: script=/resources/WebIDLParser.js
 // META: script=/resources/idlharness.js
 // META: timeout=long
diff --git a/test/fixtures/wpt/streams/readable-byte-streams/general.any.js b/test/fixtures/wpt/streams/readable-byte-streams/general.any.js
index cdce2244c3c84b..4b0c73865f7cf9 100644
--- a/test/fixtures/wpt/streams/readable-byte-streams/general.any.js
+++ b/test/fixtures/wpt/streams/readable-byte-streams/general.any.js
@@ -870,11 +870,11 @@ promise_test(() => {
     start(c) {
       controller = c;
     },
-    async pull() {
+    pull() {
       byobRequestDefined.push(controller.byobRequest !== null);
       const initialByobRequest = controller.byobRequest;
 
-      const transferredView = await transferArrayBufferView(controller.byobRequest.view);
+      const transferredView = transferArrayBufferView(controller.byobRequest.view);
       transferredView[0] = 0x01;
       controller.byobRequest.respondWithNewView(transferredView);
 
@@ -2288,7 +2288,7 @@ promise_test(async t => {
   await pullCalledPromise;
 
   // Transfer the original BYOB request's buffer, and respond with a new view on that buffer
-  const transferredView = await transferArrayBufferView(controller.byobRequest.view);
+  const transferredView = transferArrayBufferView(controller.byobRequest.view);
   const newView = transferredView.subarray(0, 1);
   newView[0] = 42;
 
@@ -2328,7 +2328,7 @@ promise_test(async t => {
   await pullCalledPromise;
 
   // Transfer the original BYOB request's buffer, and respond with an empty view on that buffer
-  const transferredView = await transferArrayBufferView(controller.byobRequest.view);
+  const transferredView = transferArrayBufferView(controller.byobRequest.view);
   const newView = transferredView.subarray(0, 0);
 
   controller.close();
diff --git a/test/fixtures/wpt/streams/readable-byte-streams/patched-global.any.js b/test/fixtures/wpt/streams/readable-byte-streams/patched-global.any.js
new file mode 100644
index 00000000000000..ce2e9e9993ae57
--- /dev/null
+++ b/test/fixtures/wpt/streams/readable-byte-streams/patched-global.any.js
@@ -0,0 +1,54 @@
+// META: global=window,worker,shadowrealm
+// META: script=../resources/test-utils.js
+'use strict';
+
+// Tests which patch the global environment are kept separate to avoid
+// interfering with other tests.
+
+promise_test(async (t) => {
+  let controller;
+  const rs = new ReadableStream({
+    type: 'bytes',
+    start(c) {
+      controller = c;
+    }
+  });
+  const reader = rs.getReader({mode: 'byob'});
+
+  const length = 0x4000;
+  const buffer = new ArrayBuffer(length);
+  const bigArray = new BigUint64Array(buffer, length - 8, 1);
+
+  const read1 = reader.read(new Uint8Array(new ArrayBuffer(0x100)));
+  const read2 = reader.read(bigArray);
+
+  let flag = false;
+  Object.defineProperty(Object.prototype, 'then', {
+    get: t.step_func(() => {
+      if (!flag) {
+        flag = true;
+        assert_equals(controller.byobRequest, null, 'byobRequest should be null after filling both views');
+      }
+    }),
+    configurable: true
+  });
+  t.add_cleanup(() => {
+    delete Object.prototype.then;
+  });
+
+  controller.enqueue(new Uint8Array(0x110).fill(0x42));
+  assert_true(flag, 'patched then() should be called');
+
+  // The first read() is filled entirely with 0x100 bytes
+  const result1 = await read1;
+  assert_false(result1.done, 'result1.done');
+  assert_typed_array_equals(result1.value, new Uint8Array(0x100).fill(0x42), 'result1.value');
+
+  // The second read() is filled with the remaining 0x10 bytes
+  const result2 = await read2;
+  assert_false(result2.done, 'result2.done');
+  assert_equals(result2.value.constructor, BigUint64Array, 'result2.value constructor');
+  assert_equals(result2.value.byteOffset, length - 8, 'result2.value byteOffset');
+  assert_equals(result2.value.length, 1, 'result2.value length');
+  assert_array_equals([...result2.value], [0x42424242_42424242n], 'result2.value contents');
+}, 'Patched then() sees byobRequest after filling all pending pull-into descriptors');
diff --git a/test/fixtures/wpt/streams/readable-byte-streams/tee.any.js b/test/fixtures/wpt/streams/readable-byte-streams/tee.any.js
index 7dd5ba3f3fb013..60d82b9cf6a1fd 100644
--- a/test/fixtures/wpt/streams/readable-byte-streams/tee.any.js
+++ b/test/fixtures/wpt/streams/readable-byte-streams/tee.any.js
@@ -934,3 +934,36 @@ promise_test(async () => {
   assert_typed_array_equals(result4.value, new Uint8Array([0]).subarray(0, 0), 'second chunk from branch2 should be correct');
 
 }, 'ReadableStream teeing with byte source: respond() and close() while both branches are pulling');
+
+promise_test(async t => {
+  let pullCount = 0;
+  const arrayBuffer = new Uint8Array([0x01, 0x02, 0x03]).buffer;
+  const enqueuedChunk = new Uint8Array(arrayBuffer, 2);
+  assert_equals(enqueuedChunk.length, 1);
+  assert_equals(enqueuedChunk.byteOffset, 2);
+  const rs = new ReadableStream({
+    type: 'bytes',
+    pull(c) {
+      ++pullCount;
+      if (pullCount === 1) {
+        c.enqueue(enqueuedChunk);
+      }
+    }
+  });
+
+  const [branch1, branch2] = rs.tee();
+  const reader1 = branch1.getReader();
+  const reader2 = branch2.getReader();
+
+  const [result1, result2] = await Promise.all([reader1.read(), reader2.read()]);
+  assert_equals(result1.done, false, 'reader1 done');
+  assert_equals(result2.done, false, 'reader2 done');
+
+  const view1 = result1.value;
+  const view2 = result2.value;
+  // The first stream has the transferred buffer, but the second stream has the
+  // cloned buffer.
+  const underlying = new Uint8Array([0x01, 0x02, 0x03]).buffer;
+  assert_typed_array_equals(view1, new Uint8Array(underlying, 2), 'reader1 value');
+  assert_typed_array_equals(view2, new Uint8Array([0x03]), 'reader2 value');
+}, 'ReadableStream teeing with byte source: reading an array with a byte offset should clone correctly');
diff --git a/test/fixtures/wpt/streams/readable-streams/crashtests/from-cross-realm.https.html b/test/fixtures/wpt/streams/readable-streams/crashtests/from-cross-realm.https.html
new file mode 100644
index 00000000000000..58a4371186ece7
--- /dev/null
+++ b/test/fixtures/wpt/streams/readable-streams/crashtests/from-cross-realm.https.html
@@ -0,0 +1,18 @@
+<html class="test-wait">
+<meta charset="utf-8">
+<script type="module">
+  let a = window.open()
+  try {
+    let dir = await a.navigator.storage.getDirectory()
+    let hdl = await dir.getFileHandle("7399d8cf-9ff9-494d-89eb-d3045f229c27", {"create": true})
+    let map = new Map([[]])
+    let b = ReadableStream.from(map)
+    let c = await hdl.createWritable({ })
+    await b.pipeTo(c, { }).catch(() => {
+      // Error expected as we are not piping the right form of chunk to FileHandle
+    })
+  } finally {
+    document.documentElement.classList.remove("test-wait")
+    a.close()
+  }
+</script>
diff --git a/test/fixtures/wpt/streams/readable-streams/owning-type-video-frame.any.js b/test/fixtures/wpt/streams/readable-streams/owning-type-video-frame.any.js
index b652f9c5fcb4b6..ec01fda0b3c737 100644
--- a/test/fixtures/wpt/streams/readable-streams/owning-type-video-frame.any.js
+++ b/test/fixtures/wpt/streams/readable-streams/owning-type-video-frame.any.js
@@ -1,4 +1,4 @@
-// META: global=window,worker,shadowrealm
+// META: global=window,worker
 // META: script=../resources/test-utils.js
 // META: script=../resources/rs-utils.js
 'use strict';
diff --git a/test/fixtures/wpt/streams/resources/rs-utils.js b/test/fixtures/wpt/streams/resources/rs-utils.js
index f1a014275a2fbc..0f7742a5b3b190 100644
--- a/test/fixtures/wpt/streams/resources/rs-utils.js
+++ b/test/fixtures/wpt/streams/resources/rs-utils.js
@@ -1,5 +1,42 @@
 'use strict';
 (function () {
+  // Fake setInterval-like functionality in environments that don't have it
+  class IntervalHandle {
+    constructor(callback, delayMs) {
+      this.callback = callback;
+      this.delayMs = delayMs;
+      this.cancelled = false;
+      Promise.resolve().then(() => this.check());
+    }
+
+    async check() {
+      while (true) {
+        await new Promise(resolve => step_timeout(resolve, this.delayMs));
+        if (this.cancelled) {
+          return;
+        }
+        this.callback();
+      }
+    }
+
+    cancel() {
+      this.cancelled = true;
+    }
+  }
+
+  let localSetInterval, localClearInterval;
+  if (typeof globalThis.setInterval !== "undefined" &&
+      typeof globalThis.clearInterval !== "undefined") {
+    localSetInterval = globalThis.setInterval;
+    localClearInterval = globalThis.clearInterval;
+  } else {
+    localSetInterval = function setInterval(callback, delayMs) {
+      return new IntervalHandle(callback, delayMs);
+    }
+    localClearInterval = function clearInterval(handle) {
+      handle.cancel();
+    }
+  }
 
   class RandomPushSource {
     constructor(toPush) {
@@ -18,12 +55,12 @@
       }
 
       if (!this.started) {
-        this._intervalHandle = setInterval(writeChunk, 2);
+        this._intervalHandle = localSetInterval(writeChunk, 2);
         this.started = true;
       }
 
       if (this.paused) {
-        this._intervalHandle = setInterval(writeChunk, 2);
+        this._intervalHandle = localSetInterval(writeChunk, 2);
         this.paused = false;
       }
 
@@ -37,7 +74,7 @@
 
         if (source.toPush > 0 && source.pushed > source.toPush) {
           if (source._intervalHandle) {
-            clearInterval(source._intervalHandle);
+            localClearInterval(source._intervalHandle);
             source._intervalHandle = undefined;
           }
           source.closed = true;
@@ -55,7 +92,7 @@
 
       if (this.started) {
         this.paused = true;
-        clearInterval(this._intervalHandle);
+        localClearInterval(this._intervalHandle);
         this._intervalHandle = undefined;
       } else {
         throw new Error('Can\'t pause reading an unstarted source.');
@@ -178,15 +215,7 @@
   }
 
   function transferArrayBufferView(view) {
-    const noopByteStream = new ReadableStream({
-      type: 'bytes',
-      pull(c) {
-        c.byobRequest.respond(c.byobRequest.view.byteLength);
-        c.close();
-      }
-    });
-    const reader = noopByteStream.getReader({ mode: 'byob' });
-    return reader.read(view).then((result) => result.value);
+    return structuredClone(view, { transfer: [view.buffer] });
   }
 
   self.RandomPushSource = RandomPushSource;
diff --git a/test/fixtures/wpt/streams/transferable/transfer-with-messageport.window.js b/test/fixtures/wpt/streams/transferable/transfer-with-messageport.window.js
index 37f8c9df169607..3bfe634a6e153d 100644
--- a/test/fixtures/wpt/streams/transferable/transfer-with-messageport.window.js
+++ b/test/fixtures/wpt/streams/transferable/transfer-with-messageport.window.js
@@ -105,7 +105,7 @@ async function transferMessagePortWith(constructor) {
   await transferMessagePortWithOrder3(new constructor());
 }
 
-async function advancedTransferMesagePortWith(constructor) {
+async function advancedTransferMessagePortWith(constructor) {
   await transferMessagePortWithOrder4(new constructor());
   await transferMessagePortWithOrder5(new constructor());
   await transferMessagePortWithOrder6(new constructor());
@@ -166,7 +166,7 @@ async function mixedTransferMessagePortWithOrder3() {
   );
 }
 
-async function mixedTransferMesagePortWith() {
+async function mixedTransferMessagePortWith() {
   await mixedTransferMessagePortWithOrder1();
   await mixedTransferMessagePortWithOrder2();
   await mixedTransferMessagePortWithOrder3();
@@ -185,19 +185,19 @@ promise_test(async t => {
 }, "Transferring a MessagePort with a TransformStream should set `.ports`");
 
 promise_test(async t => {
-  await transferMessagePortWith(ReadableStream);
+  await advancedTransferMessagePortWith(ReadableStream);
 }, "Transferring a MessagePort with a ReadableStream should set `.ports`, advanced");
 
 promise_test(async t => {
-  await transferMessagePortWith(WritableStream);
+  await advancedTransferMessagePortWith(WritableStream);
 }, "Transferring a MessagePort with a WritableStream should set `.ports`, advanced");
 
 promise_test(async t => {
-  await transferMessagePortWith(TransformStream);
+  await advancedTransferMessagePortWith(TransformStream);
 }, "Transferring a MessagePort with a TransformStream should set `.ports`, advanced");
 
 promise_test(async t => {
-  await mixedTransferMesagePortWith();
+  await mixedTransferMessagePortWith();
 }, "Transferring a MessagePort with multiple streams should set `.ports`");
 
 test(() => {
diff --git a/test/fixtures/wpt/url/resources/toascii.json b/test/fixtures/wpt/url/resources/toascii.json
index d02c4c7e86654c..6445db80e3c8f3 100644
--- a/test/fixtures/wpt/url/resources/toascii.json
+++ b/test/fixtures/wpt/url/resources/toascii.json
@@ -1,5 +1,6 @@
 [
-  "This resource is focused on highlighting issues with UTS #46 ToASCII",
+  "This contains assorted IDNA tests that IdnaTestV2 might not cover.",
+  "Feel free to deduplicate with a clear commit message.",
   {
     "comment": "Label with hyphens in 3rd and 4th position",
     "input": "aa--",
@@ -198,5 +199,175 @@
   {
     "input": ">\u00AD\u0338",
     "output": "xn--hdh"
+  },
+  "Tests below are from WebKit (fast/url/idna2003.html & fast/url/idna2008.html; contributed by Chris Weber back in 2011).",
+  {
+    "input": "fa\u00DF.de",
+    "output": "xn--fa-hia.de"
+  },
+  {
+    "input": "\u03B2\u03CC\u03BB\u03BF\u03C2.com",
+    "output": "xn--nxasmm1c.com"
+  },
+  {
+    "input": "\u0DC1\u0DCA\u200D\u0DBB\u0DD3.com",
+    "output": "xn--10cl1a0b660p.com"
+  },
+  {
+    "input": "\u0646\u0627\u0645\u0647\u200C\u0627\u06CC.com",
+    "output": "xn--mgba3gch31f060k.com"
+  },
+  {
+    "input": "www.loo\u0138out.net",
+    "output": "www.xn--looout-5bb.net"
+  },
+  {
+    "input": "\u15EF\u15EF\u15EF.lookout.net",
+    "output": "xn--1qeaa.lookout.net"
+  },
+  {
+    "input": "www.lookout.\u0441\u043E\u043C",
+    "output": "www.lookout.xn--l1adi"
+  },
+  {
+    "input": "www\u2025lookout.net",
+    "output": null
+  },
+  {
+    "input": "www.lookout\u2027net",
+    "output": "www.xn--lookoutnet-406e"
+  },
+  {
+    "input": "www.lookout.net\u2A7480",
+    "output": null
+  },
+  {
+    "input": "www\u00A0.lookout.net",
+    "output": null
+  },
+  {
+    "input": "\u1680lookout.net",
+    "output": null
+  },
+  {
+    "input": "\u001flookout.net",
+    "output": null
+  },
+  {
+    "input": "look\u06DDout.net",
+    "output": null
+  },
+  {
+    "input": "look\u180Eout.net",
+    "output": null
+  },
+  {
+    "input": "look\u2060out.net",
+    "output": "lookout.net"
+  },
+  {
+    "input": "look\uFEFFout.net",
+    "output": "lookout.net"
+  },
+  {
+    "input": "look\uD83F\uDFFEout.net",
+    "output": null
+  },
+  {
+    "input": "look\uFFFAout.net",
+    "output": null
+  },
+  {
+    "input": "look\u2FF0out.net",
+    "output": null
+  },
+  {
+    "input": "look\u0341out.net",
+    "output": "xn--looout-kp7b.net"
+  },
+  {
+    "input": "look\u202Eout.net",
+    "output": null
+  },
+  {
+    "input": "look\u206Bout.net",
+    "output": null
+  },
+  {
+    "input": "look\uDB40\uDC01out.net",
+    "output": null
+  },
+  {
+    "input": "look\uDB40\uDC20out.net",
+    "output": null
+  },
+  {
+    "input": "look\u05BEout.net",
+    "output": null
+  },
+  {
+    "input": "B\u00FCcher.de",
+    "output": "xn--bcher-kva.de"
+  },
+  {
+    "input": "\u2665.net",
+    "output": "xn--g6h.net"
+  },
+  {
+    "input": "\u0378.net",
+    "output": null
+  },
+  {
+    "input": "\u04C0.com",
+    "output": null
+  },
+  {
+    "comment": "This is U+2F868 (which is mapped to U+36FC starting with Unicode 16.0)",
+    "input": "\uD87E\uDC68.com",
+    "output": "xn--snl.com"
+  },
+  {
+    "input": "\u2183.com",
+    "output": null
+  },
+  {
+    "input": "look\u034Fout.net",
+    "output": "lookout.net"
+  },
+  {
+    "input": "gOoGle.com",
+    "output": "google.com"
+  },
+  {
+    "input": "\u09dc.com",
+    "output": "xn--15b8c.com"
+  },
+  {
+    "input": "\u1E9E.com",
+    "output": "xn--zca.com"
+  },
+  {
+    "input": "\u1E9E.foo.com",
+    "output": "xn--zca.foo.com"
+  },
+  {
+    "input": "-foo.bar.com",
+    "output": "-foo.bar.com"
+  },
+  {
+    "input": "foo-.bar.com",
+    "output": "foo-.bar.com"
+  },
+  {
+    "input": "ab--cd.com",
+    "output": "ab--cd.com"
+  },
+  {
+    "input": "xn--0.com",
+    "output": null
+  },
+  {
+    "input": "foo\u0300.bar.com",
+    "output": "xn--fo-3ja.bar.com"
   }
 ]
diff --git a/test/fixtures/wpt/versions.json b/test/fixtures/wpt/versions.json
index 55227b3be66e1b..2560056dba990d 100644
--- a/test/fixtures/wpt/versions.json
+++ b/test/fixtures/wpt/versions.json
@@ -68,11 +68,11 @@
     "path": "resources"
   },
   "streams": {
-    "commit": "2bd26e124cf17b2f0a25c150794d640b07b2a870",
+    "commit": "bc9dcbbf1a4c2c741ef47f47d6ede6458f40c4a4",
     "path": "streams"
   },
   "url": {
-    "commit": "67880a4eb83ca9aa732eec4b35a1971ff5bf37ff",
+    "commit": "6fa3fe8a929be45422cd46a8961e647e13d0cab8",
     "path": "url"
   },
   "user-timing": {
diff --git a/test/js-native-api/6_object_wrap/6_object_wrap.cc b/test/js-native-api/6_object_wrap/6_object_wrap.cc
index 49b1241fb38caa..8a380e3caa20bb 100644
--- a/test/js-native-api/6_object_wrap/6_object_wrap.cc
+++ b/test/js-native-api/6_object_wrap/6_object_wrap.cc
@@ -3,6 +3,8 @@
 #include "assert.h"
 #include "myobject.h"
 
+typedef int32_t FinalizerData;
+
 napi_ref MyObject::constructor;
 
 MyObject::MyObject(double value)
@@ -10,10 +12,16 @@ MyObject::MyObject(double value)
 
 MyObject::~MyObject() { napi_delete_reference(env_, wrapper_); }
 
-void MyObject::Destructor(
-  napi_env env, void* nativeObject, void* /*finalize_hint*/) {
+void MyObject::Destructor(node_api_basic_env env,
+                          void* nativeObject,
+                          void* /*finalize_hint*/) {
   MyObject* obj = static_cast<MyObject*>(nativeObject);
   delete obj;
+
+  FinalizerData* data;
+  NODE_API_BASIC_CALL_RETURN_VOID(
+      env, napi_get_instance_data(env, reinterpret_cast<void**>(&data)));
+  *data += 1;
 }
 
 void MyObject::Init(napi_env env, napi_value exports) {
@@ -154,7 +162,7 @@ napi_value MyObject::Multiply(napi_env env, napi_callback_info info) {
 }
 
 // This finalizer should never be invoked.
-void ObjectWrapDanglingReferenceFinalizer(napi_env env,
+void ObjectWrapDanglingReferenceFinalizer(node_api_basic_env env,
                                           void* finalize_data,
                                           void* finalize_hint) {
   assert(0 && "unreachable");
@@ -198,8 +206,30 @@ napi_value ObjectWrapDanglingReferenceTest(napi_env env,
   return ret;
 }
 
+static napi_value GetFinalizerCallCount(napi_env env, napi_callback_info info) {
+  size_t argc = 1;
+  napi_value argv[1];
+  FinalizerData* data;
+  napi_value result;
+
+  NODE_API_CALL(env,
+                napi_get_cb_info(env, info, &argc, argv, nullptr, nullptr));
+  NODE_API_CALL(env,
+                napi_get_instance_data(env, reinterpret_cast<void**>(&data)));
+  NODE_API_CALL(env, napi_create_int32(env, *data, &result));
+  return result;
+}
+
+static void finalizeData(napi_env env, void* data, void* hint) {
+  delete reinterpret_cast<FinalizerData*>(data);
+}
+
 EXTERN_C_START
 napi_value Init(napi_env env, napi_value exports) {
+  FinalizerData* data = new FinalizerData;
+  *data = 0;
+  NODE_API_CALL(env, napi_set_instance_data(env, data, finalizeData, nullptr));
+
   MyObject::Init(env, exports);
 
   napi_property_descriptor descriptors[] = {
@@ -207,6 +237,7 @@ napi_value Init(napi_env env, napi_value exports) {
                                 ObjectWrapDanglingReference),
       DECLARE_NODE_API_PROPERTY("objectWrapDanglingReferenceTest",
                                 ObjectWrapDanglingReferenceTest),
+      DECLARE_NODE_API_PROPERTY("getFinalizerCallCount", GetFinalizerCallCount),
   };
 
   NODE_API_CALL(
diff --git a/test/js-native-api/6_object_wrap/binding.gyp b/test/js-native-api/6_object_wrap/binding.gyp
index 44c9c3f837b4a6..2be24c9ec171a9 100644
--- a/test/js-native-api/6_object_wrap/binding.gyp
+++ b/test/js-native-api/6_object_wrap/binding.gyp
@@ -5,6 +5,13 @@
       "sources": [
         "6_object_wrap.cc"
       ]
+    },
+    {
+      "target_name": "6_object_wrap_basic_finalizer",
+      "defines": [ "NAPI_EXPERIMENTAL" ],
+      "sources": [
+        "6_object_wrap.cc"
+      ]
     }
   ]
 }
diff --git a/test/js-native-api/6_object_wrap/myobject.h b/test/js-native-api/6_object_wrap/myobject.h
index 337180598bc042..0faff676b4d992 100644
--- a/test/js-native-api/6_object_wrap/myobject.h
+++ b/test/js-native-api/6_object_wrap/myobject.h
@@ -6,7 +6,9 @@
 class MyObject {
  public:
   static void Init(napi_env env, napi_value exports);
-  static void Destructor(napi_env env, void* nativeObject, void* finalize_hint);
+  static void Destructor(node_api_basic_env env,
+                         void* nativeObject,
+                         void* finalize_hint);
 
  private:
   explicit MyObject(double value_ = 0);
diff --git a/test/js-native-api/6_object_wrap/test-basic-finalizer.js b/test/js-native-api/6_object_wrap/test-basic-finalizer.js
new file mode 100644
index 00000000000000..46b5672c5fa4b9
--- /dev/null
+++ b/test/js-native-api/6_object_wrap/test-basic-finalizer.js
@@ -0,0 +1,24 @@
+// Flags: --expose-gc
+
+'use strict';
+const common = require('../../common');
+const assert = require('assert');
+const addon = require(`./build/${common.buildType}/6_object_wrap_basic_finalizer`);
+
+// This test verifies that ObjectWrap can be correctly finalized with a node_api_basic_finalizer
+// in the current JS loop tick
+(() => {
+  let obj = new addon.MyObject(9);
+  obj = null;
+  // Silent eslint about unused variables.
+  assert.strictEqual(obj, null);
+})();
+
+for (let i = 0; i < 10; ++i) {
+  global.gc();
+  if (addon.getFinalizerCallCount() === 1) {
+    break;
+  }
+}
+
+assert.strictEqual(addon.getFinalizerCallCount(), 1);
diff --git a/test/parallel/test-whatwg-url-custom-domainto.js b/test/known_issues/test-whatwg-url-custom-domainto.js
similarity index 96%
rename from test/parallel/test-whatwg-url-custom-domainto.js
rename to test/known_issues/test-whatwg-url-custom-domainto.js
index b7458d7a8e1a86..9e70e34f7095d3 100644
--- a/test/parallel/test-whatwg-url-custom-domainto.js
+++ b/test/known_issues/test-whatwg-url-custom-domainto.js
@@ -13,7 +13,7 @@ const { domainToASCII, domainToUnicode } = require('url');
 const tests = require('../fixtures/url-idna');
 const fixtures = require('../common/fixtures');
 const wptToASCIITests = require(
-  fixtures.path('wpt', 'url', 'resources', 'toascii.json')
+  fixtures.path('wpt', 'url', 'resources', 'toascii.json'),
 );
 
 {
diff --git a/test/parallel/test-whatwg-url-toascii.js b/test/known_issues/test-whatwg-url-toascii.js
similarity index 97%
rename from test/parallel/test-whatwg-url-toascii.js
rename to test/known_issues/test-whatwg-url-toascii.js
index e5180bfb344127..0d2485f0d38398 100644
--- a/test/parallel/test-whatwg-url-toascii.js
+++ b/test/known_issues/test-whatwg-url-toascii.js
@@ -10,8 +10,8 @@ const { test, assert_equals, assert_throws } = require('../common/wpt').harness;
 
 const request = {
   response: require(
-    fixtures.path('wpt', 'url', 'resources', 'toascii.json')
-  )
+    fixtures.path('wpt', 'url', 'resources', 'toascii.json'),
+  ),
 };
 
 // The following tests are copied from WPT. Modifications to them should be
diff --git a/test/module-hooks/module-hooks.status b/test/module-hooks/module-hooks.status
new file mode 100644
index 00000000000000..cb697c3ae80155
--- /dev/null
+++ b/test/module-hooks/module-hooks.status
@@ -0,0 +1,7 @@
+prefix module-hooks
+
+# To mark a test as flaky, list the test name in the appropriate section
+# below, without ".js", followed by ": PASS,FLAKY". Example:
+# sample-test                        : PASS,FLAKY
+
+[true] # This section applies to all platforms
diff --git a/test/module-hooks/test-module-hooks-import-wasm.mjs b/test/module-hooks/test-module-hooks-import-wasm.mjs
new file mode 100644
index 00000000000000..f2c357cd50390c
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-import-wasm.mjs
@@ -0,0 +1,35 @@
+// Flags: --no-experimental-wasm-modules
+// This tests that module.registerHooks() can be used to support unknown formats, like
+// import(wasm) (without --experimental-wasm-modules).
+import '../common/index.mjs';
+
+import assert from 'node:assert';
+import { registerHooks, createRequire } from 'node:module';
+import { readFileSync } from 'node:fs';
+
+registerHooks({
+  load(url, context, nextLoad) {
+    assert.match(url, /simple\.wasm$/);
+    const source =
+      `const buf = Buffer.from([${Array.from(readFileSync(new URL(url))).join(',')}]);
+       const compiled = new WebAssembly.Module(buf);
+       const { exports } = new WebAssembly.Instance(compiled);
+       export default exports;
+       export { exports as 'module.exports' };
+    `;
+    return {
+      shortCircuit: true,
+      source,
+      format: 'module',
+    };
+  },
+});
+
+// Checks that it works with require.
+const require = createRequire(import.meta.url);
+const { add } = require('../fixtures/simple.wasm');
+assert.strictEqual(add(1, 2), 3);
+
+// Checks that it works with import.
+const { default: { add: add2 } } = await import('../fixtures/simple.wasm');
+assert.strictEqual(add2(1, 2), 3);
diff --git a/test/module-hooks/test-module-hooks-load-buffers.js b/test/module-hooks/test-module-hooks-load-buffers.js
new file mode 100644
index 00000000000000..07f7374fd96161
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-buffers.js
@@ -0,0 +1,50 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// This tests that the source in the load hook can be returned as
+// array buffers or array buffer views.
+const arrayBufferSource = 'module.exports = "arrayBuffer"';
+const arrayBufferViewSource = 'module.exports = "arrayBufferView"';
+
+const encoder = new TextEncoder();
+
+const hook1 = registerHooks({
+  resolve(specifier, context, nextResolve) {
+    return { shortCircuit: true, url: `test://${specifier}` };
+  },
+  load(url, context, nextLoad) {
+    const result = nextLoad(url, context);
+    if (url === 'test://array_buffer') {
+      assert.deepStrictEqual(result.source, encoder.encode(arrayBufferSource).buffer);
+    } else if (url === 'test://array_buffer_view') {
+      assert.deepStrictEqual(result.source, encoder.encode(arrayBufferViewSource));
+    }
+    return result;
+  },
+});
+
+const hook2 = registerHooks({
+  load(url, context, nextLoad) {
+    if (url === 'test://array_buffer') {
+      return {
+        shortCircuit: true,
+        source: encoder.encode(arrayBufferSource).buffer,
+      };
+    } else if (url === 'test://array_buffer_view') {
+      return {
+        shortCircuit: true,
+        source: encoder.encode(arrayBufferViewSource),
+      };
+    }
+    assert.fail('unreachable');
+  },
+});
+
+assert.strictEqual(require('array_buffer'), 'arrayBuffer');
+assert.strictEqual(require('array_buffer_view'), 'arrayBufferView');
+
+hook1.deregister();
+hook2.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-builtin-import.mjs b/test/module-hooks/test-module-hooks-load-builtin-import.mjs
new file mode 100644
index 00000000000000..f78c69692fe04b
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-builtin-import.mjs
@@ -0,0 +1,29 @@
+import { mustCall } from '../common/index.mjs';
+import assert from 'node:assert';
+import { registerHooks } from 'node:module';
+import process from 'node:process';
+
+// This tests that imported builtins get null as source from default
+// step, and the source returned are ignored.
+// TODO(joyeecheung): this is to align with the module.register() behavior
+// but perhaps the load hooks should not be invoked for builtins at all.
+
+// Pick a builtin that's unlikely to be loaded already - like zlib.
+assert(!process.moduleLoadList.includes('NativeModule zlib'));
+
+const hook = registerHooks({
+  load: mustCall(function load(url, context, nextLoad) {
+    assert.strictEqual(url, 'node:zlib');
+    const result = nextLoad(url, context);
+    assert.strictEqual(result.source, null);
+    return {
+      source: 'throw new Error("I should not be thrown")',
+      format: 'builtin',
+    };
+  }),
+});
+
+const ns = await import('node:zlib');
+assert.strictEqual(typeof ns.createGzip, 'function');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-builtin-require.js b/test/module-hooks/test-module-hooks-load-builtin-require.js
new file mode 100644
index 00000000000000..78f732d2dd9207
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-builtin-require.js
@@ -0,0 +1,29 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// This tests that required builtins get null as source from default
+// step, and the source returned are ignored.
+// TODO(joyeecheung): this is to align with the module.register() behavior
+// but perhaps the load hooks should not be invoked for builtins at all.
+
+// Pick a builtin that's unlikely to be loaded already - like zlib.
+assert(!process.moduleLoadList.includes('NativeModule zlib'));
+
+const hook = registerHooks({
+  load: common.mustCall(function load(url, context, nextLoad) {
+    assert.strictEqual(url, 'node:zlib');
+    const result = nextLoad(url, context);
+    assert.strictEqual(result.source, null);
+    return {
+      source: 'throw new Error("I should not be thrown")',
+      format: 'builtin',
+    };
+  }),
+});
+
+assert.strictEqual(typeof require('zlib').createGzip, 'function');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-chained.js b/test/module-hooks/test-module-hooks-load-chained.js
new file mode 100644
index 00000000000000..5227658262a752
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-chained.js
@@ -0,0 +1,34 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that multiple loaders works together.
+const hook1 = registerHooks({
+  load(url, context, nextLoad) {
+    const result = nextLoad(url, context);
+    assert.strictEqual(result.source, '');
+    return {
+      source: 'exports.hello = "world"',
+      format: 'commonjs',
+    };
+  },
+});
+
+const hook2 = registerHooks({
+  load(url, context, nextLoad) {
+    const result = nextLoad(url, context);
+    assert.strictEqual(result.source, 'exports.hello = "world"');
+    return {
+      source: 'export const hello = "world"',
+      format: 'module',
+    };
+  },
+});
+
+const mod = require('../fixtures/empty.js');
+assert.strictEqual(mod.hello, 'world');
+
+hook1.deregister();
+hook2.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-detection.js b/test/module-hooks/test-module-hooks-load-detection.js
new file mode 100644
index 00000000000000..9915b98440355b
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-detection.js
@@ -0,0 +1,21 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that module syntax detection works.
+const hook = registerHooks({
+  load(url, context, nextLoad) {
+    const result = nextLoad(url, context);
+    assert.strictEqual(result.source, '');
+    return {
+      source: 'export const hello = "world"',
+    };
+  },
+});
+
+const mod = require('../fixtures/empty.js');
+assert.strictEqual(mod.hello, 'world');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-esm-mock.js b/test/module-hooks/test-module-hooks-load-esm-mock.js
new file mode 100644
index 00000000000000..88941b5d685f07
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-esm-mock.js
@@ -0,0 +1,51 @@
+'use strict';
+
+// This tests a pirates-like load hook works.
+
+const common = require('../common');
+const assert = require('assert');
+const fixtures = require('../common/fixtures');
+const { readFileSync } = require('fs');
+
+const loader = require('../fixtures/module-hooks/load-from-this-dir');
+const { addHook } = require('../fixtures/module-hooks/add-hook');
+
+const matcherArgs = [];
+function matcher(filename) {
+  matcherArgs.push(filename);
+  return true;
+}
+
+const hookArgs = [];
+function hook(code, filename) {
+  hookArgs.push({ code, filename });
+  return code.replace('$key', 'hello');
+}
+
+(async () => {
+  const revert = addHook(hook, { exts: ['.js'], matcher });
+
+  {
+    const foo = await loader.import('foo-esm');
+    const filename = fixtures.path('module-hooks', 'node_modules', 'foo-esm', 'foo-esm.js');
+    assert.deepStrictEqual(matcherArgs, [filename]);
+    const code = readFileSync(filename, 'utf-8');
+    assert.deepStrictEqual(hookArgs, [{ code, filename }]);
+    assert.deepStrictEqual({ ...foo }, { hello: 'foo-esm' });
+  }
+
+  matcherArgs.splice(0, 1);
+  hookArgs.splice(0, 1);
+
+  revert();
+
+  // Later loads are unaffected.
+
+  {
+    const bar = await loader.import('bar-esm');
+    assert.deepStrictEqual(matcherArgs, []);
+    assert.deepStrictEqual(hookArgs, []);
+    assert.deepStrictEqual({ ...bar }, { $key: 'bar-esm' });
+  }
+
+})().catch(common.mustNotCall());
diff --git a/test/module-hooks/test-module-hooks-load-esm.js b/test/module-hooks/test-module-hooks-load-esm.js
new file mode 100644
index 00000000000000..88941b5d685f07
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-esm.js
@@ -0,0 +1,51 @@
+'use strict';
+
+// This tests a pirates-like load hook works.
+
+const common = require('../common');
+const assert = require('assert');
+const fixtures = require('../common/fixtures');
+const { readFileSync } = require('fs');
+
+const loader = require('../fixtures/module-hooks/load-from-this-dir');
+const { addHook } = require('../fixtures/module-hooks/add-hook');
+
+const matcherArgs = [];
+function matcher(filename) {
+  matcherArgs.push(filename);
+  return true;
+}
+
+const hookArgs = [];
+function hook(code, filename) {
+  hookArgs.push({ code, filename });
+  return code.replace('$key', 'hello');
+}
+
+(async () => {
+  const revert = addHook(hook, { exts: ['.js'], matcher });
+
+  {
+    const foo = await loader.import('foo-esm');
+    const filename = fixtures.path('module-hooks', 'node_modules', 'foo-esm', 'foo-esm.js');
+    assert.deepStrictEqual(matcherArgs, [filename]);
+    const code = readFileSync(filename, 'utf-8');
+    assert.deepStrictEqual(hookArgs, [{ code, filename }]);
+    assert.deepStrictEqual({ ...foo }, { hello: 'foo-esm' });
+  }
+
+  matcherArgs.splice(0, 1);
+  hookArgs.splice(0, 1);
+
+  revert();
+
+  // Later loads are unaffected.
+
+  {
+    const bar = await loader.import('bar-esm');
+    assert.deepStrictEqual(matcherArgs, []);
+    assert.deepStrictEqual(hookArgs, []);
+    assert.deepStrictEqual({ ...bar }, { $key: 'bar-esm' });
+  }
+
+})().catch(common.mustNotCall());
diff --git a/test/module-hooks/test-module-hooks-load-invalid.js b/test/module-hooks/test-module-hooks-load-invalid.js
new file mode 100644
index 00000000000000..7836a864ca57b9
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-invalid.js
@@ -0,0 +1,39 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// This tests that the invalid return values in load hooks are not accepted.
+
+const hook = registerHooks({
+  resolve(specifier, context, nextResolve) {
+    return { shortCircuit: true, url: `test://${specifier}` };
+  },
+  load(url, context, nextLoad) {
+    const result = { shortCircuit: true };
+    if (url.endsWith('array')) {
+      result.source = [];
+    } else if (url.endsWith('null')) {
+      result.source = null;
+    } else if (url.endsWith('number')) {
+      result.source = 1;
+    } else if (url.endsWith('boolean')) {
+      result.source = true;
+    } else if (url.endsWith('function')) {
+      result.source = () => {};
+    } else if (url.endsWith('object')) {
+      result.source = {};
+    }
+    return result;
+  },
+});
+
+for (const item of ['undefined', 'array', 'null', 'number', 'boolean', 'function', 'object']) {
+  assert.throws(() => { require(item); }, {
+    code: 'ERR_INVALID_RETURN_PROPERTY_VALUE',
+    message: /"source" from the "load" hook/,
+  });
+}
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-mock.js b/test/module-hooks/test-module-hooks-load-mock.js
new file mode 100644
index 00000000000000..bf00182bc32bb4
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-mock.js
@@ -0,0 +1,48 @@
+'use strict';
+
+// This tests a pirates-like load hook works.
+
+require('../common');
+const assert = require('assert');
+const fixtures = require('../common/fixtures');
+const { readFileSync } = require('fs');
+
+const loader = require('../fixtures/module-hooks/load-from-this-dir');
+const { addHook } = require('../fixtures/module-hooks/add-hook');
+
+const matcherArgs = [];
+function matcher(filename) {
+  matcherArgs.push(filename);
+  return true;
+}
+
+const hookArgs = [];
+function hook(code, filename) {
+  hookArgs.push({ code, filename });
+  return code.replace('$key', 'hello');
+}
+
+const revert = addHook(hook, { exts: ['.js'], matcher });
+
+{
+  const foo = loader.require('foo');
+  const filename = fixtures.path('module-hooks', 'node_modules', 'foo', 'foo.js');
+  assert.deepStrictEqual(matcherArgs, [filename]);
+  const code = readFileSync(filename, 'utf-8');
+  assert.deepStrictEqual(hookArgs, [{ code, filename }]);
+  assert.deepStrictEqual(foo, { hello: 'foo' });
+}
+
+matcherArgs.splice(0, 1);
+hookArgs.splice(0, 1);
+
+revert();
+
+// Later loads are unaffected.
+
+{
+  const bar = loader.require('bar');
+  assert.deepStrictEqual(matcherArgs, []);
+  assert.deepStrictEqual(hookArgs, []);
+  assert.deepStrictEqual(bar, { $key: 'bar' });
+}
diff --git a/test/module-hooks/test-module-hooks-load-short-circuit-required-middle.js b/test/module-hooks/test-module-hooks-load-short-circuit-required-middle.js
new file mode 100644
index 00000000000000..a3d7d9c28cc50d
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-short-circuit-required-middle.js
@@ -0,0 +1,33 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that shortCircuit is required in a middle hook when nextLoad is not called.
+const hook1 = registerHooks({
+  load(url, context, nextLoad) {
+    return nextLoad(url, context);
+  },
+});
+const hook2 = registerHooks({
+  load(url, context, nextLoad) {
+    if (url.includes('empty')) {
+      return {
+        format: 'commonjs',
+        source: 'module.exports = "modified"',
+      };
+    }
+    return nextLoad(url, context);
+  },
+});
+
+assert.throws(() => {
+  require('../fixtures/empty.js');
+}, {
+  code: 'ERR_INVALID_RETURN_PROPERTY_VALUE',
+  message: /shortCircuit/,
+});
+
+hook1.deregister();
+hook2.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-short-circuit-required-start.js b/test/module-hooks/test-module-hooks-load-short-circuit-required-start.js
new file mode 100644
index 00000000000000..7de85018427bc3
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-short-circuit-required-start.js
@@ -0,0 +1,29 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that shortCircuit is required in the starting hook when nextLoad is not called.
+const hook = registerHooks({
+  load(url, context, nextLoad) {
+    if (url.includes('empty')) {
+      return {
+        format: 'commonjs',
+        source: 'module.exports = "modified"',
+      };
+    }
+    return nextLoad(url, context);
+  },
+});
+
+assert.throws(() => {
+  require('../fixtures/empty.js');
+}, {
+  code: 'ERR_INVALID_RETURN_PROPERTY_VALUE',
+  message: /shortCircuit/,
+});
+
+const baz = require('../fixtures/baz.js');
+assert.strictEqual(baz, 'perhaps I work');
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-load-short-circuit.js b/test/module-hooks/test-module-hooks-load-short-circuit.js
new file mode 100644
index 00000000000000..d4f3d2f2341cb7
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-load-short-circuit.js
@@ -0,0 +1,28 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that shortCircuit is required in a middle hook when nextResolve is not called.
+const hook1 = registerHooks({
+  load: common.mustNotCall(),
+});
+const hook2 = registerHooks({
+  load(url, context, nextLoad) {
+    if (url.includes('empty')) {
+      return {
+        format: 'commonjs',
+        source: 'module.exports = "modified"',
+        shortCircuit: true,
+      };
+    }
+    return nextLoad(url, context);
+  },
+});
+
+const value = require('../fixtures/empty.js');
+assert.strictEqual(value, 'modified');
+
+hook1.deregister();
+hook2.deregister();
diff --git a/test/module-hooks/test-module-hooks-preload.js b/test/module-hooks/test-module-hooks-preload.js
new file mode 100644
index 00000000000000..a88cd672a59a78
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-preload.js
@@ -0,0 +1,49 @@
+'use strict';
+
+require('../common');
+const fixtures = require('../common/fixtures.js');
+const { spawnSyncAndAssert } = require('../common/child_process.js');
+
+spawnSyncAndAssert(process.execPath,
+                   [
+                     '--require',
+                     fixtures.path('module-hooks', 'register-typescript-hooks.js'),
+                     fixtures.path('module-hooks', 'log-user.cts'),
+                   ], {
+                     trim: true,
+                     stdout: 'UserAccount { name: \'john\', id: 100, type: 1 }',
+                   });
+
+spawnSyncAndAssert(process.execPath,
+                   [
+                     '--experimental-strip-types',
+                     '--no-experimental-transform-types',
+                     '--require',
+                     fixtures.path('module-hooks', 'register-typescript-hooks.js'),
+                     fixtures.path('module-hooks', 'log-user.cts'),
+                   ], {
+                     trim: true,
+                     stdout: 'UserAccount { name: \'john\', id: 100, type: 1 }',
+                   });
+
+spawnSyncAndAssert(process.execPath,
+                   [
+                     '--import',
+                     fixtures.fileURL('module-hooks', 'register-typescript-hooks.js'),
+                     fixtures.path('module-hooks', 'log-user.mts'),
+                   ], {
+                     trim: true,
+                     stdout: 'UserAccount { name: \'john\', id: 100, type: 1 }',
+                   });
+
+spawnSyncAndAssert(process.execPath,
+                   [
+                     '--experimental-strip-types',
+                     '--no-experimental-transform-types',
+                     '--import',
+                     fixtures.fileURL('module-hooks', 'register-typescript-hooks.js'),
+                     fixtures.path('module-hooks', 'log-user.mts'),
+                   ], {
+                     trim: true,
+                     stdout: 'UserAccount { name: \'john\', id: 100, type: 1 }',
+                   });
diff --git a/test/module-hooks/test-module-hooks-require-wasm.js b/test/module-hooks/test-module-hooks-require-wasm.js
new file mode 100644
index 00000000000000..b4276bcc749a01
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-require-wasm.js
@@ -0,0 +1,34 @@
+// Flags: --no-experimental-wasm-modules
+'use strict';
+
+// This tests that module.registerHooks() can be used to support unknown formats, like
+// require(wasm) and import(wasm) (without --experimental-wasm-modules).
+const common = require('../common');
+
+const assert = require('assert');
+const { registerHooks } = require('module');
+const { readFileSync } = require('fs');
+
+registerHooks({
+  load(url, context, nextLoad) {
+    assert.match(url, /simple\.wasm$/);
+    const source =
+      `const buf = Buffer.from([${Array.from(readFileSync(new URL(url))).join(',')}]);
+       const compiled = new WebAssembly.Module(buf);
+       module.exports = (new WebAssembly.Instance(compiled)).exports;`;
+    return {
+      shortCircuit: true,
+      source,
+      format: 'commonjs',
+    };
+  },
+});
+
+// Checks that it works with require.
+const { add } = require('../fixtures/simple.wasm');
+assert.strictEqual(add(1, 2), 3);
+
+(async () => {   // Checks that it works with import.
+  const { default: { add } } = await import('../fixtures/simple.wasm');
+  assert.strictEqual(add(1, 2), 3);
+})().then(common.mustCall());
diff --git a/test/module-hooks/test-module-hooks-resolve-builtin-builtin-import.mjs b/test/module-hooks/test-module-hooks-resolve-builtin-builtin-import.mjs
new file mode 100644
index 00000000000000..b7c31678137e5e
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-builtin-builtin-import.mjs
@@ -0,0 +1,27 @@
+import '../common/index.mjs';
+import assert from 'node:assert';
+import { registerHooks } from 'node:module';
+import process from 'node:process';
+
+// This tests that builtins can be redirected to another builtin.
+// Pick a builtin that's unlikely to be loaded already - like zlib.
+assert(!process.moduleLoadList.includes('NativeModule zlib'));
+
+const hook = registerHooks({
+  resolve(specifier, context, nextLoad) {
+    if (specifier === 'node:assert') {
+      return {
+        url: 'node:zlib',
+        shortCircuit: true,
+      };
+    }
+  },
+});
+
+
+// Check assert, which is already loaded.
+// zlib.createGzip is a function.
+const redirected = await import('node:assert');
+assert.strictEqual(typeof redirected.createGzip, 'function');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-builtin-builtin-require.js b/test/module-hooks/test-module-hooks-resolve-builtin-builtin-require.js
new file mode 100644
index 00000000000000..6de7b0d23d6675
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-builtin-builtin-require.js
@@ -0,0 +1,26 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// This tests that builtins can be redirected to another builtin.
+// Pick a builtin that's unlikely to be loaded already - like zlib.
+assert(!process.moduleLoadList.includes('NativeModule zlib'));
+
+const hook = registerHooks({
+  resolve(specifier, context, nextLoad) {
+    if (specifier === 'assert') {
+      return {
+        url: 'node:zlib',
+        shortCircuit: true,
+      };
+    }
+  },
+});
+
+// Check assert, which is already loaded.
+// zlib.createGzip is a function.
+assert.strictEqual(typeof require('assert').createGzip, 'function');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-builtin-on-disk-import.mjs b/test/module-hooks/test-module-hooks-resolve-builtin-on-disk-import.mjs
new file mode 100644
index 00000000000000..0afd294298c814
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-builtin-on-disk-import.mjs
@@ -0,0 +1,36 @@
+import '../common/index.mjs';
+import { fileURL } from '../common/fixtures.mjs';
+import assert from 'node:assert';
+import { registerHooks } from 'node:module';
+import process from 'node:process';
+
+// This tests that builtins can be redirected to a local file.
+// Pick a builtin that's unlikely to be loaded already - like zlib.
+assert(!process.moduleLoadList.includes('NativeModule zlib'));
+
+const hook = registerHooks({
+  resolve(specifier, context, nextLoad) {
+    // FIXME(joyeecheung): when it gets redirected to a CommonJS module, the
+    // ESM loader invokes the CJS loader with the resolved URL again even when
+    // it already has the url and source code. Fix it so that the hooks are
+    // skipped during the second loading.
+    if (!specifier.startsWith('node:')) {
+      return nextLoad(specifier, context);
+    }
+    return {
+      url: fileURL(
+        'module-hooks',
+        `redirected-${specifier.replace('node:', '')}.js`).href,
+      shortCircuit: true,
+    };
+  },
+});
+
+// Check assert, which is already loaded.
+assert.strictEqual((await import('node:assert')).exports_for_test, 'redirected assert');
+// Check zlib, which is not yet loaded.
+assert.strictEqual((await import('node:zlib')).exports_for_test, 'redirected zlib');
+// Check fs, which is redirected to an ESM
+assert.strictEqual((await import('node:fs')).exports_for_test, 'redirected fs');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-builtin-on-disk-require.js b/test/module-hooks/test-module-hooks-resolve-builtin-on-disk-require.js
new file mode 100644
index 00000000000000..0006975867ce9c
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-builtin-on-disk-require.js
@@ -0,0 +1,29 @@
+'use strict';
+
+require('../common');
+
+const assert = require('assert');
+const { registerHooks } = require('module');
+const fixtures = require('../common/fixtures');
+
+// This tests that builtins can be redirected to a local file.
+// Pick a builtin that's unlikely to be loaded already - like zlib.
+assert(!process.moduleLoadList.includes('NativeModule zlib'));
+
+const hook = registerHooks({
+  resolve(specifier, context, nextLoad) {
+    return {
+      url: fixtures.fileURL('module-hooks', `redirected-${specifier}.js`).href,
+      shortCircuit: true,
+    };
+  },
+});
+
+// Check assert, which is already loaded.
+assert.strictEqual(require('assert').exports_for_test, 'redirected assert');
+// Check zlib, which is not yet loaded.
+assert.strictEqual(require('zlib').exports_for_test, 'redirected zlib');
+// Check fs, which is redirected to an ESM
+assert.strictEqual(require('fs').exports_for_test, 'redirected fs');
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-invalid.js b/test/module-hooks/test-module-hooks-resolve-invalid.js
new file mode 100644
index 00000000000000..48f121dfe70b31
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-invalid.js
@@ -0,0 +1,36 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// This tests that the invalid return values in resolve hooks are not accepted.
+
+const hook = registerHooks({
+  resolve(specifier, context, nextLoad) {
+    const result = { shortCircuit: true };
+    if (specifier === 'array') {
+      result.url = [];
+    } else if (specifier === 'null') {
+      result.url = null;
+    } else if (specifier === 'number') {
+      result.url = 1;
+    } else if (specifier === 'boolean') {
+      result.url = true;
+    } else if (specifier === 'function') {
+      result.url = () => {};
+    } else if (specifier === 'object') {
+      result.url = {};
+    }
+    return result;
+  },
+});
+
+for (const item of ['undefined', 'array', 'null', 'number', 'boolean', 'function', 'object']) {
+  assert.throws(() => { require(item); }, {
+    code: 'ERR_INVALID_RETURN_PROPERTY_VALUE',
+    message: /"url" from the "resolve" hook/,
+  });
+}
+
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-load-import-inline-typescript-override.mjs b/test/module-hooks/test-module-hooks-resolve-load-import-inline-typescript-override.mjs
new file mode 100644
index 00000000000000..18e8d20ef2d93b
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-load-import-inline-typescript-override.mjs
@@ -0,0 +1,11 @@
+// Flags: --experimental-strip-types --no-experimental-transform-types
+// This tests that a mini TypeScript loader works with resolve and
+// load hooks when overriding --experimental-strip-types in ESM.
+import '../common/index.mjs';
+import assert from 'node:assert';
+
+await import('../fixtures/module-hooks/register-typescript-hooks.js');
+// Test inline import(), if override fails, this should fail too because enum is
+// not supported when --experimental-transform-types is disabled.
+const { UserAccount, UserType } = await import('../fixtures/module-hooks/user.ts');
+assert.strictEqual((new UserAccount('foo', 1, UserType.Admin).name), 'foo');
diff --git a/test/module-hooks/test-module-hooks-resolve-load-import-inline-typescript.mjs b/test/module-hooks/test-module-hooks-resolve-load-import-inline-typescript.mjs
new file mode 100644
index 00000000000000..797597764308c2
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-load-import-inline-typescript.mjs
@@ -0,0 +1,11 @@
+// Flags: --no-experimental-strip-types --no-experimental-transform-types
+// This tests that a mini TypeScript loader works with resolve and
+// load hooks when TypeScript support is disabled.
+import '../common/index.mjs';
+import assert from 'node:assert';
+
+await import('../fixtures/module-hooks/register-typescript-hooks.js');
+// Test inline import(), if override fails, this should fail too because enum is
+// not supported when --experimental-transform-types is disabled.
+const { UserAccount, UserType } = await import('../fixtures/module-hooks/user.ts');
+assert.strictEqual((new UserAccount('foo', 1, UserType.Admin).name), 'foo');
diff --git a/test/module-hooks/test-module-hooks-resolve-load-require-inline-typescript-override.js b/test/module-hooks/test-module-hooks-resolve-load-require-inline-typescript-override.js
new file mode 100644
index 00000000000000..967e362c70413f
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-load-require-inline-typescript-override.js
@@ -0,0 +1,13 @@
+'use strict';
+// Flags: --experimental-strip-types --no-experimental-transform-types
+// This tests that a mini TypeScript loader works with resolve and
+// load hooks when overriding --experimental-strip-types in CJS.
+
+require('../common');
+const assert = require('assert');
+
+require('../fixtures/module-hooks/register-typescript-hooks.js');
+// Test inline require(), if override fails, this should fail too because enum is
+// not supported when --experimental-transform-types is disabled.
+const { UserAccount, UserType } = require('../fixtures/module-hooks/user.ts');
+assert.strictEqual((new UserAccount('foo', 1, UserType.Admin).name), 'foo');
diff --git a/test/module-hooks/test-module-hooks-resolve-load-require-inline-typescript.js b/test/module-hooks/test-module-hooks-resolve-load-require-inline-typescript.js
new file mode 100644
index 00000000000000..4366438f684262
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-load-require-inline-typescript.js
@@ -0,0 +1,12 @@
+'use strict';
+// Flags: --no-experimental-strip-types --no-experimental-transform-types
+// This tests that a mini TypeScript loader works with resolve and
+// load hooks when TypeScript support is disabled.
+
+require('../common');
+const assert = require('assert');
+
+// Test inline require().
+require('../fixtures/module-hooks/register-typescript-hooks.js');
+const { UserAccount, UserType } = require('../fixtures/module-hooks/user.ts');
+assert.strictEqual((new UserAccount('foo', 1, UserType.Admin).name), 'foo');
diff --git a/test/module-hooks/test-module-hooks-resolve-short-circuit-required-middle.js b/test/module-hooks/test-module-hooks-resolve-short-circuit-required-middle.js
new file mode 100644
index 00000000000000..1275304f997d9f
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-short-circuit-required-middle.js
@@ -0,0 +1,32 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that shortCircuit is required in a middle hook when nextResolve is not called.
+const hook1 = registerHooks({
+  resolve(specifier, context, nextResolve) {
+    return nextResolve(specifier, context);
+  },
+});
+const hook2 = registerHooks({
+  resolve(specifier, context, nextResolve) {
+    if (specifier === 'bar') {
+      return {
+        url: 'node:bar',
+      };
+    }
+    return nextResolve(specifier, context);
+  },
+});
+
+assert.throws(() => {
+  require('bar');
+}, {
+  code: 'ERR_INVALID_RETURN_PROPERTY_VALUE',
+  message: /shortCircuit/,
+});
+
+hook1.deregister();
+hook2.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-short-circuit-required-start.js b/test/module-hooks/test-module-hooks-resolve-short-circuit-required-start.js
new file mode 100644
index 00000000000000..69c68212b0d025
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-short-circuit-required-start.js
@@ -0,0 +1,28 @@
+'use strict';
+
+require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that shortCircuit is required in the starting hook when nextResolve is not called.
+const hook = registerHooks({
+  resolve(specifier, context, nextResolve) {
+    if (specifier === 'foo') {
+      return {
+        url: 'node:foo',
+      };
+    }
+    return nextResolve(specifier, context);
+  },
+});
+
+assert.throws(() => {
+  require('foo');
+}, {
+  code: 'ERR_INVALID_RETURN_PROPERTY_VALUE',
+  message: /shortCircuit/,
+});
+
+const baz = require('../fixtures/baz.js');
+assert.strictEqual(baz, 'perhaps I work');
+hook.deregister();
diff --git a/test/module-hooks/test-module-hooks-resolve-short-circuit.js b/test/module-hooks/test-module-hooks-resolve-short-circuit.js
new file mode 100644
index 00000000000000..83e7057fa1ab7a
--- /dev/null
+++ b/test/module-hooks/test-module-hooks-resolve-short-circuit.js
@@ -0,0 +1,29 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const { registerHooks } = require('module');
+
+// Test that shortCircuit works for the resolve hook.
+const source1 = 'module.exports = "modified"';
+const hook1 = registerHooks({
+  load: common.mustNotCall(),
+});
+const hook2 = registerHooks({
+  load(url, context, nextLoad) {
+    if (url.includes('empty')) {
+      return {
+        format: 'commonjs',
+        source: source1,
+        shortCircuit: true,
+      };
+    }
+    return nextLoad(url, context);
+  },
+});
+
+const value = require('../fixtures/empty.js');
+assert.strictEqual(value, 'modified');
+
+hook1.deregister();
+hook2.deregister();
diff --git a/test/module-hooks/testcfg.py b/test/module-hooks/testcfg.py
new file mode 100644
index 00000000000000..f904b1e9170fde
--- /dev/null
+++ b/test/module-hooks/testcfg.py
@@ -0,0 +1,6 @@
+import sys, os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+import testpy
+
+def GetConfiguration(context, root):
+  return testpy.ParallelTestConfiguration(context, root, 'module-hooks')
diff --git a/test/parallel/parallel.status b/test/parallel/parallel.status
index eeb67a187303be..04d98f75383a4d 100644
--- a/test/parallel/parallel.status
+++ b/test/parallel/parallel.status
@@ -39,8 +39,6 @@ test-runner-run-watch: PASS, FLAKY
 test-performance-function: PASS, FLAKY
 # https://github.com/nodejs/node/issues/54346
 test-esm-loader-hooks-inspect-wait: PASS, FLAKY
-# https://github.com/nodejs/node/issues/54810
-test-sqlite-statement-sync: PASS, FLAKY
 # https://github.com/nodejs/node/issues/54534
 test-runner-run-watch: PASS, FLAKY
 
diff --git a/test/parallel/test-abortsignal-drop-settled-signals.mjs b/test/parallel/test-abortsignal-drop-settled-signals.mjs
index 728002b51d30d5..2bff3a3e063057 100644
--- a/test/parallel/test-abortsignal-drop-settled-signals.mjs
+++ b/test/parallel/test-abortsignal-drop-settled-signals.mjs
@@ -1,6 +1,7 @@
 // Flags: --expose_gc
 //
 import '../common/index.mjs';
+import { gcUntil } from '../common/gc.js';
 import { describe, it } from 'node:test';
 
 function makeSubsequentCalls(limit, done, holdReferences = false) {
@@ -64,6 +65,41 @@ function runShortLivedSourceSignal(limit, done) {
   run(1);
 };
 
+function runWithOrphanListeners(limit, done) {
+  let composedSignalRef;
+  const composedSignalRefs = [];
+  const handler = () => { };
+
+  function run(iteration) {
+    const ac = new AbortController();
+    if (iteration > limit) {
+      setImmediate(() => {
+        global.gc();
+        setImmediate(() => {
+          global.gc();
+
+          done(composedSignalRefs);
+        });
+      });
+      return;
+    }
+
+    composedSignalRef = new WeakRef(AbortSignal.any([ac.signal]));
+    composedSignalRef.deref().addEventListener('abort', handler);
+
+    const otherComposedSignalRef = new WeakRef(AbortSignal.any([composedSignalRef.deref()]));
+    otherComposedSignalRef.deref().addEventListener('abort', handler);
+
+    composedSignalRefs.push(composedSignalRef, otherComposedSignalRef);
+
+    setImmediate(() => {
+      run(iteration + 1);
+    });
+  }
+
+  run(1);
+}
+
 const limit = 10_000;
 
 describe('when there is a long-lived signal', () => {
@@ -106,17 +142,29 @@ it('drops settled dependant signals when signal is composite', (t, done) => {
   );
 
   setImmediate(() => {
-    global.gc();
+    global.gc({ execution: 'async' }).then(() => {
+      t.assert.strictEqual(composedSignalRef.deref(), undefined);
+      t.assert.strictEqual(controllers[0].signal[kDependantSignals].size, 2);
+      t.assert.strictEqual(controllers[1].signal[kDependantSignals].size, 1);
 
-    t.assert.strictEqual(composedSignalRef.deref(), undefined);
-    t.assert.strictEqual(controllers[0].signal[kDependantSignals].size, 2);
-    t.assert.strictEqual(controllers[1].signal[kDependantSignals].size, 1);
+      setImmediate(() => {
+        t.assert.strictEqual(controllers[0].signal[kDependantSignals].size, 0);
+        t.assert.strictEqual(controllers[1].signal[kDependantSignals].size, 0);
 
-    setImmediate(() => {
-      t.assert.strictEqual(controllers[0].signal[kDependantSignals].size, 0);
-      t.assert.strictEqual(controllers[1].signal[kDependantSignals].size, 0);
+        done();
+      });
+    });
+  });
+});
 
-      done();
+it('drops settled signals even when there are listeners', (t, done) => {
+  runWithOrphanListeners(limit, async (signalRefs) => {
+    await gcUntil('all signals are GCed', () => {
+      const unGCedSignals = [...signalRefs].filter((ref) => ref.deref());
+
+      return unGCedSignals.length === 0;
     });
+
+    done();
   });
 });
diff --git a/test/parallel/test-assert-objects.js b/test/parallel/test-assert-objects.js
index d1c8bb854babb0..3f02ff3c274daa 100644
--- a/test/parallel/test-assert-objects.js
+++ b/test/parallel/test-assert-objects.js
@@ -39,10 +39,15 @@ describe('Object Comparison Tests', () => {
     describe('throws an error', () => {
       const tests = [
         {
-          description: 'throws when only one argument is provided',
+          description: 'throws when only actual is provided',
           actual: { a: 1 },
           expected: undefined,
         },
+        {
+          description: 'throws when only expected is provided',
+          actual: undefined,
+          expected: { a: 1 },
+        },
         {
           description: 'throws when expected has more properties than actual',
           actual: [1, 'two'],
@@ -164,12 +169,26 @@ describe('Object Comparison Tests', () => {
         },
         {
           description:
-            'throws when comparing two Map objects with different length',
+            'throws when the expected Map has more entries than the actual Map',
           actual: new Map([
             ['key1', 'value1'],
             ['key2', 'value2'],
           ]),
-          expected: new Map([['key1', 'value1']]),
+          expected: new Map([
+            ['key1', 'value1'],
+            ['key2', 'value2'],
+            ['key3', 'value3'],
+          ]),
+        },
+        {
+          description: 'throws when the nested array in the Map is not a subset of the other nested array',
+          actual: new Map([
+            ['key1', ['value1', 'value2']],
+            ['key2', 'value2'],
+          ]),
+          expected: new Map([
+            ['key1', ['value3']],
+          ]),
         },
         {
           description:
@@ -207,6 +226,74 @@ describe('Object Comparison Tests', () => {
           actual: [1, 2, 3],
           expected: ['2'],
         },
+        {
+          description: 'throws when comparing an ArrayBuffer with a Uint8Array',
+          actual: new ArrayBuffer(3),
+          expected: new Uint8Array(3),
+        },
+        {
+          description: 'throws when comparing a ArrayBuffer with a SharedArrayBuffer',
+          actual: new ArrayBuffer(3),
+          expected: new SharedArrayBuffer(3),
+        },
+        {
+          description: 'throws when comparing a SharedArrayBuffer with an ArrayBuffer',
+          actual: new SharedArrayBuffer(3),
+          expected: new ArrayBuffer(3),
+        },
+        {
+          description: 'throws when comparing an Int16Array with a Uint16Array',
+          actual: new Int16Array(3),
+          expected: new Uint16Array(3),
+        },
+        {
+          description: 'throws when comparing two dataviews with different buffers',
+          actual: { dataView: new DataView(new ArrayBuffer(3)) },
+          expected: { dataView: new DataView(new ArrayBuffer(4)) },
+        },
+        {
+          description: 'throws because expected Uint8Array(SharedArrayBuffer) is not a subset of actual',
+          actual: { typedArray: new Uint8Array(new SharedArrayBuffer(3)) },
+          expected: { typedArray: new Uint8Array(new SharedArrayBuffer(5)) },
+        },
+        {
+          description: 'throws because expected SharedArrayBuffer is not a subset of actual',
+          actual: { typedArray: new SharedArrayBuffer(3) },
+          expected: { typedArray: new SharedArrayBuffer(5) },
+        },
+        {
+          description: 'throws when comparing a DataView with a TypedArray',
+          actual: { dataView: new DataView(new ArrayBuffer(3)) },
+          expected: { dataView: new Uint8Array(3) },
+        },
+        {
+          description: 'throws when comparing a TypedArray with a DataView',
+          actual: { dataView: new Uint8Array(3) },
+          expected: { dataView: new DataView(new ArrayBuffer(3)) },
+        },
+        {
+          description: 'throws when comparing SharedArrayBuffers when expected has different elements actual',
+          actual: (() => {
+            const sharedBuffer = new SharedArrayBuffer(4 * Int32Array.BYTES_PER_ELEMENT);
+            const sharedArray = new Int32Array(sharedBuffer);
+
+            sharedArray[0] = 1;
+            sharedArray[1] = 2;
+            sharedArray[2] = 3;
+
+            return sharedBuffer;
+          })(),
+          expected: (() => {
+            const sharedBuffer = new SharedArrayBuffer(4 * Int32Array.BYTES_PER_ELEMENT);
+            const sharedArray = new Int32Array(sharedBuffer);
+
+            sharedArray[0] = 1;
+            sharedArray[1] = 2;
+            sharedArray[2] = 6;
+
+            return sharedBuffer;
+          })(),
+        },
       ];
 
       if (common.hasCrypto) {
@@ -343,10 +430,89 @@ describe('Object Comparison Tests', () => {
         expected: { error: new Error('Test error') },
       },
       {
-        description: 'compares two objects with TypedArray instances with the same content',
-        actual: { typedArray: new Uint8Array([1, 2, 3]) },
+        description: 'compares two Uint8Array objects',
+        actual: { typedArray: new Uint8Array([1, 2, 3, 4, 5]) },
         expected: { typedArray: new Uint8Array([1, 2, 3]) },
       },
+      {
+        description: 'compares two Int16Array objects',
+        actual: { typedArray: new Int16Array([1, 2, 3, 4, 5]) },
+        expected: { typedArray: new Int16Array([1, 2, 3]) },
+      },
+      {
+        description: 'compares two DataView objects with the same buffer and different views',
+        actual: { dataView: new DataView(new ArrayBuffer(8), 0, 4) },
+        expected: { dataView: new DataView(new ArrayBuffer(8), 4, 4) },
+      },
+      {
+        description: 'compares two DataView objects with different buffers',
+        actual: { dataView: new DataView(new ArrayBuffer(8)) },
+        expected: { dataView: new DataView(new ArrayBuffer(8)) },
+      },
+      {
+        description: 'compares two DataView objects with the same buffer and same views',
+        actual: { dataView: new DataView(new ArrayBuffer(8), 0, 8) },
+        expected: { dataView: new DataView(new ArrayBuffer(8), 0, 8) },
+      },
+      {
+        description: 'compares two SharedArrayBuffers with the same length',
+        actual: new SharedArrayBuffer(3),
+        expected: new SharedArrayBuffer(3),
+      },
+      {
+        description: 'compares two Uint8Array objects from SharedArrayBuffer',
+        actual: { typedArray: new Uint8Array(new SharedArrayBuffer(5)) },
+        expected: { typedArray: new Uint8Array(new SharedArrayBuffer(3)) },
+      },
+      {
+        description: 'compares two Int16Array objects from SharedArrayBuffer',
+        actual: { typedArray: new Int16Array(new SharedArrayBuffer(10)) },
+        expected: { typedArray: new Int16Array(new SharedArrayBuffer(6)) },
+      },
+      {
+        description: 'compares two DataView objects with the same SharedArrayBuffer and different views',
+        actual: { dataView: new DataView(new SharedArrayBuffer(8), 0, 4) },
+        expected: { dataView: new DataView(new SharedArrayBuffer(8), 4, 4) },
+      },
+      {
+        description: 'compares two DataView objects with different SharedArrayBuffers',
+        actual: { dataView: new DataView(new SharedArrayBuffer(8)) },
+        expected: { dataView: new DataView(new SharedArrayBuffer(8)) },
+      },
+      {
+        description: 'compares two DataView objects with the same SharedArrayBuffer and same views',
+        actual: { dataView: new DataView(new SharedArrayBuffer(8), 0, 8) },
+        expected: { dataView: new DataView(new SharedArrayBuffer(8), 0, 8) },
+      },
+      {
+        description: 'compares two SharedArrayBuffers',
+        actual: { typedArray: new SharedArrayBuffer(5) },
+        expected: { typedArray: new SharedArrayBuffer(3) },
+      },
+      {
+        description: 'compares two SharedArrayBuffers with data inside',
+        actual: (() => {
+          const sharedBuffer = new SharedArrayBuffer(4 * Int32Array.BYTES_PER_ELEMENT);
+          const sharedArray = new Int32Array(sharedBuffer);
+
+          sharedArray[0] = 1;
+          sharedArray[1] = 2;
+          sharedArray[2] = 3;
+          sharedArray[3] = 4;
+
+          return sharedBuffer;
+        })(),
+        expected: (() => {
+          const sharedBuffer = new SharedArrayBuffer(3 * Int32Array.BYTES_PER_ELEMENT);
+          const sharedArray = new Int32Array(sharedBuffer);
+
+          sharedArray[0] = 1;
+          sharedArray[1] = 2;
+          sharedArray[2] = 3;
+
+          return sharedBuffer;
+        })(),
+      },
       {
         description: 'compares two Map objects with identical entries',
         actual: new Map([
@@ -358,6 +524,19 @@ describe('Object Comparison Tests', () => {
           ['key2', 'value2'],
         ]),
       },
+      {
+        description: 'compares two Map where one is a subset of the other',
+        actual: new Map([
+          ['key1', { nested: { property: true } }],
+          ['key2', new Set([1, 2, 3])],
+          ['key3', new Uint8Array([1, 2, 3])],
+        ]),
+        expected: new Map([
+          ['key1', { nested: { property: true } }],
+          ['key2', new Set([1, 2, 3])],
+          ['key3', new Uint8Array([1, 2, 3])],
+        ])
+      },
       {
         describe: 'compares two array of objects',
         actual: [{ a: 5 }],
@@ -399,6 +578,63 @@ describe('Object Comparison Tests', () => {
           ['key2', 'value2'],
         ]),
       },
+      {
+        description:
+          'compares two Map objects where expected is a subset of actual',
+        actual: new Map([
+          ['key1', 'value1'],
+          ['key2', 'value2'],
+        ]),
+        expected: new Map([['key1', 'value1']]),
+      },
+      {
+        description:
+          'compares two deeply nested Maps',
+        actual: {
+          a: {
+            b: {
+              c: new Map([
+                ['key1', 'value1'],
+                ['key2', 'value2'],
+              ])
+            },
+            z: [1, 2, 3]
+          }
+        },
+        expected: {
+          a: {
+            z: [1, 2, 3],
+            b: {
+              c: new Map([['key1', 'value1']])
+            }
+          }
+        },
+      },
+      {
+        description: 'compares Maps nested into Maps',
+        actual: new Map([
+          ['key1', new Map([
+            ['nestedKey1', 'nestedValue1'],
+            ['nestedKey2', 'nestedValue2'],
+          ])],
+          ['key2', 'value2'],
+        ]),
+        expected: new Map([
+          ['key1', new Map([
+            ['nestedKey1', 'nestedValue1'],
+          ])],
+        ])
+      },
+      {
+        description: 'compares Maps with nested arrays inside',
+        actual: new Map([
+          ['key1', ['value1', 'value2']],
+          ['key2', 'value2'],
+        ]),
+        expected: new Map([
+          ['key1', ['value1', 'value2']],
+        ]),
+      },
       {
         description:
           'compares two objects with identical getter/setter properties',
diff --git a/test/parallel/test-assert-typedarray-deepequal.js b/test/parallel/test-assert-typedarray-deepequal.js
index 1c1c4c030a267e..7fb18c1886ba91 100644
--- a/test/parallel/test-assert-typedarray-deepequal.js
+++ b/test/parallel/test-assert-typedarray-deepequal.js
@@ -86,6 +86,8 @@ suite('notEqualArrayPairs', () => {
       new Uint8Array(new ArrayBuffer(3)).fill(1).buffer,
       new Uint8Array(new SharedArrayBuffer(3)).fill(2).buffer,
     ],
+    [new ArrayBuffer(3), new SharedArrayBuffer(3)],
+    [new SharedArrayBuffer(2), new ArrayBuffer(2)],
   ];
 
   for (const arrayPair of notEqualArrayPairs) {
@@ -99,6 +101,10 @@ suite('notEqualArrayPairs', () => {
         makeBlock(assert.deepStrictEqual, arrayPair[0], arrayPair[1]),
         assert.AssertionError
       );
+      assert.throws(
+        makeBlock(assert.partialDeepStrictEqual, arrayPair[0], arrayPair[1]),
+        assert.AssertionError
+      );
     });
   }
 });
diff --git a/test/parallel/test-bootstrap-modules.js b/test/parallel/test-bootstrap-modules.js
index 12adfaa7f5c5e1..c0ba01d3891477 100644
--- a/test/parallel/test-bootstrap-modules.js
+++ b/test/parallel/test-bootstrap-modules.js
@@ -98,6 +98,7 @@ expected.beforePreExec = new Set([
   'Internal Binding contextify',
   'NativeModule internal/vm',
   'NativeModule internal/modules/helpers',
+  'NativeModule internal/modules/customization_hooks',
   'NativeModule internal/modules/package_json_reader',
   'Internal Binding module_wrap',
   'NativeModule internal/modules/cjs/loader',
diff --git a/test/parallel/test-cli-bad-options.js b/test/parallel/test-cli-bad-options.js
index 8a77e94babb4fa..6868541325302d 100644
--- a/test/parallel/test-cli-bad-options.js
+++ b/test/parallel/test-cli-bad-options.js
@@ -14,8 +14,8 @@ if (process.features.inspector) {
 }
 requiresArgument('--eval');
 
-missingOption('--allow-fs-read=*', '--experimental-permission');
-missingOption('--allow-fs-write=*', '--experimental-permission');
+missingOption('--allow-fs-read=*', '--permission');
+missingOption('--allow-fs-write=*', '--permission');
 
 function missingOption(option, requiredOption) {
   const r = spawnSync(process.execPath, [option], { encoding: 'utf8' });
diff --git a/test/parallel/test-cli-permission-deny-fs.js b/test/parallel/test-cli-permission-deny-fs.js
index d38c4a61adbcfc..d5744cac94db3d 100644
--- a/test/parallel/test-cli-permission-deny-fs.js
+++ b/test/parallel/test-cli-permission-deny-fs.js
@@ -12,7 +12,7 @@ const path = require('path');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission', '-e',
+      '--permission', '-e',
       `console.log(process.permission.has("fs"));
        console.log(process.permission.has("fs.read"));
        console.log(process.permission.has("fs.write"));`,
@@ -31,7 +31,7 @@ const path = require('path');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-write', tmpPath, '-e',
       `console.log(process.permission.has("fs"));
       console.log(process.permission.has("fs.read"));
@@ -51,7 +51,7 @@ const path = require('path');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-write', '*', '-e',
       `console.log(process.permission.has("fs"));
        console.log(process.permission.has("fs.read"));
@@ -70,7 +70,7 @@ const path = require('path');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', '*', '-e',
       `console.log(process.permission.has("fs"));
        console.log(process.permission.has("fs.read"));
@@ -89,7 +89,7 @@ const path = require('path');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-write=*', '-p',
       'fs.readFileSync(process.execPath)',
     ]
@@ -104,7 +104,7 @@ const path = require('path');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '-p',
       'fs.readFileSync(process.execPath)',
     ]
@@ -119,7 +119,7 @@ const path = require('path');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read=*', '-p',
       'fs.writeFileSync("policy-deny-example.md", "# test")',
     ]
@@ -145,7 +145,7 @@ const path = require('path');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${firstPath}`,
       file,
     ]
diff --git a/test/parallel/test-cli-permission-multiple-allow.js b/test/parallel/test-cli-permission-multiple-allow.js
index 57ce15535300d5..3ff1935e7de1f4 100644
--- a/test/parallel/test-cli-permission-multiple-allow.js
+++ b/test/parallel/test-cli-permission-multiple-allow.js
@@ -12,7 +12,7 @@ const path = require('path');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-write', tmpPath, '--allow-fs-write', otherPath, '-e',
       `console.log(process.permission.has("fs"));
       console.log(process.permission.has("fs.read"));
@@ -36,7 +36,7 @@ const path = require('path');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-write',
       tmpPath,
       '--allow-fs-write',
@@ -63,7 +63,7 @@ const path = require('path');
   const { status, stdout, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read=*',
       `--allow-fs-write=${filePath}`,
       '-e',
diff --git a/test/parallel/test-compile-cache-api-permission.js b/test/parallel/test-compile-cache-api-permission.js
index 4163cadce1428f..1a0123161b1c36 100644
--- a/test/parallel/test-compile-cache-api-permission.js
+++ b/test/parallel/test-compile-cache-api-permission.js
@@ -26,7 +26,7 @@ const fs = require('fs');
   spawnSyncAndAssert(
     process.execPath,
     [
-      '--experimental-permission', `--allow-fs-read=${scriptDir}`, `--allow-fs-write=${scriptDir}`,
+      '--permission', `--allow-fs-read=${scriptDir}`, `--allow-fs-write=${scriptDir}`,
       '-r', wrapper, empty,
     ],
     {
diff --git a/test/parallel/test-compile-cache-permission-allowed.js b/test/parallel/test-compile-cache-permission-allowed.js
index 76dbfab720d8df..43ce4c274780db 100644
--- a/test/parallel/test-compile-cache-permission-allowed.js
+++ b/test/parallel/test-compile-cache-permission-allowed.js
@@ -23,7 +23,7 @@ function testAllowed(readDir, writeDir, envDir) {
   spawnSyncAndAssert(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${dummyDir}`,
       `--allow-fs-read=${readDir}`,
       `--allow-fs-write=${writeDir}`,
@@ -47,7 +47,7 @@ function testAllowed(readDir, writeDir, envDir) {
   spawnSyncAndAssert(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${dummyDir}`,
       `--allow-fs-read=${readDir}`,
       `--allow-fs-write=${writeDir}`,
diff --git a/test/parallel/test-compile-cache-permission-disallowed.js b/test/parallel/test-compile-cache-permission-disallowed.js
index dbbb38fb99f240..9870de81c5d031 100644
--- a/test/parallel/test-compile-cache-permission-disallowed.js
+++ b/test/parallel/test-compile-cache-permission-disallowed.js
@@ -24,7 +24,7 @@ function testDisallowed(dummyDir, cacheDirInPermission, cacheDirInEnv) {
   spawnSyncAndAssert(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${dummyDir}`,  // No read or write permission for cache dir.
       `--allow-fs-write=${dummyDir}`,
       script,
@@ -47,7 +47,7 @@ function testDisallowed(dummyDir, cacheDirInPermission, cacheDirInEnv) {
   spawnSyncAndAssert(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${dummyDir}`,
       `--allow-fs-read=${cacheDirInPermission}`,  // Read-only
       `--allow-fs-write=${dummyDir}`,
@@ -71,7 +71,7 @@ function testDisallowed(dummyDir, cacheDirInPermission, cacheDirInEnv) {
   spawnSyncAndAssert(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${dummyDir}`,
       `--allow-fs-write=${cacheDirInPermission}`,  // Write-only
       script,
diff --git a/test/parallel/test-dgram-blocklist.js b/test/parallel/test-dgram-blocklist.js
new file mode 100644
index 00000000000000..8af6522e7bd2d2
--- /dev/null
+++ b/test/parallel/test-dgram-blocklist.js
@@ -0,0 +1,49 @@
+'use strict';
+const common = require('../common');
+const assert = require('assert');
+const dgram = require('dgram');
+const net = require('net');
+
+{
+  const blockList = new net.BlockList();
+  blockList.addAddress(common.localhostIPv4);
+
+  const connectSocket = dgram.createSocket({ type: 'udp4', sendBlockList: blockList });
+  connectSocket.connect(9999, common.localhostIPv4, common.mustCall((err) => {
+    assert.ok(err.code === 'ERR_IP_BLOCKED', err);
+    connectSocket.close();
+  }));
+}
+
+{
+  const blockList = new net.BlockList();
+  blockList.addAddress(common.localhostIPv4);
+  const sendSocket = dgram.createSocket({ type: 'udp4', sendBlockList: blockList });
+  sendSocket.send('hello', 9999, common.localhostIPv4, common.mustCall((err) => {
+    assert.ok(err.code === 'ERR_IP_BLOCKED', err);
+    sendSocket.close();
+  }));
+}
+
+{
+  const blockList = new net.BlockList();
+  blockList.addAddress(common.localhostIPv4);
+  const receiveSocket = dgram.createSocket({ type: 'udp4', receiveBlockList: blockList });
+  // Hack to close the socket
+  const check = blockList.check;
+  blockList.check = function() {
+    process.nextTick(() => {
+      receiveSocket.close();
+    });
+    return check.apply(this, arguments);
+  };
+  receiveSocket.on('message', common.mustNotCall());
+  receiveSocket.bind(0, common.localhostIPv4, common.mustCall(() => {
+    const addressInfo = receiveSocket.address();
+    const client = dgram.createSocket('udp4');
+    client.send('hello', addressInfo.port, addressInfo.address, common.mustCall((err) => {
+      assert.ok(!err);
+      client.close();
+    }));
+  }));
+}
diff --git a/test/parallel/test-eventtarget-memoryleakwarning.js b/test/parallel/test-eventtarget-memoryleakwarning.js
index b2da553ab4cacb..2c907165d865d9 100644
--- a/test/parallel/test-eventtarget-memoryleakwarning.js
+++ b/test/parallel/test-eventtarget-memoryleakwarning.js
@@ -12,22 +12,22 @@ const { setTimeout } = require('timers/promises');
 common.expectWarning({
   MaxListenersExceededWarning: [
     ['Possible EventTarget memory leak detected. 3 foo listeners added to ' +
-     'EventTarget. MaxListeners is 2. Use events.setMaxListeners() ' +
+        'EventTarget. MaxListeners is 2. Use events.setMaxListeners() ' +
      'to increase limit'],
     ['Possible EventTarget memory leak detected. 3 foo listeners added to ' +
-     '[MessagePort [EventTarget]]. ' +
-     'MaxListeners is 2. ' +
-     'Use events.setMaxListeners() to increase ' +
+        '[MessagePort [EventTarget]]. ' +
+        'MaxListeners is 2. ' +
+        'Use events.setMaxListeners() to increase ' +
      'limit'],
     ['Possible EventTarget memory leak detected. 3 foo listeners added to ' +
-     '[MessagePort [EventTarget]]. ' +
-     'MaxListeners is 2. ' +
-     'Use events.setMaxListeners() to increase ' +
+        '[MessagePort [EventTarget]]. ' +
+        'MaxListeners is 2. ' +
+        'Use events.setMaxListeners() to increase ' +
      'limit'],
-    ['Possible EventTarget memory leak detected. 3 foo listeners added to ' +
-     '[AbortSignal]. ' +
-     'MaxListeners is 2. ' +
-     'Use events.setMaxListeners() to increase ' +
+    ['Possible EventTarget memory leak detected. 2 foo listeners added to ' +
+        '[AbortSignal]. ' +
+        'MaxListeners is 1. ' +
+        'Use events.setMaxListeners() to increase ' +
      'limit'],
   ],
 });
@@ -65,13 +65,25 @@ common.expectWarning({
   mc.port1.addEventListener('foo', () => {});
   mc.port1.addEventListener('foo', () => {});
   mc.port1.addEventListener('foo', () => {});
+}
 
+{
+  // No warning emitted because AbortController ignores `EventEmitter.defaultMaxListeners`
+  setMaxListeners(2);
   const ac = new AbortController();
   ac.signal.addEventListener('foo', () => {});
   ac.signal.addEventListener('foo', () => {});
   ac.signal.addEventListener('foo', () => {});
 }
 
+{
+  // Will still warn as `setMaxListeners` can still manually set a limit
+  const ac = new AbortController();
+  setMaxListeners(1, ac.signal);
+  ac.signal.addEventListener('foo', () => {});
+  ac.signal.addEventListener('foo', () => {});
+}
+
 {
   // It works for EventEmitters also
   const ee = new EventEmitter();
diff --git a/test/parallel/test-fs-readdir-pipe.js b/test/parallel/test-fs-readdir-pipe.js
new file mode 100644
index 00000000000000..592e7a3d54009f
--- /dev/null
+++ b/test/parallel/test-fs-readdir-pipe.js
@@ -0,0 +1,21 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const { readdir, readdirSync } = require('fs');
+
+if (!common.isWindows) {
+  common.skip('This test is specific to Windows to test enumerate pipes');
+}
+
+// Ref: https://github.com/nodejs/node/issues/56002
+// This test is specific to Windows.
+
+const pipe = '\\\\.\\pipe\\';
+
+const { length } = readdirSync(pipe);
+assert.ok(length >= 0, `${length} is not greater or equal to 0`);
+
+readdir(pipe, common.mustSucceed((files) => {
+  assert.ok(files.length >= 0, `${files.length} is not greater or equal to 0`);
+}));
diff --git a/test/parallel/test-fs-readdir-types.js b/test/parallel/test-fs-readdir-types.js
index 3cc6b1cceff7fc..c6225c919e4a22 100644
--- a/test/parallel/test-fs-readdir-types.js
+++ b/test/parallel/test-fs-readdir-types.js
@@ -78,6 +78,22 @@ fs.readdir(readdirDir, {
   assertDirents(dirents);
 })().then(common.mustCall());
 
+// Check that mutating options doesn't affect results
+(async () => {
+  const options = { withFileTypes: true };
+  const direntsPromise = fs.promises.readdir(readdirDir, options);
+  options.withFileTypes = false;
+  assertDirents(await direntsPromise);
+})().then(common.mustCall());
+
+{
+  const options = { recursive: true, withFileTypes: true };
+  fs.readdir(readdirDir, options, common.mustSucceed((dirents) => {
+    assertDirents(dirents);
+  }));
+  options.withFileTypes = false;
+}
+
 // Check for correct types when the binding returns unknowns
 const UNKNOWN = constants.UV_DIRENT_UNKNOWN;
 const oldReaddir = binding.readdir;
diff --git a/test/parallel/test-fs-symlink-dir-junction.js b/test/parallel/test-fs-symlink-dir-junction.js
index 3990467c6f8008..4d5db3b444eb59 100644
--- a/test/parallel/test-fs-symlink-dir-junction.js
+++ b/test/parallel/test-fs-symlink-dir-junction.js
@@ -28,7 +28,7 @@ const fs = require('fs');
 const tmpdir = require('../common/tmpdir');
 
 // Test creating and reading symbolic link
-const linkData = fixtures.path('cycles/');
+const linkData = fixtures.path('cycles');
 const linkPath = tmpdir.resolve('cycles_link');
 
 tmpdir.refresh();
diff --git a/test/parallel/test-http-dont-set-default-headers-with-set-header.js b/test/parallel/test-http-dont-set-default-headers-with-set-header.js
new file mode 100644
index 00000000000000..bafdae5571e33c
--- /dev/null
+++ b/test/parallel/test-http-dont-set-default-headers-with-set-header.js
@@ -0,0 +1,33 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const http = require('http');
+
+const server = http.createServer(common.mustCall(function(req, res) {
+  assert.deepStrictEqual(req.rawHeaders, [
+    'test', 'value',
+    'HOST', `127.0.0.1:${server.address().port}`,
+    'foo', 'bar',
+    'foo', 'baz',
+    'connection', 'close',
+  ]);
+
+  res.end('ok');
+  server.close();
+}));
+server.listen(0, common.localhostIPv4, function() {
+  const req = http.request({
+    method: 'POST',
+    host: common.localhostIPv4,
+    port: this.address().port,
+    setDefaultHeaders: false,
+  });
+
+  req.setHeader('test', 'value');
+  req.setHeader('HOST', `${common.localhostIPv4}:${server.address().port}`);
+  req.setHeader('foo', ['bar', 'baz']);
+  req.setHeader('connection', 'close');
+
+  req.end();
+});
diff --git a/test/parallel/test-http-dont-set-default-headers-with-setHost.js b/test/parallel/test-http-dont-set-default-headers-with-setHost.js
new file mode 100644
index 00000000000000..e2a4e39c24b837
--- /dev/null
+++ b/test/parallel/test-http-dont-set-default-headers-with-setHost.js
@@ -0,0 +1,23 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const http = require('http');
+
+const server = http.createServer(common.mustCall(function(req, res) {
+  assert.deepStrictEqual(req.rawHeaders, [
+    'Host', `${common.localhostIPv4}:${server.address().port}`,
+  ]);
+
+  res.end('ok');
+  server.close();
+}));
+server.listen(0, common.localhostIPv4, function() {
+  http.request({
+    method: 'POST',
+    host: common.localhostIPv4,
+    port: this.address().port,
+    setDefaultHeaders: false,
+    setHost: true
+  }).end();
+});
diff --git a/test/parallel/test-http-dont-set-default-headers.js b/test/parallel/test-http-dont-set-default-headers.js
new file mode 100644
index 00000000000000..3f73c11e5112ee
--- /dev/null
+++ b/test/parallel/test-http-dont-set-default-headers.js
@@ -0,0 +1,31 @@
+'use strict';
+
+const common = require('../common');
+const assert = require('assert');
+const http = require('http');
+
+const server = http.createServer(common.mustCall(function(req, res) {
+  assert.deepStrictEqual(req.rawHeaders, [
+    'host', `${common.localhostIPv4}:${server.address().port}`,
+    'foo', 'bar',
+    'test', 'value',
+    'foo', 'baz',
+  ]);
+
+  res.end('ok');
+  server.close();
+}));
+server.listen(0, common.localhostIPv4, function() {
+  http.request({
+    method: 'POST',
+    host: common.localhostIPv4,
+    port: this.address().port,
+    setDefaultHeaders: false,
+    headers: [
+      'host', `${common.localhostIPv4}:${server.address().port}`,
+      'foo', 'bar',
+      'test', 'value',
+      'foo', 'baz',
+    ]
+  }).end();
+});
diff --git a/test/parallel/test-http2-alpn.js b/test/parallel/test-http2-alpn.js
new file mode 100644
index 00000000000000..a073d26e576cce
--- /dev/null
+++ b/test/parallel/test-http2-alpn.js
@@ -0,0 +1,47 @@
+'use strict';
+const common = require('../common');
+const fixtures = require('../common/fixtures');
+
+// This test verifies that http2 server support ALPNCallback option.
+
+if (!common.hasCrypto) common.skip('missing crypto');
+
+const assert = require('assert');
+const h2 = require('http2');
+const tls = require('tls');
+
+{
+  // Server sets two incompatible ALPN options:
+  assert.throws(() => h2.createSecureServer({
+    ALPNCallback: () => 'a',
+    ALPNProtocols: ['b', 'c']
+  }), (error) => error.code === 'ERR_TLS_ALPN_CALLBACK_WITH_PROTOCOLS');
+}
+
+{
+  const server = h2.createSecureServer({
+    key: fixtures.readKey('rsa_private.pem'),
+    cert: fixtures.readKey('rsa_cert.crt'),
+    ALPNCallback: () => 'a',
+  });
+
+  server.on(
+    'secureConnection',
+    common.mustCall((socket) => {
+      assert.strictEqual(socket.alpnProtocol, 'a');
+      socket.end();
+      server.close();
+    })
+  );
+
+  server.listen(0, function() {
+    const client = tls.connect({
+      port: server.address().port,
+      rejectUnauthorized: false,
+      ALPNProtocols: ['a'],
+    }, common.mustCall(() => {
+      assert.strictEqual(client.alpnProtocol, 'a');
+      client.end();
+    }));
+  });
+}
diff --git a/test/parallel/test-https-agent-session-eviction.js b/test/parallel/test-https-agent-session-eviction.js
index da5600710560b2..e0986e53c1103b 100644
--- a/test/parallel/test-https-agent-session-eviction.js
+++ b/test/parallel/test-https-agent-session-eviction.js
@@ -56,7 +56,7 @@ function faultyServer(port) {
 function second(server, session) {
   const req = https.request({
     port: server.address().port,
-    ciphers: (common.hasOpenSSL31 ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT'),
+    ciphers: (common.hasOpenSSL(3, 1) ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT'),
     rejectUnauthorized: false
   }, function(res) {
     res.resume();
diff --git a/test/parallel/test-icu-env.js b/test/parallel/test-icu-env.js
index 7a153d41beae5a..afa36132f60e8d 100644
--- a/test/parallel/test-icu-env.js
+++ b/test/parallel/test-icu-env.js
@@ -2,11 +2,38 @@
 const common = require('../common');
 const assert = require('assert');
 const { execFileSync } = require('child_process');
+const { readFileSync, globSync } = require('fs');
+const { path } = require('../common/fixtures');
+
+
+// This test checks for regressions in environment variable handling and
+// caching, but the localization data originated from ICU might change
+// over time.
+//
+// The json file can be updated using `tools/icu/update-test-data.js`
+// whenever ICU is updated. Run the update script if this test fails after
+// an ICU update, and verify that only expected values are updated.
+// Typically, only a few strings change with each ICU update. If this script
+// suddenly generates identical values for all locales, it indicates a bug.
+// Editing json file manually is also fine.
+const localizationDataFile = path(`icu/localizationData-v${process.versions.icu}.json`);
+
+let localizationData;
+try {
+  localizationData = JSON.parse(readFileSync(localizationDataFile));
+} catch ({ code }) {
+  assert.strictEqual(code, 'ENOENT');
+
+  // No data for current version, try latest known version
+  const [ latestVersion ] = globSync('test/fixtures/icu/localizationData-*.json')
+    .map((file) => file.match(/localizationData-v(.*)\.json/)[1])
+    .sort((a, b) => b.localeCompare(a, undefined, { numeric: true }));
+  console.log(`The ICU is v${process.versions.icu}, but there is no fixture for this version. ` +
+  `Trying the latest known version: v${latestVersion}. If this test fails with a few strings changed ` +
+  `after ICU update, run this: \n${process.argv[0]} tools/icu/update-test-data.mjs\n`);
+  localizationData = JSON.parse(readFileSync(path(`icu/localizationData-v${latestVersion}.json`)));
+}
 
-// system-icu should not be tested
-const hasBuiltinICU = process.config.variables.icu_gyp_path === 'tools/icu/icu-generic.gyp';
-if (!hasBuiltinICU)
-  common.skip('system ICU');
 
 // small-icu doesn't support non-English locales
 const hasFullICU = (() => {
@@ -100,45 +127,11 @@ if (isMockable) {
   );
   assert.deepStrictEqual(
     locales.map((LANG) => runEnvOutside({ LANG, TZ: 'Europe/Zurich' }, 'new Date(333333333333).toString()')),
-    [
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (Central European Standard Time)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (中欧标准时间)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (मध्य यूरोपीय मानक समय)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (hora estándar de Europa central)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (heure normale d’Europe centrale)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (توقيت وسط أوروبا الرسمي)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (মধ্য ইউরোপীয় মানক সময়)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (Центральная Европа, стандартное время)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (Horário Padrão da Europa Central)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (وسطی یورپ کا معیاری وقت)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (Waktu Standar Eropa Tengah)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (Mitteleuropäische Normalzeit)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (中央ヨーロッパ標準時)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (Mídúl Yúrop Fíksd Taim)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (मध्‍य युरोपियन प्रमाण वेळ)',
-      'Fri Jul 25 1980 01:35:33 GMT+0100 (సెంట్రల్ యూరోపియన్ ప్రామాణిక సమయం)',
-    ]
+    Object.values(localizationData.dateStrings)
   );
   assert.deepStrictEqual(
     locales.map((LANG) => runEnvOutside({ LANG, TZ: 'Europe/Zurich' }, 'new Date(333333333333).toLocaleString()')),
-    [
-      '7/25/1980, 1:35:33 AM',
-      '1980/7/25 01:35:33',
-      '25/7/1980, 1:35:33 am',
-      '25/7/1980, 1:35:33',
-      '25/07/1980 01:35:33',
-      '25‏/7‏/1980، 1:35:33 ص',
-      '২৫/৭/১৯৮০, ১:৩৫:৩৩ AM',
-      '25.07.1980, 01:35:33',
-      '25/07/1980, 01:35:33',
-      '25/7/1980، 1:35:33 AM',
-      '25/7/1980, 01.35.33',
-      '25.7.1980, 01:35:33',
-      '1980/7/25 1:35:33',
-      '25/7/1980 01:35:33',
-      '२५/७/१९८०, १:३५:३३ AM',
-      '25/7/1980 1:35:33 AM',
-    ]
+    Object.values(localizationData.dateTimeFormats)
   );
   assert.strictEqual(
     runEnvOutside({ LANG: 'en' }, '["z", "ä"].sort(new Intl.Collator().compare)'),
@@ -152,72 +145,23 @@ if (isMockable) {
     locales.map(
       (LANG) => runEnvOutside({ LANG, TZ: 'Europe/Zurich' }, 'new Intl.DateTimeFormat().format(333333333333)')
     ),
-    [
-      '7/25/1980', '1980/7/25',
-      '25/7/1980', '25/7/1980',
-      '25/07/1980', '25‏/7‏/1980',
-      '২৫/৭/১৯৮০', '25.07.1980',
-      '25/07/1980', '25/7/1980',
-      '25/7/1980', '25.7.1980',
-      '1980/7/25', '25/7/1980',
-      '२५/७/१९८०', '25/7/1980',
-    ]
+    Object.values(localizationData.dateFormats)
   );
   assert.deepStrictEqual(
     locales.map((LANG) => runEnvOutside({ LANG }, 'new Intl.DisplayNames(undefined, { type: "region" }).of("CH")')),
-    [
-      'Switzerland', '瑞士',
-      'स्विट्ज़रलैंड', 'Suiza',
-      'Suisse', 'سويسرا',
-      'সুইজারল্যান্ড', 'Швейцария',
-      'Suíça', 'سوئٹزر لینڈ',
-      'Swiss', 'Schweiz',
-      'スイス', 'Swítsaland',
-      'स्वित्झर्लंड', 'స్విట్జర్లాండ్',
-    ]
+    Object.values(localizationData.displayNames)
   );
   assert.deepStrictEqual(
     locales.map((LANG) => runEnvOutside({ LANG }, 'new Intl.NumberFormat().format(275760.913)')),
-    [
-      '275,760.913', '275,760.913',
-      '2,75,760.913', '275.760,913',
-      '275 760,913', '275,760.913',
-      '২,৭৫,৭৬০.৯১৩', '275 760,913',
-      '275.760,913', '275,760.913',
-      '275.760,913', '275.760,913',
-      '275,760.913', '275,760.913',
-      '२,७५,७६०.९१३', '2,75,760.913',
-    ]
+    Object.values(localizationData.numberFormats)
   );
   assert.deepStrictEqual(
     locales.map((LANG) => runEnvOutside({ LANG }, 'new Intl.PluralRules().select(0)')),
-    [
-      'other', 'other', 'one', 'other',
-      'one', 'zero', 'one', 'many',
-      'one', 'other', 'other', 'other',
-      'other', 'one', 'other', 'other',
-    ]
+    Object.values(localizationData.pluralRules)
   );
   assert.deepStrictEqual(
     locales.map((LANG) => runEnvOutside({ LANG }, 'new Intl.RelativeTimeFormat().format(-586920.617, "hour")')),
-    [
-      '586,920.617 hours ago',
-      '586,920.617小时前',
-      '5,86,920.617 घंटे पहले',
-      'hace 586.920,617 horas',
-      'il y a 586 920,617 heures',
-      'قبل 586,920.617 ساعة',
-      '৫,৮৬,৯২০.৬১৭ ঘন্টা আগে',
-      '586 920,617 часа назад',
-      'há 586.920,617 horas',
-      '586,920.617 گھنٹے پہلے',
-      '586.920,617 jam yang lalu',
-      'vor 586.920,617 Stunden',
-      '586,920.617 時間前',
-      '586,920.617 áwa wé dọ́n pas',
-      '५,८६,९२०.६१७ तासांपूर्वी',
-      '5,86,920.617 గంటల క్రితం',
-    ]
+    Object.values(localizationData.relativeTime)
   );
 }
 
diff --git a/test/parallel/test-internal-module-require.js b/test/parallel/test-internal-module-require.js
index c6e2057d3da1ee..058273c7ea4304 100644
--- a/test/parallel/test-internal-module-require.js
+++ b/test/parallel/test-internal-module-require.js
@@ -87,6 +87,9 @@ if (process.argv[2] === 'child') {
         });
       } else {
         require(id);
+        if (!id.startsWith('node:')) {
+          require(`node:${id}`);
+        }
         publicModules.add(id);
       }
     }
diff --git a/test/parallel/test-permission-allow-addons-cli.js b/test/parallel/test-permission-allow-addons-cli.js
index 2254d9920cbe71..484f16e0acb3b5 100644
--- a/test/parallel/test-permission-allow-addons-cli.js
+++ b/test/parallel/test-permission-allow-addons-cli.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-addons --allow-fs-read=*
+// Flags: --permission --allow-addons --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-allow-child-process-cli.js b/test/parallel/test-permission-allow-child-process-cli.js
index d805c6fb973c3c..1569b2b5e87459 100644
--- a/test/parallel/test-permission-allow-child-process-cli.js
+++ b/test/parallel/test-permission-allow-child-process-cli.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-child-process --allow-fs-read=*
+// Flags: --permission --allow-child-process --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-allow-wasi-cli.js b/test/parallel/test-permission-allow-wasi-cli.js
index f6f1cfe3c895fb..c6bea9fb39cf0a 100644
--- a/test/parallel/test-permission-allow-wasi-cli.js
+++ b/test/parallel/test-permission-allow-wasi-cli.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-wasi --allow-fs-read=*
+// Flags: --permission --allow-wasi --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-allow-worker-cli.js b/test/parallel/test-permission-allow-worker-cli.js
index ae5a28fdae3597..3dcafea7a3fa35 100644
--- a/test/parallel/test-permission-allow-worker-cli.js
+++ b/test/parallel/test-permission-allow-worker-cli.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-worker --allow-fs-read=*
+// Flags: --permission --allow-worker --allow-fs-read=*
 'use strict';
 
 require('../common');
diff --git a/test/parallel/test-permission-child-process-cli.js b/test/parallel/test-permission-child-process-cli.js
index 76586a1c538bed..dfea008a60407b 100644
--- a/test/parallel/test-permission-child-process-cli.js
+++ b/test/parallel/test-permission-child-process-cli.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-experimental.js b/test/parallel/test-permission-experimental.js
deleted file mode 100644
index bec66e5a731a95..00000000000000
--- a/test/parallel/test-permission-experimental.js
+++ /dev/null
@@ -1,13 +0,0 @@
-// Flags: --experimental-permission --allow-fs-read=*
-'use strict';
-
-const common = require('../common');
-common.skipIfWorker();
-const assert = require('assert');
-
-// This test ensures that the experimental message is emitted
-// when using permission system
-
-process.on('warning', common.mustCall((warning) => {
-  assert.match(warning.message, /Permission is an experimental feature/);
-}, 1));
diff --git a/test/parallel/test-permission-fs-absolute-path.js b/test/parallel/test-permission-fs-absolute-path.js
index b7897743941d2e..2c2257052c8b02 100644
--- a/test/parallel/test-permission-fs-absolute-path.js
+++ b/test/parallel/test-permission-fs-absolute-path.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -13,7 +13,7 @@ const { spawnSync } = require('child_process');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', '*',
       '--allow-fs-write', path.resolve('../fixtures/permission/deny/regular-file.md'),
       '-e',
diff --git a/test/parallel/test-permission-fs-internal-module-stat.js b/test/parallel/test-permission-fs-internal-module-stat.js
index f0b9d86f0809a8..fd0222cc34fa2e 100644
--- a/test/parallel/test-permission-fs-internal-module-stat.js
+++ b/test/parallel/test-permission-fs-internal-module-stat.js
@@ -1,4 +1,4 @@
-// Flags: --expose-internals --experimental-permission --allow-fs-read=test/common* --allow-fs-read=tools* --allow-fs-read=test/parallel* --allow-child-process
+// Flags: --expose-internals --permission --allow-fs-read=test/common* --allow-fs-read=tools* --allow-fs-read=test/parallel* --allow-child-process
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-fs-read.js b/test/parallel/test-permission-fs-read.js
index 5be993c9df6be5..ed8e866a6a4c10 100644
--- a/test/parallel/test-permission-fs-read.js
+++ b/test/parallel/test-permission-fs-read.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -28,7 +28,7 @@ const commonPath = path.join(__filename, '../../common');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission', `--allow-fs-read=${file}`, `--allow-fs-read=${commonPathWildcard}`, file,
+      '--permission', `--allow-fs-read=${file}`, `--allow-fs-read=${commonPathWildcard}`, file,
     ],
     {
       env: {
diff --git a/test/parallel/test-permission-fs-relative-path.js b/test/parallel/test-permission-fs-relative-path.js
index 628e9918660088..3b115ee35d1227 100644
--- a/test/parallel/test-permission-fs-relative-path.js
+++ b/test/parallel/test-permission-fs-relative-path.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -12,7 +12,7 @@ const { spawnSync } = require('child_process');
   const { status, stdout } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', '*',
       '--allow-fs-write', '../fixtures/permission/deny/regular-file.md',
       '-e',
diff --git a/test/parallel/test-permission-fs-require.js b/test/parallel/test-permission-fs-require.js
index 6a2e9201dac7b4..5d3a407708371e 100644
--- a/test/parallel/test-permission-fs-require.js
+++ b/test/parallel/test-permission-fs-require.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -14,7 +14,7 @@ const { spawnSync } = require('node:child_process');
   const { status, stdout, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', mainModule,
       '--allow-fs-read', requiredModule,
       mainModule,
@@ -31,7 +31,7 @@ const { spawnSync } = require('node:child_process');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', mainModule,
       mainModule,
     ]
@@ -48,7 +48,7 @@ const { spawnSync } = require('node:child_process');
   const { status, stdout, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', mainModule,
       '--allow-fs-read', requiredModule,
       mainModule,
@@ -65,7 +65,7 @@ const { spawnSync } = require('node:child_process');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read', mainModule,
       mainModule,
     ]
diff --git a/test/parallel/test-permission-fs-symlink-relative.js b/test/parallel/test-permission-fs-symlink-relative.js
index 4cc7d920593c23..cf9b37ea79b059 100644
--- a/test/parallel/test-permission-fs-symlink-relative.js
+++ b/test/parallel/test-permission-fs-symlink-relative.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-fs-write=*
+// Flags: --permission --allow-fs-read=* --allow-fs-write=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-fs-symlink-target-write.js b/test/parallel/test-permission-fs-symlink-target-write.js
index e2b4aa2a657442..f55b19fa764a89 100644
--- a/test/parallel/test-permission-fs-symlink-target-write.js
+++ b/test/parallel/test-permission-fs-symlink-target-write.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -35,7 +35,7 @@ fs.writeFileSync(path.join(readWriteFolder, 'file'), 'NO evil file contents');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${file}`, `--allow-fs-read=${commonPathWildcard}`, `--allow-fs-read=${readOnlyFolder}`, `--allow-fs-read=${readWriteFolder}`,
       `--allow-fs-write=${readWriteFolder}`, `--allow-fs-write=${writeOnlyFolder}`,
       file,
diff --git a/test/parallel/test-permission-fs-symlink.js b/test/parallel/test-permission-fs-symlink.js
index c7d753c267c1e7..92965c960177d4 100644
--- a/test/parallel/test-permission-fs-symlink.js
+++ b/test/parallel/test-permission-fs-symlink.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -36,7 +36,7 @@ const symlinkFromBlockedFile = tmpdir.resolve('example-symlink.md');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${file}`, `--allow-fs-read=${commonPathWildcard}`, `--allow-fs-read=${symlinkFromBlockedFile}`,
       `--allow-fs-write=${symlinkFromBlockedFile}`,
       file,
diff --git a/test/parallel/test-permission-fs-traversal-path.js b/test/parallel/test-permission-fs-traversal-path.js
index d618c3e4f79879..03571c2d01c861 100644
--- a/test/parallel/test-permission-fs-traversal-path.js
+++ b/test/parallel/test-permission-fs-traversal-path.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-fs-write=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -30,7 +30,7 @@ const commonPathWildcard = path.join(__filename, '../../common*');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${file}`, `--allow-fs-read=${commonPathWildcard}`, `--allow-fs-read=${allowedFolder}`,
       `--allow-fs-write=${allowedFolder}`,
       file,
diff --git a/test/parallel/test-permission-fs-wildcard.js b/test/parallel/test-permission-fs-wildcard.js
index 7aa8c34fd65cb1..adca56ed0dba6d 100644
--- a/test/parallel/test-permission-fs-wildcard.js
+++ b/test/parallel/test-permission-fs-wildcard.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -31,7 +31,7 @@ if (common.isWindows) {
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       ...allowList.flatMap((path) => ['--allow-fs-read', path]),
       '-e',
       `
@@ -66,7 +66,7 @@ if (common.isWindows) {
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       ...allowList.flatMap((path) => ['--allow-fs-read', path]),
       '-e',
       `
@@ -91,7 +91,7 @@ if (common.isWindows) {
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       `--allow-fs-read=${file}`, `--allow-fs-read=${commonPathWildcard}`, ...allowList.flatMap((path) => ['--allow-fs-read', path]),
       file,
     ],
@@ -104,7 +104,7 @@ if (common.isWindows) {
     const { status, stderr } = spawnSync(
       process.execPath,
       [
-        '--experimental-permission',
+        '--permission',
         '--allow-fs-read=/a/b/*',
         '--allow-fs-read=/a/b/d',
         '--allow-fs-read=/etc/passwd.*',
diff --git a/test/parallel/test-permission-fs-windows-path.js b/test/parallel/test-permission-fs-windows-path.js
index 552f8e1c21694b..6869b347cf283f 100644
--- a/test/parallel/test-permission-fs-windows-path.js
+++ b/test/parallel/test-permission-fs-windows-path.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -13,7 +13,7 @@ if (!common.isWindows) {
 
 {
   const { stdout, status } = spawnSync(process.execPath, [
-    '--experimental-permission', '--allow-fs-write', 'C:\\\\', '-e',
+    '--permission', '--allow-fs-write', 'C:\\\\', '-e',
     'console.log(process.permission.has("fs.write", "C:\\\\"))',
   ]);
   assert.strictEqual(stdout.toString(), 'true\n');
@@ -22,7 +22,7 @@ if (!common.isWindows) {
 
 {
   const { stdout, status, stderr } = spawnSync(process.execPath, [
-    '--experimental-permission', '--allow-fs-write="\\\\?\\C:\\"', '-e',
+    '--permission', '--allow-fs-write="\\\\?\\C:\\"', '-e',
     'console.log(process.permission.has("fs.write", "C:\\\\"))',
   ]);
   assert.strictEqual(stdout.toString(), 'false\n', stderr.toString());
@@ -31,7 +31,7 @@ if (!common.isWindows) {
 
 {
   const { stdout, status, stderr } = spawnSync(process.execPath, [
-    '--experimental-permission', '--allow-fs-write', 'C:\\', '-e',
+    '--permission', '--allow-fs-write', 'C:\\', '-e',
     `const path = require('path');
      console.log(process.permission.has('fs.write', path.toNamespacedPath('C:\\\\')))`,
   ]);
@@ -41,7 +41,7 @@ if (!common.isWindows) {
 
 {
   const { stdout, status, stderr } = spawnSync(process.execPath, [
-    '--experimental-permission', '--allow-fs-write', 'C:\\*', '-e',
+    '--permission', '--allow-fs-write', 'C:\\*', '-e',
     "console.log(process.permission.has('fs.write', '\\\\\\\\A\\\\C:\\Users'))",
   ]);
   assert.strictEqual(stdout.toString(), 'false\n', stderr.toString());
diff --git a/test/parallel/test-permission-fs-write-report.js b/test/parallel/test-permission-fs-write-report.js
index c8f6673de03d83..111f73b7bcc1ed 100644
--- a/test/parallel/test-permission-fs-write-report.js
+++ b/test/parallel/test-permission-fs-write-report.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-fs-write-v8.js b/test/parallel/test-permission-fs-write-v8.js
index bb33c307544a37..85cb9a5519b3af 100644
--- a/test/parallel/test-permission-fs-write-v8.js
+++ b/test/parallel/test-permission-fs-write-v8.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-fs-write.js b/test/parallel/test-permission-fs-write.js
index 626c00e5c007a2..34eab7a40005db 100644
--- a/test/parallel/test-permission-fs-write.js
+++ b/test/parallel/test-permission-fs-write.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -24,7 +24,7 @@ const file = fixtures.path('permission', 'fs-write.js');
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read=*',
       `--allow-fs-write=${regularFile}`, `--allow-fs-write=${commonPath}`,
       file,
diff --git a/test/parallel/test-permission-has.js b/test/parallel/test-permission-has.js
index 3be45c5b2a410a..bf23af014c7a40 100644
--- a/test/parallel/test-permission-has.js
+++ b/test/parallel/test-permission-has.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-inspector-brk.js b/test/parallel/test-permission-inspector-brk.js
index e1bd8e9bbb0a34..61c9c799ba7eb6 100644
--- a/test/parallel/test-permission-inspector-brk.js
+++ b/test/parallel/test-permission-inspector-brk.js
@@ -14,7 +14,7 @@ common.skipIfInspectorDisabled();
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--allow-fs-read=*',
       '--inspect-brk',
       file,
@@ -29,7 +29,7 @@ common.skipIfInspectorDisabled();
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '--inspect-brk',
       '--eval',
       'console.log("Hi!")',
diff --git a/test/parallel/test-permission-inspector.js b/test/parallel/test-permission-inspector.js
index d4afd8d93bc2f7..9d3bf485fc4348 100644
--- a/test/parallel/test-permission-inspector.js
+++ b/test/parallel/test-permission-inspector.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=* --allow-child-process
+// Flags: --permission --allow-fs-read=* --allow-child-process
 'use strict';
 
 const common = require('../common');
@@ -26,7 +26,7 @@ if (!common.hasCrypto)
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission',
+      '--permission',
       '-e',
       '(new (require("inspector")).Session()).connect()',
     ],
diff --git a/test/parallel/test-permission-no-addons.js b/test/parallel/test-permission-no-addons.js
index 4a1fc635a99bc7..a3ae6f4be10641 100644
--- a/test/parallel/test-permission-no-addons.js
+++ b/test/parallel/test-permission-no-addons.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-processbinding.js b/test/parallel/test-permission-processbinding.js
index 0dd6fd450152cd..47a1364f19e303 100644
--- a/test/parallel/test-permission-processbinding.js
+++ b/test/parallel/test-permission-processbinding.js
@@ -13,13 +13,13 @@ const fixtures = require('../common/fixtures');
 const file = fixtures.path('permission', 'processbinding.js');
 
 // Due to linting rules-utils.js:isBinding check, process.binding() should
-// not be called when --experimental-permission is enabled.
+// not be called when --permission is enabled.
 // Always spawn a child process
 {
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission', '--allow-fs-read=*', file,
+      '--permission', '--allow-fs-read=*', file,
     ],
   );
   assert.strictEqual(status, 0, stderr.toString());
diff --git a/test/parallel/test-permission-sqlite-load-extension.js b/test/parallel/test-permission-sqlite-load-extension.js
new file mode 100644
index 00000000000000..28d750d0cd06b6
--- /dev/null
+++ b/test/parallel/test-permission-sqlite-load-extension.js
@@ -0,0 +1,18 @@
+'use strict';
+const common = require('../common');
+const assert = require('node:assert');
+const childProcess = require('child_process');
+
+const code = `const sqlite = require('node:sqlite');
+const db = new sqlite.DatabaseSync(':memory:', { allowExtension: true });
+db.loadExtension('nonexistent');`.replace(/\n/g, ' ');
+
+childProcess.exec(
+  `${process.execPath} --permission -e "${code}"`,
+  {},
+  common.mustCall((err, _, stderr) => {
+    assert.strictEqual(err.code, 1);
+    assert.match(stderr, /Error: Cannot load SQLite extensions when the permission model is enabled/);
+    assert.match(stderr, /code: 'ERR_LOAD_SQLITE_EXTENSION'/);
+  })
+);
diff --git a/test/parallel/test-permission-warning-flags.js b/test/parallel/test-permission-warning-flags.js
index 87fcb7ff7f3158..9b20248eae18e9 100644
--- a/test/parallel/test-permission-warning-flags.js
+++ b/test/parallel/test-permission-warning-flags.js
@@ -15,7 +15,7 @@ for (const flag of warnFlags) {
   const { status, stderr } = spawnSync(
     process.execPath,
     [
-      '--experimental-permission', flag, '-e',
+      '--permission', flag, '-e',
       'setTimeout(() => {}, 1)',
     ]
   );
diff --git a/test/parallel/test-permission-wasi.js b/test/parallel/test-permission-wasi.js
index 1a6cde013097b7..01291e685570f3 100644
--- a/test/parallel/test-permission-wasi.js
+++ b/test/parallel/test-permission-wasi.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-permission-worker-threads-cli.js b/test/parallel/test-permission-worker-threads-cli.js
index e817a7877226c1..efd98b2a3881aa 100644
--- a/test/parallel/test-permission-worker-threads-cli.js
+++ b/test/parallel/test-permission-worker-threads-cli.js
@@ -1,4 +1,4 @@
-// Flags: --experimental-permission --allow-fs-read=*
+// Flags: --permission --allow-fs-read=*
 'use strict';
 
 const common = require('../common');
diff --git a/test/parallel/test-process-get-builtin.mjs b/test/parallel/test-process-get-builtin.mjs
index 4b840585f2ad33..3cf8179f7286bb 100644
--- a/test/parallel/test-process-get-builtin.mjs
+++ b/test/parallel/test-process-get-builtin.mjs
@@ -41,15 +41,19 @@ for (const id of publicBuiltins) {
 }
 // Check that import(id).default returns the same thing as process.getBuiltinModule(id).
 for (const id of publicBuiltins) {
-  const imported = await import(`node:${id}`);
-  assert.strictEqual(process.getBuiltinModule(id), imported.default);
+  if (!id.startsWith('node:')) {
+    const imported = await import(`node:${id}`);
+    assert.strictEqual(process.getBuiltinModule(id), imported.default);
+  }
 }
 
 // publicBuiltins does not include 'test' which requires the node: prefix.
 const ids = publicBuiltins.add('test');
 // Check that import(id).default returns the same thing as process.getBuiltinModule(id).
 for (const id of ids) {
-  const prefixed = `node:${id}`;
-  const imported = await import(prefixed);
-  assert.strictEqual(process.getBuiltinModule(prefixed), imported.default);
+  if (!id.startsWith('node:')) {
+    const prefixed = `node:${id}`;
+    const imported = await import(prefixed);
+    assert.strictEqual(process.getBuiltinModule(prefixed), imported.default);
+  }
 }
diff --git a/test/parallel/test-process-load-env-file.js b/test/parallel/test-process-load-env-file.js
index 795b8773d955cb..1dada3aa9b7016 100644
--- a/test/parallel/test-process-load-env-file.js
+++ b/test/parallel/test-process-load-env-file.js
@@ -78,7 +78,7 @@ describe('process.loadEnvFile()', () => {
     `.trim();
     const child = await common.spawnPromisified(
       process.execPath,
-      [ '--eval', code, '--experimental-permission' ],
+      [ '--eval', code, '--permission' ],
       { cwd: __dirname },
     );
     assert.match(child.stderr, /Error: Access to this API has been restricted/);
diff --git a/test/parallel/test-repl-permission-model.js b/test/parallel/test-repl-permission-model.js
index 66f2a147652f8d..938f5121163a23 100644
--- a/test/parallel/test-repl-permission-model.js
+++ b/test/parallel/test-repl-permission-model.js
@@ -1,6 +1,6 @@
 'use strict';
 
-// Flags: --expose-internals --experimental-permission --allow-fs-read=*
+// Flags: --expose-internals --permission --allow-fs-read=*
 
 const common = require('../common');
 const stream = require('stream');
diff --git a/test/parallel/test-repl-tab-complete-import.js b/test/parallel/test-repl-tab-complete-import.js
index e328d95db5986c..fe9f7a3d11795b 100644
--- a/test/parallel/test-repl-tab-complete-import.js
+++ b/test/parallel/test-repl-tab-complete-import.js
@@ -5,7 +5,7 @@ const ArrayStream = require('../common/arraystream');
 const fixtures = require('../common/fixtures');
 const assert = require('assert');
 const { builtinModules } = require('module');
-const publicModules = builtinModules.filter((lib) => !lib.startsWith('_'));
+const publicUnprefixedModules = builtinModules.filter((lib) => !lib.startsWith('_') && !lib.startsWith('node:'));
 
 if (!common.isMainThread)
   common.skip('process.chdir is not available in Workers');
@@ -31,7 +31,7 @@ testMe._domain.on('error', assert.ifError);
 // Tab complete provides built in libs for import()
 testMe.complete('import(\'', common.mustCall((error, data) => {
   assert.strictEqual(error, null);
-  publicModules.forEach((lib) => {
+  publicUnprefixedModules.forEach((lib) => {
     assert(
       data[0].includes(lib) && data[0].includes(`node:${lib}`),
       `${lib} not found`,
@@ -55,7 +55,7 @@ testMe.complete("import\t( 'n", common.mustCall((error, data) => {
   // import(...) completions include `node:` URL modules:
   let lastIndex = -1;
 
-  publicModules.forEach((lib, index) => {
+  publicUnprefixedModules.forEach((lib, index) => {
     lastIndex = completions.indexOf(`node:${lib}`);
     assert.notStrictEqual(lastIndex, -1);
   });
diff --git a/test/parallel/test-repl-tab-complete.js b/test/parallel/test-repl-tab-complete.js
index 57278f52ccf2c6..ff1e927078ddf5 100644
--- a/test/parallel/test-repl-tab-complete.js
+++ b/test/parallel/test-repl-tab-complete.js
@@ -275,7 +275,7 @@ testMe.complete('require(\'', common.mustCall(function(error, data) {
   assert.strictEqual(error, null);
   publicModules.forEach((lib) => {
     assert(
-      data[0].includes(lib) && data[0].includes(`node:${lib}`),
+      data[0].includes(lib) && (lib.startsWith('node:') || data[0].includes(`node:${lib}`)),
       `${lib} not found`
     );
   });
@@ -295,7 +295,7 @@ testMe.complete("require\t( 'n", common.mustCall(function(error, data) {
   // require(...) completions include `node:`-prefixed modules:
   let lastIndex = -1;
 
-  publicModules.forEach((lib, index) => {
+  publicModules.filter((lib) => !lib.startsWith('node:')).forEach((lib, index) => {
     lastIndex = data[0].indexOf(`node:${lib}`);
     assert.notStrictEqual(lastIndex, -1);
   });
diff --git a/test/parallel/test-repl.js b/test/parallel/test-repl.js
index 610c7813e0439c..c2670c6cc942b4 100644
--- a/test/parallel/test-repl.js
+++ b/test/parallel/test-repl.js
@@ -51,6 +51,7 @@ async function runReplTests(socket, prompt, tests) {
     // Expect can be a single line or multiple lines
     const expectedLines = Array.isArray(expect) ? expect : [ expect ];
 
+    console.error('\n------------');
     console.error('out:', JSON.stringify(send));
     socket.write(`${send}\n`);
 
@@ -593,17 +594,18 @@ const errorTests = [
   // REPL should get a normal require() function, not one that allows
   // access to internal modules without the --expose-internals flag.
   {
-    send: 'require("internal/repl")',
+    // Shrink the stack trace to avoid having to update this test whenever the
+    // implementation of require() changes. It's set to 4 because somehow setting it
+    // to a lower value breaks the error formatting and the message becomes
+    // "Uncaught [Error...", which is probably a bug(?).
+    send: 'Error.stackTraceLimit = 4; require("internal/repl")',
     expect: [
       /^Uncaught Error: Cannot find module 'internal\/repl'/,
       /^Require stack:/,
-      /^- <repl>/,
-      /^ {4}at .*/, // at Module._resolveFilename
-      /^ {4}at .*/, // at Module._load
-      /^ {4}at .*/, // at TracingChannel.traceSync
-      /^ {4}at .*/, // at wrapModuleLoad
-      /^ {4}at .*/, // at Module.require
-      /^ {4}at .*/, // at require
+      /^- <repl>/,  // This just tests MODULE_NOT_FOUND so let's skip the stack trace
+      /^ {4}at .*/, // Some stack frame that we have to capture otherwise error message is buggy.
+      /^ {4}at .*/, // Some stack frame that we have to capture otherwise error message is buggy.
+      /^ {4}at .*/, // Some stack frame that we have to capture otherwise error message is buggy.
       "  code: 'MODULE_NOT_FOUND',",
       "  requireStack: [ '<repl>' ]",
       '}',
diff --git a/test/parallel/test-require-resolve.js b/test/parallel/test-require-resolve.js
index a38a8e074ab85d..b69192635e6d79 100644
--- a/test/parallel/test-require-resolve.js
+++ b/test/parallel/test-require-resolve.js
@@ -61,10 +61,9 @@ require(fixtures.path('resolve-paths', 'default', 'verify-paths.js'));
   // builtinModules.
   builtinModules.forEach((mod) => {
     assert.strictEqual(require.resolve.paths(mod), null);
-  });
-
-  builtinModules.forEach((mod) => {
-    assert.strictEqual(require.resolve.paths(`node:${mod}`), null);
+    if (!mod.startsWith('node:')) {
+      assert.strictEqual(require.resolve.paths(`node:${mod}`), null);
+    }
   });
 
   // node_modules.
diff --git a/test/parallel/test-runner-coverage-default-exclusion.mjs b/test/parallel/test-runner-coverage-default-exclusion.mjs
new file mode 100644
index 00000000000000..621e49412d5c8e
--- /dev/null
+++ b/test/parallel/test-runner-coverage-default-exclusion.mjs
@@ -0,0 +1,116 @@
+import '../common/index.mjs';
+import { before, describe, it } from 'node:test';
+import assert from 'node:assert';
+import { spawnSync } from 'node:child_process';
+import { cp } from 'node:fs/promises';
+import tmpdir from '../common/tmpdir.js';
+import fixtures from '../common/fixtures.js';
+const skipIfNoInspector = {
+  skip: !process.features.inspector ? 'inspector disabled' : false
+};
+
+tmpdir.refresh();
+
+async function setupFixtures() {
+  const fixtureDir = fixtures.path('test-runner', 'coverage-default-exclusion');
+  await cp(fixtureDir, tmpdir.path, { recursive: true });
+}
+
+describe('test runner coverage default exclusion', skipIfNoInspector, () => {
+  before(async () => {
+    await setupFixtures();
+  });
+
+  it('should override default exclusion setting --test-coverage-exclude', async () => {
+    const report = [
+      '# start of coverage report',
+      '# ---------------------------------------------------------------------------',
+      '# file                       | line % | branch % | funcs % | uncovered lines',
+      '# ---------------------------------------------------------------------------',
+      '# file-test.js               | 100.00 |   100.00 |  100.00 | ',
+      '# file.test.mjs              | 100.00 |   100.00 |  100.00 | ',
+      '# logic-file.js              |  66.67 |   100.00 |   50.00 | 5-7',
+      '# test.cjs                   | 100.00 |   100.00 |  100.00 | ',
+      '# test                       |        |          |         | ',
+      '#  not-matching-test-name.js | 100.00 |   100.00 |  100.00 | ',
+      '# ---------------------------------------------------------------------------',
+      '# all files                  |  91.89 |   100.00 |   83.33 | ',
+      '# ---------------------------------------------------------------------------',
+      '# end of coverage report',
+    ].join('\n');
+
+
+    const args = [
+      '--test',
+      '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
+      '--test-reporter=tap',
+    ];
+    const result = spawnSync(process.execPath, args, {
+      env: { ...process.env, NODE_TEST_TMPDIR: tmpdir.path },
+      cwd: tmpdir.path
+    });
+
+    assert.strictEqual(result.stderr.toString(), '');
+    assert(result.stdout.toString().includes(report));
+    assert.strictEqual(result.status, 0);
+  });
+
+  it('should exclude test files from coverage by default', async () => {
+    const report = [
+      '# start of coverage report',
+      '# --------------------------------------------------------------',
+      '# file          | line % | branch % | funcs % | uncovered lines',
+      '# --------------------------------------------------------------',
+      '# logic-file.js |  66.67 |   100.00 |   50.00 | 5-7',
+      '# --------------------------------------------------------------',
+      '# all files     |  66.67 |   100.00 |   50.00 | ',
+      '# --------------------------------------------------------------',
+      '# end of coverage report',
+    ].join('\n');
+
+    const args = [
+      '--test',
+      '--experimental-test-coverage',
+      '--test-reporter=tap',
+    ];
+    const result = spawnSync(process.execPath, args, {
+      env: { ...process.env, NODE_TEST_TMPDIR: tmpdir.path },
+      cwd: tmpdir.path
+    });
+
+    assert.strictEqual(result.stderr.toString(), '');
+    assert(result.stdout.toString().includes(report));
+    assert.strictEqual(result.status, 0);
+  });
+
+  it('should exclude ts test files when using --experimental-strip-types', async () => {
+    const report = [
+      '# start of coverage report',
+      '# --------------------------------------------------------------',
+      '# file          | line % | branch % | funcs % | uncovered lines',
+      '# --------------------------------------------------------------',
+      '# logic-file.js |  66.67 |   100.00 |   50.00 | 5-7',
+      '# --------------------------------------------------------------',
+      '# all files     |  66.67 |   100.00 |   50.00 | ',
+      '# --------------------------------------------------------------',
+      '# end of coverage report',
+    ].join('\n');
+
+    const args = [
+      '--test',
+      '--experimental-test-coverage',
+      '--experimental-strip-types',
+      '--disable-warning=ExperimentalWarning',
+      '--test-reporter=tap',
+    ];
+    const result = spawnSync(process.execPath, args, {
+      env: { ...process.env, NODE_TEST_TMPDIR: tmpdir.path },
+      cwd: tmpdir.path
+    });
+
+    assert.strictEqual(result.stderr.toString(), '');
+    assert(result.stdout.toString().includes(report));
+    assert.strictEqual(result.status, 0);
+  });
+});
diff --git a/test/parallel/test-runner-coverage-source-map.js b/test/parallel/test-runner-coverage-source-map.js
index 48807fb2d193b4..1f6e6d2c81fb45 100644
--- a/test/parallel/test-runner-coverage-source-map.js
+++ b/test/parallel/test-runner-coverage-source-map.js
@@ -20,7 +20,11 @@ function generateReport(report) {
 
 const flags = [
   '--enable-source-maps',
-  '--test', '--experimental-test-coverage', '--test-reporter', 'tap',
+  '--test',
+  '--experimental-test-coverage',
+  '--test-coverage-exclude=!test/**',
+  '--test-reporter',
+  'tap',
 ];
 
 describe('Coverage with source maps', async () => {
diff --git a/test/parallel/test-runner-coverage-thresholds.js b/test/parallel/test-runner-coverage-thresholds.js
index c1b64cb06a83ff..61066f80a39cc0 100644
--- a/test/parallel/test-runner-coverage-thresholds.js
+++ b/test/parallel/test-runner-coverage-thresholds.js
@@ -61,6 +61,7 @@ for (const coverage of coverages) {
     const result = spawnSync(process.execPath, [
       '--test',
       '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
       `${coverage.flag}=25`,
       '--test-reporter', 'tap',
       fixture,
@@ -77,6 +78,7 @@ for (const coverage of coverages) {
     const result = spawnSync(process.execPath, [
       '--test',
       '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
       `${coverage.flag}=25`,
       '--test-reporter', reporter,
       fixture,
@@ -92,6 +94,7 @@ for (const coverage of coverages) {
     const result = spawnSync(process.execPath, [
       '--test',
       '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
       `${coverage.flag}=99`,
       '--test-reporter', 'tap',
       fixture,
@@ -108,6 +111,7 @@ for (const coverage of coverages) {
     const result = spawnSync(process.execPath, [
       '--test',
       '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
       `${coverage.flag}=99`,
       '--test-reporter', reporter,
       fixture,
@@ -123,6 +127,7 @@ for (const coverage of coverages) {
     const result = spawnSync(process.execPath, [
       '--test',
       '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
       `${coverage.flag}=101`,
       fixture,
     ]);
@@ -136,6 +141,7 @@ for (const coverage of coverages) {
     const result = spawnSync(process.execPath, [
       '--test',
       '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
       `${coverage.flag}=-1`,
       fixture,
     ]);
diff --git a/test/parallel/test-runner-coverage.js b/test/parallel/test-runner-coverage.js
index 5756f1d237605c..3a9de3c053431e 100644
--- a/test/parallel/test-runner-coverage.js
+++ b/test/parallel/test-runner-coverage.js
@@ -84,7 +84,11 @@ test('test coverage report', async (t) => {
     }
 
     const fixture = fixtures.path('test-runner', 'coverage.js');
-    const args = ['--experimental-test-coverage', fixture];
+    const args = [
+      '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
+      fixture,
+    ];
     const result = spawnSync(process.execPath, args);
 
     assert(!result.stdout.toString().includes('# start of coverage report'));
@@ -97,7 +101,13 @@ test('test coverage report', async (t) => {
 test('test tap coverage reporter', skipIfNoInspector, async (t) => {
   await t.test('coverage is reported and dumped to NODE_V8_COVERAGE if present', (t) => {
     const fixture = fixtures.path('test-runner', 'coverage.js');
-    const args = ['--experimental-test-coverage', '--test-reporter', 'tap', fixture];
+    const args = [
+      '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
+      '--test-reporter',
+      'tap',
+      fixture,
+    ];
     const options = { env: { ...process.env, NODE_V8_COVERAGE: tmpdir.path } };
     const result = spawnSync(process.execPath, args, options);
     const report = getTapCoverageFixtureReport();
@@ -109,7 +119,13 @@ test('test tap coverage reporter', skipIfNoInspector, async (t) => {
 
   await t.test('coverage is reported without NODE_V8_COVERAGE present', (t) => {
     const fixture = fixtures.path('test-runner', 'coverage.js');
-    const args = ['--experimental-test-coverage', '--test-reporter', 'tap', fixture];
+    const args = [
+      '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
+      '--test-reporter',
+      'tap',
+      fixture,
+    ];
     const result = spawnSync(process.execPath, args);
     const report = getTapCoverageFixtureReport();
 
@@ -123,7 +139,12 @@ test('test tap coverage reporter', skipIfNoInspector, async (t) => {
 test('test spec coverage reporter', skipIfNoInspector, async (t) => {
   await t.test('coverage is reported and dumped to NODE_V8_COVERAGE if present', (t) => {
     const fixture = fixtures.path('test-runner', 'coverage.js');
-    const args = ['--experimental-test-coverage', '--test-reporter', 'spec', fixture];
+    const args = [
+      '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
+      '--test-reporter',
+      'spec',
+      fixture];
     const options = { env: { ...process.env, NODE_V8_COVERAGE: tmpdir.path } };
     const result = spawnSync(process.execPath, args, options);
     const report = getSpecCoverageFixtureReport();
@@ -136,7 +157,12 @@ test('test spec coverage reporter', skipIfNoInspector, async (t) => {
 
   await t.test('coverage is reported without NODE_V8_COVERAGE present', (t) => {
     const fixture = fixtures.path('test-runner', 'coverage.js');
-    const args = ['--experimental-test-coverage', '--test-reporter', 'spec', fixture];
+    const args = [
+      '--experimental-test-coverage',
+      '--test-coverage-exclude=!test/**',
+      '--test-reporter',
+      'spec',
+      fixture];
     const result = spawnSync(process.execPath, args);
     const report = getSpecCoverageFixtureReport();
 
@@ -150,7 +176,12 @@ test('test spec coverage reporter', skipIfNoInspector, async (t) => {
 test('single process coverage is the same with --test', skipIfNoInspector, () => {
   const fixture = fixtures.path('test-runner', 'coverage.js');
   const args = [
-    '--test', '--experimental-test-coverage', '--test-reporter', 'tap', fixture,
+    '--test',
+    '--experimental-test-coverage',
+    '--test-coverage-exclude=!test/**',
+    '--test-reporter',
+    'tap',
+    fixture,
   ];
   const result = spawnSync(process.execPath, args);
   const report = getTapCoverageFixtureReport();
@@ -183,7 +214,11 @@ test('coverage is combined for multiple processes', skipIfNoInspector, () => {
 
   const fixture = fixtures.path('v8-coverage', 'combined_coverage');
   const args = [
-    '--test', '--experimental-test-coverage', '--test-reporter', 'tap',
+    '--test',
+    '--experimental-test-coverage',
+    '--test-coverage-exclude=!test/**',
+    '--test-reporter',
+    'tap',
   ];
   const result = spawnSync(process.execPath, args, {
     env: { ...process.env, NODE_TEST_TMPDIR: tmpdir.path },
@@ -221,7 +256,11 @@ test.skip('coverage works with isolation=none', skipIfNoInspector, () => {
 
   const fixture = fixtures.path('v8-coverage', 'combined_coverage');
   const args = [
-    '--test', '--experimental-test-coverage', '--test-reporter', 'tap', '--experimental-test-isolation=none',
+    '--test',
+    '--experimental-test-coverage',
+    '--test-reporter',
+    'tap',
+    '--experimental-test-isolation=none',
   ];
   const result = spawnSync(process.execPath, args, {
     env: { ...process.env, NODE_TEST_TMPDIR: tmpdir.path },
@@ -236,9 +275,14 @@ test.skip('coverage works with isolation=none', skipIfNoInspector, () => {
 test('coverage reports on lines, functions, and branches', skipIfNoInspector, async (t) => {
   const fixture = fixtures.path('test-runner', 'coverage.js');
   const child = spawnSync(process.execPath,
-                          ['--test', '--experimental-test-coverage', '--test-reporter',
-                           fixtures.fileURL('test-runner/custom_reporters/coverage.mjs'),
-                           fixture]);
+                          [
+                            '--test',
+                            '--experimental-test-coverage',
+                            '--test-coverage-exclude=!test/**',
+                            '--test-reporter',
+                            fixtures.fileURL('test-runner/custom_reporters/coverage.mjs'),
+                            fixture,
+                          ]);
   assert.strictEqual(child.stderr.toString(), '');
   const stdout = child.stdout.toString();
   const coverage = JSON.parse(stdout);
@@ -310,7 +354,14 @@ test('coverage with ESM hook - source irrelevant', skipIfNoInspector, () => {
 
   const fixture = fixtures.path('test-runner', 'coverage-loader');
   const args = [
-    '--import', './register-hooks.js', '--test', '--experimental-test-coverage', '--test-reporter', 'tap', 'virtual.js',
+    '--import',
+    './register-hooks.js',
+    '--test',
+    '--experimental-test-coverage',
+    '--test-coverage-exclude=!test/**',
+    '--test-reporter',
+    'tap',
+    'virtual.js',
   ];
   const result = spawnSync(process.execPath, args, { cwd: fixture });
 
@@ -341,7 +392,10 @@ test('coverage with ESM hook - source transpiled', skipIfNoInspector, () => {
 
   const fixture = fixtures.path('test-runner', 'coverage-loader');
   const args = [
-    '--import', './register-hooks.js', '--test', '--experimental-test-coverage',
+    '--import', './register-hooks.js',
+    '--test',
+    '--experimental-test-coverage',
+    '--test-coverage-exclude=!test/**',
     '--test-reporter', 'tap', 'sum.test.ts',
   ];
   const result = spawnSync(process.execPath, args, { cwd: fixture });
@@ -356,6 +410,7 @@ test('coverage with excluded files', skipIfNoInspector, () => {
   const args = [
     '--experimental-test-coverage', '--test-reporter', 'tap',
     '--test-coverage-exclude=test/*/test-runner/invalid-tap.js',
+    '--test-coverage-exclude=!test/**',
     fixture];
   const result = spawnSync(process.execPath, args);
   const report = [
@@ -391,6 +446,7 @@ test('coverage with included files', skipIfNoInspector, () => {
     '--experimental-test-coverage', '--test-reporter', 'tap',
     '--test-coverage-include=test/fixtures/test-runner/coverage.js',
     '--test-coverage-include=test/fixtures/v8-coverage/throw.js',
+    '--test-coverage-exclude=!test/**',
     fixture,
   ];
   const result = spawnSync(process.execPath, args);
@@ -478,7 +534,12 @@ test('correctly prints the coverage report of files contained in parent director
   }
   const fixture = fixtures.path('test-runner', 'coverage.js');
   const args = [
-    '--test', '--experimental-test-coverage', '--test-reporter', 'tap', fixture,
+    '--test',
+    '--experimental-test-coverage',
+    '--test-coverage-exclude=!test/**',
+    '--test-reporter',
+    'tap',
+    fixture,
   ];
   const result = spawnSync(process.execPath, args, {
     env: { ...process.env, NODE_TEST_TMPDIR: tmpdir.path },
diff --git a/test/parallel/test-runner-output.mjs b/test/parallel/test-runner-output.mjs
index 4f745a5d4a9be7..3bc4b0ad3d1a2e 100644
--- a/test/parallel/test-runner-output.mjs
+++ b/test/parallel/test-runner-output.mjs
@@ -171,7 +171,12 @@ const tests = [
     name: 'test-runner/output/source_mapped_locations.mjs',
     flags: ['--test-reporter=tap'],
   },
-  process.features.inspector ? { name: 'test-runner/output/lcov_reporter.js', transform: lcovTransform } : false,
+  process.features.inspector ?
+    {
+      name: 'test-runner/output/lcov_reporter.js',
+      transform: lcovTransform
+    } :
+    false,
   { name: 'test-runner/output/output.js', flags: ['--test-reporter=tap'] },
   { name: 'test-runner/output/output_cli.js' },
   {
@@ -236,7 +241,7 @@ const tests = [
   },
   process.features.inspector ? {
     name: 'test-runner/output/coverage_failure.js',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   {
     name: 'test-runner/output/test-diagnostic-warning-without-test-only-flag.js',
@@ -244,49 +249,51 @@ const tests = [
   },
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-40.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-80.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector && !skipCoverageColors ? {
     name: 'test-runner/output/coverage-width-80-color.mjs',
+    flags: ['--test-coverage-exclude=!test/**'],
     transform: specTransform,
     tty: true
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-100.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-150.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-infinity.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-80-uncovered-lines.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-100-uncovered-lines.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector && !skipCoverageColors ? {
     name: 'test-runner/output/coverage-width-80-uncovered-lines-color.mjs',
+    flags: ['--test-coverage-exclude=!test/**'],
     transform: specTransform,
     tty: true
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-150-uncovered-lines.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
   process.features.inspector ? {
     name: 'test-runner/output/coverage-width-infinity-uncovered-lines.mjs',
-    flags: ['--test-reporter=tap'],
+    flags: ['--test-reporter=tap', '--test-coverage-exclude=!test/**'],
   } : false,
 ]
 .filter(Boolean)
diff --git a/test/parallel/test-sqlite-custom-functions.js b/test/parallel/test-sqlite-custom-functions.js
new file mode 100644
index 00000000000000..591378ae827e18
--- /dev/null
+++ b/test/parallel/test-sqlite-custom-functions.js
@@ -0,0 +1,373 @@
+'use strict';
+require('../common');
+const assert = require('node:assert');
+const { DatabaseSync } = require('node:sqlite');
+const { suite, test } = require('node:test');
+
+suite('DatabaseSync.prototype.function()', () => {
+  suite('input validation', () => {
+    const db = new DatabaseSync(':memory:');
+
+    test('throws if name is not a string', () => {
+      assert.throws(() => {
+        db.function();
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "name" argument must be a string/,
+      });
+    });
+
+    test('throws if function is not a function', () => {
+      assert.throws(() => {
+        db.function('foo');
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "function" argument must be a function/,
+      });
+    });
+
+    test('throws if options is not an object', () => {
+      assert.throws(() => {
+        db.function('foo', null, () => {});
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "options" argument must be an object/,
+      });
+    });
+
+    test('throws if options.useBigIntArguments is not a boolean', () => {
+      assert.throws(() => {
+        db.function('foo', { useBigIntArguments: null }, () => {});
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "options\.useBigIntArguments" argument must be a boolean/,
+      });
+    });
+
+    test('throws if options.varargs is not a boolean', () => {
+      assert.throws(() => {
+        db.function('foo', { varargs: null }, () => {});
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "options\.varargs" argument must be a boolean/,
+      });
+    });
+
+    test('throws if options.deterministic is not a boolean', () => {
+      assert.throws(() => {
+        db.function('foo', { deterministic: null }, () => {});
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "options\.deterministic" argument must be a boolean/,
+      });
+    });
+
+    test('throws if options.directOnly is not a boolean', () => {
+      assert.throws(() => {
+        db.function('foo', { directOnly: null }, () => {});
+      }, {
+        code: 'ERR_INVALID_ARG_TYPE',
+        message: /The "options\.directOnly" argument must be a boolean/,
+      });
+    });
+  });
+
+  suite('useBigIntArguments', () => {
+    test('converts arguments to BigInts when true', () => {
+      const db = new DatabaseSync(':memory:');
+      let value;
+      const r = db.function('custom', { useBigIntArguments: true }, (arg) => {
+        value = arg;
+      });
+      assert.strictEqual(r, undefined);
+      db.prepare('SELECT custom(5) AS custom').get();
+      assert.strictEqual(value, 5n);
+    });
+
+    test('uses number primitives when false', () => {
+      const db = new DatabaseSync(':memory:');
+      let value;
+      const r = db.function('custom', { useBigIntArguments: false }, (arg) => {
+        value = arg;
+      });
+      assert.strictEqual(r, undefined);
+      db.prepare('SELECT custom(5) AS custom').get();
+      assert.strictEqual(value, 5);
+    });
+
+    test('defaults to false', () => {
+      const db = new DatabaseSync(':memory:');
+      let value;
+      const r = db.function('custom', (arg) => {
+        value = arg;
+      });
+      assert.strictEqual(r, undefined);
+      db.prepare('SELECT custom(5) AS custom').get();
+      assert.strictEqual(value, 5);
+    });
+
+    test('throws if value cannot fit in a number', () => {
+      const db = new DatabaseSync(':memory:');
+      const value = Number.MAX_SAFE_INTEGER + 1;
+      db.function('custom', (arg) => {});
+      assert.throws(() => {
+        db.prepare(`SELECT custom(${value}) AS custom`).get();
+      }, {
+        code: 'ERR_OUT_OF_RANGE',
+        message: /Value is too large to be represented as a JavaScript number: 9007199254740992/,
+      });
+    });
+  });
+
+  suite('varargs', () => {
+    test('supports variable number of arguments when true', () => {
+      const db = new DatabaseSync(':memory:');
+      let value;
+      const r = db.function('custom', { varargs: true }, (...args) => {
+        value = args;
+      });
+      assert.strictEqual(r, undefined);
+      db.prepare('SELECT custom(5, 4, 3, 2, 1) AS custom').get();
+      assert.deepStrictEqual(value, [5, 4, 3, 2, 1]);
+    });
+
+    test('uses function.length when false', () => {
+      const db = new DatabaseSync(':memory:');
+      let value;
+      const r = db.function('custom', { varargs: false }, (a, b, c) => {
+        value = [a, b, c];
+      });
+      assert.strictEqual(r, undefined);
+      db.prepare('SELECT custom(1, 2, 3) AS custom').get();
+      assert.deepStrictEqual(value, [1, 2, 3]);
+    });
+
+    test('defaults to false', () => {
+      const db = new DatabaseSync(':memory:');
+      let value;
+      const r = db.function('custom', (a, b, c) => {
+        value = [a, b, c];
+      });
+      assert.strictEqual(r, undefined);
+      db.prepare('SELECT custom(7, 8, 9) AS custom').get();
+      assert.deepStrictEqual(value, [7, 8, 9]);
+    });
+
+    test('throws if an incorrect number of arguments is provided', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('custom', (a, b, c, d) => {});
+      assert.throws(() => {
+        db.prepare('SELECT custom(1, 2, 3) AS custom').get();
+      }, {
+        code: 'ERR_SQLITE_ERROR',
+        message: /wrong number of arguments to function custom\(\)/,
+      });
+    });
+  });
+
+  suite('deterministic', () => {
+    test('creates a deterministic function when true', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('isDeterministic', { deterministic: true }, () => {
+        return 42;
+      });
+      const r = db.exec(`
+        CREATE TABLE t1 (
+          a INTEGER PRIMARY KEY,
+          b INTEGER GENERATED ALWAYS AS (isDeterministic()) VIRTUAL
+        )
+      `);
+      assert.strictEqual(r, undefined);
+    });
+
+    test('creates a non-deterministic function when false', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('isNonDeterministic', { deterministic: false }, () => {
+        return 42;
+      });
+      assert.throws(() => {
+        db.exec(`
+          CREATE TABLE t1 (
+            a INTEGER PRIMARY KEY,
+            b INTEGER GENERATED ALWAYS AS (isNonDeterministic()) VIRTUAL
+          )
+        `);
+      }, {
+        code: 'ERR_SQLITE_ERROR',
+        message: /non-deterministic functions prohibited in generated columns/,
+      });
+    });
+
+    test('deterministic defaults to false', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('isNonDeterministic', () => {
+        return 42;
+      });
+      assert.throws(() => {
+        db.exec(`
+          CREATE TABLE t1 (
+            a INTEGER PRIMARY KEY,
+            b INTEGER GENERATED ALWAYS AS (isNonDeterministic()) VIRTUAL
+          )
+        `);
+      }, {
+        code: 'ERR_SQLITE_ERROR',
+        message: /non-deterministic functions prohibited in generated columns/,
+      });
+    });
+  });
+
+  suite('directOnly', () => {
+    test('sets SQLite direct only flag when true', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('fn', { deterministic: true, directOnly: true }, () => {
+        return 42;
+      });
+      assert.throws(() => {
+        db.exec(`
+          CREATE TABLE t1 (
+            a INTEGER PRIMARY KEY,
+            b INTEGER GENERATED ALWAYS AS (fn()) VIRTUAL
+          )
+        `);
+      }, {
+        code: 'ERR_SQLITE_ERROR',
+        message: /unsafe use of fn\(\)/
+      });
+    });
+
+    test('does not set SQLite direct only flag when false', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('fn', { deterministic: true, directOnly: false }, () => {
+        return 42;
+      });
+      const r = db.exec(`
+        CREATE TABLE t1 (
+          a INTEGER PRIMARY KEY,
+          b INTEGER GENERATED ALWAYS AS (fn()) VIRTUAL
+        )
+      `);
+      assert.strictEqual(r, undefined);
+    });
+
+    test('directOnly defaults to false', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('fn', { deterministic: true }, () => {
+        return 42;
+      });
+      const r = db.exec(`
+        CREATE TABLE t1 (
+          a INTEGER PRIMARY KEY,
+          b INTEGER GENERATED ALWAYS AS (fn()) VIRTUAL
+        )
+      `);
+      assert.strictEqual(r, undefined);
+    });
+  });
+
+  suite('return types', () => {
+    test('supported return types', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('retUndefined', () => {});
+      db.function('retNull', () => { return null; });
+      db.function('retNumber', () => { return 3; });
+      db.function('retString', () => { return 'foo'; });
+      db.function('retBigInt', () => { return 5n; });
+      db.function('retUint8Array', () => { return new Uint8Array([1, 2, 3]); });
+      const stmt = db.prepare(`SELECT
+        retUndefined() AS retUndefined,
+        retNull() AS retNull,
+        retNumber() AS retNumber,
+        retString() AS retString,
+        retBigInt() AS retBigInt,
+        retUint8Array() AS retUint8Array
+      `);
+      assert.deepStrictEqual(stmt.get(), {
+        __proto__: null,
+        retUndefined: null,
+        retNull: null,
+        retNumber: 3,
+        retString: 'foo',
+        retBigInt: 5,
+        retUint8Array: new Uint8Array([1, 2, 3]),
+      });
+    });
+
+    test('throws if returned BigInt is too large for SQLite', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('retBigInt', () => {
+        return BigInt(Number.MAX_SAFE_INTEGER + 1);
+      });
+      const stmt = db.prepare('SELECT retBigInt() AS retBigInt');
+      assert.throws(() => {
+        stmt.get();
+      }, {
+        code: 'ERR_OUT_OF_RANGE',
+      });
+    });
+
+    test('does not support Promise return values', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('retPromise', async () => {});
+      const stmt = db.prepare('SELECT retPromise() AS retPromise');
+      assert.throws(() => {
+        stmt.get();
+      }, {
+        code: 'ERR_SQLITE_ERROR',
+        message: /Asynchronous user-defined functions are not supported/,
+      });
+    });
+
+    test('throws on unsupported return types', () => {
+      const db = new DatabaseSync(':memory:');
+      db.function('retFunction', () => {
+        return () => {};
+      });
+      const stmt = db.prepare('SELECT retFunction() AS retFunction');
+      assert.throws(() => {
+        stmt.get();
+      }, {
+        code: 'ERR_SQLITE_ERROR',
+        message: /Returned JavaScript value cannot be converted to a SQLite value/,
+      });
+    });
+  });
+
+  test('supported argument types', () => {
+    const db = new DatabaseSync(':memory:');
+    db.function('arguments', (i, f, s, n, b) => {
+      assert.strictEqual(i, 5);
+      assert.strictEqual(f, 3.14);
+      assert.strictEqual(s, 'foo');
+      assert.strictEqual(n, null);
+      assert.deepStrictEqual(b, new Uint8Array([254]));
+      return 42;
+    });
+    const stmt = db.prepare(
+      'SELECT arguments(5, 3.14, \'foo\', null, x\'fe\') as result'
+    );
+    assert.deepStrictEqual(stmt.get(), { __proto__: null, result: 42 });
+  });
+
+  test('propagates thrown errors', () => {
+    const db = new DatabaseSync(':memory:');
+    const err = new Error('boom');
+    db.function('throws', () => {
+      throw err;
+    });
+    const stmt = db.prepare('SELECT throws()');
+    assert.throws(() => {
+      stmt.get();
+    }, err);
+  });
+
+  test('throws if database is not open', () => {
+    const db = new DatabaseSync(':memory:', { open: false });
+    assert.throws(() => {
+      db.function('foo', () => {});
+    }, {
+      code: 'ERR_INVALID_STATE',
+      message: /database is not open/,
+    });
+  });
+});
diff --git a/test/parallel/test-sqlite-session.js b/test/parallel/test-sqlite-session.js
index 306f439939e2e0..617c0c2aa71181 100644
--- a/test/parallel/test-sqlite-session.js
+++ b/test/parallel/test-sqlite-session.js
@@ -3,9 +3,7 @@
 require('../common');
 const {
   DatabaseSync,
-  SQLITE_CHANGESET_OMIT,
-  SQLITE_CHANGESET_REPLACE,
-  SQLITE_CHANGESET_ABORT
+  constants,
 } = require('node:sqlite');
 const { test, suite } = require('node:test');
 
@@ -165,7 +163,7 @@ suite('conflict resolution', () => {
   test('database.applyChangeset() - conflict with SQLITE_CHANGESET_ABORT', (t) => {
     const { database2, changeset } = prepareConflict();
     const result = database2.applyChangeset(changeset, {
-      onConflict: SQLITE_CHANGESET_ABORT
+      onConflict: constants.SQLITE_CHANGESET_ABORT
     });
     // When changeset is aborted due to a conflict, applyChangeset should return false
     t.assert.strictEqual(result, false);
@@ -177,7 +175,7 @@ suite('conflict resolution', () => {
   test('database.applyChangeset() - conflict with SQLITE_CHANGESET_REPLACE', (t) => {
     const { database2, changeset } = prepareConflict();
     const result = database2.applyChangeset(changeset, {
-      onConflict: SQLITE_CHANGESET_REPLACE
+      onConflict: constants.SQLITE_CHANGESET_REPLACE
     });
     // Not aborted due to conflict, so should return true
     t.assert.strictEqual(result, true);
@@ -189,7 +187,7 @@ suite('conflict resolution', () => {
   test('database.applyChangeset() - conflict with SQLITE_CHANGESET_OMIT', (t) => {
     const { database2, changeset } = prepareConflict();
     const result = database2.applyChangeset(changeset, {
-      onConflict: SQLITE_CHANGESET_OMIT
+      onConflict: constants.SQLITE_CHANGESET_OMIT
     });
     // Not aborted due to conflict, so should return true
     t.assert.strictEqual(result, true);
@@ -199,12 +197,6 @@ suite('conflict resolution', () => {
   });
 });
 
-test('session related constants are defined', (t) => {
-  t.assert.strictEqual(SQLITE_CHANGESET_OMIT, 0);
-  t.assert.strictEqual(SQLITE_CHANGESET_REPLACE, 1);
-  t.assert.strictEqual(SQLITE_CHANGESET_ABORT, 2);
-});
-
 test('database.createSession() - filter changes', (t) => {
   const database1 = new DatabaseSync(':memory:');
   const database2 = new DatabaseSync(':memory:');
diff --git a/test/parallel/test-sqlite.js b/test/parallel/test-sqlite.js
index 825e44fb2965f7..87162526ffadcd 100644
--- a/test/parallel/test-sqlite.js
+++ b/test/parallel/test-sqlite.js
@@ -2,7 +2,7 @@
 const { spawnPromisified } = require('../common');
 const tmpdir = require('../common/tmpdir');
 const { join } = require('node:path');
-const { DatabaseSync } = require('node:sqlite');
+const { DatabaseSync, constants } = require('node:sqlite');
 const { suite, test } = require('node:test');
 let cnt = 0;
 
@@ -85,6 +85,12 @@ test('in-memory databases are supported', (t) => {
   );
 });
 
+test('sqlite constants are defined', (t) => {
+  t.assert.strictEqual(constants.SQLITE_CHANGESET_OMIT, 0);
+  t.assert.strictEqual(constants.SQLITE_CHANGESET_REPLACE, 1);
+  t.assert.strictEqual(constants.SQLITE_CHANGESET_ABORT, 2);
+});
+
 test('PRAGMAs are supported', (t) => {
   const db = new DatabaseSync(nextDb());
   t.after(() => { db.close(); });
diff --git a/test/parallel/test-tls-alert.js b/test/parallel/test-tls-alert.js
index 04000771aa977b..e6aaaedfe59d72 100644
--- a/test/parallel/test-tls-alert.js
+++ b/test/parallel/test-tls-alert.js
@@ -42,7 +42,7 @@ const server = tls.Server({
   cert: loadPEM('agent2-cert')
 }, null).listen(0, common.mustCall(() => {
   const args = ['s_client', '-quiet', '-tls1_1',
-                '-cipher', (common.hasOpenSSL31 ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT'),
+                '-cipher', (common.hasOpenSSL(3, 1) ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT'),
                 '-connect', `127.0.0.1:${server.address().port}`];
 
   execFile(common.opensslCli, args, common.mustCall((err, _, stderr) => {
diff --git a/test/parallel/test-tls-empty-sni-context.js b/test/parallel/test-tls-empty-sni-context.js
index 0a8a344c779637..093e5cca712d2c 100644
--- a/test/parallel/test-tls-empty-sni-context.js
+++ b/test/parallel/test-tls-empty-sni-context.js
@@ -26,7 +26,7 @@ const server = tls.createServer(options, (c) => {
   }, common.mustNotCall());
 
   c.on('error', common.mustCall((err) => {
-    const expectedErr = common.hasOpenSSL32 ?
+    const expectedErr = common.hasOpenSSL(3, 2) ?
       'ERR_SSL_SSL/TLS_ALERT_HANDSHAKE_FAILURE' : 'ERR_SSL_SSLV3_ALERT_HANDSHAKE_FAILURE';
     assert.strictEqual(err.code, expectedErr);
   }));
diff --git a/test/parallel/test-tls-getprotocol.js b/test/parallel/test-tls-getprotocol.js
index 571f400cea5746..a9c8775e2f112f 100644
--- a/test/parallel/test-tls-getprotocol.js
+++ b/test/parallel/test-tls-getprotocol.js
@@ -14,11 +14,11 @@ const clientConfigs = [
   {
     secureProtocol: 'TLSv1_method',
     version: 'TLSv1',
-    ciphers: (common.hasOpenSSL31 ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT')
+    ciphers: (common.hasOpenSSL(3, 1) ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT')
   }, {
     secureProtocol: 'TLSv1_1_method',
     version: 'TLSv1.1',
-    ciphers: (common.hasOpenSSL31 ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT')
+    ciphers: (common.hasOpenSSL(3, 1) ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT')
   }, {
     secureProtocol: 'TLSv1_2_method',
     version: 'TLSv1.2'
diff --git a/test/parallel/test-tls-min-max-version.js b/test/parallel/test-tls-min-max-version.js
index ab351558a4c8b3..af32468eea6a68 100644
--- a/test/parallel/test-tls-min-max-version.js
+++ b/test/parallel/test-tls-min-max-version.js
@@ -22,7 +22,7 @@ function test(cmin, cmax, cprot, smin, smax, sprot, proto, cerr, serr) {
     if (serr !== 'ERR_SSL_UNSUPPORTED_PROTOCOL')
       ciphers = 'ALL@SECLEVEL=0';
   }
-  if (common.hasOpenSSL31 && cerr === 'ERR_SSL_TLSV1_ALERT_PROTOCOL_VERSION') {
+  if (common.hasOpenSSL(3, 1) && cerr === 'ERR_SSL_TLSV1_ALERT_PROTOCOL_VERSION') {
     ciphers = 'DEFAULT@SECLEVEL=0';
   }
   // Report where test was called from. Strip leading garbage from
diff --git a/test/parallel/test-tls-psk-circuit.js b/test/parallel/test-tls-psk-circuit.js
index 2b49161df8326c..e93db3eb1b4923 100644
--- a/test/parallel/test-tls-psk-circuit.js
+++ b/test/parallel/test-tls-psk-circuit.js
@@ -62,11 +62,11 @@ test({ psk: USERS.UserA, identity: 'UserA' }, { minVersion: 'TLSv1.3' });
 test({ psk: USERS.UserB, identity: 'UserB' });
 test({ psk: USERS.UserB, identity: 'UserB' }, { minVersion: 'TLSv1.3' });
 // Unrecognized user should fail handshake
-const expectedHandshakeErr = common.hasOpenSSL32 ?
+const expectedHandshakeErr = common.hasOpenSSL(3, 2) ?
   'ERR_SSL_SSL/TLS_ALERT_HANDSHAKE_FAILURE' : 'ERR_SSL_SSLV3_ALERT_HANDSHAKE_FAILURE';
 test({ psk: USERS.UserB, identity: 'UserC' }, {}, expectedHandshakeErr);
 // Recognized user but incorrect secret should fail handshake
-const expectedIllegalParameterErr = common.hasOpenSSL32 ?
+const expectedIllegalParameterErr = common.hasOpenSSL(3, 2) ?
   'ERR_SSL_SSL/TLS_ALERT_ILLEGAL_PARAMETER' : 'ERR_SSL_SSLV3_ALERT_ILLEGAL_PARAMETER';
 test({ psk: USERS.UserA, identity: 'UserB' }, {}, expectedIllegalParameterErr);
 test({ psk: USERS.UserB, identity: 'UserB' });
diff --git a/test/parallel/test-tls-session-cache.js b/test/parallel/test-tls-session-cache.js
index e4ecb53282fbae..b55e150401d8a2 100644
--- a/test/parallel/test-tls-session-cache.js
+++ b/test/parallel/test-tls-session-cache.js
@@ -100,7 +100,7 @@ function doTest(testOptions, callback) {
     const args = [
       's_client',
       '-tls1',
-      '-cipher', (common.hasOpenSSL31 ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT'),
+      '-cipher', (common.hasOpenSSL(3, 1) ? 'DEFAULT:@SECLEVEL=0' : 'DEFAULT'),
       '-connect', `localhost:${this.address().port}`,
       '-servername', 'ohgod',
       '-key', fixtures.path('keys/rsa_private.pem'),
diff --git a/test/parallel/test-util-inspect.js b/test/parallel/test-util-inspect.js
index 3da292fc663c35..e69c0349dbafa9 100644
--- a/test/parallel/test-util-inspect.js
+++ b/test/parallel/test-util-inspect.js
@@ -3323,3 +3323,74 @@ assert.strictEqual(
     }
   }), '{ [Symbol(Symbol.iterator)]: [Getter] }');
 }
+
+{
+  const o = {};
+  const { prototype: BuiltinPrototype } = Object;
+  const desc = Reflect.getOwnPropertyDescriptor(BuiltinPrototype, 'constructor');
+  Object.defineProperty(BuiltinPrototype, 'constructor', {
+    get: () => BuiltinPrototype,
+    configurable: true,
+  });
+  assert.strictEqual(
+    util.inspect(o),
+    '{}',
+  );
+  Object.defineProperty(BuiltinPrototype, 'constructor', desc);
+}
+
+{
+  const o = { f() {} };
+  const { prototype: BuiltinPrototype } = Function;
+  const desc = Reflect.getOwnPropertyDescriptor(BuiltinPrototype, 'constructor');
+  Object.defineProperty(BuiltinPrototype, 'constructor', {
+    get: () => BuiltinPrototype,
+    configurable: true,
+  });
+  assert.strictEqual(
+    util.inspect(o),
+    '{ f: [Function: f] }',
+  );
+  Object.defineProperty(BuiltinPrototype, 'constructor', desc);
+}
+{
+  const prototypes = [
+    Array.prototype,
+    ArrayBuffer.prototype,
+    Buffer.prototype,
+    Function.prototype,
+    Map.prototype,
+    Object.prototype,
+    Reflect.getPrototypeOf(Uint8Array.prototype),
+    Set.prototype,
+    Uint8Array.prototype,
+  ];
+  const descriptors = new Map();
+  const buffer = Buffer.from('Hello');
+  const o = {
+    arrayBuffer: new ArrayBuffer(), buffer, typedArray: Uint8Array.from(buffer),
+    array: [], func() {}, set: new Set([1]), map: new Map(),
+  };
+  for (const BuiltinPrototype of prototypes) {
+    descriptors.set(BuiltinPrototype, Reflect.getOwnPropertyDescriptor(BuiltinPrototype, 'constructor'));
+    Object.defineProperty(BuiltinPrototype, 'constructor', {
+      get: () => BuiltinPrototype,
+      configurable: true,
+    });
+  }
+  assert.strictEqual(
+    util.inspect(o),
+    '{\n' +
+    '  arrayBuffer: ArrayBuffer { [Uint8Contents]: <>, byteLength: 0 },\n' +
+    '  buffer: <Buffer 48 65 6c 6c 6f>,\n' +
+    '  typedArray: TypedArray(5) [Uint8Array] [ 72, 101, 108, 108, 111 ],\n' +
+    '  array: [],\n' +
+    '  func: [Function: func],\n' +
+    '  set: Set(1) { 1 },\n' +
+    '  map: Map(0) {}\n' +
+    '}',
+  );
+  for (const [BuiltinPrototype, desc] of descriptors) {
+    Object.defineProperty(BuiltinPrototype, 'constructor', desc);
+  }
+}
diff --git a/test/parallel/test-util-text-decoder.js b/test/parallel/test-util-text-decoder.js
new file mode 100644
index 00000000000000..0f6d0463f9da48
--- /dev/null
+++ b/test/parallel/test-util-text-decoder.js
@@ -0,0 +1,17 @@
+'use strict';
+
+const common = require('../common');
+
+const test = require('node:test');
+const assert = require('node:assert');
+
+test('TextDecoder correctly decodes windows-1252 encoded data', { skip: !common.hasIntl }, () => {
+  const latin1Bytes = new Uint8Array([0xc1, 0xe9, 0xf3]);
+
+  const expectedString = 'Áéó';
+
+  const decoder = new TextDecoder('windows-1252');
+  const decodedString = decoder.decode(latin1Bytes);
+
+  assert.strictEqual(decodedString, expectedString);
+});
diff --git a/test/parallel/test-v8-serdes.js b/test/parallel/test-v8-serdes.js
index 296e076a9f760d..c87ed89353af3b 100644
--- a/test/parallel/test-v8-serdes.js
+++ b/test/parallel/test-v8-serdes.js
@@ -1,4 +1,4 @@
-// Flags: --expose-internals
+// Flags: --expose-internals --js-float16array
 
 'use strict';
 
@@ -7,6 +7,9 @@ const { internalBinding } = require('internal/test/binding');
 const assert = require('assert');
 const v8 = require('v8');
 const os = require('os');
+// TODO(bartlomieju): once `Float16Array` is available in stable V8,
+// remove this line and `--js-float16array` flag up top
+const { Float16Array } = globalThis;
 
 const circular = {};
 circular.circular = circular;
@@ -26,6 +29,7 @@ const objects = [
   Buffer.from([1, 2, 3, 4]),
   new BigInt64Array([42n]),
   new BigUint64Array([42n]),
+  new Float16Array([1, 2, 3, 4]),
   undefined,
   null,
   42,
diff --git a/test/sequential/sequential.status b/test/sequential/sequential.status
index 073b29cce8dbca..5f4445416d95fa 100644
--- a/test/sequential/sequential.status
+++ b/test/sequential/sequential.status
@@ -52,14 +52,3 @@ test-watch-mode-inspect: SKIP
 [$arch==s390x]
 # https://github.com/nodejs/node/issues/41286
 test-performance-eventloopdelay: PASS, FLAKY
-
-[$system==linux && $arch==ppc64]
-# https://github.com/nodejs/node/issues/50740
-test-single-executable-application-assets-raw: PASS, FLAKY
-test-single-executable-application-assets: PASS, FLAKY
-test-single-executable-application-disable-experimental-sea-warning: PASS, FLAKY
-test-single-executable-application-empty: PASS, FLAKY
-test-single-executable-application-snapshot-and-code-cache: PASS, FLAKY
-test-single-executable-application-snapshot: PASS, FLAKY
-test-single-executable-application-use-code-cache: PASS, FLAKY
-test-single-executable-application: PASS, FLAKY
diff --git a/test/sequential/test-single-executable-application-snapshot-worker.js b/test/sequential/test-single-executable-application-snapshot-worker.js
new file mode 100644
index 00000000000000..50c77743573a44
--- /dev/null
+++ b/test/sequential/test-single-executable-application-snapshot-worker.js
@@ -0,0 +1,80 @@
+'use strict';
+
+require('../common');
+
+const {
+  generateSEA,
+  skipIfSingleExecutableIsNotSupported,
+} = require('../common/sea');
+
+skipIfSingleExecutableIsNotSupported();
+
+// This tests the snapshot support in single executable applications.
+
+const tmpdir = require('../common/tmpdir');
+const { writeFileSync, existsSync } = require('fs');
+const {
+  spawnSyncAndAssert, spawnSyncAndExitWithoutError,
+} = require('../common/child_process');
+const assert = require('assert');
+
+const configFile = tmpdir.resolve('sea-config.json');
+const seaPrepBlob = tmpdir.resolve('sea-prep.blob');
+const outputFile = tmpdir.resolve(process.platform === 'win32' ? 'sea.exe' : 'sea');
+
+{
+  tmpdir.refresh();
+
+  // FIXME(joyeecheung): currently `worker_threads` cannot be loaded during the
+  // snapshot building process because internal/worker.js is accessing isMainThread at
+  // the top level (and there are maybe more code that access these at the top-level),
+  // and have to be loaded in the deserialized snapshot main function.
+  // Change these states to be accessed on-demand.
+  const code = `
+  const {
+    setDeserializeMainFunction,
+  } = require('v8').startupSnapshot;
+  setDeserializeMainFunction(() => {
+    const { Worker } = require('worker_threads');
+    new Worker("console.log('Hello from Worker')", { eval: true });
+  });
+  `;
+
+  writeFileSync(tmpdir.resolve('snapshot.js'), code, 'utf-8');
+  writeFileSync(configFile, `
+  {
+    "main": "snapshot.js",
+    "output": "sea-prep.blob",
+    "useSnapshot": true
+  }
+  `);
+
+  spawnSyncAndExitWithoutError(
+    process.execPath,
+    ['--experimental-sea-config', 'sea-config.json'],
+    {
+      cwd: tmpdir.path,
+      env: {
+        NODE_DEBUG_NATIVE: 'SEA',
+        ...process.env,
+      },
+    });
+
+  assert(existsSync(seaPrepBlob));
+
+  generateSEA(outputFile, process.execPath, seaPrepBlob);
+
+  spawnSyncAndAssert(
+    outputFile,
+    {
+      env: {
+        NODE_DEBUG_NATIVE: 'SEA',
+        ...process.env,
+      }
+    },
+    {
+      trim: true,
+      stdout: 'Hello from Worker'
+    }
+  );
+}
diff --git a/test/sqlite/extension.c b/test/sqlite/extension.c
new file mode 100644
index 00000000000000..7bc8eb20f1da1e
--- /dev/null
+++ b/test/sqlite/extension.c
@@ -0,0 +1,94 @@
+/*
+** 2020-01-08
+**
+** The author disclaims copyright to this source code.  In place of
+** a legal notice, here is a blessing:
+**
+**    May you do good and not evil.
+**    May you find forgiveness for yourself and forgive others.
+**    May you share freely, never taking more than you give.
+**
+******************************************************************************
+**
+** This SQLite extension implements a noop() function used for testing.
+**
+** Variants:
+**
+**    noop(X)           The default.  Deterministic.
+**    noop_i(X)         Deterministic and innocuous.
+**    noop_do(X)        Deterministic and direct-only.
+**    noop_nd(X)        Non-deterministic.
+*/
+#include <assert.h>
+#include <sqlite3ext.h>
+#include <stdio.h>
+#include <string.h>
+
+SQLITE_EXTENSION_INIT1
+
+/*
+** Implementation of the noop() function.
+**
+** The function returns its argument, unchanged.
+*/
+static void noopfunc(sqlite3_context* context, int argc, sqlite3_value** argv) {
+  assert(argc == 1);
+  sqlite3_result_value(context, argv[0]);
+}
+
+/*
+** Implementation of the multitype_text() function.
+**
+** The function returns its argument.  The result will always have a
+** TEXT value.  But if the original input is numeric, it will also
+** have that numeric value.
+*/
+static void multitypeTextFunc(sqlite3_context* context,
+                              int argc,
+                              sqlite3_value** argv) {
+  assert(argc == 1);
+  (void)argc;
+  (void)sqlite3_value_text(argv[0]);
+  sqlite3_result_value(context, argv[0]);
+}
+
+#ifdef _WIN32
+__declspec(dllexport)
+#endif
+
+    int sqlite3_extension_init(sqlite3* db,
+                               char** pzErrMsg,
+                               const sqlite3_api_routines* pApi) {
+  int rc = SQLITE_OK;
+  SQLITE_EXTENSION_INIT2(pApi);
+
+  rc = sqlite3_create_function(
+      db, "noop", 1, SQLITE_UTF8 | SQLITE_DETERMINISTIC, 0, noopfunc, 0, 0);
+  if (rc) return rc;
+  rc = sqlite3_create_function(
+      db,
+      "noop_i",
+      1,
+      SQLITE_UTF8 | SQLITE_DETERMINISTIC | SQLITE_INNOCUOUS,
+      0,
+      noopfunc,
+      0,
+      0);
+  if (rc) return rc;
+  rc = sqlite3_create_function(
+      db,
+      "noop_do",
+      1,
+      SQLITE_UTF8 | SQLITE_DETERMINISTIC | SQLITE_DIRECTONLY,
+      0,
+      noopfunc,
+      0,
+      0);
+  if (rc) return rc;
+  rc =
+      sqlite3_create_function(db, "noop_nd", 1, SQLITE_UTF8, 0, noopfunc, 0, 0);
+  if (rc) return rc;
+  rc = sqlite3_create_function(
+      db, "multitype_text", 1, SQLITE_UTF8, 0, multitypeTextFunc, 0, 0);
+  return rc;
+}
diff --git a/test/sqlite/test-sqlite-extensions.mjs b/test/sqlite/test-sqlite-extensions.mjs
new file mode 100644
index 00000000000000..0e0acf2dc33d30
--- /dev/null
+++ b/test/sqlite/test-sqlite-extensions.mjs
@@ -0,0 +1,108 @@
+import * as common from '../common/index.mjs';
+
+import assert from 'node:assert';
+import path from 'node:path';
+import sqlite from 'node:sqlite';
+import test from 'node:test';
+import fs from 'node:fs';
+import childProcess from 'child_process';
+
+// Lib extension binary is named differently on different platforms
+function resolveBuiltBinary(binary) {
+  const targetFile = fs.readdirSync(path.dirname(process.execPath)).find((file) => file.startsWith(binary));
+  return path.join(path.dirname(process.execPath), targetFile);
+}
+
+const binary = resolveBuiltBinary('libsqlite_extension');
+
+test('should load extension successfully', () => {
+  const db = new sqlite.DatabaseSync(':memory:', {
+    allowExtension: true,
+  });
+  db.loadExtension(binary);
+  db.exec('SELECT noop(\'Hello, world!\');');
+  const query = db.prepare('SELECT noop(\'Hello, World!\') AS result');
+  const { result } = query.get();
+  assert.strictEqual(result, 'Hello, World!');
+});
+
+test('should not load extension', () => {
+  const db = new sqlite.DatabaseSync(':memory:', {
+    allowExtension: false,
+  });
+  assert.throws(() => {
+    db.exec('SELECT noop(\'Hello, world!\');');
+  }, {
+    message: 'no such function: noop',
+    code: 'ERR_SQLITE_ERROR',
+  });
+  assert.throws(() => {
+    db.loadExtension(binary);
+  }, {
+    message: 'extension loading is not allowed',
+    code: 'ERR_INVALID_STATE',
+  });
+  assert.throws(() => {
+    const query = db.prepare('SELECT load_extension(?)');
+    query.run(binary);
+  }, {
+    message: 'not authorized',
+    code: 'ERR_SQLITE_ERROR',
+  });
+  assert.throws(() => {
+    db.enableLoadExtension();
+  }, {
+    message: 'The "allow" argument must be a boolean.',
+    code: 'ERR_INVALID_ARG_TYPE',
+  });
+
+  assert.throws(() => {
+    db.enableLoadExtension(true);
+  }, {
+    message: 'Cannot enable extension loading because it was disabled at database creation.',
+  });
+});
+
+test('should load extension successfully with enableLoadExtension', () => {
+  const db = new sqlite.DatabaseSync(':memory:', {
+    allowExtension: true,
+  });
+  db.loadExtension(binary);
+  db.enableLoadExtension(false);
+  db.exec('SELECT noop(\'Hello, world!\');');
+  const query = db.prepare('SELECT noop(\'Hello, World!\') AS result');
+  const { result } = query.get();
+  assert.strictEqual(result, 'Hello, World!');
+});
+
+test('should not load extension with enableLoadExtension', () => {
+  const db = new sqlite.DatabaseSync(':memory:', {
+    allowExtension: true,
+  });
+  db.enableLoadExtension(false);
+  assert.throws(() => {
+    db.loadExtension(binary);
+  }, {
+    message: 'extension loading is not allowed',
+  });
+});
+
+test('should throw error if permission is enabled', async () => {
+  const [cmd, opts] = common.escapePOSIXShell`"${process.execPath}" `;
+  const code = `const sqlite = require('node:sqlite');
+const db = new sqlite.DatabaseSync(':memory:', { allowExtension: true });`;
+  return new Promise((resolve) => {
+    childProcess.exec(
+      `${cmd} --permission -e "${code}"`,
+      {
+        ...opts,
+      },
+      common.mustCall((err, _, stderr) => {
+        assert.strictEqual(err.code, 1);
+        assert.match(stderr, /Error: Cannot load SQLite extensions when the permission model is enabled/);
+        assert.match(stderr, /code: 'ERR_LOAD_SQLITE_EXTENSION'/);
+        resolve();
+      }),
+    );
+  });
+});
diff --git a/test/sqlite/testcfg.py b/test/sqlite/testcfg.py
new file mode 100644
index 00000000000000..3d43abbe89482d
--- /dev/null
+++ b/test/sqlite/testcfg.py
@@ -0,0 +1,6 @@
+import sys, os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+import testpy
+
+def GetConfiguration(context, root):
+    return testpy.SimpleTestConfiguration(context, root, 'sqlite')
diff --git a/test/wpt/status/url.json b/test/wpt/status/url.json
index 96dafd91b707d5..d361048874d72d 100644
--- a/test/wpt/status/url.json
+++ b/test/wpt/status/url.json
@@ -13,6 +13,22 @@
       ]
     }
   },
+  "toascii.window.js": {
+    "fail": {
+      "note": "Unicode 15.1",
+      "expected": [
+        "\uD87E\uDC68.com (using URL)",
+        "\uD87E\uDC68.com (using URL.host)",
+        "\uD87E\uDC68.com (using URL.hostname)",
+        "\u1E9E.com (using URL)",
+        "\u1E9E.com (using URL.host)",
+        "\u1E9E.com (using URL.hostname)",
+        "\u1E9E.foo.com (using URL)",
+        "\u1E9E.foo.com (using URL.host)",
+        "\u1E9E.foo.com (using URL.hostname)"
+      ]
+    }
+  },
   "url-setters-a-area.window.js": {
     "skip": "already tested in url-setters.any.js"
   },
diff --git a/tools/actions/create-release.sh b/tools/actions/create-release.sh
index e3cfd76952a18b..1392c4dd458476 100755
--- a/tools/actions/create-release.sh
+++ b/tools/actions/create-release.sh
@@ -7,6 +7,7 @@ BOT_TOKEN=${BOT_TOKEN:-}
 
 RELEASE_DATE=$1
 RELEASE_LINE=$2
+RELEASER=$3
 
 if [ -z "$RELEASE_DATE" ] || [ -z "$RELEASE_LINE" ]; then
   echo "Usage: $0 <RELEASE_DATE> <RELEASE_LINE>"
@@ -28,7 +29,7 @@ git node release --prepare --skipBranchDiff --yes --releaseDate "$RELEASE_DATE"
 HEAD_BRANCH="$(git rev-parse --abbrev-ref HEAD)"
 HEAD_SHA="$(git rev-parse HEAD^)"
 
-TITLE=$(awk "/^## ${RELEASE_DATE}/ { print substr(\$0, 4) }" "doc/changelogs/CHANGELOG_V${RELEASE_LINE}.md")
+TITLE="$(git log -1 --format=%s)"
 
 # Use a temporary file for the PR body
 TEMP_BODY="$(awk "/## ${RELEASE_DATE}/,/^<a id=/{ if (!/^<a id=/) print }" "doc/changelogs/CHANGELOG_V${RELEASE_LINE}.md")"
@@ -48,14 +49,14 @@ PR_URL="$(gh api \
   -H "Accept: application/vnd.github+json" \
   -H "X-GitHub-Api-Version: 2022-11-28" \
   "/repos/${GITHUB_REPOSITORY}/pulls" \
-   -f "title=$TITLE" -f "body=$TEMP_BODY" -f "head=$HEAD_BRANCH" -f "base=v$RELEASE_LINE.x")"
+   -f "title=$TITLE" -f "body=$TEMP_BODY" -f "head=$HEAD_BRANCH" -f "base=v$RELEASE_LINE.x" -f draft=true)"
 
 # Push the release commit to the proposal branch using `BOT_TOKEN` from the env
 node --input-type=module - \
     "$GITHUB_REPOSITORY" \
     "$HEAD_BRANCH" \
     "$HEAD_SHA" \
-    "$(git log -1 HEAD --format=%s || true)" \
+    "$TITLE" \
     "$(git log -1 HEAD --format=%b | awk -v PR_URL="$PR_URL" '{sub(/^PR-URL: TODO$/, "PR-URL: " PR_URL)} 1' || true)" \
     "$(git show HEAD --diff-filter=d --name-only --format= || true)" \
     "$(git show HEAD --diff-filter=D --name-only --format= || true)" \
@@ -124,3 +125,5 @@ if (data.errors?.length) {
 }
 console.log(util.inspect(data, { depth: Infinity }));
 EOF
+
+gh pr edit "$PR_URL" --add-label release --add-assignee "$RELEASER"
diff --git a/tools/eslint/package-lock.json b/tools/eslint/package-lock.json
index 0f1ad7b669d20b..ddae8841e1ea39 100644
--- a/tools/eslint/package-lock.json
+++ b/tools/eslint/package-lock.json
@@ -11,12 +11,12 @@
         "@babel/core": "^7.26.0",
         "@babel/eslint-parser": "^7.25.9",
         "@babel/plugin-syntax-import-attributes": "^7.26.0",
-        "@stylistic/eslint-plugin-js": "^2.10.1",
-        "eslint": "^9.14.0",
+        "@stylistic/eslint-plugin-js": "^2.11.0",
+        "eslint": "^9.16.0",
         "eslint-formatter-tap": "^8.40.0",
-        "eslint-plugin-jsdoc": "^50.4.3",
+        "eslint-plugin-jsdoc": "^50.6.0",
         "eslint-plugin-markdown": "^5.1.0",
-        "globals": "^15.11.0"
+        "globals": "^15.12.0"
       }
     },
     "node_modules/@ampproject/remapping": {
@@ -328,9 +328,9 @@
       }
     },
     "node_modules/@eslint/config-array": {
-      "version": "0.18.0",
-      "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.18.0.tgz",
-      "integrity": "sha512-fTxvnS1sRMu3+JjXwJG0j/i4RT9u4qJ+lqS/yCGap4lH4zZGzQ7tu+xZqQmcMZq5OBZDL4QRxQzRjkWcGt8IVw==",
+      "version": "0.19.0",
+      "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.19.0.tgz",
+      "integrity": "sha512-zdHg2FPIFNKPdcHWtiNT+jEFCHYVplAXRDlQDyqy0zGx/q2parwh7brGJSiTxRk/TSMkbM//zt/f5CHgyTyaSQ==",
       "dependencies": {
         "@eslint/object-schema": "^2.1.4",
         "debug": "^4.3.1",
@@ -341,18 +341,17 @@
       }
     },
     "node_modules/@eslint/core": {
-      "version": "0.7.0",
-      "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.7.0.tgz",
-      "integrity": "sha512-xp5Jirz5DyPYlPiKat8jaq0EmYvDXKKpzTbxXMpT9eqlRJkRKIz9AGMdlvYjih+im+QlhWrpvVjl8IPC/lHlUw==",
+      "version": "0.9.0",
+      "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.9.0.tgz",
+      "integrity": "sha512-7ATR9F0e4W85D/0w7cU0SNj7qkAexMG+bAHEZOjo9akvGuhHE2m7umzWzfnpa0XAg5Kxc1BWmtPMV67jJ+9VUg==",
       "engines": {
         "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
       }
     },
     "node_modules/@eslint/eslintrc": {
-      "version": "3.1.0",
-      "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.1.0.tgz",
-      "integrity": "sha512-4Bfj15dVJdoy3RfZmmo86RK1Fwzn6SstsvK9JS+BaVKqC6QQQQyXekNaC+g+LKNgkQ+2VhGAzm6hO40AhMR3zQ==",
-      "license": "MIT",
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.2.0.tgz",
+      "integrity": "sha512-grOjVNN8P3hjJn/eIETF1wwd12DdnwFDoyceUJLYYdkpbwq3nLi+4fqrTAONx7XDALqlL220wC/RHSC/QTI/0w==",
       "dependencies": {
         "ajv": "^6.12.4",
         "debug": "^4.3.2",
@@ -375,7 +374,6 @@
       "version": "14.0.0",
       "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz",
       "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==",
-      "license": "MIT",
       "engines": {
         "node": ">=18"
       },
@@ -384,9 +382,9 @@
       }
     },
     "node_modules/@eslint/js": {
-      "version": "9.14.0",
-      "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.14.0.tgz",
-      "integrity": "sha512-pFoEtFWCPyDOl+C6Ift+wC7Ro89otjigCf5vcuWqWgqNSQbRrpjSvdeE6ofLz4dHmyxD5f7gIdGT4+p36L6Twg==",
+      "version": "9.16.0",
+      "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.16.0.tgz",
+      "integrity": "sha512-tw2HxzQkrbeuvyj1tG2Yqq+0H9wGoI2IMk4EOsQeX+vmd75FtJAzf+gTA69WF+baUKRYQ3x2kbLE08js5OsTVg==",
       "engines": {
         "node": "^18.18.0 || ^20.9.0 || >=21.1.0"
       }
@@ -456,9 +454,9 @@
       }
     },
     "node_modules/@humanwhocodes/retry": {
-      "version": "0.4.0",
-      "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.0.tgz",
-      "integrity": "sha512-xnRgu9DxZbkWak/te3fcytNyp8MTbuiZIaueg2rgEvBuN55n04nwLYLU9TX/VVlusc9L2ZNXi99nUFNkHXtr5g==",
+      "version": "0.4.1",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.1.tgz",
+      "integrity": "sha512-c7hNEllBlenFTHBky65mhq8WD2kbN9Q6gk0bTk8lSBvc554jpXSkST1iePudpt7+A/AQvuHs9EMqjHDXMY1lrA==",
       "engines": {
         "node": ">=18.18"
       },
@@ -537,9 +535,9 @@
       }
     },
     "node_modules/@stylistic/eslint-plugin-js": {
-      "version": "2.10.1",
-      "resolved": "https://registry.npmjs.org/@stylistic/eslint-plugin-js/-/eslint-plugin-js-2.10.1.tgz",
-      "integrity": "sha512-IikL/RKy9Sk2UMDUUpqrEcwDeYzUEt6SaL2/UVCFuVQxKACHSgStT0NxXkxZmBOUforaU52FPf2Su07FYH5s5g==",
+      "version": "2.11.0",
+      "resolved": "https://registry.npmjs.org/@stylistic/eslint-plugin-js/-/eslint-plugin-js-2.11.0.tgz",
+      "integrity": "sha512-btchD0P3iij6cIk5RR5QMdEhtCCV0+L6cNheGhGCd//jaHILZMTi/EOqgEDAf1s4ZoViyExoToM+S2Iwa3U9DA==",
       "dependencies": {
         "eslint-visitor-keys": "^4.2.0",
         "espree": "^10.3.0"
@@ -611,7 +609,6 @@
       "version": "6.12.6",
       "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz",
       "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==",
-      "license": "MIT",
       "dependencies": {
         "fast-deep-equal": "^3.1.1",
         "fast-json-stable-stringify": "^2.0.0",
@@ -641,14 +638,12 @@
     "node_modules/balanced-match": {
       "version": "1.0.2",
       "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
-      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
-      "license": "MIT"
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw=="
     },
     "node_modules/brace-expansion": {
       "version": "1.1.11",
       "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",
       "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==",
-      "license": "MIT",
       "dependencies": {
         "balanced-match": "^1.0.0",
         "concat-map": "0.0.1"
@@ -689,7 +684,6 @@
       "version": "3.1.0",
       "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz",
       "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==",
-      "license": "MIT",
       "engines": {
         "node": ">=6"
       }
@@ -754,8 +748,7 @@
     "node_modules/concat-map": {
       "version": "0.0.1",
       "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
-      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
-      "license": "MIT"
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg=="
     },
     "node_modules/convert-source-map": {
       "version": "2.0.0",
@@ -819,25 +812,25 @@
       }
     },
     "node_modules/eslint": {
-      "version": "9.14.0",
-      "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.14.0.tgz",
-      "integrity": "sha512-c2FHsVBr87lnUtjP4Yhvk4yEhKrQavGafRA/Se1ouse8PfbfC/Qh9Mxa00yWsZRlqeUB9raXip0aiiUZkgnr9g==",
+      "version": "9.16.0",
+      "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.16.0.tgz",
+      "integrity": "sha512-whp8mSQI4C8VXd+fLgSM0lh3UlmcFtVwUQjyKCFfsp+2ItAIYhlq/hqGahGqHE6cv9unM41VlqKk2VtKYR2TaA==",
       "dependencies": {
         "@eslint-community/eslint-utils": "^4.2.0",
         "@eslint-community/regexpp": "^4.12.1",
-        "@eslint/config-array": "^0.18.0",
-        "@eslint/core": "^0.7.0",
-        "@eslint/eslintrc": "^3.1.0",
-        "@eslint/js": "9.14.0",
-        "@eslint/plugin-kit": "^0.2.0",
+        "@eslint/config-array": "^0.19.0",
+        "@eslint/core": "^0.9.0",
+        "@eslint/eslintrc": "^3.2.0",
+        "@eslint/js": "9.16.0",
+        "@eslint/plugin-kit": "^0.2.3",
         "@humanfs/node": "^0.16.6",
         "@humanwhocodes/module-importer": "^1.0.1",
-        "@humanwhocodes/retry": "^0.4.0",
+        "@humanwhocodes/retry": "^0.4.1",
         "@types/estree": "^1.0.6",
         "@types/json-schema": "^7.0.15",
         "ajv": "^6.12.4",
         "chalk": "^4.0.0",
-        "cross-spawn": "^7.0.2",
+        "cross-spawn": "^7.0.5",
         "debug": "^4.3.2",
         "escape-string-regexp": "^4.0.0",
         "eslint-scope": "^8.2.0",
@@ -856,8 +849,7 @@
         "lodash.merge": "^4.6.2",
         "minimatch": "^3.1.2",
         "natural-compare": "^1.4.0",
-        "optionator": "^0.9.3",
-        "text-table": "^0.2.0"
+        "optionator": "^0.9.3"
       },
       "bin": {
         "eslint": "bin/eslint.js"
@@ -890,9 +882,9 @@
       }
     },
     "node_modules/eslint-plugin-jsdoc": {
-      "version": "50.4.3",
-      "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-50.4.3.tgz",
-      "integrity": "sha512-uWtwFxGRv6B8sU63HZM5dAGDhgsatb+LONwmILZJhdRALLOkCX2HFZhdL/Kw2ls8SQMAVEfK+LmnEfxInRN8HA==",
+      "version": "50.6.0",
+      "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-50.6.0.tgz",
+      "integrity": "sha512-tCNp4fR79Le3dYTPB0dKEv7yFyvGkUCa+Z3yuTrrNGGOxBlXo9Pn0PEgroOZikUQOGjxoGMVKNjrOHcYEdfszg==",
       "dependencies": {
         "@es-joy/jsdoccomment": "~0.49.0",
         "are-docs-informative": "^0.0.2",
@@ -1180,14 +1172,12 @@
     "node_modules/fast-deep-equal": {
       "version": "3.1.3",
       "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
-      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
-      "license": "MIT"
+      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q=="
     },
     "node_modules/fast-json-stable-stringify": {
       "version": "2.1.0",
       "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz",
-      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==",
-      "license": "MIT"
+      "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw=="
     },
     "node_modules/fast-levenshtein": {
       "version": "2.0.6",
@@ -1264,9 +1254,9 @@
       }
     },
     "node_modules/globals": {
-      "version": "15.11.0",
-      "resolved": "https://registry.npmjs.org/globals/-/globals-15.11.0.tgz",
-      "integrity": "sha512-yeyNSjdbyVaWurlwCpcA6XNBrHTMIeDdj0/hnvX/OLJ9ekOXYbLsLinH/MucQyGvNnXhidTdNhTtJaffL2sMfw==",
+      "version": "15.12.0",
+      "resolved": "https://registry.npmjs.org/globals/-/globals-15.12.0.tgz",
+      "integrity": "sha512-1+gLErljJFhbOVyaetcwJiJ4+eLe45S2E7P5UiZ9xGfeq3ATQf5DOv9G7MH3gGbKQLkzmNh2DxfZwLdw+j6oTQ==",
       "engines": {
         "node": ">=18"
       },
@@ -1275,10 +1265,9 @@
       }
     },
     "node_modules/ignore": {
-      "version": "5.3.1",
-      "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.1.tgz",
-      "integrity": "sha512-5Fytz/IraMjqpwfd34ke28PTVMjZjJG2MPn5t7OE4eUCUNf8BAa7b5WUS9/Qvr6mwOQS7Mk6vdsMno5he+T8Xw==",
-      "license": "MIT",
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz",
+      "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==",
       "engines": {
         "node": ">= 4"
       }
@@ -1287,7 +1276,6 @@
       "version": "3.3.0",
       "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.0.tgz",
       "integrity": "sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==",
-      "license": "MIT",
       "dependencies": {
         "parent-module": "^1.0.0",
         "resolve-from": "^4.0.0"
@@ -1424,8 +1412,7 @@
     "node_modules/json-schema-traverse": {
       "version": "0.4.1",
       "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz",
-      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==",
-      "license": "MIT"
+      "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="
     },
     "node_modules/json-stable-stringify-without-jsonify": {
       "version": "1.0.1",
@@ -1547,7 +1534,6 @@
       "version": "3.1.2",
       "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
       "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
-      "license": "ISC",
       "dependencies": {
         "brace-expansion": "^1.1.7"
       },
@@ -1623,7 +1609,6 @@
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz",
       "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==",
-      "license": "MIT",
       "dependencies": {
         "callsites": "^3.0.0"
       },
@@ -1698,7 +1683,6 @@
       "version": "2.3.1",
       "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz",
       "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==",
-      "license": "MIT",
       "engines": {
         "node": ">=6"
       }
@@ -1707,7 +1691,6 @@
       "version": "4.0.0",
       "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz",
       "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==",
-      "license": "MIT",
       "engines": {
         "node": ">=4"
       }
@@ -1774,7 +1757,6 @@
       "version": "3.1.1",
       "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz",
       "integrity": "sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==",
-      "license": "MIT",
       "engines": {
         "node": ">=8"
       },
@@ -1798,12 +1780,6 @@
         "url": "https://opencollective.com/unts"
       }
     },
-    "node_modules/text-table": {
-      "version": "0.2.0",
-      "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz",
-      "integrity": "sha512-N+8UisAXDGk8PFXP4HAzVR9nbfmVJ3zYLAWiTIoqC5v5isinhr+r5uaO8+7r3BMfuNIufIsA7RdpVgacC2cSpw==",
-      "license": "MIT"
-    },
     "node_modules/tslib": {
       "version": "2.6.3",
       "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.3.tgz",
@@ -1868,7 +1844,6 @@
       "version": "4.4.1",
       "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz",
       "integrity": "sha512-7rKUyy33Q1yc98pQ1DAmLtwX109F7TIfWlW1Ydo8Wl1ii1SeHieeh0HHfPeL2fMXK6z0s8ecKs9frCuLJvndBg==",
-      "license": "BSD-2-Clause",
       "dependencies": {
         "punycode": "^2.1.0"
       }
diff --git a/tools/eslint/package.json b/tools/eslint/package.json
index 39beeeb2ce1bd6..59b2e661aa96f1 100644
--- a/tools/eslint/package.json
+++ b/tools/eslint/package.json
@@ -6,11 +6,11 @@
     "@babel/core": "^7.26.0",
     "@babel/eslint-parser": "^7.25.9",
     "@babel/plugin-syntax-import-attributes": "^7.26.0",
-    "@stylistic/eslint-plugin-js": "^2.10.1",
-    "eslint": "^9.14.0",
+    "@stylistic/eslint-plugin-js": "^2.11.0",
+    "eslint": "^9.16.0",
     "eslint-formatter-tap": "^8.40.0",
-    "eslint-plugin-jsdoc": "^50.4.3",
+    "eslint-plugin-jsdoc": "^50.6.0",
     "eslint-plugin-markdown": "^5.1.0",
-    "globals": "^15.11.0"
+    "globals": "^15.12.0"
   }
 }
diff --git a/tools/github_reporter/index.js b/tools/github_reporter/index.js
index 9abd7e231e1421..3bab152c932229 100644
--- a/tools/github_reporter/index.js
+++ b/tools/github_reporter/index.js
@@ -1,37 +1,8 @@
 "use strict";
-var __create = Object.create;
-var __defProp = Object.defineProperty;
-var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
 var __getOwnPropNames = Object.getOwnPropertyNames;
-var __getProtoOf = Object.getPrototypeOf;
-var __hasOwnProp = Object.prototype.hasOwnProperty;
-var __esm = (fn, res) => function __init() {
-  return fn && (res = (0, fn[__getOwnPropNames(fn)[0]])(fn = 0)), res;
-};
 var __commonJS = (cb, mod) => function __require() {
   return mod || (0, cb[__getOwnPropNames(cb)[0]])((mod = { exports: {} }).exports, mod), mod.exports;
 };
-var __export = (target, all) => {
-  for (var name in all)
-    __defProp(target, name, { get: all[name], enumerable: true });
-};
-var __copyProps = (to, from, except, desc) => {
-  if (from && typeof from === "object" || typeof from === "function") {
-    for (let key of __getOwnPropNames(from))
-      if (!__hasOwnProp.call(to, key) && key !== except)
-        __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
-  }
-  return to;
-};
-var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
-  // If the importer is in node compatibility mode or this is not an ESM
-  // file that has been converted to a CommonJS file using a Babel-
-  // compatible transform (i.e. "__esModule" has not been set), then set
-  // "default" to the CommonJS "module.exports" for node compatibility.
-  isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
-  mod
-));
-var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
 
 // node_modules/@reporters/github/node_modules/@actions/core/lib/utils.js
 var require_utils = __commonJS({
@@ -72,9 +43,13 @@ var require_command = __commonJS({
     var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
       if (k2 === void 0)
         k2 = k;
-      Object.defineProperty(o, k2, { enumerable: true, get: function() {
-        return m[k];
-      } });
+      var desc = Object.getOwnPropertyDescriptor(m, k);
+      if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+        desc = { enumerable: true, get: function() {
+          return m[k];
+        } };
+      }
+      Object.defineProperty(o, k2, desc);
     } : function(o, m, k, k2) {
       if (k2 === void 0)
         k2 = k;
@@ -91,7 +66,7 @@ var require_command = __commonJS({
       var result = {};
       if (mod != null) {
         for (var k in mod)
-          if (k !== "default" && Object.hasOwnProperty.call(mod, k))
+          if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k))
             __createBinding(result, mod, k);
       }
       __setModuleDefault(result, mod);
@@ -144,345 +119,12 @@ var require_command = __commonJS({
       }
     };
     function escapeData(s) {
-      return utils_1.toCommandValue(s).replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
+      return (0, utils_1.toCommandValue)(s).replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A");
     }
     function escapeProperty(s) {
-      return utils_1.toCommandValue(s).replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/:/g, "%3A").replace(/,/g, "%2C");
-    }
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/rng.js
-function rng() {
-  if (poolPtr > rnds8Pool.length - 16) {
-    import_crypto.default.randomFillSync(rnds8Pool);
-    poolPtr = 0;
-  }
-  return rnds8Pool.slice(poolPtr, poolPtr += 16);
-}
-var import_crypto, rnds8Pool, poolPtr;
-var init_rng = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/rng.js"() {
-    import_crypto = __toESM(require("crypto"));
-    rnds8Pool = new Uint8Array(256);
-    poolPtr = rnds8Pool.length;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/regex.js
-var regex_default;
-var init_regex = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/regex.js"() {
-    regex_default = /^(?:[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}|00000000-0000-0000-0000-000000000000)$/i;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/validate.js
-function validate(uuid) {
-  return typeof uuid === "string" && regex_default.test(uuid);
-}
-var validate_default;
-var init_validate = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/validate.js"() {
-    init_regex();
-    validate_default = validate;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/stringify.js
-function stringify(arr, offset = 0) {
-  const uuid = (byteToHex[arr[offset + 0]] + byteToHex[arr[offset + 1]] + byteToHex[arr[offset + 2]] + byteToHex[arr[offset + 3]] + "-" + byteToHex[arr[offset + 4]] + byteToHex[arr[offset + 5]] + "-" + byteToHex[arr[offset + 6]] + byteToHex[arr[offset + 7]] + "-" + byteToHex[arr[offset + 8]] + byteToHex[arr[offset + 9]] + "-" + byteToHex[arr[offset + 10]] + byteToHex[arr[offset + 11]] + byteToHex[arr[offset + 12]] + byteToHex[arr[offset + 13]] + byteToHex[arr[offset + 14]] + byteToHex[arr[offset + 15]]).toLowerCase();
-  if (!validate_default(uuid)) {
-    throw TypeError("Stringified UUID is invalid");
-  }
-  return uuid;
-}
-var byteToHex, stringify_default;
-var init_stringify = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/stringify.js"() {
-    init_validate();
-    byteToHex = [];
-    for (let i = 0; i < 256; ++i) {
-      byteToHex.push((i + 256).toString(16).substr(1));
-    }
-    stringify_default = stringify;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v1.js
-function v1(options, buf, offset) {
-  let i = buf && offset || 0;
-  const b = buf || new Array(16);
-  options = options || {};
-  let node = options.node || _nodeId;
-  let clockseq = options.clockseq !== void 0 ? options.clockseq : _clockseq;
-  if (node == null || clockseq == null) {
-    const seedBytes = options.random || (options.rng || rng)();
-    if (node == null) {
-      node = _nodeId = [seedBytes[0] | 1, seedBytes[1], seedBytes[2], seedBytes[3], seedBytes[4], seedBytes[5]];
-    }
-    if (clockseq == null) {
-      clockseq = _clockseq = (seedBytes[6] << 8 | seedBytes[7]) & 16383;
+      return (0, utils_1.toCommandValue)(s).replace(/%/g, "%25").replace(/\r/g, "%0D").replace(/\n/g, "%0A").replace(/:/g, "%3A").replace(/,/g, "%2C");
     }
   }
-  let msecs = options.msecs !== void 0 ? options.msecs : Date.now();
-  let nsecs = options.nsecs !== void 0 ? options.nsecs : _lastNSecs + 1;
-  const dt = msecs - _lastMSecs + (nsecs - _lastNSecs) / 1e4;
-  if (dt < 0 && options.clockseq === void 0) {
-    clockseq = clockseq + 1 & 16383;
-  }
-  if ((dt < 0 || msecs > _lastMSecs) && options.nsecs === void 0) {
-    nsecs = 0;
-  }
-  if (nsecs >= 1e4) {
-    throw new Error("uuid.v1(): Can't create more than 10M uuids/sec");
-  }
-  _lastMSecs = msecs;
-  _lastNSecs = nsecs;
-  _clockseq = clockseq;
-  msecs += 122192928e5;
-  const tl = ((msecs & 268435455) * 1e4 + nsecs) % 4294967296;
-  b[i++] = tl >>> 24 & 255;
-  b[i++] = tl >>> 16 & 255;
-  b[i++] = tl >>> 8 & 255;
-  b[i++] = tl & 255;
-  const tmh = msecs / 4294967296 * 1e4 & 268435455;
-  b[i++] = tmh >>> 8 & 255;
-  b[i++] = tmh & 255;
-  b[i++] = tmh >>> 24 & 15 | 16;
-  b[i++] = tmh >>> 16 & 255;
-  b[i++] = clockseq >>> 8 | 128;
-  b[i++] = clockseq & 255;
-  for (let n = 0; n < 6; ++n) {
-    b[i + n] = node[n];
-  }
-  return buf || stringify_default(b);
-}
-var _nodeId, _clockseq, _lastMSecs, _lastNSecs, v1_default;
-var init_v1 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v1.js"() {
-    init_rng();
-    init_stringify();
-    _lastMSecs = 0;
-    _lastNSecs = 0;
-    v1_default = v1;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/parse.js
-function parse(uuid) {
-  if (!validate_default(uuid)) {
-    throw TypeError("Invalid UUID");
-  }
-  let v;
-  const arr = new Uint8Array(16);
-  arr[0] = (v = parseInt(uuid.slice(0, 8), 16)) >>> 24;
-  arr[1] = v >>> 16 & 255;
-  arr[2] = v >>> 8 & 255;
-  arr[3] = v & 255;
-  arr[4] = (v = parseInt(uuid.slice(9, 13), 16)) >>> 8;
-  arr[5] = v & 255;
-  arr[6] = (v = parseInt(uuid.slice(14, 18), 16)) >>> 8;
-  arr[7] = v & 255;
-  arr[8] = (v = parseInt(uuid.slice(19, 23), 16)) >>> 8;
-  arr[9] = v & 255;
-  arr[10] = (v = parseInt(uuid.slice(24, 36), 16)) / 1099511627776 & 255;
-  arr[11] = v / 4294967296 & 255;
-  arr[12] = v >>> 24 & 255;
-  arr[13] = v >>> 16 & 255;
-  arr[14] = v >>> 8 & 255;
-  arr[15] = v & 255;
-  return arr;
-}
-var parse_default;
-var init_parse = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/parse.js"() {
-    init_validate();
-    parse_default = parse;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v35.js
-function stringToBytes(str) {
-  str = unescape(encodeURIComponent(str));
-  const bytes = [];
-  for (let i = 0; i < str.length; ++i) {
-    bytes.push(str.charCodeAt(i));
-  }
-  return bytes;
-}
-function v35_default(name, version2, hashfunc) {
-  function generateUUID(value, namespace, buf, offset) {
-    if (typeof value === "string") {
-      value = stringToBytes(value);
-    }
-    if (typeof namespace === "string") {
-      namespace = parse_default(namespace);
-    }
-    if (namespace.length !== 16) {
-      throw TypeError("Namespace must be array-like (16 iterable integer values, 0-255)");
-    }
-    let bytes = new Uint8Array(16 + value.length);
-    bytes.set(namespace);
-    bytes.set(value, namespace.length);
-    bytes = hashfunc(bytes);
-    bytes[6] = bytes[6] & 15 | version2;
-    bytes[8] = bytes[8] & 63 | 128;
-    if (buf) {
-      offset = offset || 0;
-      for (let i = 0; i < 16; ++i) {
-        buf[offset + i] = bytes[i];
-      }
-      return buf;
-    }
-    return stringify_default(bytes);
-  }
-  try {
-    generateUUID.name = name;
-  } catch (err) {
-  }
-  generateUUID.DNS = DNS;
-  generateUUID.URL = URL2;
-  return generateUUID;
-}
-var DNS, URL2;
-var init_v35 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v35.js"() {
-    init_stringify();
-    init_parse();
-    DNS = "6ba7b810-9dad-11d1-80b4-00c04fd430c8";
-    URL2 = "6ba7b811-9dad-11d1-80b4-00c04fd430c8";
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/md5.js
-function md5(bytes) {
-  if (Array.isArray(bytes)) {
-    bytes = Buffer.from(bytes);
-  } else if (typeof bytes === "string") {
-    bytes = Buffer.from(bytes, "utf8");
-  }
-  return import_crypto2.default.createHash("md5").update(bytes).digest();
-}
-var import_crypto2, md5_default;
-var init_md5 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/md5.js"() {
-    import_crypto2 = __toESM(require("crypto"));
-    md5_default = md5;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v3.js
-var v3, v3_default;
-var init_v3 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v3.js"() {
-    init_v35();
-    init_md5();
-    v3 = v35_default("v3", 48, md5_default);
-    v3_default = v3;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v4.js
-function v4(options, buf, offset) {
-  options = options || {};
-  const rnds = options.random || (options.rng || rng)();
-  rnds[6] = rnds[6] & 15 | 64;
-  rnds[8] = rnds[8] & 63 | 128;
-  if (buf) {
-    offset = offset || 0;
-    for (let i = 0; i < 16; ++i) {
-      buf[offset + i] = rnds[i];
-    }
-    return buf;
-  }
-  return stringify_default(rnds);
-}
-var v4_default;
-var init_v4 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v4.js"() {
-    init_rng();
-    init_stringify();
-    v4_default = v4;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/sha1.js
-function sha1(bytes) {
-  if (Array.isArray(bytes)) {
-    bytes = Buffer.from(bytes);
-  } else if (typeof bytes === "string") {
-    bytes = Buffer.from(bytes, "utf8");
-  }
-  return import_crypto3.default.createHash("sha1").update(bytes).digest();
-}
-var import_crypto3, sha1_default;
-var init_sha1 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/sha1.js"() {
-    import_crypto3 = __toESM(require("crypto"));
-    sha1_default = sha1;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v5.js
-var v5, v5_default;
-var init_v5 = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/v5.js"() {
-    init_v35();
-    init_sha1();
-    v5 = v35_default("v5", 80, sha1_default);
-    v5_default = v5;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/nil.js
-var nil_default;
-var init_nil = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/nil.js"() {
-    nil_default = "00000000-0000-0000-0000-000000000000";
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/version.js
-function version(uuid) {
-  if (!validate_default(uuid)) {
-    throw TypeError("Invalid UUID");
-  }
-  return parseInt(uuid.substr(14, 1), 16);
-}
-var version_default;
-var init_version = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/version.js"() {
-    init_validate();
-    version_default = version;
-  }
-});
-
-// node_modules/@reporters/github/node_modules/uuid/dist/esm-node/index.js
-var esm_node_exports = {};
-__export(esm_node_exports, {
-  NIL: () => nil_default,
-  parse: () => parse_default,
-  stringify: () => stringify_default,
-  v1: () => v1_default,
-  v3: () => v3_default,
-  v4: () => v4_default,
-  v5: () => v5_default,
-  validate: () => validate_default,
-  version: () => version_default
-});
-var init_esm_node = __esm({
-  "node_modules/@reporters/github/node_modules/uuid/dist/esm-node/index.js"() {
-    init_v1();
-    init_v3();
-    init_v4();
-    init_v5();
-    init_nil();
-    init_version();
-    init_validate();
-    init_stringify();
-    init_parse();
-  }
 });
 
 // node_modules/@reporters/github/node_modules/@actions/core/lib/file-command.js
@@ -492,9 +134,13 @@ var require_file_command = __commonJS({
     var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
       if (k2 === void 0)
         k2 = k;
-      Object.defineProperty(o, k2, { enumerable: true, get: function() {
-        return m[k];
-      } });
+      var desc = Object.getOwnPropertyDescriptor(m, k);
+      if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+        desc = { enumerable: true, get: function() {
+          return m[k];
+        } };
+      }
+      Object.defineProperty(o, k2, desc);
     } : function(o, m, k, k2) {
       if (k2 === void 0)
         k2 = k;
@@ -511,7 +157,7 @@ var require_file_command = __commonJS({
       var result = {};
       if (mod != null) {
         for (var k in mod)
-          if (k !== "default" && Object.hasOwnProperty.call(mod, k))
+          if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k))
             __createBinding(result, mod, k);
       }
       __setModuleDefault(result, mod);
@@ -519,9 +165,9 @@ var require_file_command = __commonJS({
     };
     Object.defineProperty(exports2, "__esModule", { value: true });
     exports2.prepareKeyValueMessage = exports2.issueFileCommand = void 0;
+    var crypto = __importStar(require("crypto"));
     var fs = __importStar(require("fs"));
     var os = __importStar(require("os"));
-    var uuid_1 = (init_esm_node(), __toCommonJS(esm_node_exports));
     var utils_1 = require_utils();
     function issueFileCommand(command, message) {
       const filePath = process.env[`GITHUB_${command}`];
@@ -531,14 +177,14 @@ var require_file_command = __commonJS({
       if (!fs.existsSync(filePath)) {
         throw new Error(`Missing file at path: ${filePath}`);
       }
-      fs.appendFileSync(filePath, `${utils_1.toCommandValue(message)}${os.EOL}`, {
+      fs.appendFileSync(filePath, `${(0, utils_1.toCommandValue)(message)}${os.EOL}`, {
         encoding: "utf8"
       });
     }
     exports2.issueFileCommand = issueFileCommand;
     function prepareKeyValueMessage(key, value) {
-      const delimiter = `ghadelimiter_${uuid_1.v4()}`;
-      const convertedValue = utils_1.toCommandValue(value);
+      const delimiter = `ghadelimiter_${crypto.randomUUID()}`;
+      const convertedValue = (0, utils_1.toCommandValue)(value);
       if (key.includes(delimiter)) {
         throw new Error(`Unexpected input: name should not contain the delimiter "${delimiter}"`);
       }
@@ -1281,7 +927,7 @@ var require_util = __commonJS({
     var { InvalidArgumentError } = require_errors();
     var { Blob: Blob2 } = require("buffer");
     var nodeUtil = require("util");
-    var { stringify: stringify2 } = require("querystring");
+    var { stringify } = require("querystring");
     var { headerNameLowerCasedRecord } = require_constants();
     var [nodeMajor, nodeMinor] = process.versions.node.split(".").map((v) => Number(v));
     function nop() {
@@ -1296,7 +942,7 @@ var require_util = __commonJS({
       if (url.includes("?") || url.includes("#")) {
         throw new Error('Query params cannot be passed when url already contains "?" or "#".');
       }
-      const stringified = stringify2(queryParams);
+      const stringified = stringify(queryParams);
       if (stringified) {
         url += "?" + stringified;
       }
@@ -3973,11 +3619,11 @@ var require_util2 = __commonJS({
     var assert = require("assert");
     var { isUint8Array } = require("util/types");
     var supportedHashes = [];
-    var crypto4;
+    var crypto;
     try {
-      crypto4 = require("crypto");
+      crypto = require("crypto");
       const possibleRelevantHashes = ["sha256", "sha384", "sha512"];
-      supportedHashes = crypto4.getHashes().filter((hash) => possibleRelevantHashes.includes(hash));
+      supportedHashes = crypto.getHashes().filter((hash) => possibleRelevantHashes.includes(hash));
     } catch {
     }
     function responseURL(response) {
@@ -4243,7 +3889,7 @@ var require_util2 = __commonJS({
       }
     }
     function bytesMatch(bytes, metadataList) {
-      if (crypto4 === void 0) {
+      if (crypto === void 0) {
         return true;
       }
       const parsedMetadata = parseMetadata(metadataList);
@@ -4258,7 +3904,7 @@ var require_util2 = __commonJS({
       for (const item of metadata) {
         const algorithm = item.algo;
         const expectedValue = item.hash;
-        let actualValue = crypto4.createHash(algorithm).update(bytes).digest("base64");
+        let actualValue = crypto.createHash(algorithm).update(bytes).digest("base64");
         if (actualValue[actualValue.length - 1] === "=") {
           if (actualValue[actualValue.length - 2] === "=") {
             actualValue = actualValue.slice(0, -2);
@@ -11400,7 +11046,7 @@ var require_proxy_agent = __commonJS({
   "node_modules/@reporters/github/node_modules/undici/lib/proxy-agent.js"(exports2, module2) {
     "use strict";
     var { kProxy, kClose, kDestroy, kInterceptors } = require_symbols();
-    var { URL: URL3 } = require("url");
+    var { URL: URL2 } = require("url");
     var Agent = require_agent();
     var Pool = require_pool();
     var DispatcherBase = require_dispatcher_base();
@@ -11449,7 +11095,7 @@ var require_proxy_agent = __commonJS({
         this[kRequestTls] = opts.requestTls;
         this[kProxyTls] = opts.proxyTls;
         this[kProxyHeaders] = opts.headers || {};
-        const resolvedUrl = new URL3(opts.uri);
+        const resolvedUrl = new URL2(opts.uri);
         const { origin, port, host, username, password } = resolvedUrl;
         if (opts.auth && opts.token) {
           throw new InvalidArgumentError("opts.auth cannot be used in combination with opts.token");
@@ -11504,7 +11150,7 @@ var require_proxy_agent = __commonJS({
         });
       }
       dispatch(opts, handler) {
-        const { host } = new URL3(opts.origin);
+        const { host } = new URL2(opts.origin);
         const headers = buildHeaders(opts.headers);
         throwIfProxyAuthIsSent(headers);
         return this[kAgent].dispatch(
@@ -15919,7 +15565,7 @@ var require_util6 = __commonJS({
         throw new Error("Invalid cookie max-age");
       }
     }
-    function stringify2(cookie) {
+    function stringify(cookie) {
       if (cookie.name.length === 0) {
         return null;
       }
@@ -15984,7 +15630,7 @@ var require_util6 = __commonJS({
     }
     module2.exports = {
       isCTLExcludingHtab,
-      stringify: stringify2,
+      stringify,
       getHeadersList
     };
   }
@@ -16135,7 +15781,7 @@ var require_cookies = __commonJS({
   "node_modules/@reporters/github/node_modules/undici/lib/cookies/index.js"(exports2, module2) {
     "use strict";
     var { parseSetCookie } = require_parse();
-    var { stringify: stringify2, getHeadersList } = require_util6();
+    var { stringify, getHeadersList } = require_util6();
     var { webidl } = require_webidl();
     var { Headers } = require_headers();
     function getCookies(headers) {
@@ -16177,9 +15823,9 @@ var require_cookies = __commonJS({
       webidl.argumentLengthCheck(arguments, 2, { header: "setCookie" });
       webidl.brandCheck(headers, Headers, { strict: false });
       cookie = webidl.converters.Cookie(cookie);
-      const str = stringify2(cookie);
+      const str = stringify(cookie);
       if (str) {
-        headers.append("Set-Cookie", stringify2(cookie));
+        headers.append("Set-Cookie", stringify(cookie));
       }
     }
     webidl.converters.DeleteCookieAttributes = webidl.dictionaryConverter([
@@ -16675,9 +16321,9 @@ var require_connection = __commonJS({
     channels.open = diagnosticsChannel.channel("undici:websocket:open");
     channels.close = diagnosticsChannel.channel("undici:websocket:close");
     channels.socketError = diagnosticsChannel.channel("undici:websocket:socket_error");
-    var crypto4;
+    var crypto;
     try {
-      crypto4 = require("crypto");
+      crypto = require("crypto");
     } catch {
     }
     function establishWebSocketConnection(url, protocols, ws, onEstablish, options) {
@@ -16696,7 +16342,7 @@ var require_connection = __commonJS({
         const headersList = new Headers(options.headers)[kHeadersList];
         request.headersList = headersList;
       }
-      const keyValue = crypto4.randomBytes(16).toString("base64");
+      const keyValue = crypto.randomBytes(16).toString("base64");
       request.headersList.append("sec-websocket-key", keyValue);
       request.headersList.append("sec-websocket-version", "13");
       for (const protocol of protocols) {
@@ -16725,7 +16371,7 @@ var require_connection = __commonJS({
             return;
           }
           const secWSAccept = response.headersList.get("Sec-WebSocket-Accept");
-          const digest = crypto4.createHash("sha1").update(keyValue + uid).digest("base64");
+          const digest = crypto.createHash("sha1").update(keyValue + uid).digest("base64");
           if (secWSAccept !== digest) {
             failWebsocketConnection(ws, "Incorrect hash received in Sec-WebSocket-Accept header.");
             return;
@@ -16805,9 +16451,9 @@ var require_frame = __commonJS({
   "node_modules/@reporters/github/node_modules/undici/lib/websocket/frame.js"(exports2, module2) {
     "use strict";
     var { maxUnsigned16Bit } = require_constants5();
-    var crypto4;
+    var crypto;
     try {
-      crypto4 = require("crypto");
+      crypto = require("crypto");
     } catch {
     }
     var WebsocketFrameSend = class {
@@ -16816,7 +16462,7 @@ var require_frame = __commonJS({
        */
       constructor(data) {
         this.frameData = data;
-        this.maskKey = crypto4.randomBytes(4);
+        this.maskKey = crypto.randomBytes(4);
       }
       createFrame(opcode) {
         const bodyLength = this.frameData?.byteLength ?? 0;
@@ -18449,9 +18095,9 @@ var require_oidc_utils = __commonJS({
               const encodedAudience = encodeURIComponent(audience);
               id_token_url = `${id_token_url}&audience=${encodedAudience}`;
             }
-            core_1.debug(`ID token url is ${id_token_url}`);
+            (0, core_1.debug)(`ID token url is ${id_token_url}`);
             const id_token = yield OidcClient.getCall(id_token_url);
-            core_1.setSecret(id_token);
+            (0, core_1.setSecret)(id_token);
             return id_token;
           } catch (error) {
             throw new Error(`Error message: ${error.message}`);
@@ -18764,9 +18410,13 @@ var require_path_utils = __commonJS({
     var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
       if (k2 === void 0)
         k2 = k;
-      Object.defineProperty(o, k2, { enumerable: true, get: function() {
-        return m[k];
-      } });
+      var desc = Object.getOwnPropertyDescriptor(m, k);
+      if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+        desc = { enumerable: true, get: function() {
+          return m[k];
+        } };
+      }
+      Object.defineProperty(o, k2, desc);
     } : function(o, m, k, k2) {
       if (k2 === void 0)
         k2 = k;
@@ -18783,7 +18433,7 @@ var require_path_utils = __commonJS({
       var result = {};
       if (mod != null) {
         for (var k in mod)
-          if (k !== "default" && Object.hasOwnProperty.call(mod, k))
+          if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k))
             __createBinding(result, mod, k);
       }
       __setModuleDefault(result, mod);
@@ -18807,9 +18457,9 @@ var require_path_utils = __commonJS({
   }
 });
 
-// node_modules/@reporters/github/node_modules/@actions/core/lib/core.js
-var require_core = __commonJS({
-  "node_modules/@reporters/github/node_modules/@actions/core/lib/core.js"(exports2) {
+// node_modules/@reporters/github/node_modules/@actions/io/lib/io-util.js
+var require_io_util = __commonJS({
+  "node_modules/@reporters/github/node_modules/@actions/io/lib/io-util.js"(exports2) {
     "use strict";
     var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
       if (k2 === void 0)
@@ -18866,146 +18516,1306 @@ var require_core = __commonJS({
         step((generator = generator.apply(thisArg, _arguments || [])).next());
       });
     };
+    var _a;
     Object.defineProperty(exports2, "__esModule", { value: true });
-    exports2.getIDToken = exports2.getState = exports2.saveState = exports2.group = exports2.endGroup = exports2.startGroup = exports2.info = exports2.notice = exports2.warning = exports2.error = exports2.debug = exports2.isDebug = exports2.setFailed = exports2.setCommandEcho = exports2.setOutput = exports2.getBooleanInput = exports2.getMultilineInput = exports2.getInput = exports2.addPath = exports2.setSecret = exports2.exportVariable = exports2.ExitCode = void 0;
-    var command_1 = require_command();
-    var file_command_1 = require_file_command();
-    var utils_1 = require_utils();
-    var os = __importStar(require("os"));
+    exports2.getCmdPath = exports2.tryGetExecutablePath = exports2.isRooted = exports2.isDirectory = exports2.exists = exports2.READONLY = exports2.UV_FS_O_EXLOCK = exports2.IS_WINDOWS = exports2.unlink = exports2.symlink = exports2.stat = exports2.rmdir = exports2.rm = exports2.rename = exports2.readlink = exports2.readdir = exports2.open = exports2.mkdir = exports2.lstat = exports2.copyFile = exports2.chmod = void 0;
+    var fs = __importStar(require("fs"));
     var path2 = __importStar(require("path"));
-    var oidc_utils_1 = require_oidc_utils();
-    var ExitCode;
-    (function(ExitCode2) {
-      ExitCode2[ExitCode2["Success"] = 0] = "Success";
-      ExitCode2[ExitCode2["Failure"] = 1] = "Failure";
-    })(ExitCode = exports2.ExitCode || (exports2.ExitCode = {}));
-    function exportVariable(name, val) {
-      const convertedVal = utils_1.toCommandValue(val);
-      process.env[name] = convertedVal;
-      const filePath = process.env["GITHUB_ENV"] || "";
-      if (filePath) {
-        return file_command_1.issueFileCommand("ENV", file_command_1.prepareKeyValueMessage(name, val));
-      }
-      command_1.issueCommand("set-env", { name }, convertedVal);
-    }
-    exports2.exportVariable = exportVariable;
-    function setSecret(secret) {
-      command_1.issueCommand("add-mask", {}, secret);
-    }
-    exports2.setSecret = setSecret;
-    function addPath(inputPath) {
-      const filePath = process.env["GITHUB_PATH"] || "";
-      if (filePath) {
-        file_command_1.issueFileCommand("PATH", inputPath);
-      } else {
-        command_1.issueCommand("add-path", {}, inputPath);
-      }
-      process.env["PATH"] = `${inputPath}${path2.delimiter}${process.env["PATH"]}`;
+    _a = fs.promises, exports2.chmod = _a.chmod, exports2.copyFile = _a.copyFile, exports2.lstat = _a.lstat, exports2.mkdir = _a.mkdir, exports2.open = _a.open, exports2.readdir = _a.readdir, exports2.readlink = _a.readlink, exports2.rename = _a.rename, exports2.rm = _a.rm, exports2.rmdir = _a.rmdir, exports2.stat = _a.stat, exports2.symlink = _a.symlink, exports2.unlink = _a.unlink;
+    exports2.IS_WINDOWS = process.platform === "win32";
+    exports2.UV_FS_O_EXLOCK = 268435456;
+    exports2.READONLY = fs.constants.O_RDONLY;
+    function exists(fsPath) {
+      return __awaiter(this, void 0, void 0, function* () {
+        try {
+          yield exports2.stat(fsPath);
+        } catch (err) {
+          if (err.code === "ENOENT") {
+            return false;
+          }
+          throw err;
+        }
+        return true;
+      });
     }
-    exports2.addPath = addPath;
-    function getInput(name, options) {
-      const val = process.env[`INPUT_${name.replace(/ /g, "_").toUpperCase()}`] || "";
-      if (options && options.required && !val) {
-        throw new Error(`Input required and not supplied: ${name}`);
-      }
-      if (options && options.trimWhitespace === false) {
-        return val;
-      }
-      return val.trim();
+    exports2.exists = exists;
+    function isDirectory(fsPath, useStat = false) {
+      return __awaiter(this, void 0, void 0, function* () {
+        const stats = useStat ? yield exports2.stat(fsPath) : yield exports2.lstat(fsPath);
+        return stats.isDirectory();
+      });
     }
-    exports2.getInput = getInput;
-    function getMultilineInput(name, options) {
-      const inputs = getInput(name, options).split("\n").filter((x) => x !== "");
-      if (options && options.trimWhitespace === false) {
-        return inputs;
+    exports2.isDirectory = isDirectory;
+    function isRooted(p) {
+      p = normalizeSeparators(p);
+      if (!p) {
+        throw new Error('isRooted() parameter "p" cannot be empty');
       }
-      return inputs.map((input) => input.trim());
-    }
-    exports2.getMultilineInput = getMultilineInput;
-    function getBooleanInput(name, options) {
-      const trueValue = ["true", "True", "TRUE"];
-      const falseValue = ["false", "False", "FALSE"];
-      const val = getInput(name, options);
-      if (trueValue.includes(val))
-        return true;
-      if (falseValue.includes(val))
-        return false;
-      throw new TypeError(`Input does not meet YAML 1.2 "Core Schema" specification: ${name}
-Support boolean input list: \`true | True | TRUE | false | False | FALSE\``);
-    }
-    exports2.getBooleanInput = getBooleanInput;
-    function setOutput(name, value) {
-      const filePath = process.env["GITHUB_OUTPUT"] || "";
-      if (filePath) {
-        return file_command_1.issueFileCommand("OUTPUT", file_command_1.prepareKeyValueMessage(name, value));
+      if (exports2.IS_WINDOWS) {
+        return p.startsWith("\\") || /^[A-Z]:/i.test(p);
       }
-      process.stdout.write(os.EOL);
-      command_1.issueCommand("set-output", { name }, utils_1.toCommandValue(value));
-    }
-    exports2.setOutput = setOutput;
-    function setCommandEcho(enabled) {
-      command_1.issue("echo", enabled ? "on" : "off");
-    }
-    exports2.setCommandEcho = setCommandEcho;
-    function setFailed(message) {
-      process.exitCode = ExitCode.Failure;
-      error(message);
-    }
-    exports2.setFailed = setFailed;
-    function isDebug() {
-      return process.env["RUNNER_DEBUG"] === "1";
-    }
-    exports2.isDebug = isDebug;
-    function debug(message) {
-      command_1.issueCommand("debug", {}, message);
-    }
-    exports2.debug = debug;
-    function error(message, properties = {}) {
-      command_1.issueCommand("error", utils_1.toCommandProperties(properties), message instanceof Error ? message.toString() : message);
-    }
-    exports2.error = error;
-    function warning(message, properties = {}) {
-      command_1.issueCommand("warning", utils_1.toCommandProperties(properties), message instanceof Error ? message.toString() : message);
+      return p.startsWith("/");
     }
-    exports2.warning = warning;
-    function notice(message, properties = {}) {
-      command_1.issueCommand("notice", utils_1.toCommandProperties(properties), message instanceof Error ? message.toString() : message);
-    }
-    exports2.notice = notice;
-    function info(message) {
-      process.stdout.write(message + os.EOL);
-    }
-    exports2.info = info;
-    function startGroup(name) {
-      command_1.issue("group", name);
-    }
-    exports2.startGroup = startGroup;
-    function endGroup() {
-      command_1.issue("endgroup");
-    }
-    exports2.endGroup = endGroup;
-    function group(name, fn) {
+    exports2.isRooted = isRooted;
+    function tryGetExecutablePath(filePath, extensions) {
       return __awaiter(this, void 0, void 0, function* () {
-        startGroup(name);
-        let result;
+        let stats = void 0;
         try {
-          result = yield fn();
-        } finally {
-          endGroup();
+          stats = yield exports2.stat(filePath);
+        } catch (err) {
+          if (err.code !== "ENOENT") {
+            console.log(`Unexpected error attempting to determine if executable file exists '${filePath}': ${err}`);
+          }
         }
-        return result;
+        if (stats && stats.isFile()) {
+          if (exports2.IS_WINDOWS) {
+            const upperExt = path2.extname(filePath).toUpperCase();
+            if (extensions.some((validExt) => validExt.toUpperCase() === upperExt)) {
+              return filePath;
+            }
+          } else {
+            if (isUnixExecutable(stats)) {
+              return filePath;
+            }
+          }
+        }
+        const originalFilePath = filePath;
+        for (const extension of extensions) {
+          filePath = originalFilePath + extension;
+          stats = void 0;
+          try {
+            stats = yield exports2.stat(filePath);
+          } catch (err) {
+            if (err.code !== "ENOENT") {
+              console.log(`Unexpected error attempting to determine if executable file exists '${filePath}': ${err}`);
+            }
+          }
+          if (stats && stats.isFile()) {
+            if (exports2.IS_WINDOWS) {
+              try {
+                const directory = path2.dirname(filePath);
+                const upperName = path2.basename(filePath).toUpperCase();
+                for (const actualName of yield exports2.readdir(directory)) {
+                  if (upperName === actualName.toUpperCase()) {
+                    filePath = path2.join(directory, actualName);
+                    break;
+                  }
+                }
+              } catch (err) {
+                console.log(`Unexpected error attempting to determine the actual case of the file '${filePath}': ${err}`);
+              }
+              return filePath;
+            } else {
+              if (isUnixExecutable(stats)) {
+                return filePath;
+              }
+            }
+          }
+        }
+        return "";
       });
     }
-    exports2.group = group;
-    function saveState(name, value) {
-      const filePath = process.env["GITHUB_STATE"] || "";
-      if (filePath) {
-        return file_command_1.issueFileCommand("STATE", file_command_1.prepareKeyValueMessage(name, value));
+    exports2.tryGetExecutablePath = tryGetExecutablePath;
+    function normalizeSeparators(p) {
+      p = p || "";
+      if (exports2.IS_WINDOWS) {
+        p = p.replace(/\//g, "\\");
+        return p.replace(/\\\\+/g, "\\");
       }
-      command_1.issueCommand("save-state", { name }, utils_1.toCommandValue(value));
+      return p.replace(/\/\/+/g, "/");
     }
-    exports2.saveState = saveState;
-    function getState(name) {
+    function isUnixExecutable(stats) {
+      return (stats.mode & 1) > 0 || (stats.mode & 8) > 0 && stats.gid === process.getgid() || (stats.mode & 64) > 0 && stats.uid === process.getuid();
+    }
+    function getCmdPath() {
+      var _a2;
+      return (_a2 = process.env["COMSPEC"]) !== null && _a2 !== void 0 ? _a2 : `cmd.exe`;
+    }
+    exports2.getCmdPath = getCmdPath;
+  }
+});
+
+// node_modules/@reporters/github/node_modules/@actions/io/lib/io.js
+var require_io = __commonJS({
+  "node_modules/@reporters/github/node_modules/@actions/io/lib/io.js"(exports2) {
+    "use strict";
+    var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      Object.defineProperty(o, k2, { enumerable: true, get: function() {
+        return m[k];
+      } });
+    } : function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      o[k2] = m[k];
+    });
+    var __setModuleDefault = exports2 && exports2.__setModuleDefault || (Object.create ? function(o, v) {
+      Object.defineProperty(o, "default", { enumerable: true, value: v });
+    } : function(o, v) {
+      o["default"] = v;
+    });
+    var __importStar = exports2 && exports2.__importStar || function(mod) {
+      if (mod && mod.__esModule)
+        return mod;
+      var result = {};
+      if (mod != null) {
+        for (var k in mod)
+          if (k !== "default" && Object.hasOwnProperty.call(mod, k))
+            __createBinding(result, mod, k);
+      }
+      __setModuleDefault(result, mod);
+      return result;
+    };
+    var __awaiter = exports2 && exports2.__awaiter || function(thisArg, _arguments, P, generator) {
+      function adopt(value) {
+        return value instanceof P ? value : new P(function(resolve) {
+          resolve(value);
+        });
+      }
+      return new (P || (P = Promise))(function(resolve, reject) {
+        function fulfilled(value) {
+          try {
+            step(generator.next(value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function rejected(value) {
+          try {
+            step(generator["throw"](value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function step(result) {
+          result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
+        }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+      });
+    };
+    Object.defineProperty(exports2, "__esModule", { value: true });
+    exports2.findInPath = exports2.which = exports2.mkdirP = exports2.rmRF = exports2.mv = exports2.cp = void 0;
+    var assert_1 = require("assert");
+    var path2 = __importStar(require("path"));
+    var ioUtil = __importStar(require_io_util());
+    function cp(source, dest, options = {}) {
+      return __awaiter(this, void 0, void 0, function* () {
+        const { force, recursive, copySourceDirectory } = readCopyOptions(options);
+        const destStat = (yield ioUtil.exists(dest)) ? yield ioUtil.stat(dest) : null;
+        if (destStat && destStat.isFile() && !force) {
+          return;
+        }
+        const newDest = destStat && destStat.isDirectory() && copySourceDirectory ? path2.join(dest, path2.basename(source)) : dest;
+        if (!(yield ioUtil.exists(source))) {
+          throw new Error(`no such file or directory: ${source}`);
+        }
+        const sourceStat = yield ioUtil.stat(source);
+        if (sourceStat.isDirectory()) {
+          if (!recursive) {
+            throw new Error(`Failed to copy. ${source} is a directory, but tried to copy without recursive flag.`);
+          } else {
+            yield cpDirRecursive(source, newDest, 0, force);
+          }
+        } else {
+          if (path2.relative(source, newDest) === "") {
+            throw new Error(`'${newDest}' and '${source}' are the same file`);
+          }
+          yield copyFile(source, newDest, force);
+        }
+      });
+    }
+    exports2.cp = cp;
+    function mv(source, dest, options = {}) {
+      return __awaiter(this, void 0, void 0, function* () {
+        if (yield ioUtil.exists(dest)) {
+          let destExists = true;
+          if (yield ioUtil.isDirectory(dest)) {
+            dest = path2.join(dest, path2.basename(source));
+            destExists = yield ioUtil.exists(dest);
+          }
+          if (destExists) {
+            if (options.force == null || options.force) {
+              yield rmRF(dest);
+            } else {
+              throw new Error("Destination already exists");
+            }
+          }
+        }
+        yield mkdirP(path2.dirname(dest));
+        yield ioUtil.rename(source, dest);
+      });
+    }
+    exports2.mv = mv;
+    function rmRF(inputPath) {
+      return __awaiter(this, void 0, void 0, function* () {
+        if (ioUtil.IS_WINDOWS) {
+          if (/[*"<>|]/.test(inputPath)) {
+            throw new Error('File path must not contain `*`, `"`, `<`, `>` or `|` on Windows');
+          }
+        }
+        try {
+          yield ioUtil.rm(inputPath, {
+            force: true,
+            maxRetries: 3,
+            recursive: true,
+            retryDelay: 300
+          });
+        } catch (err) {
+          throw new Error(`File was unable to be removed ${err}`);
+        }
+      });
+    }
+    exports2.rmRF = rmRF;
+    function mkdirP(fsPath) {
+      return __awaiter(this, void 0, void 0, function* () {
+        assert_1.ok(fsPath, "a path argument must be provided");
+        yield ioUtil.mkdir(fsPath, { recursive: true });
+      });
+    }
+    exports2.mkdirP = mkdirP;
+    function which(tool, check) {
+      return __awaiter(this, void 0, void 0, function* () {
+        if (!tool) {
+          throw new Error("parameter 'tool' is required");
+        }
+        if (check) {
+          const result = yield which(tool, false);
+          if (!result) {
+            if (ioUtil.IS_WINDOWS) {
+              throw new Error(`Unable to locate executable file: ${tool}. Please verify either the file path exists or the file can be found within a directory specified by the PATH environment variable. Also verify the file has a valid extension for an executable file.`);
+            } else {
+              throw new Error(`Unable to locate executable file: ${tool}. Please verify either the file path exists or the file can be found within a directory specified by the PATH environment variable. Also check the file mode to verify the file is executable.`);
+            }
+          }
+          return result;
+        }
+        const matches = yield findInPath(tool);
+        if (matches && matches.length > 0) {
+          return matches[0];
+        }
+        return "";
+      });
+    }
+    exports2.which = which;
+    function findInPath(tool) {
+      return __awaiter(this, void 0, void 0, function* () {
+        if (!tool) {
+          throw new Error("parameter 'tool' is required");
+        }
+        const extensions = [];
+        if (ioUtil.IS_WINDOWS && process.env["PATHEXT"]) {
+          for (const extension of process.env["PATHEXT"].split(path2.delimiter)) {
+            if (extension) {
+              extensions.push(extension);
+            }
+          }
+        }
+        if (ioUtil.isRooted(tool)) {
+          const filePath = yield ioUtil.tryGetExecutablePath(tool, extensions);
+          if (filePath) {
+            return [filePath];
+          }
+          return [];
+        }
+        if (tool.includes(path2.sep)) {
+          return [];
+        }
+        const directories = [];
+        if (process.env.PATH) {
+          for (const p of process.env.PATH.split(path2.delimiter)) {
+            if (p) {
+              directories.push(p);
+            }
+          }
+        }
+        const matches = [];
+        for (const directory of directories) {
+          const filePath = yield ioUtil.tryGetExecutablePath(path2.join(directory, tool), extensions);
+          if (filePath) {
+            matches.push(filePath);
+          }
+        }
+        return matches;
+      });
+    }
+    exports2.findInPath = findInPath;
+    function readCopyOptions(options) {
+      const force = options.force == null ? true : options.force;
+      const recursive = Boolean(options.recursive);
+      const copySourceDirectory = options.copySourceDirectory == null ? true : Boolean(options.copySourceDirectory);
+      return { force, recursive, copySourceDirectory };
+    }
+    function cpDirRecursive(sourceDir, destDir, currentDepth, force) {
+      return __awaiter(this, void 0, void 0, function* () {
+        if (currentDepth >= 255)
+          return;
+        currentDepth++;
+        yield mkdirP(destDir);
+        const files = yield ioUtil.readdir(sourceDir);
+        for (const fileName of files) {
+          const srcFile = `${sourceDir}/${fileName}`;
+          const destFile = `${destDir}/${fileName}`;
+          const srcFileStat = yield ioUtil.lstat(srcFile);
+          if (srcFileStat.isDirectory()) {
+            yield cpDirRecursive(srcFile, destFile, currentDepth, force);
+          } else {
+            yield copyFile(srcFile, destFile, force);
+          }
+        }
+        yield ioUtil.chmod(destDir, (yield ioUtil.stat(sourceDir)).mode);
+      });
+    }
+    function copyFile(srcFile, destFile, force) {
+      return __awaiter(this, void 0, void 0, function* () {
+        if ((yield ioUtil.lstat(srcFile)).isSymbolicLink()) {
+          try {
+            yield ioUtil.lstat(destFile);
+            yield ioUtil.unlink(destFile);
+          } catch (e) {
+            if (e.code === "EPERM") {
+              yield ioUtil.chmod(destFile, "0666");
+              yield ioUtil.unlink(destFile);
+            }
+          }
+          const symlinkFull = yield ioUtil.readlink(srcFile);
+          yield ioUtil.symlink(symlinkFull, destFile, ioUtil.IS_WINDOWS ? "junction" : null);
+        } else if (!(yield ioUtil.exists(destFile)) || force) {
+          yield ioUtil.copyFile(srcFile, destFile);
+        }
+      });
+    }
+  }
+});
+
+// node_modules/@reporters/github/node_modules/@actions/exec/lib/toolrunner.js
+var require_toolrunner = __commonJS({
+  "node_modules/@reporters/github/node_modules/@actions/exec/lib/toolrunner.js"(exports2) {
+    "use strict";
+    var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      Object.defineProperty(o, k2, { enumerable: true, get: function() {
+        return m[k];
+      } });
+    } : function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      o[k2] = m[k];
+    });
+    var __setModuleDefault = exports2 && exports2.__setModuleDefault || (Object.create ? function(o, v) {
+      Object.defineProperty(o, "default", { enumerable: true, value: v });
+    } : function(o, v) {
+      o["default"] = v;
+    });
+    var __importStar = exports2 && exports2.__importStar || function(mod) {
+      if (mod && mod.__esModule)
+        return mod;
+      var result = {};
+      if (mod != null) {
+        for (var k in mod)
+          if (k !== "default" && Object.hasOwnProperty.call(mod, k))
+            __createBinding(result, mod, k);
+      }
+      __setModuleDefault(result, mod);
+      return result;
+    };
+    var __awaiter = exports2 && exports2.__awaiter || function(thisArg, _arguments, P, generator) {
+      function adopt(value) {
+        return value instanceof P ? value : new P(function(resolve) {
+          resolve(value);
+        });
+      }
+      return new (P || (P = Promise))(function(resolve, reject) {
+        function fulfilled(value) {
+          try {
+            step(generator.next(value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function rejected(value) {
+          try {
+            step(generator["throw"](value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function step(result) {
+          result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
+        }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+      });
+    };
+    Object.defineProperty(exports2, "__esModule", { value: true });
+    exports2.argStringToArray = exports2.ToolRunner = void 0;
+    var os = __importStar(require("os"));
+    var events = __importStar(require("events"));
+    var child = __importStar(require("child_process"));
+    var path2 = __importStar(require("path"));
+    var io = __importStar(require_io());
+    var ioUtil = __importStar(require_io_util());
+    var timers_1 = require("timers");
+    var IS_WINDOWS = process.platform === "win32";
+    var ToolRunner = class extends events.EventEmitter {
+      constructor(toolPath, args, options) {
+        super();
+        if (!toolPath) {
+          throw new Error("Parameter 'toolPath' cannot be null or empty.");
+        }
+        this.toolPath = toolPath;
+        this.args = args || [];
+        this.options = options || {};
+      }
+      _debug(message) {
+        if (this.options.listeners && this.options.listeners.debug) {
+          this.options.listeners.debug(message);
+        }
+      }
+      _getCommandString(options, noPrefix) {
+        const toolPath = this._getSpawnFileName();
+        const args = this._getSpawnArgs(options);
+        let cmd = noPrefix ? "" : "[command]";
+        if (IS_WINDOWS) {
+          if (this._isCmdFile()) {
+            cmd += toolPath;
+            for (const a of args) {
+              cmd += ` ${a}`;
+            }
+          } else if (options.windowsVerbatimArguments) {
+            cmd += `"${toolPath}"`;
+            for (const a of args) {
+              cmd += ` ${a}`;
+            }
+          } else {
+            cmd += this._windowsQuoteCmdArg(toolPath);
+            for (const a of args) {
+              cmd += ` ${this._windowsQuoteCmdArg(a)}`;
+            }
+          }
+        } else {
+          cmd += toolPath;
+          for (const a of args) {
+            cmd += ` ${a}`;
+          }
+        }
+        return cmd;
+      }
+      _processLineBuffer(data, strBuffer, onLine) {
+        try {
+          let s = strBuffer + data.toString();
+          let n = s.indexOf(os.EOL);
+          while (n > -1) {
+            const line = s.substring(0, n);
+            onLine(line);
+            s = s.substring(n + os.EOL.length);
+            n = s.indexOf(os.EOL);
+          }
+          return s;
+        } catch (err) {
+          this._debug(`error processing line. Failed with error ${err}`);
+          return "";
+        }
+      }
+      _getSpawnFileName() {
+        if (IS_WINDOWS) {
+          if (this._isCmdFile()) {
+            return process.env["COMSPEC"] || "cmd.exe";
+          }
+        }
+        return this.toolPath;
+      }
+      _getSpawnArgs(options) {
+        if (IS_WINDOWS) {
+          if (this._isCmdFile()) {
+            let argline = `/D /S /C "${this._windowsQuoteCmdArg(this.toolPath)}`;
+            for (const a of this.args) {
+              argline += " ";
+              argline += options.windowsVerbatimArguments ? a : this._windowsQuoteCmdArg(a);
+            }
+            argline += '"';
+            return [argline];
+          }
+        }
+        return this.args;
+      }
+      _endsWith(str, end) {
+        return str.endsWith(end);
+      }
+      _isCmdFile() {
+        const upperToolPath = this.toolPath.toUpperCase();
+        return this._endsWith(upperToolPath, ".CMD") || this._endsWith(upperToolPath, ".BAT");
+      }
+      _windowsQuoteCmdArg(arg) {
+        if (!this._isCmdFile()) {
+          return this._uvQuoteCmdArg(arg);
+        }
+        if (!arg) {
+          return '""';
+        }
+        const cmdSpecialChars = [
+          " ",
+          "	",
+          "&",
+          "(",
+          ")",
+          "[",
+          "]",
+          "{",
+          "}",
+          "^",
+          "=",
+          ";",
+          "!",
+          "'",
+          "+",
+          ",",
+          "`",
+          "~",
+          "|",
+          "<",
+          ">",
+          '"'
+        ];
+        let needsQuotes = false;
+        for (const char of arg) {
+          if (cmdSpecialChars.some((x) => x === char)) {
+            needsQuotes = true;
+            break;
+          }
+        }
+        if (!needsQuotes) {
+          return arg;
+        }
+        let reverse = '"';
+        let quoteHit = true;
+        for (let i = arg.length; i > 0; i--) {
+          reverse += arg[i - 1];
+          if (quoteHit && arg[i - 1] === "\\") {
+            reverse += "\\";
+          } else if (arg[i - 1] === '"') {
+            quoteHit = true;
+            reverse += '"';
+          } else {
+            quoteHit = false;
+          }
+        }
+        reverse += '"';
+        return reverse.split("").reverse().join("");
+      }
+      _uvQuoteCmdArg(arg) {
+        if (!arg) {
+          return '""';
+        }
+        if (!arg.includes(" ") && !arg.includes("	") && !arg.includes('"')) {
+          return arg;
+        }
+        if (!arg.includes('"') && !arg.includes("\\")) {
+          return `"${arg}"`;
+        }
+        let reverse = '"';
+        let quoteHit = true;
+        for (let i = arg.length; i > 0; i--) {
+          reverse += arg[i - 1];
+          if (quoteHit && arg[i - 1] === "\\") {
+            reverse += "\\";
+          } else if (arg[i - 1] === '"') {
+            quoteHit = true;
+            reverse += "\\";
+          } else {
+            quoteHit = false;
+          }
+        }
+        reverse += '"';
+        return reverse.split("").reverse().join("");
+      }
+      _cloneExecOptions(options) {
+        options = options || {};
+        const result = {
+          cwd: options.cwd || process.cwd(),
+          env: options.env || process.env,
+          silent: options.silent || false,
+          windowsVerbatimArguments: options.windowsVerbatimArguments || false,
+          failOnStdErr: options.failOnStdErr || false,
+          ignoreReturnCode: options.ignoreReturnCode || false,
+          delay: options.delay || 1e4
+        };
+        result.outStream = options.outStream || process.stdout;
+        result.errStream = options.errStream || process.stderr;
+        return result;
+      }
+      _getSpawnOptions(options, toolPath) {
+        options = options || {};
+        const result = {};
+        result.cwd = options.cwd;
+        result.env = options.env;
+        result["windowsVerbatimArguments"] = options.windowsVerbatimArguments || this._isCmdFile();
+        if (options.windowsVerbatimArguments) {
+          result.argv0 = `"${toolPath}"`;
+        }
+        return result;
+      }
+      /**
+       * Exec a tool.
+       * Output will be streamed to the live console.
+       * Returns promise with return code
+       *
+       * @param     tool     path to tool to exec
+       * @param     options  optional exec options.  See ExecOptions
+       * @returns   number
+       */
+      exec() {
+        return __awaiter(this, void 0, void 0, function* () {
+          if (!ioUtil.isRooted(this.toolPath) && (this.toolPath.includes("/") || IS_WINDOWS && this.toolPath.includes("\\"))) {
+            this.toolPath = path2.resolve(process.cwd(), this.options.cwd || process.cwd(), this.toolPath);
+          }
+          this.toolPath = yield io.which(this.toolPath, true);
+          return new Promise((resolve, reject) => __awaiter(this, void 0, void 0, function* () {
+            this._debug(`exec tool: ${this.toolPath}`);
+            this._debug("arguments:");
+            for (const arg of this.args) {
+              this._debug(`   ${arg}`);
+            }
+            const optionsNonNull = this._cloneExecOptions(this.options);
+            if (!optionsNonNull.silent && optionsNonNull.outStream) {
+              optionsNonNull.outStream.write(this._getCommandString(optionsNonNull) + os.EOL);
+            }
+            const state = new ExecState(optionsNonNull, this.toolPath);
+            state.on("debug", (message) => {
+              this._debug(message);
+            });
+            if (this.options.cwd && !(yield ioUtil.exists(this.options.cwd))) {
+              return reject(new Error(`The cwd: ${this.options.cwd} does not exist!`));
+            }
+            const fileName = this._getSpawnFileName();
+            const cp = child.spawn(fileName, this._getSpawnArgs(optionsNonNull), this._getSpawnOptions(this.options, fileName));
+            let stdbuffer = "";
+            if (cp.stdout) {
+              cp.stdout.on("data", (data) => {
+                if (this.options.listeners && this.options.listeners.stdout) {
+                  this.options.listeners.stdout(data);
+                }
+                if (!optionsNonNull.silent && optionsNonNull.outStream) {
+                  optionsNonNull.outStream.write(data);
+                }
+                stdbuffer = this._processLineBuffer(data, stdbuffer, (line) => {
+                  if (this.options.listeners && this.options.listeners.stdline) {
+                    this.options.listeners.stdline(line);
+                  }
+                });
+              });
+            }
+            let errbuffer = "";
+            if (cp.stderr) {
+              cp.stderr.on("data", (data) => {
+                state.processStderr = true;
+                if (this.options.listeners && this.options.listeners.stderr) {
+                  this.options.listeners.stderr(data);
+                }
+                if (!optionsNonNull.silent && optionsNonNull.errStream && optionsNonNull.outStream) {
+                  const s = optionsNonNull.failOnStdErr ? optionsNonNull.errStream : optionsNonNull.outStream;
+                  s.write(data);
+                }
+                errbuffer = this._processLineBuffer(data, errbuffer, (line) => {
+                  if (this.options.listeners && this.options.listeners.errline) {
+                    this.options.listeners.errline(line);
+                  }
+                });
+              });
+            }
+            cp.on("error", (err) => {
+              state.processError = err.message;
+              state.processExited = true;
+              state.processClosed = true;
+              state.CheckComplete();
+            });
+            cp.on("exit", (code) => {
+              state.processExitCode = code;
+              state.processExited = true;
+              this._debug(`Exit code ${code} received from tool '${this.toolPath}'`);
+              state.CheckComplete();
+            });
+            cp.on("close", (code) => {
+              state.processExitCode = code;
+              state.processExited = true;
+              state.processClosed = true;
+              this._debug(`STDIO streams have closed for tool '${this.toolPath}'`);
+              state.CheckComplete();
+            });
+            state.on("done", (error, exitCode) => {
+              if (stdbuffer.length > 0) {
+                this.emit("stdline", stdbuffer);
+              }
+              if (errbuffer.length > 0) {
+                this.emit("errline", errbuffer);
+              }
+              cp.removeAllListeners();
+              if (error) {
+                reject(error);
+              } else {
+                resolve(exitCode);
+              }
+            });
+            if (this.options.input) {
+              if (!cp.stdin) {
+                throw new Error("child process missing stdin");
+              }
+              cp.stdin.end(this.options.input);
+            }
+          }));
+        });
+      }
+    };
+    exports2.ToolRunner = ToolRunner;
+    function argStringToArray(argString) {
+      const args = [];
+      let inQuotes = false;
+      let escaped = false;
+      let arg = "";
+      function append(c) {
+        if (escaped && c !== '"') {
+          arg += "\\";
+        }
+        arg += c;
+        escaped = false;
+      }
+      for (let i = 0; i < argString.length; i++) {
+        const c = argString.charAt(i);
+        if (c === '"') {
+          if (!escaped) {
+            inQuotes = !inQuotes;
+          } else {
+            append(c);
+          }
+          continue;
+        }
+        if (c === "\\" && escaped) {
+          append(c);
+          continue;
+        }
+        if (c === "\\" && inQuotes) {
+          escaped = true;
+          continue;
+        }
+        if (c === " " && !inQuotes) {
+          if (arg.length > 0) {
+            args.push(arg);
+            arg = "";
+          }
+          continue;
+        }
+        append(c);
+      }
+      if (arg.length > 0) {
+        args.push(arg.trim());
+      }
+      return args;
+    }
+    exports2.argStringToArray = argStringToArray;
+    var ExecState = class extends events.EventEmitter {
+      constructor(options, toolPath) {
+        super();
+        this.processClosed = false;
+        this.processError = "";
+        this.processExitCode = 0;
+        this.processExited = false;
+        this.processStderr = false;
+        this.delay = 1e4;
+        this.done = false;
+        this.timeout = null;
+        if (!toolPath) {
+          throw new Error("toolPath must not be empty");
+        }
+        this.options = options;
+        this.toolPath = toolPath;
+        if (options.delay) {
+          this.delay = options.delay;
+        }
+      }
+      CheckComplete() {
+        if (this.done) {
+          return;
+        }
+        if (this.processClosed) {
+          this._setResult();
+        } else if (this.processExited) {
+          this.timeout = timers_1.setTimeout(ExecState.HandleTimeout, this.delay, this);
+        }
+      }
+      _debug(message) {
+        this.emit("debug", message);
+      }
+      _setResult() {
+        let error;
+        if (this.processExited) {
+          if (this.processError) {
+            error = new Error(`There was an error when attempting to execute the process '${this.toolPath}'. This may indicate the process failed to start. Error: ${this.processError}`);
+          } else if (this.processExitCode !== 0 && !this.options.ignoreReturnCode) {
+            error = new Error(`The process '${this.toolPath}' failed with exit code ${this.processExitCode}`);
+          } else if (this.processStderr && this.options.failOnStdErr) {
+            error = new Error(`The process '${this.toolPath}' failed because one or more lines were written to the STDERR stream`);
+          }
+        }
+        if (this.timeout) {
+          clearTimeout(this.timeout);
+          this.timeout = null;
+        }
+        this.done = true;
+        this.emit("done", error, this.processExitCode);
+      }
+      static HandleTimeout(state) {
+        if (state.done) {
+          return;
+        }
+        if (!state.processClosed && state.processExited) {
+          const message = `The STDIO streams did not close within ${state.delay / 1e3} seconds of the exit event from process '${state.toolPath}'. This may indicate a child process inherited the STDIO streams and has not yet exited.`;
+          state._debug(message);
+        }
+        state._setResult();
+      }
+    };
+  }
+});
+
+// node_modules/@reporters/github/node_modules/@actions/exec/lib/exec.js
+var require_exec = __commonJS({
+  "node_modules/@reporters/github/node_modules/@actions/exec/lib/exec.js"(exports2) {
+    "use strict";
+    var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      Object.defineProperty(o, k2, { enumerable: true, get: function() {
+        return m[k];
+      } });
+    } : function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      o[k2] = m[k];
+    });
+    var __setModuleDefault = exports2 && exports2.__setModuleDefault || (Object.create ? function(o, v) {
+      Object.defineProperty(o, "default", { enumerable: true, value: v });
+    } : function(o, v) {
+      o["default"] = v;
+    });
+    var __importStar = exports2 && exports2.__importStar || function(mod) {
+      if (mod && mod.__esModule)
+        return mod;
+      var result = {};
+      if (mod != null) {
+        for (var k in mod)
+          if (k !== "default" && Object.hasOwnProperty.call(mod, k))
+            __createBinding(result, mod, k);
+      }
+      __setModuleDefault(result, mod);
+      return result;
+    };
+    var __awaiter = exports2 && exports2.__awaiter || function(thisArg, _arguments, P, generator) {
+      function adopt(value) {
+        return value instanceof P ? value : new P(function(resolve) {
+          resolve(value);
+        });
+      }
+      return new (P || (P = Promise))(function(resolve, reject) {
+        function fulfilled(value) {
+          try {
+            step(generator.next(value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function rejected(value) {
+          try {
+            step(generator["throw"](value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function step(result) {
+          result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
+        }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+      });
+    };
+    Object.defineProperty(exports2, "__esModule", { value: true });
+    exports2.getExecOutput = exports2.exec = void 0;
+    var string_decoder_1 = require("string_decoder");
+    var tr = __importStar(require_toolrunner());
+    function exec(commandLine, args, options) {
+      return __awaiter(this, void 0, void 0, function* () {
+        const commandArgs = tr.argStringToArray(commandLine);
+        if (commandArgs.length === 0) {
+          throw new Error(`Parameter 'commandLine' cannot be null or empty.`);
+        }
+        const toolPath = commandArgs[0];
+        args = commandArgs.slice(1).concat(args || []);
+        const runner = new tr.ToolRunner(toolPath, args, options);
+        return runner.exec();
+      });
+    }
+    exports2.exec = exec;
+    function getExecOutput(commandLine, args, options) {
+      var _a, _b;
+      return __awaiter(this, void 0, void 0, function* () {
+        let stdout = "";
+        let stderr = "";
+        const stdoutDecoder = new string_decoder_1.StringDecoder("utf8");
+        const stderrDecoder = new string_decoder_1.StringDecoder("utf8");
+        const originalStdoutListener = (_a = options === null || options === void 0 ? void 0 : options.listeners) === null || _a === void 0 ? void 0 : _a.stdout;
+        const originalStdErrListener = (_b = options === null || options === void 0 ? void 0 : options.listeners) === null || _b === void 0 ? void 0 : _b.stderr;
+        const stdErrListener = (data) => {
+          stderr += stderrDecoder.write(data);
+          if (originalStdErrListener) {
+            originalStdErrListener(data);
+          }
+        };
+        const stdOutListener = (data) => {
+          stdout += stdoutDecoder.write(data);
+          if (originalStdoutListener) {
+            originalStdoutListener(data);
+          }
+        };
+        const listeners = Object.assign(Object.assign({}, options === null || options === void 0 ? void 0 : options.listeners), { stdout: stdOutListener, stderr: stdErrListener });
+        const exitCode = yield exec(commandLine, args, Object.assign(Object.assign({}, options), { listeners }));
+        stdout += stdoutDecoder.end();
+        stderr += stderrDecoder.end();
+        return {
+          exitCode,
+          stdout,
+          stderr
+        };
+      });
+    }
+    exports2.getExecOutput = getExecOutput;
+  }
+});
+
+// node_modules/@reporters/github/node_modules/@actions/core/lib/platform.js
+var require_platform = __commonJS({
+  "node_modules/@reporters/github/node_modules/@actions/core/lib/platform.js"(exports2) {
+    "use strict";
+    var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      var desc = Object.getOwnPropertyDescriptor(m, k);
+      if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+        desc = { enumerable: true, get: function() {
+          return m[k];
+        } };
+      }
+      Object.defineProperty(o, k2, desc);
+    } : function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      o[k2] = m[k];
+    });
+    var __setModuleDefault = exports2 && exports2.__setModuleDefault || (Object.create ? function(o, v) {
+      Object.defineProperty(o, "default", { enumerable: true, value: v });
+    } : function(o, v) {
+      o["default"] = v;
+    });
+    var __importStar = exports2 && exports2.__importStar || function(mod) {
+      if (mod && mod.__esModule)
+        return mod;
+      var result = {};
+      if (mod != null) {
+        for (var k in mod)
+          if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k))
+            __createBinding(result, mod, k);
+      }
+      __setModuleDefault(result, mod);
+      return result;
+    };
+    var __awaiter = exports2 && exports2.__awaiter || function(thisArg, _arguments, P, generator) {
+      function adopt(value) {
+        return value instanceof P ? value : new P(function(resolve) {
+          resolve(value);
+        });
+      }
+      return new (P || (P = Promise))(function(resolve, reject) {
+        function fulfilled(value) {
+          try {
+            step(generator.next(value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function rejected(value) {
+          try {
+            step(generator["throw"](value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function step(result) {
+          result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
+        }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+      });
+    };
+    var __importDefault = exports2 && exports2.__importDefault || function(mod) {
+      return mod && mod.__esModule ? mod : { "default": mod };
+    };
+    Object.defineProperty(exports2, "__esModule", { value: true });
+    exports2.getDetails = exports2.isLinux = exports2.isMacOS = exports2.isWindows = exports2.arch = exports2.platform = void 0;
+    var os_1 = __importDefault(require("os"));
+    var exec = __importStar(require_exec());
+    var getWindowsInfo = () => __awaiter(void 0, void 0, void 0, function* () {
+      const { stdout: version } = yield exec.getExecOutput('powershell -command "(Get-CimInstance -ClassName Win32_OperatingSystem).Version"', void 0, {
+        silent: true
+      });
+      const { stdout: name } = yield exec.getExecOutput('powershell -command "(Get-CimInstance -ClassName Win32_OperatingSystem).Caption"', void 0, {
+        silent: true
+      });
+      return {
+        name: name.trim(),
+        version: version.trim()
+      };
+    });
+    var getMacOsInfo = () => __awaiter(void 0, void 0, void 0, function* () {
+      var _a, _b, _c, _d;
+      const { stdout } = yield exec.getExecOutput("sw_vers", void 0, {
+        silent: true
+      });
+      const version = (_b = (_a = stdout.match(/ProductVersion:\s*(.+)/)) === null || _a === void 0 ? void 0 : _a[1]) !== null && _b !== void 0 ? _b : "";
+      const name = (_d = (_c = stdout.match(/ProductName:\s*(.+)/)) === null || _c === void 0 ? void 0 : _c[1]) !== null && _d !== void 0 ? _d : "";
+      return {
+        name,
+        version
+      };
+    });
+    var getLinuxInfo = () => __awaiter(void 0, void 0, void 0, function* () {
+      const { stdout } = yield exec.getExecOutput("lsb_release", ["-i", "-r", "-s"], {
+        silent: true
+      });
+      const [name, version] = stdout.trim().split("\n");
+      return {
+        name,
+        version
+      };
+    });
+    exports2.platform = os_1.default.platform();
+    exports2.arch = os_1.default.arch();
+    exports2.isWindows = exports2.platform === "win32";
+    exports2.isMacOS = exports2.platform === "darwin";
+    exports2.isLinux = exports2.platform === "linux";
+    function getDetails() {
+      return __awaiter(this, void 0, void 0, function* () {
+        return Object.assign(Object.assign({}, yield exports2.isWindows ? getWindowsInfo() : exports2.isMacOS ? getMacOsInfo() : getLinuxInfo()), {
+          platform: exports2.platform,
+          arch: exports2.arch,
+          isWindows: exports2.isWindows,
+          isMacOS: exports2.isMacOS,
+          isLinux: exports2.isLinux
+        });
+      });
+    }
+    exports2.getDetails = getDetails;
+  }
+});
+
+// node_modules/@reporters/github/node_modules/@actions/core/lib/core.js
+var require_core = __commonJS({
+  "node_modules/@reporters/github/node_modules/@actions/core/lib/core.js"(exports2) {
+    "use strict";
+    var __createBinding = exports2 && exports2.__createBinding || (Object.create ? function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      var desc = Object.getOwnPropertyDescriptor(m, k);
+      if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+        desc = { enumerable: true, get: function() {
+          return m[k];
+        } };
+      }
+      Object.defineProperty(o, k2, desc);
+    } : function(o, m, k, k2) {
+      if (k2 === void 0)
+        k2 = k;
+      o[k2] = m[k];
+    });
+    var __setModuleDefault = exports2 && exports2.__setModuleDefault || (Object.create ? function(o, v) {
+      Object.defineProperty(o, "default", { enumerable: true, value: v });
+    } : function(o, v) {
+      o["default"] = v;
+    });
+    var __importStar = exports2 && exports2.__importStar || function(mod) {
+      if (mod && mod.__esModule)
+        return mod;
+      var result = {};
+      if (mod != null) {
+        for (var k in mod)
+          if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k))
+            __createBinding(result, mod, k);
+      }
+      __setModuleDefault(result, mod);
+      return result;
+    };
+    var __awaiter = exports2 && exports2.__awaiter || function(thisArg, _arguments, P, generator) {
+      function adopt(value) {
+        return value instanceof P ? value : new P(function(resolve) {
+          resolve(value);
+        });
+      }
+      return new (P || (P = Promise))(function(resolve, reject) {
+        function fulfilled(value) {
+          try {
+            step(generator.next(value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function rejected(value) {
+          try {
+            step(generator["throw"](value));
+          } catch (e) {
+            reject(e);
+          }
+        }
+        function step(result) {
+          result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected);
+        }
+        step((generator = generator.apply(thisArg, _arguments || [])).next());
+      });
+    };
+    Object.defineProperty(exports2, "__esModule", { value: true });
+    exports2.platform = exports2.toPlatformPath = exports2.toWin32Path = exports2.toPosixPath = exports2.markdownSummary = exports2.summary = exports2.getIDToken = exports2.getState = exports2.saveState = exports2.group = exports2.endGroup = exports2.startGroup = exports2.info = exports2.notice = exports2.warning = exports2.error = exports2.debug = exports2.isDebug = exports2.setFailed = exports2.setCommandEcho = exports2.setOutput = exports2.getBooleanInput = exports2.getMultilineInput = exports2.getInput = exports2.addPath = exports2.setSecret = exports2.exportVariable = exports2.ExitCode = void 0;
+    var command_1 = require_command();
+    var file_command_1 = require_file_command();
+    var utils_1 = require_utils();
+    var os = __importStar(require("os"));
+    var path2 = __importStar(require("path"));
+    var oidc_utils_1 = require_oidc_utils();
+    var ExitCode;
+    (function(ExitCode2) {
+      ExitCode2[ExitCode2["Success"] = 0] = "Success";
+      ExitCode2[ExitCode2["Failure"] = 1] = "Failure";
+    })(ExitCode || (exports2.ExitCode = ExitCode = {}));
+    function exportVariable(name, val) {
+      const convertedVal = (0, utils_1.toCommandValue)(val);
+      process.env[name] = convertedVal;
+      const filePath = process.env["GITHUB_ENV"] || "";
+      if (filePath) {
+        return (0, file_command_1.issueFileCommand)("ENV", (0, file_command_1.prepareKeyValueMessage)(name, val));
+      }
+      (0, command_1.issueCommand)("set-env", { name }, convertedVal);
+    }
+    exports2.exportVariable = exportVariable;
+    function setSecret(secret) {
+      (0, command_1.issueCommand)("add-mask", {}, secret);
+    }
+    exports2.setSecret = setSecret;
+    function addPath(inputPath) {
+      const filePath = process.env["GITHUB_PATH"] || "";
+      if (filePath) {
+        (0, file_command_1.issueFileCommand)("PATH", inputPath);
+      } else {
+        (0, command_1.issueCommand)("add-path", {}, inputPath);
+      }
+      process.env["PATH"] = `${inputPath}${path2.delimiter}${process.env["PATH"]}`;
+    }
+    exports2.addPath = addPath;
+    function getInput(name, options) {
+      const val = process.env[`INPUT_${name.replace(/ /g, "_").toUpperCase()}`] || "";
+      if (options && options.required && !val) {
+        throw new Error(`Input required and not supplied: ${name}`);
+      }
+      if (options && options.trimWhitespace === false) {
+        return val;
+      }
+      return val.trim();
+    }
+    exports2.getInput = getInput;
+    function getMultilineInput(name, options) {
+      const inputs = getInput(name, options).split("\n").filter((x) => x !== "");
+      if (options && options.trimWhitespace === false) {
+        return inputs;
+      }
+      return inputs.map((input) => input.trim());
+    }
+    exports2.getMultilineInput = getMultilineInput;
+    function getBooleanInput(name, options) {
+      const trueValue = ["true", "True", "TRUE"];
+      const falseValue = ["false", "False", "FALSE"];
+      const val = getInput(name, options);
+      if (trueValue.includes(val))
+        return true;
+      if (falseValue.includes(val))
+        return false;
+      throw new TypeError(`Input does not meet YAML 1.2 "Core Schema" specification: ${name}
+Support boolean input list: \`true | True | TRUE | false | False | FALSE\``);
+    }
+    exports2.getBooleanInput = getBooleanInput;
+    function setOutput(name, value) {
+      const filePath = process.env["GITHUB_OUTPUT"] || "";
+      if (filePath) {
+        return (0, file_command_1.issueFileCommand)("OUTPUT", (0, file_command_1.prepareKeyValueMessage)(name, value));
+      }
+      process.stdout.write(os.EOL);
+      (0, command_1.issueCommand)("set-output", { name }, (0, utils_1.toCommandValue)(value));
+    }
+    exports2.setOutput = setOutput;
+    function setCommandEcho(enabled) {
+      (0, command_1.issue)("echo", enabled ? "on" : "off");
+    }
+    exports2.setCommandEcho = setCommandEcho;
+    function setFailed(message) {
+      process.exitCode = ExitCode.Failure;
+      error(message);
+    }
+    exports2.setFailed = setFailed;
+    function isDebug() {
+      return process.env["RUNNER_DEBUG"] === "1";
+    }
+    exports2.isDebug = isDebug;
+    function debug(message) {
+      (0, command_1.issueCommand)("debug", {}, message);
+    }
+    exports2.debug = debug;
+    function error(message, properties = {}) {
+      (0, command_1.issueCommand)("error", (0, utils_1.toCommandProperties)(properties), message instanceof Error ? message.toString() : message);
+    }
+    exports2.error = error;
+    function warning(message, properties = {}) {
+      (0, command_1.issueCommand)("warning", (0, utils_1.toCommandProperties)(properties), message instanceof Error ? message.toString() : message);
+    }
+    exports2.warning = warning;
+    function notice(message, properties = {}) {
+      (0, command_1.issueCommand)("notice", (0, utils_1.toCommandProperties)(properties), message instanceof Error ? message.toString() : message);
+    }
+    exports2.notice = notice;
+    function info(message) {
+      process.stdout.write(message + os.EOL);
+    }
+    exports2.info = info;
+    function startGroup(name) {
+      (0, command_1.issue)("group", name);
+    }
+    exports2.startGroup = startGroup;
+    function endGroup() {
+      (0, command_1.issue)("endgroup");
+    }
+    exports2.endGroup = endGroup;
+    function group(name, fn) {
+      return __awaiter(this, void 0, void 0, function* () {
+        startGroup(name);
+        let result;
+        try {
+          result = yield fn();
+        } finally {
+          endGroup();
+        }
+        return result;
+      });
+    }
+    exports2.group = group;
+    function saveState(name, value) {
+      const filePath = process.env["GITHUB_STATE"] || "";
+      if (filePath) {
+        return (0, file_command_1.issueFileCommand)("STATE", (0, file_command_1.prepareKeyValueMessage)(name, value));
+      }
+      (0, command_1.issueCommand)("save-state", { name }, (0, utils_1.toCommandValue)(value));
+    }
+    exports2.saveState = saveState;
+    function getState(name) {
       return process.env[`STATE_${name}`] || "";
     }
     exports2.getState = getState;
@@ -19033,6 +19843,7 @@ Support boolean input list: \`true | True | TRUE | false | False | FALSE\``);
     Object.defineProperty(exports2, "toPlatformPath", { enumerable: true, get: function() {
       return path_utils_1.toPlatformPath;
     } });
+    exports2.platform = __importStar(require_platform());
   }
 });
 
@@ -19305,6 +20116,7 @@ var require_stack_utils = __commonJS({
 
 // node_modules/@reporters/github/index.js
 var path = require("node:path");
+var { fileURLToPath } = require("node:url");
 var util = require("node:util");
 var { EOL } = require("node:os");
 var core = require_core();
@@ -19312,10 +20124,13 @@ var StackUtils = require_stack_utils();
 var WORKSPACE = process.env.GITHUB_WORKSPACE ?? "";
 var stack = new StackUtils({ cwd: WORKSPACE, internals: StackUtils.nodeInternals() });
 var isFile = (name) => name?.startsWith(WORKSPACE);
-var getRelativeFilePath = (name) => isFile(name) ? path.relative(WORKSPACE, require.resolve(name) ?? "") : null;
+var getRelativeFilePath = (name) => isFile(name) ? path.relative(WORKSPACE, name) : null;
 function getFilePath(fileName) {
   if (fileName.startsWith("file://")) {
-    return getRelativeFilePath(new URL(fileName).pathname);
+    return getRelativeFilePath(fileURLToPath(fileName));
+  }
+  if (!path.isAbsolute(fileName)) {
+    return getRelativeFilePath(path.resolve(fileName) ?? "");
   }
   return getRelativeFilePath(fileName);
 }
@@ -19390,7 +20205,7 @@ module.exports = async function githubReporter(source) {
         break;
     }
   }
-  const formatedDiagnostics = diagnostics.map((d) => {
+  const formattedDiagnostics = diagnostics.map((d) => {
     const [key, ...rest] = d.split(" ");
     const value = rest.join(" ");
     return [
@@ -19398,11 +20213,11 @@ module.exports = async function githubReporter(source) {
       DIAGNOSTIC_VALUES[key] ? DIAGNOSTIC_VALUES[key](value) : value
     ];
   });
-  core.startGroup(`Test results (${formatedDiagnostics.find(([key]) => key === DIAGNOSTIC_KEYS.pass)?.[1] ?? counter.pass} passed, ${formatedDiagnostics.find(([key]) => key === DIAGNOSTIC_KEYS.fail)?.[1] ?? counter.fail} failed)`);
-  core.notice(formatedDiagnostics.map((d) => d.join(": ")).join(EOL));
+  core.startGroup(`Test results (${formattedDiagnostics.find(([key]) => key === DIAGNOSTIC_KEYS.pass)?.[1] ?? counter.pass} passed, ${formattedDiagnostics.find(([key]) => key === DIAGNOSTIC_KEYS.fail)?.[1] ?? counter.fail} failed)`);
+  core.notice(formattedDiagnostics.map((d) => d.join(": ")).join(EOL));
   core.endGroup();
   if (process.env.GITHUB_STEP_SUMMARY) {
-    await core.summary.addHeading("Test Results").addTable(formatedDiagnostics).write();
+    await core.summary.addHeading("Test Results").addTable(formattedDiagnostics).write();
   }
 };
 /*! Bundled license information:
diff --git a/tools/github_reporter/package.json b/tools/github_reporter/package.json
index 5e37b4fc5a0307..d9e5273b7ce46e 100644
--- a/tools/github_reporter/package.json
+++ b/tools/github_reporter/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@reporters/github",
-  "version": "1.7.1",
+  "version": "1.7.2",
   "description": "A github actions reporter for `node:test`",
   "type": "commonjs",
   "keywords": [
diff --git a/tools/gyp/CHANGELOG.md b/tools/gyp/CHANGELOG.md
index ac9a34e4509368..22257ab93d8929 100644
--- a/tools/gyp/CHANGELOG.md
+++ b/tools/gyp/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## [0.19.1](https://github.com/nodejs/gyp-next/compare/v0.19.0...v0.19.1) (2024-12-09)
+
+
+### Bug Fixes
+
+* fixup for break in EscapeForCString ([#274](https://github.com/nodejs/gyp-next/issues/274)) ([610f661](https://github.com/nodejs/gyp-next/commit/610f661da877a358c8b3cbc106b528fb1d0b8095))
+
+## [0.19.0](https://github.com/nodejs/gyp-next/compare/v0.18.3...v0.19.0) (2024-12-03)
+
+
+### Features
+
+* provide escaped version of `PRODUCT_DIR_ABS` ([#271](https://github.com/nodejs/gyp-next/issues/271)) ([3bf3b1c](https://github.com/nodejs/gyp-next/commit/3bf3b1cda26f16c645e0fdd5582ffbf49d9a2580))
+
 ## [0.18.3](https://github.com/nodejs/gyp-next/compare/v0.18.2...v0.18.3) (2024-10-08)
 
 
diff --git a/tools/gyp/pylib/gyp/MSVSSettings.py b/tools/gyp/pylib/gyp/MSVSSettings.py
index ac87f572b240de..fea6e672865bfe 100644
--- a/tools/gyp/pylib/gyp/MSVSSettings.py
+++ b/tools/gyp/pylib/gyp/MSVSSettings.py
@@ -171,7 +171,7 @@ def ValidateMSBuild(self, value):
         int(value, self._msbuild_base)
 
     def ConvertToMSBuild(self, value):
-        msbuild_format = (self._msbuild_base == 10) and "%d" or "0x%04x"
+        msbuild_format = ((self._msbuild_base == 10) and "%d") or "0x%04x"
         return msbuild_format % int(value)
 
 
diff --git a/tools/gyp/pylib/gyp/MSVSVersion.py b/tools/gyp/pylib/gyp/MSVSVersion.py
index 8d7f21e82dd2f8..1b3536292201b7 100644
--- a/tools/gyp/pylib/gyp/MSVSVersion.py
+++ b/tools/gyp/pylib/gyp/MSVSVersion.py
@@ -69,7 +69,7 @@ def UsesVcxproj(self):
 
     def ProjectExtension(self):
         """Returns the file extension for the project."""
-        return self.uses_vcxproj and ".vcxproj" or ".vcproj"
+        return (self.uses_vcxproj and ".vcxproj") or ".vcproj"
 
     def Path(self):
         """Returns the path to Visual Studio installation."""
diff --git a/tools/gyp/pylib/gyp/__init__.py b/tools/gyp/pylib/gyp/__init__.py
index d6cc01307d997c..8933d0c4f707c9 100755
--- a/tools/gyp/pylib/gyp/__init__.py
+++ b/tools/gyp/pylib/gyp/__init__.py
@@ -4,7 +4,7 @@
 # Use of this source code is governed by a BSD-style license that can be
 # found in the LICENSE file.
 
-
+from __future__ import annotations
 import copy
 import gyp.input
 import argparse
@@ -24,6 +24,18 @@
 DEBUG_VARIABLES = "variables"
 DEBUG_INCLUDES = "includes"
 
+def EscapeForCString(string: bytes | str) -> str:
+    if isinstance(string, str):
+        string = string.encode(encoding='utf8')
+
+    backslash_or_double_quote = {ord('\\'), ord('"')}
+    result = ''
+    for char in string:
+        if char in backslash_or_double_quote or not 32 <= char < 127:
+            result += '\\%03o' % char
+        else:
+            result += chr(char)
+    return result
 
 def DebugOutput(mode, message, *args):
     if "all" in gyp.debug or mode in gyp.debug:
@@ -106,18 +118,19 @@ def Load(
 
     output_dir = params["options"].generator_output or params["options"].toplevel_dir
     if default_variables["GENERATOR"] == "ninja":
-        default_variables.setdefault(
-            "PRODUCT_DIR_ABS",
-            os.path.join(
-                output_dir, "out", default_variables.get("build_type", "default")
-            ),
+        product_dir_abs = os.path.join(
+            output_dir, "out", default_variables.get("build_type", "default")
         )
     else:
-        default_variables.setdefault(
-            "PRODUCT_DIR_ABS",
-            os.path.join(output_dir, default_variables["CONFIGURATION_NAME"]),
+        product_dir_abs = os.path.join(
+            output_dir, default_variables["CONFIGURATION_NAME"]
         )
 
+    default_variables.setdefault("PRODUCT_DIR_ABS", product_dir_abs)
+    default_variables.setdefault(
+        "PRODUCT_DIR_ABS_CSTR", EscapeForCString(product_dir_abs)
+    )
+
     # Give the generator the opportunity to set additional variables based on
     # the params it will receive in the output phase.
     if getattr(generator, "CalculateVariables", None):
@@ -253,7 +266,7 @@ def Noop(value):
     for name, metadata in options._regeneration_metadata.items():
         opt = metadata["opt"]
         value = getattr(options, name)
-        value_predicate = metadata["type"] == "path" and FixPath or Noop
+        value_predicate = (metadata["type"] == "path" and FixPath) or Noop
         action = metadata["action"]
         env_name = metadata["env_name"]
         if action == "append":
diff --git a/tools/gyp/pylib/gyp/generator/make.py b/tools/gyp/pylib/gyp/generator/make.py
index 32a2122480fd93..634da8973c4abe 100644
--- a/tools/gyp/pylib/gyp/generator/make.py
+++ b/tools/gyp/pylib/gyp/generator/make.py
@@ -788,7 +788,7 @@ def __init__(self, generator_flags, flavor):
         self.suffix_rules_objdir2 = {}
 
         # Generate suffix rules for all compilable extensions.
-        for ext in COMPILABLE_EXTENSIONS:
+        for ext, value in COMPILABLE_EXTENSIONS.items():
             # Suffix rules for source folder.
             self.suffix_rules_srcdir.update(
                 {
@@ -797,7 +797,7 @@ def __init__(self, generator_flags, flavor):
 $(obj).$(TOOLSET)/$(TARGET)/%%.o: $(srcdir)/%%%s FORCE_DO_CMD
 \t@$(call do_cmd,%s,1)
 """
-                        % (ext, COMPILABLE_EXTENSIONS[ext])
+                        % (ext, value)
                     )
                 }
             )
@@ -810,7 +810,7 @@ def __init__(self, generator_flags, flavor):
 $(obj).$(TOOLSET)/$(TARGET)/%%.o: $(obj).$(TOOLSET)/%%%s FORCE_DO_CMD
 \t@$(call do_cmd,%s,1)
 """
-                        % (ext, COMPILABLE_EXTENSIONS[ext])
+                        % (ext, value)
                     )
                 }
             )
@@ -821,7 +821,7 @@ def __init__(self, generator_flags, flavor):
 $(obj).$(TOOLSET)/$(TARGET)/%%.o: $(obj)/%%%s FORCE_DO_CMD
 \t@$(call do_cmd,%s,1)
 """
-                        % (ext, COMPILABLE_EXTENSIONS[ext])
+                        % (ext, value)
                     )
                 }
             )
@@ -1779,13 +1779,13 @@ def WriteTarget(
             # using ":=".
             self.WriteSortedXcodeEnv(self.output, self.GetSortedXcodePostbuildEnv())
 
-            for configname in target_postbuilds:
+            for configname, value in target_postbuilds.items():
                 self.WriteLn(
                     "%s: TARGET_POSTBUILDS_%s := %s"
                     % (
                         QuoteSpaces(self.output),
                         configname,
-                        gyp.common.EncodePOSIXShellList(target_postbuilds[configname]),
+                        gyp.common.EncodePOSIXShellList(value),
                     )
                 )
 
diff --git a/tools/gyp/pylib/gyp/input.py b/tools/gyp/pylib/gyp/input.py
index 0881cbbbfb05fa..5e71fdace0c663 100644
--- a/tools/gyp/pylib/gyp/input.py
+++ b/tools/gyp/pylib/gyp/input.py
@@ -2469,11 +2469,8 @@ def SetUpConfigurations(target, target_dict):
         merged_configurations[configuration] = new_configuration_dict
 
     # Put the new configurations back into the target dict as a configuration.
-    for configuration in merged_configurations:
-        target_dict["configurations"][configuration] = merged_configurations[
-            configuration
-        ]
-
+    for configuration, value in merged_configurations.items():
+        target_dict["configurations"][configuration] = value
     # Now drop all the abstract ones.
     configs = target_dict["configurations"]
     target_dict["configurations"] = {
@@ -3020,8 +3017,8 @@ def Load(
                     del target_dict[key]
         ProcessListFiltersInDict(target_name, tmp_dict)
         # Write the results back to |target_dict|.
-        for key in tmp_dict:
-            target_dict[key] = tmp_dict[key]
+        for key, value in tmp_dict.items():
+            target_dict[key] = value
 
     # Make sure every dependency appears at most once.
     RemoveDuplicateDependencies(targets)
diff --git a/tools/gyp/pylib/gyp/xcode_emulation.py b/tools/gyp/pylib/gyp/xcode_emulation.py
index f567542bbb23b9..aee1a542da3299 100644
--- a/tools/gyp/pylib/gyp/xcode_emulation.py
+++ b/tools/gyp/pylib/gyp/xcode_emulation.py
@@ -1127,8 +1127,8 @@ def _GetIOSPostbuilds(self, configname, output_binary):
     be deployed to a device.  This should be run as the very last step of the
     build."""
         if not (
-            self.isIOS
-            and (self.spec["type"] == "executable" or self._IsXCTest())
+            (self.isIOS
+            and (self.spec["type"] == "executable" or self._IsXCTest()))
             or self.IsIosFramework()
         ):
             return []
diff --git a/tools/gyp/pylib/gyp/xcodeproj_file.py b/tools/gyp/pylib/gyp/xcodeproj_file.py
index d0bd2b55d0e8a0..cd72aa262d2d9d 100644
--- a/tools/gyp/pylib/gyp/xcodeproj_file.py
+++ b/tools/gyp/pylib/gyp/xcodeproj_file.py
@@ -3017,10 +3017,10 @@ def _AllSymrootsUnique(self, target, inherit_unique_symroot):
         symroots = self._DefinedSymroots(target)
         for s in self._DefinedSymroots(target):
             if (
-                s is not None
-                and not self._IsUniqueSymrootForTarget(s)
-                or s is None
-                and not inherit_unique_symroot
+                (s is not None
+                and not self._IsUniqueSymrootForTarget(s))
+                or (s is None
+                and not inherit_unique_symroot)
             ):
                 return False
         return True if symroots else inherit_unique_symroot
diff --git a/tools/gyp/pyproject.toml b/tools/gyp/pyproject.toml
index 4fd678ab22fa7f..4b0c88c8a22c43 100644
--- a/tools/gyp/pyproject.toml
+++ b/tools/gyp/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "gyp-next"
-version = "0.18.3"
+version = "0.19.1"
 authors = [
   { name="Node.js contributors", email="ryzokuken@disroot.org" },
 ]
diff --git a/tools/gyp/tools/pretty_sln.py b/tools/gyp/tools/pretty_sln.py
index cf0638a23d635d..8026699d39e0af 100755
--- a/tools/gyp/tools/pretty_sln.py
+++ b/tools/gyp/tools/pretty_sln.py
@@ -93,10 +93,10 @@ def ParseSolution(solution_file):
             continue
 
     # Change all dependencies clsid to name instead.
-    for project in dependencies:
+    for project, deps in dependencies.items():
         # For each dependencies in this project
         new_dep_array = []
-        for dep in dependencies[project]:
+        for dep in deps:
             # Look for the project name matching this cldis
             for project_info in projects:
                 if projects[project_info][1] == dep:
diff --git a/tools/gyp/tools/pretty_vcproj.py b/tools/gyp/tools/pretty_vcproj.py
index d4b58fd87f9e32..862400358a1153 100755
--- a/tools/gyp/tools/pretty_vcproj.py
+++ b/tools/gyp/tools/pretty_vcproj.py
@@ -118,8 +118,8 @@ def FixFilenames(filenames, current_directory):
     new_list = []
     for filename in filenames:
         if filename:
-            for key in REPLACEMENTS:
-                filename = filename.replace(key, REPLACEMENTS[key])
+            for key, value in REPLACEMENTS.items():
+                filename = filename.replace(key, value)
             os.chdir(current_directory)
             filename = filename.strip("\"' ")
             if filename.startswith("$"):
diff --git a/tools/icu/update-test-data.mjs b/tools/icu/update-test-data.mjs
new file mode 100644
index 00000000000000..fae784b07e958e
--- /dev/null
+++ b/tools/icu/update-test-data.mjs
@@ -0,0 +1,81 @@
+/*
+ * This script updates the `test/fixtures/icu/localizationData.json` data
+ * used by `test/parallel/test-icu-env.js` test.
+ * Run this script after an ICU update if locale-specific output changes are
+ * causing the test to fail.
+ * Typically, only a few strings change with each ICU update. If this script
+ * suddenly generates identical values for all locales, it indicates a bug.
+ * Note that Node.js must be built with either `--with-intl=full-icu` after
+ * updating ICU, or with `--with-intl=system-icu` if system version matches.
+ * Wrong version or small-icu might produce wrong values.
+ * Manually editing the json file is fine, too.
+ */
+
+import { execFileSync } from 'node:child_process';
+import { writeFileSync } from 'node:fs';
+
+const locales = [
+  'en', 'zh', 'hi', 'es',
+  'fr', 'ar', 'bn', 'ru',
+  'pt', 'ur', 'id', 'de',
+  'ja', 'pcm', 'mr', 'te',
+];
+
+const outputFilePath = new URL(`../../test/fixtures/icu/localizationData-v${process.versions.icu}.json`, import.meta.url);
+
+const runEnvCommand = (envVars, code) =>
+  execFileSync(
+    process.execPath,
+    ['-e', `process.stdout.write(String(${code}));`],
+    { env: { ...process.env, ...envVars }, encoding: 'utf8' },
+  );
+
+// Generate the localization data for all locales
+const localizationData = locales.reduce((acc, locale) => {
+  acc.dateStrings[locale] = runEnvCommand(
+    { LANG: locale, TZ: 'Europe/Zurich' },
+    `new Date(333333333333).toString()`,
+  );
+
+  acc.dateTimeFormats[locale] = runEnvCommand(
+    { LANG: locale, TZ: 'Europe/Zurich' },
+    `new Date(333333333333).toLocaleString()`,
+  );
+
+  acc.dateFormats[locale] = runEnvCommand(
+    { LANG: locale, TZ: 'Europe/Zurich' },
+    `new Intl.DateTimeFormat().format(333333333333)`,
+  );
+
+  acc.displayNames[locale] = runEnvCommand(
+    { LANG: locale },
+    `new Intl.DisplayNames(undefined, { type: "region" }).of("CH")`,
+  );
+
+  acc.numberFormats[locale] = runEnvCommand(
+    { LANG: locale },
+    `new Intl.NumberFormat().format(275760.913)`,
+  );
+
+  acc.pluralRules[locale] = runEnvCommand(
+    { LANG: locale },
+    `new Intl.PluralRules().select(0)`,
+  );
+
+  acc.relativeTime[locale] = runEnvCommand(
+    { LANG: locale },
+    `new Intl.RelativeTimeFormat().format(-586920.617, "hour")`,
+  );
+
+  return acc;
+}, {
+  dateStrings: {},
+  dateTimeFormats: {},
+  dateFormats: {},
+  displayNames: {},
+  numberFormats: {},
+  pluralRules: {},
+  relativeTime: {},
+});
+
+writeFileSync(outputFilePath, JSON.stringify(localizationData, null, 2) + '\n');
diff --git a/tools/run-worker.js b/tools/run-worker.js
index 20f03f53e12184..f4ede8628e5fd4 100644
--- a/tools/run-worker.js
+++ b/tools/run-worker.js
@@ -7,7 +7,7 @@ if (typeof require === 'undefined') {
 const path = require('path');
 const { Worker } = require('worker_threads');
 
-// When --experimental-permission is enabled, the process
+// When --permission is enabled, the process
 // aren't able to spawn any worker unless --allow-worker is passed.
 // Therefore, we skip the permission tests for custom-suites-freestyle
 if (process.permission && !process.permission.has('worker')) {
diff --git a/tools/test.py b/tools/test.py
index 9d7838d7c0110a..a6037608d52459 100755
--- a/tools/test.py
+++ b/tools/test.py
@@ -1584,6 +1584,7 @@ def PrintCrashed(code):
   'js-native-api',
   'node-api',
   'pummel',
+  'sqlite',
   'tick-processor',
   'v8-updates'
 ]
diff --git a/tools/v8_gypfiles/d8.gyp b/tools/v8_gypfiles/d8.gyp
index ef071b40710c49..4dd989724d3b6f 100644
--- a/tools/v8_gypfiles/d8.gyp
+++ b/tools/v8_gypfiles/d8.gyp
@@ -21,6 +21,7 @@
         'v8.gyp:v8_libplatform',
         'v8.gyp:generate_bytecode_builtins_list',
         'v8.gyp:v8_abseil',
+        'v8.gyp:fp16',
       ],
       # Generated source files need this explicitly:
       'include_dirs+': [
diff --git a/tools/v8_gypfiles/features.gypi b/tools/v8_gypfiles/features.gypi
index 18479d7d2abc44..7940234cd6789c 100644
--- a/tools/v8_gypfiles/features.gypi
+++ b/tools/v8_gypfiles/features.gypi
@@ -328,7 +328,10 @@
 
     # Enable advanced BigInt algorithms, costing about 10-30 KiB binary size
     # depending on platform.
-    'v8_advanced_bigint_algorithms%': 1
+    'v8_advanced_bigint_algorithms%': 1,
+
+    # Enable 256-bit long vector re-vectorization pass in WASM compilation pipeline.
+    'v8_enable_wasm_simd256_revec%' : 0
   },
 
   'target_defaults': {
@@ -541,6 +544,9 @@
       ['v8_advanced_bigint_algorithms==1', {
         'defines': ['V8_ADVANCED_BIGINT_ALGORITHMS',],
       }],
+      ['v8_enable_wasm_simd256_revec==1', {
+        'defines': ['V8_ENABLE_WASM_SIMD256_REVEC',],
+      }],
     ],  # conditions
     'defines': [
       'V8_GYP_BUILD',
diff --git a/tools/v8_gypfiles/v8.gyp b/tools/v8_gypfiles/v8.gyp
index f344406a9414e9..9acad07d966a35 100644
--- a/tools/v8_gypfiles/v8.gyp
+++ b/tools/v8_gypfiles/v8.gyp
@@ -646,6 +646,11 @@
               '<!@pymod_do_main(GN-scraper "<(V8_ROOT)/BUILD.gn"  "v8_header_set.\\"v8_internal_headers\\".*?v8_enable_webassembly.*?sources \\+= ")',
             ],
           }],
+          ['v8_enable_wasm_simd256_revec==1', {
+            'sources': [
+              '<!@pymod_do_main(GN-scraper "<(V8_ROOT)/BUILD.gn"  "v8_header_set.\\"v8_internal_headers\\".*?v8_enable_wasm_simd256_revec.*?sources \\+= ")',
+            ],
+          }],
           ['v8_enable_i18n_support==1', {
             'sources': [
               '<!@pymod_do_main(GN-scraper "<(V8_ROOT)/BUILD.gn"  "v8_header_set.\\"v8_internal_headers\\".*?v8_enable_i18n_support.*?sources \\+= ")',
@@ -882,6 +887,11 @@
               '<!@pymod_do_main(GN-scraper "<(V8_ROOT)/BUILD.gn"  "v8_compiler_sources =.*?v8_enable_webassembly.*?v8_compiler_sources \\+= ")',
             ],
           }],
+          ['v8_enable_wasm_simd256_revec==1', {
+            'sources': [
+              '<!@pymod_do_main(GN-scraper "<(V8_ROOT)/BUILD.gn"  "v8_compiler_sources =.*?v8_enable_wasm_simd256_revec.*?v8_compiler_sources \\+= ")',
+            ],
+          }],
         ],
       }
     },  # v8_compiler_sources
@@ -2391,7 +2401,6 @@
         '<(ABSEIL_ROOT)/absl/strings/cord_buffer.cc',
         '<(ABSEIL_ROOT)/absl/strings/escaping.h',
         '<(ABSEIL_ROOT)/absl/strings/escaping.cc',
-        '<(ABSEIL_ROOT)/absl/strings/has_absl_stringify.h',
         '<(ABSEIL_ROOT)/absl/strings/has_ostream_operator.h',
         '<(ABSEIL_ROOT)/absl/strings/internal/charconv_bigint.h',
         '<(ABSEIL_ROOT)/absl/strings/internal/charconv_bigint.cc',
diff --git a/typings/internalBinding/constants.d.ts b/typings/internalBinding/constants.d.ts
index 89d2a53aae2118..dc4657080ba54b 100644
--- a/typings/internalBinding/constants.d.ts
+++ b/typings/internalBinding/constants.d.ts
@@ -130,6 +130,11 @@ export interface ConstantsBinding {
       PRIORITY_HIGHEST: -20;
     };
   };
+  sqlite: {
+    SQLITE_CHANGESET_OMIT: 0;
+    SQLITE_CHANGESET_REPLACE: 1;
+    SQLITE_CHANGESET_ABORT: 2;
+  };
   fs: {
     UV_FS_SYMLINK_DIR: 1;
     UV_FS_SYMLINK_JUNCTION: 2;
diff --git a/unofficial.gni b/unofficial.gni
index 7c635322b770fa..193a0d8ff2df5e 100644
--- a/unofficial.gni
+++ b/unofficial.gni
@@ -158,10 +158,10 @@ template("node_gn_build") {
       "deps/nghttp2",
       "deps/ngtcp2",
       "deps/postject",
-      "deps/simdutf",
       "deps/sqlite",
       "deps/uvwasi",
       "//third_party/zlib",
+      "$node_simdutf_path",
       "$node_v8_path:v8_libplatform",
     ]
 
@@ -263,7 +263,7 @@ template("node_gn_build") {
           ldflags = [ "/OPT:NOICF" ]  # link.exe, but also lld-link.exe.
         } else if (is_apple && !use_lld) {
           ldflags = [ "-Wl,-no_deduplicate" ]  # ld64.
-        } else if (use_gold || use_lld) {
+        } else if ((!is_apple && use_gold) || use_lld) {
           ldflags = [ "-Wl,--icf=none" ]
         }
       }
@@ -300,8 +300,8 @@ template("node_gn_build") {
 
   executable("node_js2c") {
     deps = [
-      "deps/simdutf",
       "deps/uv",
+      "$node_simdutf_path",
     ]
     sources = [
       "tools/js2c.cc",
@@ -358,7 +358,7 @@ template("node_gn_build") {
       "deps/googletest",
       "deps/googletest:gtest_main",
       "deps/nbytes",
-      "deps/simdutf",
+      "$node_simdutf_path",
     ]
 
     sources = gypi_values.node_cctest_sources
diff --git a/vcbuild.bat b/vcbuild.bat
index 59197c1ccc36e3..f93998d2b14901 100644
--- a/vcbuild.bat
+++ b/vcbuild.bat
@@ -292,19 +292,10 @@ goto exit
 @rem Visual Studio v17.10 has a bug that causes the build to fail.
 @rem Check if the version is v17.10 and exit if it is.
 echo %VSCMD_VER% | findstr /b /c:"17.10" >nul
-if %errorlevel% neq 1 (
+if %errorlevel% neq 1  (
   echo Node.js doesn't compile with Visual Studio 17.10 Please use a different version.
   goto exit
 )
-@rem Same applies to v17.12 for MSVC.
-echo %VSCMD_VER% | findstr /b /c:"17.12" >nul
-if %errorlevel% neq 1 (
-  @rem Clang 18.1.8 Provided with VS 17.12 works fine.
-  if not defined clang_cl (
-    echo Node.js doesn't compile with Visual Studio 17.12 Please use a different version.
-    goto exit
-  )
-)
 
 @rem check if the clang-cl build is requested
 if not defined clang_cl goto clang-skip