From 7d06f26bd278badc61537ed945b3fedf161ffb5b Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankit.agarwal@intel.com>
Date: Thu, 12 Dec 2024 20:06:04 +0530
Subject: [PATCH] Revert "Enable Displaying lunch menu for A15"

Enabling lunch target trunk_staging for GAS EB

This reverts commit 94f755403d60ee8ffd08df719a9ecce0f51c4f14.

Tests: EB is successful and booted.

Tracked-On: OAM-128506
Signed-off-by: Ankit Agrawal <ankit.agarwal@intel.com>
---
 ...memory-functions-with-avx2-instructi.patch | 3779 ---------------
 ...string-functions-with-avx-implementa.patch | 4169 -----------------
 ...-avx2-implementation-for-memmove-api.patch |  645 ---
 .../0006-Obtain-x86-cache-info-from-CPU.patch |  594 ---
 vendorsetup.sh                                |    9 +-
 5 files changed, 2 insertions(+), 9194 deletions(-)
 delete mode 100644 aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch
 delete mode 100644 aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch
 delete mode 100644 aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch
 delete mode 100644 aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch

diff --git a/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch b/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch
deleted file mode 100644
index db61807bff..0000000000
--- a/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch
+++ /dev/null
@@ -1,3779 +0,0 @@
-From 013b505284379453df6637f009a224f6d5c6f3bd Mon Sep 17 00:00:00 2001
-From: "Reddy, Alavala Srinivasa" <alavala.srinivasa.reddy@intel.com>
-Date: Wed, 13 Sep 2023 18:36:21 +0530
-Subject: [PATCH 3/5] Optimize bionic memory functions with avx2 instructions
-
-Following memory related functions are optimized with
-avx2 implementation ported from glibc 2.20
-(only for 64-bit)
- - memchr
- - memcmp
- - memrchr
-
-Test done: Build and boot is fine, Run the benchmarks suite.
-
-Change-Id: I956773c79b9bcebee69726820eaa74c709df7081
-Signed-off-by: ahs <amrita.h.s@intel.com>
-Signed-off-by: Ravi Kumar Soni <ravi.kumar.soni@intel.com>
----
- libc/Android.bp                               |   36 +-
- .../kabylake/string/avx2-memcpy-kbl.S         | 2052 +++++++++++++++++
- .../arch-x86_64/dynamic_function_dispatch.cpp |   38 +
- libc/arch-x86_64/generic/string/memchr.c      |   20 +
- libc/arch-x86_64/generic/string/memrchr.c     |   20 +
- libc/arch-x86_64/generic/string/wmemset.c     |   20 +
- libc/arch-x86_64/{string => include}/cache.h  |    0
- .../kabylake/string/avx2-memchr-kbl.S         |  371 +++
- .../kabylake/string/avx2-memcmp-kbl.S         |  428 ++++
- .../kabylake/string/avx2-memrchr-kbl.S        |  408 ++++
- .../kabylake/string/avx2-wmemset-kbl.S        |  140 ++
- .../string/sse2-memmove-slm.S                 |    4 +-
- .../{ => silvermont}/string/sse2-memset-slm.S |    0
- .../{ => silvermont}/string/sse2-stpcpy-slm.S |    0
- .../string/sse2-stpncpy-slm.S                 |    0
- .../{ => silvermont}/string/sse2-strcat-slm.S |    0
- .../{ => silvermont}/string/sse2-strcpy-slm.S |    0
- .../{ => silvermont}/string/sse2-strlen-slm.S |    0
- .../string/sse2-strncat-slm.S                 |    0
- .../string/sse2-strncpy-slm.S                 |    0
- .../{ => silvermont}/string/sse4-memcmp-slm.S |    2 +-
- .../string/ssse3-strcmp-slm.S                 |    0
- .../string/ssse3-strncmp-slm.S                |    0
- libc/arch-x86_64/static_function_dispatch.S   |    6 +
- 24 files changed, 3528 insertions(+), 17 deletions(-)
- create mode 100644 libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S
- create mode 100644 libc/arch-x86_64/generic/string/memchr.c
- create mode 100644 libc/arch-x86_64/generic/string/memrchr.c
- create mode 100644 libc/arch-x86_64/generic/string/wmemset.c
- rename libc/arch-x86_64/{string => include}/cache.h (100%)
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-memmove-slm.S (99%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-memset-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpcpy-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpncpy-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcat-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcpy-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-strlen-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncat-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncpy-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/sse4-memcmp-slm.S (99%)
- rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strcmp-slm.S (100%)
- rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strncmp-slm.S (100%)
-
-diff --git a/libc/Android.bp b/libc/Android.bp
-index 943d41fba..530ce9111 100644
---- a/libc/Android.bp
-+++ b/libc/Android.bp
-@@ -617,8 +617,6 @@ cc_library_static {
-         },
-         x86_64: {
-             srcs: [
--                "upstream-openbsd/lib/libc/string/memchr.c",
--                "upstream-openbsd/lib/libc/string/memrchr.c",
-                 "upstream-openbsd/lib/libc/string/strlcat.c",
-                 "upstream-openbsd/lib/libc/string/strlcpy.c",
-             ],
-@@ -1187,6 +1185,7 @@ cc_library_static {
-             ],
-         },
-         x86_64: {
-+            include_dirs: ["bionic/libc/arch-x86_64/include"],
-             srcs: [
-                 "arch-x86_64/bionic/__bionic_clone.S",
-                 "arch-x86_64/bionic/_exit_with_stack_teardown.S",
-@@ -1196,18 +1195,27 @@ cc_library_static {
-                 "arch-x86_64/bionic/vfork.S",
- 
-                 "arch-x86_64/string/avx2-memset-kbl.S",
--                "arch-x86_64/string/sse2-memmove-slm.S",
--                "arch-x86_64/string/sse2-memset-slm.S",
--                "arch-x86_64/string/sse2-stpcpy-slm.S",
--                "arch-x86_64/string/sse2-stpncpy-slm.S",
--                "arch-x86_64/string/sse2-strcat-slm.S",
--                "arch-x86_64/string/sse2-strcpy-slm.S",
--                "arch-x86_64/string/sse2-strlen-slm.S",
--                "arch-x86_64/string/sse2-strncat-slm.S",
--                "arch-x86_64/string/sse2-strncpy-slm.S",
--                "arch-x86_64/string/sse4-memcmp-slm.S",
--                "arch-x86_64/string/ssse3-strcmp-slm.S",
--                "arch-x86_64/string/ssse3-strncmp-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-memmove-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-memset-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-stpncpy-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-strcat-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-strcpy-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-strlen-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-strncat-slm.S",
-+                "arch-x86_64/silvermont/string/sse2-strncpy-slm.S",
-+                "arch-x86_64/silvermont/string/sse4-memcmp-slm.S",
-+                "arch-x86_64/silvermont/string/ssse3-strcmp-slm.S",
-+                "arch-x86_64/silvermont/string/ssse3-strncmp-slm.S",
-+
-+                //"arch-x86_64/generic/string/wmemset.c"
-+                "arch-x86_64/generic/string/memchr.c",
-+                "arch-x86_64/generic/string/memrchr.c",
-+
-+                //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S"
-+                "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-memchr-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S",
- 
-                 "bionic/strchr.cpp",
-                 "bionic/strchrnul.cpp",
-diff --git a/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S
-new file mode 100644
-index 000000000..69fca7cf1
---- /dev/null
-+++ b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S
-@@ -0,0 +1,2052 @@
-+#define ENTRY(f) \
-+    .text; \
-+    .globl f; \
-+    .p2align    4, 0x90; \
-+    .type f,@function; \
-+    f: \
-+
-+#define END(f)
-+    .size f, .-f; \
-+    .section        .rodata,"a",@progbits; \
-+    .p2align        2 \
-+
-+ENTRY(memcpy_avx2)
-+# %bb.0:
-+	pushl	%ebp
-+	pushl	%ebx
-+	pushl	%edi
-+	pushl	%esi
-+	movl	28(%esp), %ebx
-+	movl	24(%esp), %ecx
-+	movl	20(%esp), %eax
-+	calll	.L0$pb
-+.L0$pb:
-+	popl	%esi
-+.Ltmp0:
-+	addl	$_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %esi
-+	cmpl	$256, %ebx              # imm = 0x100
-+	ja	.LBB0_251
-+# %bb.1:
-+	leal	-1(%ebx), %edi
-+	cmpl	$255, %edi
-+	ja	.LBB0_270
-+# %bb.2:
-+	addl	.LJTI0_1@GOTOFF(%esi,%edi,4), %esi
-+	leal	(%eax,%ebx), %edx
-+	addl	%ebx, %ecx
-+	jmpl	*%esi
-+.LBB0_251:
-+	movl	%eax, %ebp
-+	vmovups	(%ecx), %ymm0
-+	movl	%ebx, %edi
-+	negl	%ebp
-+	andl	$31, %ebp
-+	subl	%ebp, %edi
-+	addl	%ebp, %ecx
-+	leal	(%eax,%ebp), %edx
-+	cmpl	$2097152, %edi          # imm = 0x200000
-+	vmovups	%ymm0, (%eax)
-+	ja	.LBB0_256
-+# %bb.252:
-+	cmpl	$256, %edi              # imm = 0x100
-+	jb	.LBB0_260
-+# %bb.253:
-+	subl	%ebp, %ebx
-+	.p2align	4, 0x90
-+.LBB0_254:                              # =>This Inner Loop Header: Depth=1
-+	vmovups	(%ecx), %ymm0
-+	vmovups	32(%ecx), %ymm1
-+	vmovups	64(%ecx), %ymm2
-+	vmovups	96(%ecx), %ymm3
-+	vmovups	128(%ecx), %ymm4
-+	vmovups	160(%ecx), %ymm5
-+	vmovups	192(%ecx), %ymm6
-+	vmovups	224(%ecx), %ymm7
-+	prefetchnta	512(%ecx)
-+	addl	$-256, %edi
-+	addl	$256, %ecx              # imm = 0x100
-+	vmovups	%ymm0, (%edx)
-+	vmovups	%ymm1, 32(%edx)
-+	vmovups	%ymm2, 64(%edx)
-+	vmovups	%ymm3, 96(%edx)
-+	vmovups	%ymm4, 128(%edx)
-+	vmovups	%ymm5, 160(%edx)
-+	vmovups	%ymm6, 192(%edx)
-+	vmovups	%ymm7, 224(%edx)
-+	addl	$256, %edx              # imm = 0x100
-+	cmpl	$255, %edi
-+	ja	.LBB0_254
-+# %bb.255:
-+	movzbl	%bl, %edi
-+	leal	-1(%edi), %ebx
-+	cmpl	$255, %ebx
-+	jbe	.LBB0_261
-+	jmp	.LBB0_270
-+.LBB0_256:
-+	prefetchnta	(%ecx)
-+	subl	%ebp, %ebx
-+	testb	$31, %cl
-+	je	.LBB0_257
-+	.p2align	4, 0x90
-+.LBB0_258:                              # =>This Inner Loop Header: Depth=1
-+	vmovups	(%ecx), %ymm0
-+	vmovups	32(%ecx), %ymm1
-+	vmovups	64(%ecx), %ymm2
-+	vmovups	96(%ecx), %ymm3
-+	vmovups	128(%ecx), %ymm4
-+	vmovups	160(%ecx), %ymm5
-+	vmovups	192(%ecx), %ymm6
-+	vmovups	224(%ecx), %ymm7
-+	prefetchnta	512(%ecx)
-+	addl	$-256, %edi
-+	addl	$256, %ecx              # imm = 0x100
-+	vmovntps	%ymm0, (%edx)
-+	vmovntps	%ymm1, 32(%edx)
-+	vmovntps	%ymm2, 64(%edx)
-+	vmovntps	%ymm3, 96(%edx)
-+	vmovntps	%ymm4, 128(%edx)
-+	vmovntps	%ymm5, 160(%edx)
-+	vmovntps	%ymm6, 192(%edx)
-+	vmovntps	%ymm7, 224(%edx)
-+	addl	$256, %edx              # imm = 0x100
-+	cmpl	$255, %edi
-+	ja	.LBB0_258
-+	jmp	.LBB0_259
-+	.p2align	4, 0x90
-+.LBB0_257:                              # =>This Inner Loop Header: Depth=1
-+	vmovaps	(%ecx), %ymm0
-+	vmovaps	32(%ecx), %ymm1
-+	vmovaps	64(%ecx), %ymm2
-+	vmovaps	96(%ecx), %ymm3
-+	vmovaps	128(%ecx), %ymm4
-+	vmovaps	160(%ecx), %ymm5
-+	vmovaps	192(%ecx), %ymm6
-+	vmovaps	224(%ecx), %ymm7
-+	prefetchnta	512(%ecx)
-+	addl	$-256, %edi
-+	addl	$256, %ecx              # imm = 0x100
-+	vmovntps	%ymm0, (%edx)
-+	vmovntps	%ymm1, 32(%edx)
-+	vmovntps	%ymm2, 64(%edx)
-+	vmovntps	%ymm3, 96(%edx)
-+	vmovntps	%ymm4, 128(%edx)
-+	vmovntps	%ymm5, 160(%edx)
-+	vmovntps	%ymm6, 192(%edx)
-+	vmovntps	%ymm7, 224(%edx)
-+	addl	$256, %edx              # imm = 0x100
-+	cmpl	$255, %edi
-+	ja	.LBB0_257
-+.LBB0_259:
-+	sfence
-+	movzbl	%bl, %edi
-+.LBB0_260:
-+	leal	-1(%edi), %ebx
-+	cmpl	$255, %ebx
-+	ja	.LBB0_270
-+.LBB0_261:
-+	addl	.LJTI0_0@GOTOFF(%esi,%ebx,4), %esi
-+	addl	%edi, %edx
-+	addl	%edi, %ecx
-+	jmpl	*%esi
-+.LBB0_11:
-+	vmovups	-131(%ecx), %ymm0
-+	vmovups	%ymm0, -131(%edx)
-+	vmovups	-99(%ecx), %ymm0
-+	vmovups	%ymm0, -99(%edx)
-+	vmovups	-67(%ecx), %ymm0
-+	vmovups	%ymm0, -67(%edx)
-+	vmovups	-35(%ecx), %ymm0
-+	vmovups	%ymm0, -35(%edx)
-+.LBB0_12:
-+	movzwl	-3(%ecx), %esi
-+	movw	%si, -3(%edx)
-+	jmp	.LBB0_6
-+.LBB0_17:
-+	vmovups	-133(%ecx), %ymm0
-+	vmovups	%ymm0, -133(%edx)
-+	vmovups	-101(%ecx), %ymm0
-+	vmovups	%ymm0, -101(%edx)
-+	vmovups	-69(%ecx), %ymm0
-+	vmovups	%ymm0, -69(%edx)
-+	vmovups	-37(%ecx), %ymm0
-+	vmovups	%ymm0, -37(%edx)
-+.LBB0_18:
-+	movl	-5(%ecx), %esi
-+	movl	%esi, -5(%edx)
-+	jmp	.LBB0_6
-+.LBB0_19:
-+	vmovups	-134(%ecx), %ymm0
-+	vmovups	%ymm0, -134(%edx)
-+	vmovups	-102(%ecx), %ymm0
-+	vmovups	%ymm0, -102(%edx)
-+	vmovups	-70(%ecx), %ymm0
-+	vmovups	%ymm0, -70(%edx)
-+	vmovups	-38(%ecx), %ymm0
-+	vmovups	%ymm0, -38(%edx)
-+.LBB0_20:
-+	movl	-6(%ecx), %esi
-+	movl	%esi, -6(%edx)
-+	jmp	.LBB0_10
-+.LBB0_21:
-+	vmovups	-135(%ecx), %ymm0
-+	vmovups	%ymm0, -135(%edx)
-+	vmovups	-103(%ecx), %ymm0
-+	vmovups	%ymm0, -103(%edx)
-+	vmovups	-71(%ecx), %ymm0
-+	vmovups	%ymm0, -71(%edx)
-+	vmovups	-39(%ecx), %ymm0
-+	vmovups	%ymm0, -39(%edx)
-+.LBB0_22:
-+	movl	-7(%ecx), %esi
-+	movl	%esi, -7(%edx)
-+	jmp	.LBB0_16
-+.LBB0_27:
-+	vmovups	-137(%ecx), %ymm0
-+	vmovups	%ymm0, -137(%edx)
-+	vmovups	-105(%ecx), %ymm0
-+	vmovups	%ymm0, -105(%edx)
-+	vmovups	-73(%ecx), %ymm0
-+	vmovups	%ymm0, -73(%edx)
-+	vmovups	-41(%ecx), %ymm0
-+	vmovups	%ymm0, -41(%edx)
-+.LBB0_28:
-+	vmovsd	-9(%ecx), %xmm0         # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -9(%edx)
-+	jmp	.LBB0_6
-+.LBB0_29:
-+	vmovups	-138(%ecx), %ymm0
-+	vmovups	%ymm0, -138(%edx)
-+	vmovups	-106(%ecx), %ymm0
-+	vmovups	%ymm0, -106(%edx)
-+	vmovups	-74(%ecx), %ymm0
-+	vmovups	%ymm0, -74(%edx)
-+	vmovups	-42(%ecx), %ymm0
-+	vmovups	%ymm0, -42(%edx)
-+.LBB0_30:
-+	vmovsd	-10(%ecx), %xmm0        # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -10(%edx)
-+	jmp	.LBB0_10
-+.LBB0_31:
-+	vmovups	-139(%ecx), %ymm0
-+	vmovups	%ymm0, -139(%edx)
-+	vmovups	-107(%ecx), %ymm0
-+	vmovups	%ymm0, -107(%edx)
-+	vmovups	-75(%ecx), %ymm0
-+	vmovups	%ymm0, -75(%edx)
-+	vmovups	-43(%ecx), %ymm0
-+	vmovups	%ymm0, -43(%edx)
-+.LBB0_32:
-+	vmovsd	-11(%ecx), %xmm0        # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -11(%edx)
-+	jmp	.LBB0_16
-+.LBB0_33:
-+	vmovups	-140(%ecx), %ymm0
-+	vmovups	%ymm0, -140(%edx)
-+	vmovups	-108(%ecx), %ymm0
-+	vmovups	%ymm0, -108(%edx)
-+	vmovups	-76(%ecx), %ymm0
-+	vmovups	%ymm0, -76(%edx)
-+	vmovups	-44(%ecx), %ymm0
-+	vmovups	%ymm0, -44(%edx)
-+.LBB0_34:
-+	vmovsd	-12(%ecx), %xmm0        # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -12(%edx)
-+	jmp	.LBB0_16
-+.LBB0_35:
-+	vmovups	-141(%ecx), %ymm0
-+	vmovups	%ymm0, -141(%edx)
-+	vmovups	-109(%ecx), %ymm0
-+	vmovups	%ymm0, -109(%edx)
-+	vmovups	-77(%ecx), %ymm0
-+	vmovups	%ymm0, -77(%edx)
-+	vmovups	-45(%ecx), %ymm0
-+	vmovups	%ymm0, -45(%edx)
-+.LBB0_36:
-+	vmovsd	-13(%ecx), %xmm0        # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -13(%edx)
-+	jmp	.LBB0_26
-+.LBB0_37:
-+	vmovups	-142(%ecx), %ymm0
-+	vmovups	%ymm0, -142(%edx)
-+	vmovups	-110(%ecx), %ymm0
-+	vmovups	%ymm0, -110(%edx)
-+	vmovups	-78(%ecx), %ymm0
-+	vmovups	%ymm0, -78(%edx)
-+	vmovups	-46(%ecx), %ymm0
-+	vmovups	%ymm0, -46(%edx)
-+.LBB0_38:
-+	vmovsd	-14(%ecx), %xmm0        # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -14(%edx)
-+	jmp	.LBB0_26
-+.LBB0_39:
-+	vmovups	-143(%ecx), %ymm0
-+	vmovups	%ymm0, -143(%edx)
-+	vmovups	-111(%ecx), %ymm0
-+	vmovups	%ymm0, -111(%edx)
-+	vmovups	-79(%ecx), %ymm0
-+	vmovups	%ymm0, -79(%edx)
-+	vmovups	-47(%ecx), %ymm0
-+	vmovups	%ymm0, -47(%edx)
-+.LBB0_40:
-+	vmovsd	-15(%ecx), %xmm0        # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -15(%edx)
-+	jmp	.LBB0_26
-+.LBB0_45:
-+	vmovups	-145(%ecx), %ymm0
-+	vmovups	%ymm0, -145(%edx)
-+	vmovups	-113(%ecx), %ymm0
-+	vmovups	%ymm0, -113(%edx)
-+	vmovups	-81(%ecx), %ymm0
-+	vmovups	%ymm0, -81(%edx)
-+	vmovups	-49(%ecx), %ymm0
-+	vmovups	%ymm0, -49(%edx)
-+.LBB0_46:
-+	vmovups	-17(%ecx), %xmm0
-+	vmovups	%xmm0, -17(%edx)
-+	jmp	.LBB0_6
-+.LBB0_47:
-+	vmovups	-146(%ecx), %ymm0
-+	vmovups	%ymm0, -146(%edx)
-+	vmovups	-114(%ecx), %ymm0
-+	vmovups	%ymm0, -114(%edx)
-+	vmovups	-82(%ecx), %ymm0
-+	vmovups	%ymm0, -82(%edx)
-+	vmovups	-50(%ecx), %ymm0
-+	vmovups	%ymm0, -50(%edx)
-+.LBB0_48:
-+	vmovups	-18(%ecx), %xmm0
-+	vmovups	%xmm0, -18(%edx)
-+	jmp	.LBB0_10
-+.LBB0_49:
-+	vmovups	-147(%ecx), %ymm0
-+	vmovups	%ymm0, -147(%edx)
-+	vmovups	-115(%ecx), %ymm0
-+	vmovups	%ymm0, -115(%edx)
-+	vmovups	-83(%ecx), %ymm0
-+	vmovups	%ymm0, -83(%edx)
-+	vmovups	-51(%ecx), %ymm0
-+	vmovups	%ymm0, -51(%edx)
-+.LBB0_50:
-+	vmovups	-19(%ecx), %xmm0
-+	vmovups	%xmm0, -19(%edx)
-+	jmp	.LBB0_16
-+.LBB0_51:
-+	vmovups	-148(%ecx), %ymm0
-+	vmovups	%ymm0, -148(%edx)
-+	vmovups	-116(%ecx), %ymm0
-+	vmovups	%ymm0, -116(%edx)
-+	vmovups	-84(%ecx), %ymm0
-+	vmovups	%ymm0, -84(%edx)
-+	vmovups	-52(%ecx), %ymm0
-+	vmovups	%ymm0, -52(%edx)
-+.LBB0_52:
-+	vmovups	-20(%ecx), %xmm0
-+	vmovups	%xmm0, -20(%edx)
-+	jmp	.LBB0_16
-+.LBB0_53:
-+	vmovups	-149(%ecx), %ymm0
-+	vmovups	%ymm0, -149(%edx)
-+	vmovups	-117(%ecx), %ymm0
-+	vmovups	%ymm0, -117(%edx)
-+	vmovups	-85(%ecx), %ymm0
-+	vmovups	%ymm0, -85(%edx)
-+	vmovups	-53(%ecx), %ymm0
-+	vmovups	%ymm0, -53(%edx)
-+.LBB0_54:
-+	vmovups	-21(%ecx), %xmm0
-+	vmovups	%xmm0, -21(%edx)
-+	jmp	.LBB0_26
-+.LBB0_55:
-+	vmovups	-150(%ecx), %ymm0
-+	vmovups	%ymm0, -150(%edx)
-+	vmovups	-118(%ecx), %ymm0
-+	vmovups	%ymm0, -118(%edx)
-+	vmovups	-86(%ecx), %ymm0
-+	vmovups	%ymm0, -86(%edx)
-+	vmovups	-54(%ecx), %ymm0
-+	vmovups	%ymm0, -54(%edx)
-+.LBB0_56:
-+	vmovups	-22(%ecx), %xmm0
-+	vmovups	%xmm0, -22(%edx)
-+	jmp	.LBB0_26
-+.LBB0_57:
-+	vmovups	-151(%ecx), %ymm0
-+	vmovups	%ymm0, -151(%edx)
-+	vmovups	-119(%ecx), %ymm0
-+	vmovups	%ymm0, -119(%edx)
-+	vmovups	-87(%ecx), %ymm0
-+	vmovups	%ymm0, -87(%edx)
-+	vmovups	-55(%ecx), %ymm0
-+	vmovups	%ymm0, -55(%edx)
-+.LBB0_58:
-+	vmovups	-23(%ecx), %xmm0
-+	vmovups	%xmm0, -23(%edx)
-+	jmp	.LBB0_26
-+.LBB0_59:
-+	vmovups	-152(%ecx), %ymm0
-+	vmovups	%ymm0, -152(%edx)
-+	vmovups	-120(%ecx), %ymm0
-+	vmovups	%ymm0, -120(%edx)
-+	vmovups	-88(%ecx), %ymm0
-+	vmovups	%ymm0, -88(%edx)
-+	vmovups	-56(%ecx), %ymm0
-+	vmovups	%ymm0, -56(%edx)
-+.LBB0_60:
-+	vmovups	-24(%ecx), %xmm0
-+	vmovups	%xmm0, -24(%edx)
-+	jmp	.LBB0_26
-+.LBB0_61:
-+	vmovups	-153(%ecx), %ymm0
-+	vmovups	%ymm0, -153(%edx)
-+	vmovups	-121(%ecx), %ymm0
-+	vmovups	%ymm0, -121(%edx)
-+	vmovups	-89(%ecx), %ymm0
-+	vmovups	%ymm0, -89(%edx)
-+	vmovups	-57(%ecx), %ymm0
-+	vmovups	%ymm0, -57(%edx)
-+.LBB0_62:
-+	vmovups	-25(%ecx), %xmm0
-+	vmovups	%xmm0, -25(%edx)
-+	jmp	.LBB0_44
-+.LBB0_63:
-+	vmovups	-154(%ecx), %ymm0
-+	vmovups	%ymm0, -154(%edx)
-+	vmovups	-122(%ecx), %ymm0
-+	vmovups	%ymm0, -122(%edx)
-+	vmovups	-90(%ecx), %ymm0
-+	vmovups	%ymm0, -90(%edx)
-+	vmovups	-58(%ecx), %ymm0
-+	vmovups	%ymm0, -58(%edx)
-+.LBB0_64:
-+	vmovups	-26(%ecx), %xmm0
-+	vmovups	%xmm0, -26(%edx)
-+	jmp	.LBB0_44
-+.LBB0_65:
-+	vmovups	-155(%ecx), %ymm0
-+	vmovups	%ymm0, -155(%edx)
-+	vmovups	-123(%ecx), %ymm0
-+	vmovups	%ymm0, -123(%edx)
-+	vmovups	-91(%ecx), %ymm0
-+	vmovups	%ymm0, -91(%edx)
-+	vmovups	-59(%ecx), %ymm0
-+	vmovups	%ymm0, -59(%edx)
-+.LBB0_66:
-+	vmovups	-27(%ecx), %xmm0
-+	vmovups	%xmm0, -27(%edx)
-+	jmp	.LBB0_44
-+.LBB0_67:
-+	vmovups	-156(%ecx), %ymm0
-+	vmovups	%ymm0, -156(%edx)
-+	vmovups	-124(%ecx), %ymm0
-+	vmovups	%ymm0, -124(%edx)
-+	vmovups	-92(%ecx), %ymm0
-+	vmovups	%ymm0, -92(%edx)
-+	vmovups	-60(%ecx), %ymm0
-+	vmovups	%ymm0, -60(%edx)
-+.LBB0_68:
-+	vmovups	-28(%ecx), %xmm0
-+	vmovups	%xmm0, -28(%edx)
-+	jmp	.LBB0_44
-+.LBB0_69:
-+	vmovups	-157(%ecx), %ymm0
-+	vmovups	%ymm0, -157(%edx)
-+	vmovups	-125(%ecx), %ymm0
-+	vmovups	%ymm0, -125(%edx)
-+	vmovups	-93(%ecx), %ymm0
-+	vmovups	%ymm0, -93(%edx)
-+	vmovups	-61(%ecx), %ymm0
-+	vmovups	%ymm0, -61(%edx)
-+.LBB0_70:
-+	vmovups	-29(%ecx), %xmm0
-+	vmovups	%xmm0, -29(%edx)
-+	jmp	.LBB0_44
-+.LBB0_71:
-+	vmovups	-158(%ecx), %ymm0
-+	vmovups	%ymm0, -158(%edx)
-+	vmovups	-126(%ecx), %ymm0
-+	vmovups	%ymm0, -126(%edx)
-+	vmovups	-94(%ecx), %ymm0
-+	vmovups	%ymm0, -94(%edx)
-+	vmovups	-62(%ecx), %ymm0
-+	vmovups	%ymm0, -62(%edx)
-+.LBB0_72:
-+	vmovups	-30(%ecx), %xmm0
-+	vmovups	%xmm0, -30(%edx)
-+	jmp	.LBB0_44
-+.LBB0_73:
-+	vmovups	-159(%ecx), %ymm0
-+	vmovups	%ymm0, -159(%edx)
-+	vmovups	-127(%ecx), %ymm0
-+	vmovups	%ymm0, -127(%edx)
-+	vmovups	-95(%ecx), %ymm0
-+	vmovups	%ymm0, -95(%edx)
-+	vmovups	-63(%ecx), %ymm0
-+	vmovups	%ymm0, -63(%edx)
-+.LBB0_74:
-+	vmovups	-31(%ecx), %xmm0
-+	vmovups	%xmm0, -31(%edx)
-+	jmp	.LBB0_44
-+.LBB0_75:
-+	vmovups	-193(%ecx), %ymm0
-+	vmovups	%ymm0, -193(%edx)
-+.LBB0_76:
-+	vmovups	-161(%ecx), %ymm0
-+	vmovups	%ymm0, -161(%edx)
-+.LBB0_3:
-+	vmovups	-129(%ecx), %ymm0
-+	vmovups	%ymm0, -129(%edx)
-+	vmovups	-97(%ecx), %ymm0
-+	vmovups	%ymm0, -97(%edx)
-+.LBB0_4:
-+	vmovups	-65(%ecx), %ymm0
-+	vmovups	%ymm0, -65(%edx)
-+.LBB0_5:
-+	vmovups	-33(%ecx), %ymm0
-+	vmovups	%ymm0, -33(%edx)
-+.LBB0_6:
-+	movb	-1(%ecx), %cl
-+	movb	%cl, -1(%edx)
-+	jmp	.LBB0_270
-+.LBB0_77:
-+	vmovups	-194(%ecx), %ymm0
-+	vmovups	%ymm0, -194(%edx)
-+.LBB0_78:
-+	vmovups	-162(%ecx), %ymm0
-+	vmovups	%ymm0, -162(%edx)
-+.LBB0_7:
-+	vmovups	-130(%ecx), %ymm0
-+	vmovups	%ymm0, -130(%edx)
-+	vmovups	-98(%ecx), %ymm0
-+	vmovups	%ymm0, -98(%edx)
-+.LBB0_8:
-+	vmovups	-66(%ecx), %ymm0
-+	vmovups	%ymm0, -66(%edx)
-+.LBB0_9:
-+	vmovups	-34(%ecx), %ymm0
-+	vmovups	%ymm0, -34(%edx)
-+.LBB0_10:
-+	movzwl	-2(%ecx), %ecx
-+	movw	%cx, -2(%edx)
-+	jmp	.LBB0_270
-+.LBB0_79:
-+	vmovups	-195(%ecx), %ymm0
-+	vmovups	%ymm0, -195(%edx)
-+.LBB0_80:
-+	vmovups	-163(%ecx), %ymm0
-+	vmovups	%ymm0, -163(%edx)
-+	vmovups	-131(%ecx), %ymm0
-+	vmovups	%ymm0, -131(%edx)
-+	vmovups	-99(%ecx), %ymm0
-+	vmovups	%ymm0, -99(%edx)
-+.LBB0_81:
-+	vmovups	-67(%ecx), %ymm0
-+	vmovups	%ymm0, -67(%edx)
-+.LBB0_82:
-+	vmovups	-35(%ecx), %ymm0
-+	vmovups	%ymm0, -35(%edx)
-+	jmp	.LBB0_16
-+.LBB0_83:
-+	vmovups	-196(%ecx), %ymm0
-+	vmovups	%ymm0, -196(%edx)
-+.LBB0_84:
-+	vmovups	-164(%ecx), %ymm0
-+	vmovups	%ymm0, -164(%edx)
-+.LBB0_13:
-+	vmovups	-132(%ecx), %ymm0
-+	vmovups	%ymm0, -132(%edx)
-+	vmovups	-100(%ecx), %ymm0
-+	vmovups	%ymm0, -100(%edx)
-+.LBB0_14:
-+	vmovups	-68(%ecx), %ymm0
-+	vmovups	%ymm0, -68(%edx)
-+.LBB0_15:
-+	vmovups	-36(%ecx), %ymm0
-+	vmovups	%ymm0, -36(%edx)
-+.LBB0_16:
-+	movl	-4(%ecx), %ecx
-+	movl	%ecx, -4(%edx)
-+	jmp	.LBB0_270
-+.LBB0_85:
-+	vmovups	-197(%ecx), %ymm0
-+	vmovups	%ymm0, -197(%edx)
-+.LBB0_86:
-+	vmovups	-165(%ecx), %ymm0
-+	vmovups	%ymm0, -165(%edx)
-+	vmovups	-133(%ecx), %ymm0
-+	vmovups	%ymm0, -133(%edx)
-+	vmovups	-101(%ecx), %ymm0
-+	vmovups	%ymm0, -101(%edx)
-+.LBB0_87:
-+	vmovups	-69(%ecx), %ymm0
-+	vmovups	%ymm0, -69(%edx)
-+.LBB0_88:
-+	vmovups	-37(%ecx), %ymm0
-+	vmovups	%ymm0, -37(%edx)
-+	jmp	.LBB0_26
-+.LBB0_89:
-+	vmovups	-198(%ecx), %ymm0
-+	vmovups	%ymm0, -198(%edx)
-+.LBB0_90:
-+	vmovups	-166(%ecx), %ymm0
-+	vmovups	%ymm0, -166(%edx)
-+	vmovups	-134(%ecx), %ymm0
-+	vmovups	%ymm0, -134(%edx)
-+	vmovups	-102(%ecx), %ymm0
-+	vmovups	%ymm0, -102(%edx)
-+.LBB0_91:
-+	vmovups	-70(%ecx), %ymm0
-+	vmovups	%ymm0, -70(%edx)
-+.LBB0_92:
-+	vmovups	-38(%ecx), %ymm0
-+	vmovups	%ymm0, -38(%edx)
-+	jmp	.LBB0_26
-+.LBB0_93:
-+	vmovups	-199(%ecx), %ymm0
-+	vmovups	%ymm0, -199(%edx)
-+.LBB0_94:
-+	vmovups	-167(%ecx), %ymm0
-+	vmovups	%ymm0, -167(%edx)
-+	vmovups	-135(%ecx), %ymm0
-+	vmovups	%ymm0, -135(%edx)
-+	vmovups	-103(%ecx), %ymm0
-+	vmovups	%ymm0, -103(%edx)
-+.LBB0_95:
-+	vmovups	-71(%ecx), %ymm0
-+	vmovups	%ymm0, -71(%edx)
-+.LBB0_96:
-+	vmovups	-39(%ecx), %ymm0
-+	vmovups	%ymm0, -39(%edx)
-+	jmp	.LBB0_26
-+.LBB0_97:
-+	vmovups	-200(%ecx), %ymm0
-+	vmovups	%ymm0, -200(%edx)
-+.LBB0_98:
-+	vmovups	-168(%ecx), %ymm0
-+	vmovups	%ymm0, -168(%edx)
-+.LBB0_23:
-+	vmovups	-136(%ecx), %ymm0
-+	vmovups	%ymm0, -136(%edx)
-+	vmovups	-104(%ecx), %ymm0
-+	vmovups	%ymm0, -104(%edx)
-+.LBB0_24:
-+	vmovups	-72(%ecx), %ymm0
-+	vmovups	%ymm0, -72(%edx)
-+.LBB0_25:
-+	vmovups	-40(%ecx), %ymm0
-+	vmovups	%ymm0, -40(%edx)
-+.LBB0_26:
-+	vmovsd	-8(%ecx), %xmm0         # xmm0 = mem[0],zero
-+	vmovsd	%xmm0, -8(%edx)
-+	jmp	.LBB0_270
-+.LBB0_99:
-+	vmovups	-201(%ecx), %ymm0
-+	vmovups	%ymm0, -201(%edx)
-+.LBB0_100:
-+	vmovups	-169(%ecx), %ymm0
-+	vmovups	%ymm0, -169(%edx)
-+	vmovups	-137(%ecx), %ymm0
-+	vmovups	%ymm0, -137(%edx)
-+	vmovups	-105(%ecx), %ymm0
-+	vmovups	%ymm0, -105(%edx)
-+.LBB0_101:
-+	vmovups	-73(%ecx), %ymm0
-+	vmovups	%ymm0, -73(%edx)
-+.LBB0_102:
-+	vmovups	-41(%ecx), %ymm0
-+	vmovups	%ymm0, -41(%edx)
-+	jmp	.LBB0_44
-+.LBB0_103:
-+	vmovups	-202(%ecx), %ymm0
-+	vmovups	%ymm0, -202(%edx)
-+.LBB0_104:
-+	vmovups	-170(%ecx), %ymm0
-+	vmovups	%ymm0, -170(%edx)
-+	vmovups	-138(%ecx), %ymm0
-+	vmovups	%ymm0, -138(%edx)
-+	vmovups	-106(%ecx), %ymm0
-+	vmovups	%ymm0, -106(%edx)
-+.LBB0_105:
-+	vmovups	-74(%ecx), %ymm0
-+	vmovups	%ymm0, -74(%edx)
-+.LBB0_106:
-+	vmovups	-42(%ecx), %ymm0
-+	vmovups	%ymm0, -42(%edx)
-+	jmp	.LBB0_44
-+.LBB0_107:
-+	vmovups	-203(%ecx), %ymm0
-+	vmovups	%ymm0, -203(%edx)
-+.LBB0_108:
-+	vmovups	-171(%ecx), %ymm0
-+	vmovups	%ymm0, -171(%edx)
-+	vmovups	-139(%ecx), %ymm0
-+	vmovups	%ymm0, -139(%edx)
-+	vmovups	-107(%ecx), %ymm0
-+	vmovups	%ymm0, -107(%edx)
-+.LBB0_109:
-+	vmovups	-75(%ecx), %ymm0
-+	vmovups	%ymm0, -75(%edx)
-+.LBB0_110:
-+	vmovups	-43(%ecx), %ymm0
-+	vmovups	%ymm0, -43(%edx)
-+	jmp	.LBB0_44
-+.LBB0_111:
-+	vmovups	-204(%ecx), %ymm0
-+	vmovups	%ymm0, -204(%edx)
-+.LBB0_112:
-+	vmovups	-172(%ecx), %ymm0
-+	vmovups	%ymm0, -172(%edx)
-+	vmovups	-140(%ecx), %ymm0
-+	vmovups	%ymm0, -140(%edx)
-+	vmovups	-108(%ecx), %ymm0
-+	vmovups	%ymm0, -108(%edx)
-+.LBB0_113:
-+	vmovups	-76(%ecx), %ymm0
-+	vmovups	%ymm0, -76(%edx)
-+.LBB0_114:
-+	vmovups	-44(%ecx), %ymm0
-+	vmovups	%ymm0, -44(%edx)
-+	jmp	.LBB0_44
-+.LBB0_115:
-+	vmovups	-205(%ecx), %ymm0
-+	vmovups	%ymm0, -205(%edx)
-+.LBB0_116:
-+	vmovups	-173(%ecx), %ymm0
-+	vmovups	%ymm0, -173(%edx)
-+	vmovups	-141(%ecx), %ymm0
-+	vmovups	%ymm0, -141(%edx)
-+	vmovups	-109(%ecx), %ymm0
-+	vmovups	%ymm0, -109(%edx)
-+.LBB0_117:
-+	vmovups	-77(%ecx), %ymm0
-+	vmovups	%ymm0, -77(%edx)
-+.LBB0_118:
-+	vmovups	-45(%ecx), %ymm0
-+	vmovups	%ymm0, -45(%edx)
-+	jmp	.LBB0_44
-+.LBB0_119:
-+	vmovups	-206(%ecx), %ymm0
-+	vmovups	%ymm0, -206(%edx)
-+.LBB0_120:
-+	vmovups	-174(%ecx), %ymm0
-+	vmovups	%ymm0, -174(%edx)
-+	vmovups	-142(%ecx), %ymm0
-+	vmovups	%ymm0, -142(%edx)
-+	vmovups	-110(%ecx), %ymm0
-+	vmovups	%ymm0, -110(%edx)
-+.LBB0_121:
-+	vmovups	-78(%ecx), %ymm0
-+	vmovups	%ymm0, -78(%edx)
-+.LBB0_122:
-+	vmovups	-46(%ecx), %ymm0
-+	vmovups	%ymm0, -46(%edx)
-+	jmp	.LBB0_44
-+.LBB0_123:
-+	vmovups	-207(%ecx), %ymm0
-+	vmovups	%ymm0, -207(%edx)
-+.LBB0_124:
-+	vmovups	-175(%ecx), %ymm0
-+	vmovups	%ymm0, -175(%edx)
-+	vmovups	-143(%ecx), %ymm0
-+	vmovups	%ymm0, -143(%edx)
-+	vmovups	-111(%ecx), %ymm0
-+	vmovups	%ymm0, -111(%edx)
-+.LBB0_125:
-+	vmovups	-79(%ecx), %ymm0
-+	vmovups	%ymm0, -79(%edx)
-+.LBB0_126:
-+	vmovups	-47(%ecx), %ymm0
-+	vmovups	%ymm0, -47(%edx)
-+	jmp	.LBB0_44
-+.LBB0_127:
-+	vmovups	-208(%ecx), %ymm0
-+	vmovups	%ymm0, -208(%edx)
-+.LBB0_128:
-+	vmovups	-176(%ecx), %ymm0
-+	vmovups	%ymm0, -176(%edx)
-+.LBB0_41:
-+	vmovups	-144(%ecx), %ymm0
-+	vmovups	%ymm0, -144(%edx)
-+	vmovups	-112(%ecx), %ymm0
-+	vmovups	%ymm0, -112(%edx)
-+.LBB0_42:
-+	vmovups	-80(%ecx), %ymm0
-+	vmovups	%ymm0, -80(%edx)
-+.LBB0_43:
-+	vmovups	-48(%ecx), %ymm0
-+	vmovups	%ymm0, -48(%edx)
-+.LBB0_44:
-+	vmovups	-16(%ecx), %xmm0
-+	vmovups	%xmm0, -16(%edx)
-+	jmp	.LBB0_270
-+.LBB0_129:
-+	vmovups	-209(%ecx), %ymm0
-+	vmovups	%ymm0, -209(%edx)
-+.LBB0_130:
-+	vmovups	-177(%ecx), %ymm0
-+	vmovups	%ymm0, -177(%edx)
-+	vmovups	-145(%ecx), %ymm0
-+	vmovups	%ymm0, -145(%edx)
-+	vmovups	-113(%ecx), %ymm0
-+	vmovups	%ymm0, -113(%edx)
-+.LBB0_131:
-+	vmovups	-81(%ecx), %ymm0
-+	vmovups	%ymm0, -81(%edx)
-+.LBB0_132:
-+	vmovups	-49(%ecx), %ymm0
-+	vmovups	%ymm0, -49(%edx)
-+	jmp	.LBB0_269
-+.LBB0_133:
-+	vmovups	-210(%ecx), %ymm0
-+	vmovups	%ymm0, -210(%edx)
-+.LBB0_134:
-+	vmovups	-178(%ecx), %ymm0
-+	vmovups	%ymm0, -178(%edx)
-+	vmovups	-146(%ecx), %ymm0
-+	vmovups	%ymm0, -146(%edx)
-+	vmovups	-114(%ecx), %ymm0
-+	vmovups	%ymm0, -114(%edx)
-+.LBB0_135:
-+	vmovups	-82(%ecx), %ymm0
-+	vmovups	%ymm0, -82(%edx)
-+.LBB0_136:
-+	vmovups	-50(%ecx), %ymm0
-+	vmovups	%ymm0, -50(%edx)
-+	jmp	.LBB0_269
-+.LBB0_137:
-+	vmovups	-211(%ecx), %ymm0
-+	vmovups	%ymm0, -211(%edx)
-+.LBB0_138:
-+	vmovups	-179(%ecx), %ymm0
-+	vmovups	%ymm0, -179(%edx)
-+	vmovups	-147(%ecx), %ymm0
-+	vmovups	%ymm0, -147(%edx)
-+	vmovups	-115(%ecx), %ymm0
-+	vmovups	%ymm0, -115(%edx)
-+.LBB0_139:
-+	vmovups	-83(%ecx), %ymm0
-+	vmovups	%ymm0, -83(%edx)
-+.LBB0_140:
-+	vmovups	-51(%ecx), %ymm0
-+	vmovups	%ymm0, -51(%edx)
-+	jmp	.LBB0_269
-+.LBB0_141:
-+	vmovups	-212(%ecx), %ymm0
-+	vmovups	%ymm0, -212(%edx)
-+.LBB0_142:
-+	vmovups	-180(%ecx), %ymm0
-+	vmovups	%ymm0, -180(%edx)
-+	vmovups	-148(%ecx), %ymm0
-+	vmovups	%ymm0, -148(%edx)
-+	vmovups	-116(%ecx), %ymm0
-+	vmovups	%ymm0, -116(%edx)
-+.LBB0_143:
-+	vmovups	-84(%ecx), %ymm0
-+	vmovups	%ymm0, -84(%edx)
-+.LBB0_144:
-+	vmovups	-52(%ecx), %ymm0
-+	vmovups	%ymm0, -52(%edx)
-+	jmp	.LBB0_269
-+.LBB0_145:
-+	vmovups	-213(%ecx), %ymm0
-+	vmovups	%ymm0, -213(%edx)
-+.LBB0_146:
-+	vmovups	-181(%ecx), %ymm0
-+	vmovups	%ymm0, -181(%edx)
-+	vmovups	-149(%ecx), %ymm0
-+	vmovups	%ymm0, -149(%edx)
-+	vmovups	-117(%ecx), %ymm0
-+	vmovups	%ymm0, -117(%edx)
-+.LBB0_147:
-+	vmovups	-85(%ecx), %ymm0
-+	vmovups	%ymm0, -85(%edx)
-+.LBB0_148:
-+	vmovups	-53(%ecx), %ymm0
-+	vmovups	%ymm0, -53(%edx)
-+	jmp	.LBB0_269
-+.LBB0_149:
-+	vmovups	-214(%ecx), %ymm0
-+	vmovups	%ymm0, -214(%edx)
-+.LBB0_150:
-+	vmovups	-182(%ecx), %ymm0
-+	vmovups	%ymm0, -182(%edx)
-+	vmovups	-150(%ecx), %ymm0
-+	vmovups	%ymm0, -150(%edx)
-+	vmovups	-118(%ecx), %ymm0
-+	vmovups	%ymm0, -118(%edx)
-+.LBB0_151:
-+	vmovups	-86(%ecx), %ymm0
-+	vmovups	%ymm0, -86(%edx)
-+.LBB0_152:
-+	vmovups	-54(%ecx), %ymm0
-+	vmovups	%ymm0, -54(%edx)
-+	jmp	.LBB0_269
-+.LBB0_153:
-+	vmovups	-215(%ecx), %ymm0
-+	vmovups	%ymm0, -215(%edx)
-+.LBB0_154:
-+	vmovups	-183(%ecx), %ymm0
-+	vmovups	%ymm0, -183(%edx)
-+	vmovups	-151(%ecx), %ymm0
-+	vmovups	%ymm0, -151(%edx)
-+	vmovups	-119(%ecx), %ymm0
-+	vmovups	%ymm0, -119(%edx)
-+.LBB0_155:
-+	vmovups	-87(%ecx), %ymm0
-+	vmovups	%ymm0, -87(%edx)
-+.LBB0_156:
-+	vmovups	-55(%ecx), %ymm0
-+	vmovups	%ymm0, -55(%edx)
-+	jmp	.LBB0_269
-+.LBB0_157:
-+	vmovups	-216(%ecx), %ymm0
-+	vmovups	%ymm0, -216(%edx)
-+.LBB0_158:
-+	vmovups	-184(%ecx), %ymm0
-+	vmovups	%ymm0, -184(%edx)
-+	vmovups	-152(%ecx), %ymm0
-+	vmovups	%ymm0, -152(%edx)
-+	vmovups	-120(%ecx), %ymm0
-+	vmovups	%ymm0, -120(%edx)
-+.LBB0_159:
-+	vmovups	-88(%ecx), %ymm0
-+	vmovups	%ymm0, -88(%edx)
-+.LBB0_160:
-+	vmovups	-56(%ecx), %ymm0
-+	vmovups	%ymm0, -56(%edx)
-+	jmp	.LBB0_269
-+.LBB0_161:
-+	vmovups	-217(%ecx), %ymm0
-+	vmovups	%ymm0, -217(%edx)
-+.LBB0_162:
-+	vmovups	-185(%ecx), %ymm0
-+	vmovups	%ymm0, -185(%edx)
-+	vmovups	-153(%ecx), %ymm0
-+	vmovups	%ymm0, -153(%edx)
-+	vmovups	-121(%ecx), %ymm0
-+	vmovups	%ymm0, -121(%edx)
-+.LBB0_163:
-+	vmovups	-89(%ecx), %ymm0
-+	vmovups	%ymm0, -89(%edx)
-+.LBB0_164:
-+	vmovups	-57(%ecx), %ymm0
-+	vmovups	%ymm0, -57(%edx)
-+	jmp	.LBB0_269
-+.LBB0_165:
-+	vmovups	-218(%ecx), %ymm0
-+	vmovups	%ymm0, -218(%edx)
-+.LBB0_166:
-+	vmovups	-186(%ecx), %ymm0
-+	vmovups	%ymm0, -186(%edx)
-+	vmovups	-154(%ecx), %ymm0
-+	vmovups	%ymm0, -154(%edx)
-+	vmovups	-122(%ecx), %ymm0
-+	vmovups	%ymm0, -122(%edx)
-+.LBB0_167:
-+	vmovups	-90(%ecx), %ymm0
-+	vmovups	%ymm0, -90(%edx)
-+.LBB0_168:
-+	vmovups	-58(%ecx), %ymm0
-+	vmovups	%ymm0, -58(%edx)
-+	jmp	.LBB0_269
-+.LBB0_169:
-+	vmovups	-219(%ecx), %ymm0
-+	vmovups	%ymm0, -219(%edx)
-+.LBB0_170:
-+	vmovups	-187(%ecx), %ymm0
-+	vmovups	%ymm0, -187(%edx)
-+	vmovups	-155(%ecx), %ymm0
-+	vmovups	%ymm0, -155(%edx)
-+	vmovups	-123(%ecx), %ymm0
-+	vmovups	%ymm0, -123(%edx)
-+.LBB0_171:
-+	vmovups	-91(%ecx), %ymm0
-+	vmovups	%ymm0, -91(%edx)
-+.LBB0_172:
-+	vmovups	-59(%ecx), %ymm0
-+	vmovups	%ymm0, -59(%edx)
-+	jmp	.LBB0_269
-+.LBB0_173:
-+	vmovups	-220(%ecx), %ymm0
-+	vmovups	%ymm0, -220(%edx)
-+.LBB0_174:
-+	vmovups	-188(%ecx), %ymm0
-+	vmovups	%ymm0, -188(%edx)
-+	vmovups	-156(%ecx), %ymm0
-+	vmovups	%ymm0, -156(%edx)
-+	vmovups	-124(%ecx), %ymm0
-+	vmovups	%ymm0, -124(%edx)
-+.LBB0_175:
-+	vmovups	-92(%ecx), %ymm0
-+	vmovups	%ymm0, -92(%edx)
-+.LBB0_176:
-+	vmovups	-60(%ecx), %ymm0
-+	vmovups	%ymm0, -60(%edx)
-+	jmp	.LBB0_269
-+.LBB0_177:
-+	vmovups	-221(%ecx), %ymm0
-+	vmovups	%ymm0, -221(%edx)
-+.LBB0_178:
-+	vmovups	-189(%ecx), %ymm0
-+	vmovups	%ymm0, -189(%edx)
-+	vmovups	-157(%ecx), %ymm0
-+	vmovups	%ymm0, -157(%edx)
-+	vmovups	-125(%ecx), %ymm0
-+	vmovups	%ymm0, -125(%edx)
-+.LBB0_179:
-+	vmovups	-93(%ecx), %ymm0
-+	vmovups	%ymm0, -93(%edx)
-+.LBB0_180:
-+	vmovups	-61(%ecx), %ymm0
-+	vmovups	%ymm0, -61(%edx)
-+	jmp	.LBB0_269
-+.LBB0_181:
-+	vmovups	-222(%ecx), %ymm0
-+	vmovups	%ymm0, -222(%edx)
-+.LBB0_182:
-+	vmovups	-190(%ecx), %ymm0
-+	vmovups	%ymm0, -190(%edx)
-+	vmovups	-158(%ecx), %ymm0
-+	vmovups	%ymm0, -158(%edx)
-+	vmovups	-126(%ecx), %ymm0
-+	vmovups	%ymm0, -126(%edx)
-+.LBB0_183:
-+	vmovups	-94(%ecx), %ymm0
-+	vmovups	%ymm0, -94(%edx)
-+.LBB0_184:
-+	vmovups	-62(%ecx), %ymm0
-+	vmovups	%ymm0, -62(%edx)
-+	jmp	.LBB0_269
-+.LBB0_185:
-+	vmovups	-223(%ecx), %ymm0
-+	vmovups	%ymm0, -223(%edx)
-+.LBB0_186:
-+	vmovups	-191(%ecx), %ymm0
-+	vmovups	%ymm0, -191(%edx)
-+	vmovups	-159(%ecx), %ymm0
-+	vmovups	%ymm0, -159(%edx)
-+	vmovups	-127(%ecx), %ymm0
-+	vmovups	%ymm0, -127(%edx)
-+.LBB0_187:
-+	vmovups	-95(%ecx), %ymm0
-+	vmovups	%ymm0, -95(%edx)
-+.LBB0_188:
-+	vmovups	-63(%ecx), %ymm0
-+	vmovups	%ymm0, -63(%edx)
-+	jmp	.LBB0_269
-+.LBB0_189:
-+	vmovups	-225(%ecx), %ymm0
-+	vmovups	%ymm0, -225(%edx)
-+	vmovups	-193(%ecx), %ymm0
-+	vmovups	%ymm0, -193(%edx)
-+	vmovups	-161(%ecx), %ymm0
-+	vmovups	%ymm0, -161(%edx)
-+	vmovups	-129(%ecx), %ymm0
-+	vmovups	%ymm0, -129(%edx)
-+.LBB0_190:
-+	vmovups	-97(%ecx), %ymm0
-+	vmovups	%ymm0, -97(%edx)
-+	vmovups	-65(%ecx), %ymm0
-+	vmovups	%ymm0, -65(%edx)
-+	jmp	.LBB0_268
-+.LBB0_191:
-+	vmovups	-226(%ecx), %ymm0
-+	vmovups	%ymm0, -226(%edx)
-+	vmovups	-194(%ecx), %ymm0
-+	vmovups	%ymm0, -194(%edx)
-+	vmovups	-162(%ecx), %ymm0
-+	vmovups	%ymm0, -162(%edx)
-+	vmovups	-130(%ecx), %ymm0
-+	vmovups	%ymm0, -130(%edx)
-+.LBB0_192:
-+	vmovups	-98(%ecx), %ymm0
-+	vmovups	%ymm0, -98(%edx)
-+	vmovups	-66(%ecx), %ymm0
-+	vmovups	%ymm0, -66(%edx)
-+	jmp	.LBB0_268
-+.LBB0_193:
-+	vmovups	-227(%ecx), %ymm0
-+	vmovups	%ymm0, -227(%edx)
-+	vmovups	-195(%ecx), %ymm0
-+	vmovups	%ymm0, -195(%edx)
-+	vmovups	-163(%ecx), %ymm0
-+	vmovups	%ymm0, -163(%edx)
-+	vmovups	-131(%ecx), %ymm0
-+	vmovups	%ymm0, -131(%edx)
-+.LBB0_194:
-+	vmovups	-99(%ecx), %ymm0
-+	vmovups	%ymm0, -99(%edx)
-+	vmovups	-67(%ecx), %ymm0
-+	vmovups	%ymm0, -67(%edx)
-+	jmp	.LBB0_268
-+.LBB0_195:
-+	vmovups	-228(%ecx), %ymm0
-+	vmovups	%ymm0, -228(%edx)
-+	vmovups	-196(%ecx), %ymm0
-+	vmovups	%ymm0, -196(%edx)
-+	vmovups	-164(%ecx), %ymm0
-+	vmovups	%ymm0, -164(%edx)
-+	vmovups	-132(%ecx), %ymm0
-+	vmovups	%ymm0, -132(%edx)
-+.LBB0_196:
-+	vmovups	-100(%ecx), %ymm0
-+	vmovups	%ymm0, -100(%edx)
-+	vmovups	-68(%ecx), %ymm0
-+	vmovups	%ymm0, -68(%edx)
-+	jmp	.LBB0_268
-+.LBB0_197:
-+	vmovups	-229(%ecx), %ymm0
-+	vmovups	%ymm0, -229(%edx)
-+	vmovups	-197(%ecx), %ymm0
-+	vmovups	%ymm0, -197(%edx)
-+	vmovups	-165(%ecx), %ymm0
-+	vmovups	%ymm0, -165(%edx)
-+	vmovups	-133(%ecx), %ymm0
-+	vmovups	%ymm0, -133(%edx)
-+.LBB0_198:
-+	vmovups	-101(%ecx), %ymm0
-+	vmovups	%ymm0, -101(%edx)
-+	vmovups	-69(%ecx), %ymm0
-+	vmovups	%ymm0, -69(%edx)
-+	jmp	.LBB0_268
-+.LBB0_199:
-+	vmovups	-230(%ecx), %ymm0
-+	vmovups	%ymm0, -230(%edx)
-+	vmovups	-198(%ecx), %ymm0
-+	vmovups	%ymm0, -198(%edx)
-+	vmovups	-166(%ecx), %ymm0
-+	vmovups	%ymm0, -166(%edx)
-+	vmovups	-134(%ecx), %ymm0
-+	vmovups	%ymm0, -134(%edx)
-+.LBB0_200:
-+	vmovups	-102(%ecx), %ymm0
-+	vmovups	%ymm0, -102(%edx)
-+	vmovups	-70(%ecx), %ymm0
-+	vmovups	%ymm0, -70(%edx)
-+	jmp	.LBB0_268
-+.LBB0_201:
-+	vmovups	-231(%ecx), %ymm0
-+	vmovups	%ymm0, -231(%edx)
-+	vmovups	-199(%ecx), %ymm0
-+	vmovups	%ymm0, -199(%edx)
-+	vmovups	-167(%ecx), %ymm0
-+	vmovups	%ymm0, -167(%edx)
-+	vmovups	-135(%ecx), %ymm0
-+	vmovups	%ymm0, -135(%edx)
-+.LBB0_202:
-+	vmovups	-103(%ecx), %ymm0
-+	vmovups	%ymm0, -103(%edx)
-+	vmovups	-71(%ecx), %ymm0
-+	vmovups	%ymm0, -71(%edx)
-+	jmp	.LBB0_268
-+.LBB0_203:
-+	vmovups	-232(%ecx), %ymm0
-+	vmovups	%ymm0, -232(%edx)
-+	vmovups	-200(%ecx), %ymm0
-+	vmovups	%ymm0, -200(%edx)
-+	vmovups	-168(%ecx), %ymm0
-+	vmovups	%ymm0, -168(%edx)
-+	vmovups	-136(%ecx), %ymm0
-+	vmovups	%ymm0, -136(%edx)
-+.LBB0_204:
-+	vmovups	-104(%ecx), %ymm0
-+	vmovups	%ymm0, -104(%edx)
-+	vmovups	-72(%ecx), %ymm0
-+	vmovups	%ymm0, -72(%edx)
-+	jmp	.LBB0_268
-+.LBB0_205:
-+	vmovups	-233(%ecx), %ymm0
-+	vmovups	%ymm0, -233(%edx)
-+	vmovups	-201(%ecx), %ymm0
-+	vmovups	%ymm0, -201(%edx)
-+	vmovups	-169(%ecx), %ymm0
-+	vmovups	%ymm0, -169(%edx)
-+	vmovups	-137(%ecx), %ymm0
-+	vmovups	%ymm0, -137(%edx)
-+.LBB0_206:
-+	vmovups	-105(%ecx), %ymm0
-+	vmovups	%ymm0, -105(%edx)
-+	vmovups	-73(%ecx), %ymm0
-+	vmovups	%ymm0, -73(%edx)
-+	jmp	.LBB0_268
-+.LBB0_207:
-+	vmovups	-234(%ecx), %ymm0
-+	vmovups	%ymm0, -234(%edx)
-+	vmovups	-202(%ecx), %ymm0
-+	vmovups	%ymm0, -202(%edx)
-+	vmovups	-170(%ecx), %ymm0
-+	vmovups	%ymm0, -170(%edx)
-+	vmovups	-138(%ecx), %ymm0
-+	vmovups	%ymm0, -138(%edx)
-+.LBB0_208:
-+	vmovups	-106(%ecx), %ymm0
-+	vmovups	%ymm0, -106(%edx)
-+	vmovups	-74(%ecx), %ymm0
-+	vmovups	%ymm0, -74(%edx)
-+	jmp	.LBB0_268
-+.LBB0_209:
-+	vmovups	-235(%ecx), %ymm0
-+	vmovups	%ymm0, -235(%edx)
-+	vmovups	-203(%ecx), %ymm0
-+	vmovups	%ymm0, -203(%edx)
-+	vmovups	-171(%ecx), %ymm0
-+	vmovups	%ymm0, -171(%edx)
-+	vmovups	-139(%ecx), %ymm0
-+	vmovups	%ymm0, -139(%edx)
-+.LBB0_210:
-+	vmovups	-107(%ecx), %ymm0
-+	vmovups	%ymm0, -107(%edx)
-+	vmovups	-75(%ecx), %ymm0
-+	vmovups	%ymm0, -75(%edx)
-+	jmp	.LBB0_268
-+.LBB0_211:
-+	vmovups	-236(%ecx), %ymm0
-+	vmovups	%ymm0, -236(%edx)
-+	vmovups	-204(%ecx), %ymm0
-+	vmovups	%ymm0, -204(%edx)
-+	vmovups	-172(%ecx), %ymm0
-+	vmovups	%ymm0, -172(%edx)
-+	vmovups	-140(%ecx), %ymm0
-+	vmovups	%ymm0, -140(%edx)
-+.LBB0_212:
-+	vmovups	-108(%ecx), %ymm0
-+	vmovups	%ymm0, -108(%edx)
-+	vmovups	-76(%ecx), %ymm0
-+	vmovups	%ymm0, -76(%edx)
-+	jmp	.LBB0_268
-+.LBB0_213:
-+	vmovups	-237(%ecx), %ymm0
-+	vmovups	%ymm0, -237(%edx)
-+	vmovups	-205(%ecx), %ymm0
-+	vmovups	%ymm0, -205(%edx)
-+	vmovups	-173(%ecx), %ymm0
-+	vmovups	%ymm0, -173(%edx)
-+	vmovups	-141(%ecx), %ymm0
-+	vmovups	%ymm0, -141(%edx)
-+.LBB0_214:
-+	vmovups	-109(%ecx), %ymm0
-+	vmovups	%ymm0, -109(%edx)
-+	vmovups	-77(%ecx), %ymm0
-+	vmovups	%ymm0, -77(%edx)
-+	jmp	.LBB0_268
-+.LBB0_215:
-+	vmovups	-238(%ecx), %ymm0
-+	vmovups	%ymm0, -238(%edx)
-+	vmovups	-206(%ecx), %ymm0
-+	vmovups	%ymm0, -206(%edx)
-+	vmovups	-174(%ecx), %ymm0
-+	vmovups	%ymm0, -174(%edx)
-+	vmovups	-142(%ecx), %ymm0
-+	vmovups	%ymm0, -142(%edx)
-+.LBB0_216:
-+	vmovups	-110(%ecx), %ymm0
-+	vmovups	%ymm0, -110(%edx)
-+	vmovups	-78(%ecx), %ymm0
-+	vmovups	%ymm0, -78(%edx)
-+	jmp	.LBB0_268
-+.LBB0_217:
-+	vmovups	-239(%ecx), %ymm0
-+	vmovups	%ymm0, -239(%edx)
-+	vmovups	-207(%ecx), %ymm0
-+	vmovups	%ymm0, -207(%edx)
-+	vmovups	-175(%ecx), %ymm0
-+	vmovups	%ymm0, -175(%edx)
-+	vmovups	-143(%ecx), %ymm0
-+	vmovups	%ymm0, -143(%edx)
-+.LBB0_218:
-+	vmovups	-111(%ecx), %ymm0
-+	vmovups	%ymm0, -111(%edx)
-+	vmovups	-79(%ecx), %ymm0
-+	vmovups	%ymm0, -79(%edx)
-+	jmp	.LBB0_268
-+.LBB0_219:
-+	vmovups	-240(%ecx), %ymm0
-+	vmovups	%ymm0, -240(%edx)
-+	vmovups	-208(%ecx), %ymm0
-+	vmovups	%ymm0, -208(%edx)
-+	vmovups	-176(%ecx), %ymm0
-+	vmovups	%ymm0, -176(%edx)
-+	vmovups	-144(%ecx), %ymm0
-+	vmovups	%ymm0, -144(%edx)
-+.LBB0_220:
-+	vmovups	-112(%ecx), %ymm0
-+	vmovups	%ymm0, -112(%edx)
-+	vmovups	-80(%ecx), %ymm0
-+	vmovups	%ymm0, -80(%edx)
-+	jmp	.LBB0_268
-+.LBB0_221:
-+	vmovups	-241(%ecx), %ymm0
-+	vmovups	%ymm0, -241(%edx)
-+	vmovups	-209(%ecx), %ymm0
-+	vmovups	%ymm0, -209(%edx)
-+	vmovups	-177(%ecx), %ymm0
-+	vmovups	%ymm0, -177(%edx)
-+	vmovups	-145(%ecx), %ymm0
-+	vmovups	%ymm0, -145(%edx)
-+.LBB0_222:
-+	vmovups	-113(%ecx), %ymm0
-+	vmovups	%ymm0, -113(%edx)
-+	vmovups	-81(%ecx), %ymm0
-+	vmovups	%ymm0, -81(%edx)
-+	jmp	.LBB0_268
-+.LBB0_223:
-+	vmovups	-242(%ecx), %ymm0
-+	vmovups	%ymm0, -242(%edx)
-+	vmovups	-210(%ecx), %ymm0
-+	vmovups	%ymm0, -210(%edx)
-+	vmovups	-178(%ecx), %ymm0
-+	vmovups	%ymm0, -178(%edx)
-+	vmovups	-146(%ecx), %ymm0
-+	vmovups	%ymm0, -146(%edx)
-+.LBB0_224:
-+	vmovups	-114(%ecx), %ymm0
-+	vmovups	%ymm0, -114(%edx)
-+	vmovups	-82(%ecx), %ymm0
-+	vmovups	%ymm0, -82(%edx)
-+	jmp	.LBB0_268
-+.LBB0_225:
-+	vmovups	-243(%ecx), %ymm0
-+	vmovups	%ymm0, -243(%edx)
-+	vmovups	-211(%ecx), %ymm0
-+	vmovups	%ymm0, -211(%edx)
-+	vmovups	-179(%ecx), %ymm0
-+	vmovups	%ymm0, -179(%edx)
-+	vmovups	-147(%ecx), %ymm0
-+	vmovups	%ymm0, -147(%edx)
-+.LBB0_226:
-+	vmovups	-115(%ecx), %ymm0
-+	vmovups	%ymm0, -115(%edx)
-+	vmovups	-83(%ecx), %ymm0
-+	vmovups	%ymm0, -83(%edx)
-+	jmp	.LBB0_268
-+.LBB0_227:
-+	vmovups	-244(%ecx), %ymm0
-+	vmovups	%ymm0, -244(%edx)
-+	vmovups	-212(%ecx), %ymm0
-+	vmovups	%ymm0, -212(%edx)
-+	vmovups	-180(%ecx), %ymm0
-+	vmovups	%ymm0, -180(%edx)
-+	vmovups	-148(%ecx), %ymm0
-+	vmovups	%ymm0, -148(%edx)
-+.LBB0_228:
-+	vmovups	-116(%ecx), %ymm0
-+	vmovups	%ymm0, -116(%edx)
-+	vmovups	-84(%ecx), %ymm0
-+	vmovups	%ymm0, -84(%edx)
-+	jmp	.LBB0_268
-+.LBB0_229:
-+	vmovups	-245(%ecx), %ymm0
-+	vmovups	%ymm0, -245(%edx)
-+	vmovups	-213(%ecx), %ymm0
-+	vmovups	%ymm0, -213(%edx)
-+	vmovups	-181(%ecx), %ymm0
-+	vmovups	%ymm0, -181(%edx)
-+	vmovups	-149(%ecx), %ymm0
-+	vmovups	%ymm0, -149(%edx)
-+.LBB0_230:
-+	vmovups	-117(%ecx), %ymm0
-+	vmovups	%ymm0, -117(%edx)
-+	vmovups	-85(%ecx), %ymm0
-+	vmovups	%ymm0, -85(%edx)
-+	jmp	.LBB0_268
-+.LBB0_231:
-+	vmovups	-246(%ecx), %ymm0
-+	vmovups	%ymm0, -246(%edx)
-+	vmovups	-214(%ecx), %ymm0
-+	vmovups	%ymm0, -214(%edx)
-+	vmovups	-182(%ecx), %ymm0
-+	vmovups	%ymm0, -182(%edx)
-+	vmovups	-150(%ecx), %ymm0
-+	vmovups	%ymm0, -150(%edx)
-+.LBB0_232:
-+	vmovups	-118(%ecx), %ymm0
-+	vmovups	%ymm0, -118(%edx)
-+	vmovups	-86(%ecx), %ymm0
-+	vmovups	%ymm0, -86(%edx)
-+	jmp	.LBB0_268
-+.LBB0_233:
-+	vmovups	-247(%ecx), %ymm0
-+	vmovups	%ymm0, -247(%edx)
-+	vmovups	-215(%ecx), %ymm0
-+	vmovups	%ymm0, -215(%edx)
-+	vmovups	-183(%ecx), %ymm0
-+	vmovups	%ymm0, -183(%edx)
-+	vmovups	-151(%ecx), %ymm0
-+	vmovups	%ymm0, -151(%edx)
-+.LBB0_234:
-+	vmovups	-119(%ecx), %ymm0
-+	vmovups	%ymm0, -119(%edx)
-+	vmovups	-87(%ecx), %ymm0
-+	vmovups	%ymm0, -87(%edx)
-+	jmp	.LBB0_268
-+.LBB0_235:
-+	vmovups	-248(%ecx), %ymm0
-+	vmovups	%ymm0, -248(%edx)
-+	vmovups	-216(%ecx), %ymm0
-+	vmovups	%ymm0, -216(%edx)
-+	vmovups	-184(%ecx), %ymm0
-+	vmovups	%ymm0, -184(%edx)
-+	vmovups	-152(%ecx), %ymm0
-+	vmovups	%ymm0, -152(%edx)
-+.LBB0_236:
-+	vmovups	-120(%ecx), %ymm0
-+	vmovups	%ymm0, -120(%edx)
-+	vmovups	-88(%ecx), %ymm0
-+	vmovups	%ymm0, -88(%edx)
-+	jmp	.LBB0_268
-+.LBB0_237:
-+	vmovups	-249(%ecx), %ymm0
-+	vmovups	%ymm0, -249(%edx)
-+	vmovups	-217(%ecx), %ymm0
-+	vmovups	%ymm0, -217(%edx)
-+	vmovups	-185(%ecx), %ymm0
-+	vmovups	%ymm0, -185(%edx)
-+	vmovups	-153(%ecx), %ymm0
-+	vmovups	%ymm0, -153(%edx)
-+.LBB0_238:
-+	vmovups	-121(%ecx), %ymm0
-+	vmovups	%ymm0, -121(%edx)
-+	vmovups	-89(%ecx), %ymm0
-+	vmovups	%ymm0, -89(%edx)
-+	jmp	.LBB0_268
-+.LBB0_239:
-+	vmovups	-250(%ecx), %ymm0
-+	vmovups	%ymm0, -250(%edx)
-+	vmovups	-218(%ecx), %ymm0
-+	vmovups	%ymm0, -218(%edx)
-+	vmovups	-186(%ecx), %ymm0
-+	vmovups	%ymm0, -186(%edx)
-+	vmovups	-154(%ecx), %ymm0
-+	vmovups	%ymm0, -154(%edx)
-+.LBB0_240:
-+	vmovups	-122(%ecx), %ymm0
-+	vmovups	%ymm0, -122(%edx)
-+	vmovups	-90(%ecx), %ymm0
-+	vmovups	%ymm0, -90(%edx)
-+	jmp	.LBB0_268
-+.LBB0_241:
-+	vmovups	-251(%ecx), %ymm0
-+	vmovups	%ymm0, -251(%edx)
-+	vmovups	-219(%ecx), %ymm0
-+	vmovups	%ymm0, -219(%edx)
-+	vmovups	-187(%ecx), %ymm0
-+	vmovups	%ymm0, -187(%edx)
-+	vmovups	-155(%ecx), %ymm0
-+	vmovups	%ymm0, -155(%edx)
-+.LBB0_242:
-+	vmovups	-123(%ecx), %ymm0
-+	vmovups	%ymm0, -123(%edx)
-+	vmovups	-91(%ecx), %ymm0
-+	vmovups	%ymm0, -91(%edx)
-+	jmp	.LBB0_268
-+.LBB0_243:
-+	vmovups	-252(%ecx), %ymm0
-+	vmovups	%ymm0, -252(%edx)
-+	vmovups	-220(%ecx), %ymm0
-+	vmovups	%ymm0, -220(%edx)
-+	vmovups	-188(%ecx), %ymm0
-+	vmovups	%ymm0, -188(%edx)
-+	vmovups	-156(%ecx), %ymm0
-+	vmovups	%ymm0, -156(%edx)
-+.LBB0_244:
-+	vmovups	-124(%ecx), %ymm0
-+	vmovups	%ymm0, -124(%edx)
-+	vmovups	-92(%ecx), %ymm0
-+	vmovups	%ymm0, -92(%edx)
-+	jmp	.LBB0_268
-+.LBB0_245:
-+	vmovups	-253(%ecx), %ymm0
-+	vmovups	%ymm0, -253(%edx)
-+	vmovups	-221(%ecx), %ymm0
-+	vmovups	%ymm0, -221(%edx)
-+	vmovups	-189(%ecx), %ymm0
-+	vmovups	%ymm0, -189(%edx)
-+	vmovups	-157(%ecx), %ymm0
-+	vmovups	%ymm0, -157(%edx)
-+.LBB0_246:
-+	vmovups	-125(%ecx), %ymm0
-+	vmovups	%ymm0, -125(%edx)
-+	vmovups	-93(%ecx), %ymm0
-+	vmovups	%ymm0, -93(%edx)
-+	jmp	.LBB0_268
-+.LBB0_247:
-+	vmovups	-254(%ecx), %ymm0
-+	vmovups	%ymm0, -254(%edx)
-+	vmovups	-222(%ecx), %ymm0
-+	vmovups	%ymm0, -222(%edx)
-+	vmovups	-190(%ecx), %ymm0
-+	vmovups	%ymm0, -190(%edx)
-+	vmovups	-158(%ecx), %ymm0
-+	vmovups	%ymm0, -158(%edx)
-+.LBB0_248:
-+	vmovups	-126(%ecx), %ymm0
-+	vmovups	%ymm0, -126(%edx)
-+	vmovups	-94(%ecx), %ymm0
-+	vmovups	%ymm0, -94(%edx)
-+	jmp	.LBB0_268
-+.LBB0_249:
-+	vmovups	-255(%ecx), %ymm0
-+	vmovups	%ymm0, -255(%edx)
-+	vmovups	-223(%ecx), %ymm0
-+	vmovups	%ymm0, -223(%edx)
-+	vmovups	-191(%ecx), %ymm0
-+	vmovups	%ymm0, -191(%edx)
-+	vmovups	-159(%ecx), %ymm0
-+	vmovups	%ymm0, -159(%edx)
-+.LBB0_250:
-+	vmovups	-127(%ecx), %ymm0
-+	vmovups	%ymm0, -127(%edx)
-+	vmovups	-95(%ecx), %ymm0
-+	vmovups	%ymm0, -95(%edx)
-+	jmp	.LBB0_268
-+.LBB0_262:
-+	vmovups	-256(%ecx), %ymm0
-+	vmovups	%ymm0, -256(%edx)
-+.LBB0_263:
-+	vmovups	-224(%ecx), %ymm0
-+	vmovups	%ymm0, -224(%edx)
-+.LBB0_264:
-+	vmovups	-192(%ecx), %ymm0
-+	vmovups	%ymm0, -192(%edx)
-+.LBB0_265:
-+	vmovups	-160(%ecx), %ymm0
-+	vmovups	%ymm0, -160(%edx)
-+.LBB0_266:
-+	vmovups	-128(%ecx), %ymm0
-+	vmovups	%ymm0, -128(%edx)
-+.LBB0_267:
-+	vmovups	-96(%ecx), %ymm0
-+	vmovups	%ymm0, -96(%edx)
-+.LBB0_268:
-+	vmovups	-64(%ecx), %ymm0
-+	vmovups	%ymm0, -64(%edx)
-+.LBB0_269:
-+	vmovups	-32(%ecx), %ymm0
-+	vmovups	%ymm0, -32(%edx)
-+.LBB0_270:
-+	vzeroupper
-+	popl	%esi
-+	popl	%edi
-+	popl	%ebx
-+	popl	%ebp
-+	retl
-+END(memcpy_avx2)
-+
-+/*.Lfunc_end0:
-+	.size	memcpy_avx2, .Lfunc_end0-memcpy_avx2
-+	.section	.rodata,"a",@progbits
-+	.p2align	2*/
-+.LJTI0_0:
-+	.long	.LBB0_6@GOTOFF
-+	.long	.LBB0_10@GOTOFF
-+	.long	.LBB0_12@GOTOFF
-+	.long	.LBB0_16@GOTOFF
-+	.long	.LBB0_18@GOTOFF
-+	.long	.LBB0_20@GOTOFF
-+	.long	.LBB0_22@GOTOFF
-+	.long	.LBB0_26@GOTOFF
-+	.long	.LBB0_28@GOTOFF
-+	.long	.LBB0_30@GOTOFF
-+	.long	.LBB0_32@GOTOFF
-+	.long	.LBB0_34@GOTOFF
-+	.long	.LBB0_36@GOTOFF
-+	.long	.LBB0_38@GOTOFF
-+	.long	.LBB0_40@GOTOFF
-+	.long	.LBB0_44@GOTOFF
-+	.long	.LBB0_46@GOTOFF
-+	.long	.LBB0_48@GOTOFF
-+	.long	.LBB0_50@GOTOFF
-+	.long	.LBB0_52@GOTOFF
-+	.long	.LBB0_54@GOTOFF
-+	.long	.LBB0_56@GOTOFF
-+	.long	.LBB0_58@GOTOFF
-+	.long	.LBB0_60@GOTOFF
-+	.long	.LBB0_62@GOTOFF
-+	.long	.LBB0_64@GOTOFF
-+	.long	.LBB0_66@GOTOFF
-+	.long	.LBB0_68@GOTOFF
-+	.long	.LBB0_70@GOTOFF
-+	.long	.LBB0_72@GOTOFF
-+	.long	.LBB0_74@GOTOFF
-+	.long	.LBB0_269@GOTOFF
-+	.long	.LBB0_5@GOTOFF
-+	.long	.LBB0_9@GOTOFF
-+	.long	.LBB0_82@GOTOFF
-+	.long	.LBB0_15@GOTOFF
-+	.long	.LBB0_88@GOTOFF
-+	.long	.LBB0_92@GOTOFF
-+	.long	.LBB0_96@GOTOFF
-+	.long	.LBB0_25@GOTOFF
-+	.long	.LBB0_102@GOTOFF
-+	.long	.LBB0_106@GOTOFF
-+	.long	.LBB0_110@GOTOFF
-+	.long	.LBB0_114@GOTOFF
-+	.long	.LBB0_118@GOTOFF
-+	.long	.LBB0_122@GOTOFF
-+	.long	.LBB0_126@GOTOFF
-+	.long	.LBB0_43@GOTOFF
-+	.long	.LBB0_132@GOTOFF
-+	.long	.LBB0_136@GOTOFF
-+	.long	.LBB0_140@GOTOFF
-+	.long	.LBB0_144@GOTOFF
-+	.long	.LBB0_148@GOTOFF
-+	.long	.LBB0_152@GOTOFF
-+	.long	.LBB0_156@GOTOFF
-+	.long	.LBB0_160@GOTOFF
-+	.long	.LBB0_164@GOTOFF
-+	.long	.LBB0_168@GOTOFF
-+	.long	.LBB0_172@GOTOFF
-+	.long	.LBB0_176@GOTOFF
-+	.long	.LBB0_180@GOTOFF
-+	.long	.LBB0_184@GOTOFF
-+	.long	.LBB0_188@GOTOFF
-+	.long	.LBB0_268@GOTOFF
-+	.long	.LBB0_4@GOTOFF
-+	.long	.LBB0_8@GOTOFF
-+	.long	.LBB0_81@GOTOFF
-+	.long	.LBB0_14@GOTOFF
-+	.long	.LBB0_87@GOTOFF
-+	.long	.LBB0_91@GOTOFF
-+	.long	.LBB0_95@GOTOFF
-+	.long	.LBB0_24@GOTOFF
-+	.long	.LBB0_101@GOTOFF
-+	.long	.LBB0_105@GOTOFF
-+	.long	.LBB0_109@GOTOFF
-+	.long	.LBB0_113@GOTOFF
-+	.long	.LBB0_117@GOTOFF
-+	.long	.LBB0_121@GOTOFF
-+	.long	.LBB0_125@GOTOFF
-+	.long	.LBB0_42@GOTOFF
-+	.long	.LBB0_131@GOTOFF
-+	.long	.LBB0_135@GOTOFF
-+	.long	.LBB0_139@GOTOFF
-+	.long	.LBB0_143@GOTOFF
-+	.long	.LBB0_147@GOTOFF
-+	.long	.LBB0_151@GOTOFF
-+	.long	.LBB0_155@GOTOFF
-+	.long	.LBB0_159@GOTOFF
-+	.long	.LBB0_163@GOTOFF
-+	.long	.LBB0_167@GOTOFF
-+	.long	.LBB0_171@GOTOFF
-+	.long	.LBB0_175@GOTOFF
-+	.long	.LBB0_179@GOTOFF
-+	.long	.LBB0_183@GOTOFF
-+	.long	.LBB0_187@GOTOFF
-+	.long	.LBB0_267@GOTOFF
-+	.long	.LBB0_190@GOTOFF
-+	.long	.LBB0_192@GOTOFF
-+	.long	.LBB0_194@GOTOFF
-+	.long	.LBB0_196@GOTOFF
-+	.long	.LBB0_198@GOTOFF
-+	.long	.LBB0_200@GOTOFF
-+	.long	.LBB0_202@GOTOFF
-+	.long	.LBB0_204@GOTOFF
-+	.long	.LBB0_206@GOTOFF
-+	.long	.LBB0_208@GOTOFF
-+	.long	.LBB0_210@GOTOFF
-+	.long	.LBB0_212@GOTOFF
-+	.long	.LBB0_214@GOTOFF
-+	.long	.LBB0_216@GOTOFF
-+	.long	.LBB0_218@GOTOFF
-+	.long	.LBB0_220@GOTOFF
-+	.long	.LBB0_222@GOTOFF
-+	.long	.LBB0_224@GOTOFF
-+	.long	.LBB0_226@GOTOFF
-+	.long	.LBB0_228@GOTOFF
-+	.long	.LBB0_230@GOTOFF
-+	.long	.LBB0_232@GOTOFF
-+	.long	.LBB0_234@GOTOFF
-+	.long	.LBB0_236@GOTOFF
-+	.long	.LBB0_238@GOTOFF
-+	.long	.LBB0_240@GOTOFF
-+	.long	.LBB0_242@GOTOFF
-+	.long	.LBB0_244@GOTOFF
-+	.long	.LBB0_246@GOTOFF
-+	.long	.LBB0_248@GOTOFF
-+	.long	.LBB0_250@GOTOFF
-+	.long	.LBB0_266@GOTOFF
-+	.long	.LBB0_3@GOTOFF
-+	.long	.LBB0_7@GOTOFF
-+	.long	.LBB0_11@GOTOFF
-+	.long	.LBB0_13@GOTOFF
-+	.long	.LBB0_17@GOTOFF
-+	.long	.LBB0_19@GOTOFF
-+	.long	.LBB0_21@GOTOFF
-+	.long	.LBB0_23@GOTOFF
-+	.long	.LBB0_27@GOTOFF
-+	.long	.LBB0_29@GOTOFF
-+	.long	.LBB0_31@GOTOFF
-+	.long	.LBB0_33@GOTOFF
-+	.long	.LBB0_35@GOTOFF
-+	.long	.LBB0_37@GOTOFF
-+	.long	.LBB0_39@GOTOFF
-+	.long	.LBB0_41@GOTOFF
-+	.long	.LBB0_45@GOTOFF
-+	.long	.LBB0_47@GOTOFF
-+	.long	.LBB0_49@GOTOFF
-+	.long	.LBB0_51@GOTOFF
-+	.long	.LBB0_53@GOTOFF
-+	.long	.LBB0_55@GOTOFF
-+	.long	.LBB0_57@GOTOFF
-+	.long	.LBB0_59@GOTOFF
-+	.long	.LBB0_61@GOTOFF
-+	.long	.LBB0_63@GOTOFF
-+	.long	.LBB0_65@GOTOFF
-+	.long	.LBB0_67@GOTOFF
-+	.long	.LBB0_69@GOTOFF
-+	.long	.LBB0_71@GOTOFF
-+	.long	.LBB0_73@GOTOFF
-+	.long	.LBB0_265@GOTOFF
-+	.long	.LBB0_76@GOTOFF
-+	.long	.LBB0_78@GOTOFF
-+	.long	.LBB0_80@GOTOFF
-+	.long	.LBB0_84@GOTOFF
-+	.long	.LBB0_86@GOTOFF
-+	.long	.LBB0_90@GOTOFF
-+	.long	.LBB0_94@GOTOFF
-+	.long	.LBB0_98@GOTOFF
-+	.long	.LBB0_100@GOTOFF
-+	.long	.LBB0_104@GOTOFF
-+	.long	.LBB0_108@GOTOFF
-+	.long	.LBB0_112@GOTOFF
-+	.long	.LBB0_116@GOTOFF
-+	.long	.LBB0_120@GOTOFF
-+	.long	.LBB0_124@GOTOFF
-+	.long	.LBB0_128@GOTOFF
-+	.long	.LBB0_130@GOTOFF
-+	.long	.LBB0_134@GOTOFF
-+	.long	.LBB0_138@GOTOFF
-+	.long	.LBB0_142@GOTOFF
-+	.long	.LBB0_146@GOTOFF
-+	.long	.LBB0_150@GOTOFF
-+	.long	.LBB0_154@GOTOFF
-+	.long	.LBB0_158@GOTOFF
-+	.long	.LBB0_162@GOTOFF
-+	.long	.LBB0_166@GOTOFF
-+	.long	.LBB0_170@GOTOFF
-+	.long	.LBB0_174@GOTOFF
-+	.long	.LBB0_178@GOTOFF
-+	.long	.LBB0_182@GOTOFF
-+	.long	.LBB0_186@GOTOFF
-+	.long	.LBB0_264@GOTOFF
-+	.long	.LBB0_75@GOTOFF
-+	.long	.LBB0_77@GOTOFF
-+	.long	.LBB0_79@GOTOFF
-+	.long	.LBB0_83@GOTOFF
-+	.long	.LBB0_85@GOTOFF
-+	.long	.LBB0_89@GOTOFF
-+	.long	.LBB0_93@GOTOFF
-+	.long	.LBB0_97@GOTOFF
-+	.long	.LBB0_99@GOTOFF
-+	.long	.LBB0_103@GOTOFF
-+	.long	.LBB0_107@GOTOFF
-+	.long	.LBB0_111@GOTOFF
-+	.long	.LBB0_115@GOTOFF
-+	.long	.LBB0_119@GOTOFF
-+	.long	.LBB0_123@GOTOFF
-+	.long	.LBB0_127@GOTOFF
-+	.long	.LBB0_129@GOTOFF
-+	.long	.LBB0_133@GOTOFF
-+	.long	.LBB0_137@GOTOFF
-+	.long	.LBB0_141@GOTOFF
-+	.long	.LBB0_145@GOTOFF
-+	.long	.LBB0_149@GOTOFF
-+	.long	.LBB0_153@GOTOFF
-+	.long	.LBB0_157@GOTOFF
-+	.long	.LBB0_161@GOTOFF
-+	.long	.LBB0_165@GOTOFF
-+	.long	.LBB0_169@GOTOFF
-+	.long	.LBB0_173@GOTOFF
-+	.long	.LBB0_177@GOTOFF
-+	.long	.LBB0_181@GOTOFF
-+	.long	.LBB0_185@GOTOFF
-+	.long	.LBB0_263@GOTOFF
-+	.long	.LBB0_189@GOTOFF
-+	.long	.LBB0_191@GOTOFF
-+	.long	.LBB0_193@GOTOFF
-+	.long	.LBB0_195@GOTOFF
-+	.long	.LBB0_197@GOTOFF
-+	.long	.LBB0_199@GOTOFF
-+	.long	.LBB0_201@GOTOFF
-+	.long	.LBB0_203@GOTOFF
-+	.long	.LBB0_205@GOTOFF
-+	.long	.LBB0_207@GOTOFF
-+	.long	.LBB0_209@GOTOFF
-+	.long	.LBB0_211@GOTOFF
-+	.long	.LBB0_213@GOTOFF
-+	.long	.LBB0_215@GOTOFF
-+	.long	.LBB0_217@GOTOFF
-+	.long	.LBB0_219@GOTOFF
-+	.long	.LBB0_221@GOTOFF
-+	.long	.LBB0_223@GOTOFF
-+	.long	.LBB0_225@GOTOFF
-+	.long	.LBB0_227@GOTOFF
-+	.long	.LBB0_229@GOTOFF
-+	.long	.LBB0_231@GOTOFF
-+	.long	.LBB0_233@GOTOFF
-+	.long	.LBB0_235@GOTOFF
-+	.long	.LBB0_237@GOTOFF
-+	.long	.LBB0_239@GOTOFF
-+	.long	.LBB0_241@GOTOFF
-+	.long	.LBB0_243@GOTOFF
-+	.long	.LBB0_245@GOTOFF
-+	.long	.LBB0_247@GOTOFF
-+	.long	.LBB0_249@GOTOFF
-+	.long	.LBB0_262@GOTOFF
-+.LJTI0_1:
-+	.long	.LBB0_6@GOTOFF
-+	.long	.LBB0_10@GOTOFF
-+	.long	.LBB0_12@GOTOFF
-+	.long	.LBB0_16@GOTOFF
-+	.long	.LBB0_18@GOTOFF
-+	.long	.LBB0_20@GOTOFF
-+	.long	.LBB0_22@GOTOFF
-+	.long	.LBB0_26@GOTOFF
-+	.long	.LBB0_28@GOTOFF
-+	.long	.LBB0_30@GOTOFF
-+	.long	.LBB0_32@GOTOFF
-+	.long	.LBB0_34@GOTOFF
-+	.long	.LBB0_36@GOTOFF
-+	.long	.LBB0_38@GOTOFF
-+	.long	.LBB0_40@GOTOFF
-+	.long	.LBB0_44@GOTOFF
-+	.long	.LBB0_46@GOTOFF
-+	.long	.LBB0_48@GOTOFF
-+	.long	.LBB0_50@GOTOFF
-+	.long	.LBB0_52@GOTOFF
-+	.long	.LBB0_54@GOTOFF
-+	.long	.LBB0_56@GOTOFF
-+	.long	.LBB0_58@GOTOFF
-+	.long	.LBB0_60@GOTOFF
-+	.long	.LBB0_62@GOTOFF
-+	.long	.LBB0_64@GOTOFF
-+	.long	.LBB0_66@GOTOFF
-+	.long	.LBB0_68@GOTOFF
-+	.long	.LBB0_70@GOTOFF
-+	.long	.LBB0_72@GOTOFF
-+	.long	.LBB0_74@GOTOFF
-+	.long	.LBB0_269@GOTOFF
-+	.long	.LBB0_5@GOTOFF
-+	.long	.LBB0_9@GOTOFF
-+	.long	.LBB0_82@GOTOFF
-+	.long	.LBB0_15@GOTOFF
-+	.long	.LBB0_88@GOTOFF
-+	.long	.LBB0_92@GOTOFF
-+	.long	.LBB0_96@GOTOFF
-+	.long	.LBB0_25@GOTOFF
-+	.long	.LBB0_102@GOTOFF
-+	.long	.LBB0_106@GOTOFF
-+	.long	.LBB0_110@GOTOFF
-+	.long	.LBB0_114@GOTOFF
-+	.long	.LBB0_118@GOTOFF
-+	.long	.LBB0_122@GOTOFF
-+	.long	.LBB0_126@GOTOFF
-+	.long	.LBB0_43@GOTOFF
-+	.long	.LBB0_132@GOTOFF
-+	.long	.LBB0_136@GOTOFF
-+	.long	.LBB0_140@GOTOFF
-+	.long	.LBB0_144@GOTOFF
-+	.long	.LBB0_148@GOTOFF
-+	.long	.LBB0_152@GOTOFF
-+	.long	.LBB0_156@GOTOFF
-+	.long	.LBB0_160@GOTOFF
-+	.long	.LBB0_164@GOTOFF
-+	.long	.LBB0_168@GOTOFF
-+	.long	.LBB0_172@GOTOFF
-+	.long	.LBB0_176@GOTOFF
-+	.long	.LBB0_180@GOTOFF
-+	.long	.LBB0_184@GOTOFF
-+	.long	.LBB0_188@GOTOFF
-+	.long	.LBB0_268@GOTOFF
-+	.long	.LBB0_4@GOTOFF
-+	.long	.LBB0_8@GOTOFF
-+	.long	.LBB0_81@GOTOFF
-+	.long	.LBB0_14@GOTOFF
-+	.long	.LBB0_87@GOTOFF
-+	.long	.LBB0_91@GOTOFF
-+	.long	.LBB0_95@GOTOFF
-+	.long	.LBB0_24@GOTOFF
-+	.long	.LBB0_101@GOTOFF
-+	.long	.LBB0_105@GOTOFF
-+	.long	.LBB0_109@GOTOFF
-+	.long	.LBB0_113@GOTOFF
-+	.long	.LBB0_117@GOTOFF
-+	.long	.LBB0_121@GOTOFF
-+	.long	.LBB0_125@GOTOFF
-+	.long	.LBB0_42@GOTOFF
-+	.long	.LBB0_131@GOTOFF
-+	.long	.LBB0_135@GOTOFF
-+	.long	.LBB0_139@GOTOFF
-+	.long	.LBB0_143@GOTOFF
-+	.long	.LBB0_147@GOTOFF
-+	.long	.LBB0_151@GOTOFF
-+	.long	.LBB0_155@GOTOFF
-+	.long	.LBB0_159@GOTOFF
-+	.long	.LBB0_163@GOTOFF
-+	.long	.LBB0_167@GOTOFF
-+	.long	.LBB0_171@GOTOFF
-+	.long	.LBB0_175@GOTOFF
-+	.long	.LBB0_179@GOTOFF
-+	.long	.LBB0_183@GOTOFF
-+	.long	.LBB0_187@GOTOFF
-+	.long	.LBB0_267@GOTOFF
-+	.long	.LBB0_190@GOTOFF
-+	.long	.LBB0_192@GOTOFF
-+	.long	.LBB0_194@GOTOFF
-+	.long	.LBB0_196@GOTOFF
-+	.long	.LBB0_198@GOTOFF
-+	.long	.LBB0_200@GOTOFF
-+	.long	.LBB0_202@GOTOFF
-+	.long	.LBB0_204@GOTOFF
-+	.long	.LBB0_206@GOTOFF
-+	.long	.LBB0_208@GOTOFF
-+	.long	.LBB0_210@GOTOFF
-+	.long	.LBB0_212@GOTOFF
-+	.long	.LBB0_214@GOTOFF
-+	.long	.LBB0_216@GOTOFF
-+	.long	.LBB0_218@GOTOFF
-+	.long	.LBB0_220@GOTOFF
-+	.long	.LBB0_222@GOTOFF
-+	.long	.LBB0_224@GOTOFF
-+	.long	.LBB0_226@GOTOFF
-+	.long	.LBB0_228@GOTOFF
-+	.long	.LBB0_230@GOTOFF
-+	.long	.LBB0_232@GOTOFF
-+	.long	.LBB0_234@GOTOFF
-+	.long	.LBB0_236@GOTOFF
-+	.long	.LBB0_238@GOTOFF
-+	.long	.LBB0_240@GOTOFF
-+	.long	.LBB0_242@GOTOFF
-+	.long	.LBB0_244@GOTOFF
-+	.long	.LBB0_246@GOTOFF
-+	.long	.LBB0_248@GOTOFF
-+	.long	.LBB0_250@GOTOFF
-+	.long	.LBB0_266@GOTOFF
-+	.long	.LBB0_3@GOTOFF
-+	.long	.LBB0_7@GOTOFF
-+	.long	.LBB0_11@GOTOFF
-+	.long	.LBB0_13@GOTOFF
-+	.long	.LBB0_17@GOTOFF
-+	.long	.LBB0_19@GOTOFF
-+	.long	.LBB0_21@GOTOFF
-+	.long	.LBB0_23@GOTOFF
-+	.long	.LBB0_27@GOTOFF
-+	.long	.LBB0_29@GOTOFF
-+	.long	.LBB0_31@GOTOFF
-+	.long	.LBB0_33@GOTOFF
-+	.long	.LBB0_35@GOTOFF
-+	.long	.LBB0_37@GOTOFF
-+	.long	.LBB0_39@GOTOFF
-+	.long	.LBB0_41@GOTOFF
-+	.long	.LBB0_45@GOTOFF
-+	.long	.LBB0_47@GOTOFF
-+	.long	.LBB0_49@GOTOFF
-+	.long	.LBB0_51@GOTOFF
-+	.long	.LBB0_53@GOTOFF
-+	.long	.LBB0_55@GOTOFF
-+	.long	.LBB0_57@GOTOFF
-+	.long	.LBB0_59@GOTOFF
-+	.long	.LBB0_61@GOTOFF
-+	.long	.LBB0_63@GOTOFF
-+	.long	.LBB0_65@GOTOFF
-+	.long	.LBB0_67@GOTOFF
-+	.long	.LBB0_69@GOTOFF
-+	.long	.LBB0_71@GOTOFF
-+	.long	.LBB0_73@GOTOFF
-+	.long	.LBB0_265@GOTOFF
-+	.long	.LBB0_76@GOTOFF
-+	.long	.LBB0_78@GOTOFF
-+	.long	.LBB0_80@GOTOFF
-+	.long	.LBB0_84@GOTOFF
-+	.long	.LBB0_86@GOTOFF
-+	.long	.LBB0_90@GOTOFF
-+	.long	.LBB0_94@GOTOFF
-+	.long	.LBB0_98@GOTOFF
-+	.long	.LBB0_100@GOTOFF
-+	.long	.LBB0_104@GOTOFF
-+	.long	.LBB0_108@GOTOFF
-+	.long	.LBB0_112@GOTOFF
-+	.long	.LBB0_116@GOTOFF
-+	.long	.LBB0_120@GOTOFF
-+	.long	.LBB0_124@GOTOFF
-+	.long	.LBB0_128@GOTOFF
-+	.long	.LBB0_130@GOTOFF
-+	.long	.LBB0_134@GOTOFF
-+	.long	.LBB0_138@GOTOFF
-+	.long	.LBB0_142@GOTOFF
-+	.long	.LBB0_146@GOTOFF
-+	.long	.LBB0_150@GOTOFF
-+	.long	.LBB0_154@GOTOFF
-+	.long	.LBB0_158@GOTOFF
-+	.long	.LBB0_162@GOTOFF
-+	.long	.LBB0_166@GOTOFF
-+	.long	.LBB0_170@GOTOFF
-+	.long	.LBB0_174@GOTOFF
-+	.long	.LBB0_178@GOTOFF
-+	.long	.LBB0_182@GOTOFF
-+	.long	.LBB0_186@GOTOFF
-+	.long	.LBB0_264@GOTOFF
-+	.long	.LBB0_75@GOTOFF
-+	.long	.LBB0_77@GOTOFF
-+	.long	.LBB0_79@GOTOFF
-+	.long	.LBB0_83@GOTOFF
-+	.long	.LBB0_85@GOTOFF
-+	.long	.LBB0_89@GOTOFF
-+	.long	.LBB0_93@GOTOFF
-+	.long	.LBB0_97@GOTOFF
-+	.long	.LBB0_99@GOTOFF
-+	.long	.LBB0_103@GOTOFF
-+	.long	.LBB0_107@GOTOFF
-+	.long	.LBB0_111@GOTOFF
-+	.long	.LBB0_115@GOTOFF
-+	.long	.LBB0_119@GOTOFF
-+	.long	.LBB0_123@GOTOFF
-+	.long	.LBB0_127@GOTOFF
-+	.long	.LBB0_129@GOTOFF
-+	.long	.LBB0_133@GOTOFF
-+	.long	.LBB0_137@GOTOFF
-+	.long	.LBB0_141@GOTOFF
-+	.long	.LBB0_145@GOTOFF
-+	.long	.LBB0_149@GOTOFF
-+	.long	.LBB0_153@GOTOFF
-+	.long	.LBB0_157@GOTOFF
-+	.long	.LBB0_161@GOTOFF
-+	.long	.LBB0_165@GOTOFF
-+	.long	.LBB0_169@GOTOFF
-+	.long	.LBB0_173@GOTOFF
-+	.long	.LBB0_177@GOTOFF
-+	.long	.LBB0_181@GOTOFF
-+	.long	.LBB0_185@GOTOFF
-+	.long	.LBB0_263@GOTOFF
-+	.long	.LBB0_189@GOTOFF
-+	.long	.LBB0_191@GOTOFF
-+	.long	.LBB0_193@GOTOFF
-+	.long	.LBB0_195@GOTOFF
-+	.long	.LBB0_197@GOTOFF
-+	.long	.LBB0_199@GOTOFF
-+	.long	.LBB0_201@GOTOFF
-+	.long	.LBB0_203@GOTOFF
-+	.long	.LBB0_205@GOTOFF
-+	.long	.LBB0_207@GOTOFF
-+	.long	.LBB0_209@GOTOFF
-+	.long	.LBB0_211@GOTOFF
-+	.long	.LBB0_213@GOTOFF
-+	.long	.LBB0_215@GOTOFF
-+	.long	.LBB0_217@GOTOFF
-+	.long	.LBB0_219@GOTOFF
-+	.long	.LBB0_221@GOTOFF
-+	.long	.LBB0_223@GOTOFF
-+	.long	.LBB0_225@GOTOFF
-+	.long	.LBB0_227@GOTOFF
-+	.long	.LBB0_229@GOTOFF
-+	.long	.LBB0_231@GOTOFF
-+	.long	.LBB0_233@GOTOFF
-+	.long	.LBB0_235@GOTOFF
-+	.long	.LBB0_237@GOTOFF
-+	.long	.LBB0_239@GOTOFF
-+	.long	.LBB0_241@GOTOFF
-+	.long	.LBB0_243@GOTOFF
-+	.long	.LBB0_245@GOTOFF
-+	.long	.LBB0_247@GOTOFF
-+	.long	.LBB0_249@GOTOFF
-+	.long	.LBB0_262@GOTOFF
-+                                        # -- End function
-diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp
-index c846ded45..43aaebb54 100644
---- a/libc/arch-x86_64/dynamic_function_dispatch.cpp
-+++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp
-@@ -46,4 +46,42 @@ DEFINE_IFUNC_FOR(__memset_chk) {
-   RETURN_FUNC(__memset_chk_func, __memset_chk_generic);
- }
- 
-+typedef int memcmp_func(const void* __lhs, const void* __rhs, size_t __n);
-+DEFINE_IFUNC_FOR(memcmp) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memcmp_func, memcmp_avx2)
-+    RETURN_FUNC(memcmp_func, memcmp_generic);
-+}
-+
-+typedef void* memmove_func(void* __dst, const void* __src, size_t __n);
-+DEFINE_IFUNC_FOR(memmove) {
-+    RETURN_FUNC(memmove_func, memmove_generic);
-+}
-+
-+typedef void* memcpy_func(void* __dst, const void* __src, size_t __n);
-+DEFINE_IFUNC_FOR(memcpy) {
-+	return memmove_resolver();
-+}
-+
-+typedef void* memchr_func(const void* __s, int __ch, size_t __n);
-+DEFINE_IFUNC_FOR(memchr) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2);
-+    RETURN_FUNC(memchr_func, memchr_openbsd);
-+}
-+
-+typedef void* memrchr_func(const void* __s, int __ch, size_t __n);
-+DEFINE_IFUNC_FOR(memrchr) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2);
-+    RETURN_FUNC(memrchr_func, memrchr_openbsd);
-+}
-+
-+// typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
-+// DEFINE_IFUNC_FOR(wmemset) {
-+//     __builtin_cpu_init();
-+//     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2);
-+//     RETURN_FUNC(wmemset_func, wmemset_freebsd);
-+// }
-+
- }  // extern "C"
-diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c
-new file mode 100644
-index 000000000..86ee02e0b
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/memchr.c
-@@ -0,0 +1,20 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#include <upstream-openbsd/android/include/openbsd-compat.h>
-+#define memchr memchr_openbsd
-+
-+#include <upstream-openbsd/lib/libc/string/memchr.c>
-diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c
-new file mode 100644
-index 000000000..c803009f5
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/memrchr.c
-@@ -0,0 +1,20 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#include <upstream-openbsd/android/include/openbsd-compat.h>
-+#define memrchr memrchr_openbsd
-+
-+#include <upstream-openbsd/lib/libc/string/memrchr.c>
-diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c
-new file mode 100644
-index 000000000..ac6bd7ec4
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wmemset.c
-@@ -0,0 +1,20 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#include <upstream-openbsd/android/include/openbsd-compat.h>
-+#define wmemset wmemset_freebsd
-+
-+#include <upstream-freebsd/lib/libc/string/wmemset.c>
-diff --git a/libc/arch-x86_64/string/cache.h b/libc/arch-x86_64/include/cache.h
-similarity index 100%
-rename from libc/arch-x86_64/string/cache.h
-rename to libc/arch-x86_64/include/cache.h
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S
-new file mode 100644
-index 000000000..da667c9b3
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S
-@@ -0,0 +1,371 @@
-+#ifndef L
-+# define L(label)	.L##label
-+#endif
-+
-+#ifndef cfi_startproc
-+# define cfi_startproc	.cfi_startproc
-+#endif
-+
-+#ifndef cfi_endproc
-+# define cfi_endproc	.cfi_endproc
-+#endif
-+
-+#ifndef cfi_rel_offset
-+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-+#endif
-+
-+#ifndef cfi_restore
-+# define cfi_restore(reg)	.cfi_restore reg
-+#endif
-+
-+#ifndef cfi_adjust_cfa_offset
-+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-+#endif
-+
-+#ifndef ENTRY
-+# define ENTRY(name)		\
-+	.type name,  @function;		\
-+	.globl name;		\
-+	.p2align 4;		\
-+name:		\
-+	cfi_startproc
-+#endif
-+
-+#ifndef END
-+# define END(name)		\
-+	cfi_endproc;		\
-+	.size name, .-name
-+#endif
-+
-+#define CFI_PUSH(REG)		\
-+	cfi_adjust_cfa_offset (4);		\
-+	cfi_rel_offset (REG, 0)
-+
-+#define CFI_POP(REG)		\
-+	cfi_adjust_cfa_offset (-4);		\
-+	cfi_restore (REG)
-+
-+#define PUSH(REG)	push REG;
-+#define POP(REG)	pop REG;
-+
-+#define ENTRANCE	PUSH (%rbx);
-+#define RETURN_END	POP (%rbx); ret
-+#define RETURN		RETURN_END;
-+
-+# ifndef MEMCHR
-+#  define MEMCHR          memchr_avx2
-+# endif
-+
-+# ifdef USE_AS_WMEMCHR
-+#  define VPCMPEQ         vpcmpeqd
-+# else
-+#  define VPCMPEQ         vpcmpeqb
-+# endif
-+
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER  vzeroupper
-+# endif
-+
-+# define VEC_SIZE 32
-+
-+                .section .text.avx,"ax",@progbits
-+ENTRY (MEMCHR)
-+# ifndef USE_AS_RAWMEMCHR
-+                /* Check for zero length.  */
-+                testq      %rdx, %rdx
-+                jz             L(null)
-+# endif
-+                movl      %edi, %ecx
-+                /* Broadcast CHAR to YMM0.  */
-+                vmovd  %esi, %xmm0
-+# ifdef USE_AS_WMEMCHR
-+                shl          $2, %rdx
-+                vpbroadcastd %xmm0, %ymm0
-+# else
-+                vpbroadcastb %xmm0, %ymm0
-+# endif
-+                /* Check if we may cross page boundary with one vector load.  */
-+                andl       $(2 * VEC_SIZE - 1), %ecx
-+                cmpl      $VEC_SIZE, %ecx
-+                ja             L(cros_page_boundary)
-+
-+                /* Check the first VEC_SIZE bytes.  */
-+                VPCMPEQ (%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+
-+# ifndef USE_AS_RAWMEMCHR
-+                jnz          L(first_vec_x0_check)
-+                /* Adjust length and check the end of data.  */
-+                subq      $VEC_SIZE, %rdx
-+                jbe          L(zero)
-+# else
-+                jnz          L(first_vec_x0)
-+# endif
-+
-+                /* Align data for aligned loads in the loop.  */
-+                addq      $VEC_SIZE, %rdi
-+                andl       $(VEC_SIZE - 1), %ecx
-+                andq      $-VEC_SIZE, %rdi
-+
-+# ifndef USE_AS_RAWMEMCHR
-+                /* Adjust length.  */
-+                addq      %rcx, %rdx
-+
-+                subq      $(VEC_SIZE * 4), %rdx
-+                jbe          L(last_4x_vec_or_less)
-+# endif
-+                jmp         L(more_4x_vec)
-+
-+                .p2align 4
-+L(cros_page_boundary):
-+                andl       $(VEC_SIZE - 1), %ecx
-+                andq      $-VEC_SIZE, %rdi
-+                VPCMPEQ (%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                /* Remove the leading bytes.  */
-+                sarl         %cl, %eax
-+                testl       %eax, %eax
-+                jz             L(aligned_more)
-+                tzcntl     %eax, %eax
-+# ifndef USE_AS_RAWMEMCHR
-+                /* Check the end of data.  */
-+                cmpq     %rax, %rdx
-+                jbe          L(zero)
-+# endif
-+                addq      %rdi, %rax
-+                addq      %rcx, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(aligned_more):
-+# ifndef USE_AS_RAWMEMCHR
-+        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-+                  instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-+                   overflow.  */
-+                negq      %rcx
-+                addq      $VEC_SIZE, %rcx
-+
-+                /* Check the end of data.  */
-+                subq      %rcx, %rdx
-+                jbe          L(zero)
-+# endif
-+
-+                addq      $VEC_SIZE, %rdi
-+
-+# ifndef USE_AS_RAWMEMCHR
-+                subq      $(VEC_SIZE * 4), %rdx
-+                jbe          L(last_4x_vec_or_less)
-+# endif
-+
-+L(more_4x_vec):
-+                /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-+                   since data is only aligned to VEC_SIZE.  */
-+                VPCMPEQ (%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x0)
-+
-+                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x1)
-+
-+                VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x2)
-+
-+                VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x3)
-+
-+                addq      $(VEC_SIZE * 4), %rdi
-+
-+# ifndef USE_AS_RAWMEMCHR
-+                subq      $(VEC_SIZE * 4), %rdx
-+                jbe          L(last_4x_vec_or_less)
-+# endif
-+
-+                /* Align data to 4 * VEC_SIZE.  */
-+                movq     %rdi, %rcx
-+                andl       $(4 * VEC_SIZE - 1), %ecx
-+                andq      $-(4 * VEC_SIZE), %rdi
-+
-+# ifndef USE_AS_RAWMEMCHR
-+                /* Adjust length.  */
-+                addq      %rcx, %rdx
-+# endif
-+
-+                .p2align 4
-+L(loop_4x_vec):
-+                /* Compare 4 * VEC at a time forward.  */
-+                VPCMPEQ (%rdi), %ymm0, %ymm1
-+                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
-+                VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-+                VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-+
-+                vpor       %ymm1, %ymm2, %ymm5
-+                vpor       %ymm3, %ymm4, %ymm6
-+                vpor       %ymm5, %ymm6, %ymm5
-+
-+                vpmovmskb %ymm5, %eax
-+                testl       %eax, %eax
-+                jnz          L(4x_vec_end)
-+
-+                addq      $(VEC_SIZE * 4), %rdi
-+
-+# ifdef USE_AS_RAWMEMCHR
-+                jmp         L(loop_4x_vec)
-+# else
-+                subq      $(VEC_SIZE * 4), %rdx
-+                ja             L(loop_4x_vec)
-+
-+L(last_4x_vec_or_less):
-+                /* Less than 4 * VEC and aligned to VEC_SIZE.  */
-+                addl       $(VEC_SIZE * 2), %edx
-+                jle           L(last_2x_vec)
-+
-+                VPCMPEQ (%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x0)
-+
-+                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x1)
-+
-+                VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+
-+                jnz          L(first_vec_x2_check)
-+                subl       $VEC_SIZE, %edx
-+                jle           L(zero)
-+
-+                VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+
-+                jnz          L(first_vec_x3_check)
-+                xorl        %eax, %eax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(last_2x_vec):
-+                addl       $(VEC_SIZE * 2), %edx
-+                VPCMPEQ (%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+
-+                jnz          L(first_vec_x0_check)
-+                subl       $VEC_SIZE, %edx
-+                jle           L(zero)
-+
-+                VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x1_check)
-+                xorl        %eax, %eax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(first_vec_x0_check):
-+                tzcntl     %eax, %eax
-+                /* Check the end of data.  */
-+                cmpq     %rax, %rdx
-+                jbe          L(zero)
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(first_vec_x1_check):
-+                tzcntl     %eax, %eax
-+                /* Check the end of data.  */
-+                cmpq     %rax, %rdx
-+                jbe          L(zero)
-+                addq      $VEC_SIZE, %rax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(first_vec_x2_check):
-+                tzcntl     %eax, %eax
-+                /* Check the end of data.  */
-+                cmpq     %rax, %rdx
-+                jbe          L(zero)
-+                addq      $(VEC_SIZE * 2), %rax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(first_vec_x3_check):
-+                tzcntl     %eax, %eax
-+                /* Check the end of data.  */
-+                cmpq     %rax, %rdx
-+                jbe          L(zero)
-+                addq      $(VEC_SIZE * 3), %rax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(zero):
-+                VZEROUPPER
-+L(null):
-+                xorl        %eax, %eax
-+                ret
-+# endif
-+
-+                .p2align 4
-+L(first_vec_x0):
-+                tzcntl     %eax, %eax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(first_vec_x1):
-+                tzcntl     %eax, %eax
-+                addq      $VEC_SIZE, %rax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(first_vec_x2):
-+                tzcntl     %eax, %eax
-+                addq      $(VEC_SIZE * 2), %rax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+                .p2align 4
-+L(4x_vec_end):
-+                vpmovmskb %ymm1, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x0)
-+                vpmovmskb %ymm2, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x1)
-+                vpmovmskb %ymm3, %eax
-+                testl       %eax, %eax
-+                jnz          L(first_vec_x2)
-+                vpmovmskb %ymm4, %eax
-+                testl       %eax, %eax
-+L(first_vec_x3):
-+                tzcntl     %eax, %eax
-+                addq      $(VEC_SIZE * 3), %rax
-+                addq      %rdi, %rax
-+                VZEROUPPER
-+                ret
-+
-+END (MEMCHR)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S
-new file mode 100644
-index 000000000..e9778ca5a
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S
-@@ -0,0 +1,428 @@
-+/* Copyright (C) 2017-2019 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+/* memcmp/wmemcmp is implemented as:
-+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
-+      to avoid branches.
-+   2. Use overlapping compare to avoid branch.
-+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
-+      bytes for wmemcmp.
-+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
-+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
-+      area.
-+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
-+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
-+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
-+
-+
-+#ifndef MEMCMP
-+# define MEMCMP		memcmp_avx2
-+#endif
-+
-+#ifndef L
-+# define L(label)	.L##label
-+#endif
-+
-+#ifndef ALIGN
-+# define ALIGN(n)	.p2align n
-+#endif
-+
-+#ifndef cfi_startproc
-+# define cfi_startproc			.cfi_startproc
-+#endif
-+
-+#ifndef cfi_endproc
-+# define cfi_endproc			.cfi_endproc
-+#endif
-+
-+#ifndef ENTRY
-+# define ENTRY(name)			\
-+	.type name,  @function; 	\
-+	.globl name;			\
-+	.p2align 4;			\
-+name:					\
-+	cfi_startproc
-+#endif
-+
-+#ifndef END
-+# define END(name)			\
-+	cfi_endproc;			\
-+	.size name, .-name
-+#endif
-+
-+#ifndef ALIGN
-+# define ALIGN(n)	.p2align n
-+#endif
-+
-+# ifdef USE_AS_WMEMCMP
-+#  define VPCMPEQ        vpcmpeqd
-+# else
-+#  define VPCMPEQ        vpcmpeqb
-+# endif
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER        vzeroupper
-+# endif
-+# define VEC_SIZE 32
-+# define VEC_MASK ((1 << VEC_SIZE) - 1)
-+        .section .text.avx,"ax",@progbits
-+ENTRY (MEMCMP)
-+# ifdef USE_AS_WMEMCMP
-+        shl        $2, %RDX_LP
-+# elif defined __ILP32__
-+        /* Clear the upper 32 bits.  */
-+        movl        %edx, %edx
-+# endif
-+        cmp        $VEC_SIZE, %rdx
-+        jb        L(less_vec)
-+        /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        cmpq        $(VEC_SIZE * 2), %rdx
-+        jbe        L(last_vec)
-+        VPCMPEQ        %ymm0, %ymm0, %ymm0
-+        /* More than 2 * VEC.  */
-+        cmpq        $(VEC_SIZE * 8), %rdx
-+        ja        L(more_8x_vec)
-+        cmpq        $(VEC_SIZE * 4), %rdx
-+        jb        L(last_4x_vec)
-+        /* From 4 * VEC to 8 * VEC, inclusively. */
-+        vmovdqu        (%rsi), %ymm1
-+        VPCMPEQ (%rdi), %ymm1, %ymm1
-+        vmovdqu        VEC_SIZE(%rsi), %ymm2
-+        VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-+        vmovdqu        (VEC_SIZE * 2)(%rsi), %ymm3
-+        VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-+        vmovdqu        (VEC_SIZE * 3)(%rsi), %ymm4
-+        VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-+        vpand        %ymm1, %ymm2, %ymm5
-+        vpand        %ymm3, %ymm4, %ymm6
-+        vpand        %ymm5, %ymm6, %ymm5
-+        vptest        %ymm0, %ymm5
-+        jnc        L(4x_vec_end)
-+        leaq        -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
-+        leaq        -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
-+        vmovdqu        (%rsi), %ymm1
-+        VPCMPEQ (%rdi), %ymm1, %ymm1
-+        vmovdqu        VEC_SIZE(%rsi), %ymm2
-+        VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-+        vpand        %ymm2, %ymm1, %ymm5
-+        vmovdqu        (VEC_SIZE * 2)(%rsi), %ymm3
-+        VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-+        vpand        %ymm3, %ymm5, %ymm5
-+        vmovdqu        (VEC_SIZE * 3)(%rsi), %ymm4
-+        VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-+        vpand        %ymm4, %ymm5, %ymm5
-+        vptest        %ymm0, %ymm5
-+        jnc        L(4x_vec_end)
-+        xorl        %eax, %eax
-+        VZEROUPPER
-+        ret
-+        .p2align 4
-+L(last_2x_vec):
-+        /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+L(last_vec):
-+        /* Use overlapping loads to avoid branches.  */
-+        leaq        -VEC_SIZE(%rdi, %rdx), %rdi
-+        leaq        -VEC_SIZE(%rsi, %rdx), %rsi
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        VZEROUPPER
-+        ret
-+        .p2align 4
-+L(first_vec):
-+        /* A byte or int32 is different within 16 or 32 bytes.  */
-+        tzcntl        %eax, %ecx
-+# ifdef USE_AS_WMEMCMP
-+        xorl        %eax, %eax
-+        movl        (%rdi, %rcx), %edx
-+        cmpl        (%rsi, %rcx), %edx
-+L(wmemcmp_return):
-+        setl        %al
-+        negl        %eax
-+        orl        $1, %eax
-+# else
-+        movzbl        (%rdi, %rcx), %eax
-+        movzbl        (%rsi, %rcx), %edx
-+        sub        %edx, %eax
-+# endif
-+        VZEROUPPER
-+        ret
-+# ifdef USE_AS_WMEMCMP
-+        .p2align 4
-+L(4):
-+        xorl        %eax, %eax
-+        movl        (%rdi), %edx
-+        cmpl        (%rsi), %edx
-+        jne        L(wmemcmp_return)
-+        ret
-+# else
-+
-+L(between_4_7):
-+        /* Load as big endian with overlapping movbe to avoid branches.  */
-+        movbe        (%rdi), %eax
-+        movbe        (%rsi), %ecx
-+        shlq        $32, %rax
-+        shlq        $32, %rcx
-+        movbe        -4(%rdi, %rdx), %edi
-+        movbe        -4(%rsi, %rdx), %esi
-+        orq        %rdi, %rax
-+        orq        %rsi, %rcx
-+        subq        %rcx, %rax
-+        je        L(exit)
-+        sbbl        %eax, %eax
-+        orl        $1, %eax
-+        ret
-+        .p2align 4
-+/*L(8):
-+	giving two failures 
-+	movl (%rdi), %eax
-+	subl (%rsi), %eax
-+	je L(between_4_7)
-+        retq */
-+
-+L(exit):
-+        ret
-+        .p2align 4
-+L(between_2_3):
-+        /* Load as big endian to avoid branches.  */
-+        movzwl        (%rdi), %eax
-+        movzwl        (%rsi), %ecx
-+        shll        $8, %eax
-+        shll        $8, %ecx
-+        bswap        %eax
-+        bswap        %ecx
-+        movb        -1(%rdi, %rdx), %al
-+        movb        -1(%rsi, %rdx), %cl
-+        /* Subtraction is okay because the upper 8 bits are zero.  */
-+        subl        %ecx, %eax
-+        ret
-+        .p2align 4
-+L(1):
-+        movzbl        (%rdi), %eax
-+        movzbl        (%rsi), %ecx
-+        sub        %ecx, %eax
-+        ret
-+# endif
-+        .p2align 4
-+L(zero):
-+        xorl        %eax, %eax
-+        ret
-+        .p2align 4
-+L(less_vec):
-+# ifdef USE_AS_WMEMCMP
-+        /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
-+        cmpb        $4, %dl
-+        je        L(4)
-+        jb        L(zero)
-+# else
-+/*	cmpb $8, %dl
-+        jne L(tmp)
-+        movl (%rdi), %eax
-+        subl (%rsi), %eax
-+        jne L(exit)
-+L(temp):
-+	movl    %edx, %edx
-+	//jmp L(tmp) 
-+L(tmp):*/ 
-+
-+        cmpb        $1, %dl
-+        je        L(1)
-+        jb        L(zero)
-+
-+        cmpb        $4, %dl
-+        jb        L(between_2_3)
-+        cmpb       $8, %dl
-+        //je        L(8)
-+        jb        L(between_4_7)
-+# endif
-+        cmpb        $16, %dl
-+        jae        L(between_16_31)
-+        /* It is between 8 and 15 bytes.  */
-+        vmovq        (%rdi), %xmm1
-+        vmovq        (%rsi), %xmm2
-+        VPCMPEQ %xmm1, %xmm2, %xmm2
-+        vpmovmskb %xmm2, %eax
-+        subl    $0xffff, %eax
-+        jnz        L(first_vec)
-+        /* Use overlapping loads to avoid branches.  */
-+        leaq        -8(%rdi, %rdx), %rdi
-+        leaq        -8(%rsi, %rdx), %rsi
-+        vmovq        (%rdi), %xmm1
-+        vmovq        (%rsi), %xmm2
-+        VPCMPEQ %xmm1, %xmm2, %xmm2
-+        vpmovmskb %xmm2, %eax
-+        subl    $0xffff, %eax
-+        jnz        L(first_vec)
-+        ret
-+        .p2align 4
-+L(between_16_31):
-+        /* From 16 to 31 bytes.  No branch when size == 16.  */
-+        vmovdqu        (%rsi), %xmm2
-+        VPCMPEQ (%rdi), %xmm2, %xmm2
-+        vpmovmskb %xmm2, %eax
-+        subl    $0xffff, %eax
-+        jnz        L(first_vec)
-+        /* Use overlapping loads to avoid branches.  */
-+        leaq        -16(%rdi, %rdx), %rdi
-+        leaq        -16(%rsi, %rdx), %rsi
-+        vmovdqu        (%rsi), %xmm2
-+        VPCMPEQ (%rdi), %xmm2, %xmm2
-+        vpmovmskb %xmm2, %eax
-+        subl    $0xffff, %eax
-+        jnz        L(first_vec)
-+        ret
-+        .p2align 4
-+L(more_8x_vec):
-+        /* More than 8 * VEC.  Check the first VEC.  */
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        /* Align the first memory area for aligned loads in the loop.
-+           Compute how much the first memory area is misaligned.  */
-+        movq        %rdi, %rcx
-+        andl        $(VEC_SIZE - 1), %ecx
-+        /* Get the negative of offset for alignment.  */
-+        subq        $VEC_SIZE, %rcx
-+        /* Adjust the second memory area.  */
-+        subq        %rcx, %rsi
-+        /* Adjust the first memory area which should be aligned now.  */
-+        subq        %rcx, %rdi
-+        /* Adjust length.  */
-+        addq        %rcx, %rdx
-+L(loop_4x_vec):
-+        /* Compare 4 * VEC at a time forward.  */
-+        vmovdqu        (%rsi), %ymm1
-+        VPCMPEQ (%rdi), %ymm1, %ymm1
-+        vmovdqu        VEC_SIZE(%rsi), %ymm2
-+        VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-+        vpand        %ymm2, %ymm1, %ymm5
-+        vmovdqu        (VEC_SIZE * 2)(%rsi), %ymm3
-+        VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-+        vpand        %ymm3, %ymm5, %ymm5
-+        vmovdqu        (VEC_SIZE * 3)(%rsi), %ymm4
-+        VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-+        vpand        %ymm4, %ymm5, %ymm5
-+        vptest        %ymm0, %ymm5
-+        jnc        L(4x_vec_end)
-+        addq        $(VEC_SIZE * 4), %rdi
-+        addq        $(VEC_SIZE * 4), %rsi
-+        subq        $(VEC_SIZE * 4), %rdx
-+        cmpq        $(VEC_SIZE * 4), %rdx
-+        jae        L(loop_4x_vec)
-+        /* Less than 4 * VEC.  */
-+        cmpq        $VEC_SIZE, %rdx
-+        jbe        L(last_vec)
-+        cmpq        $(VEC_SIZE * 2), %rdx
-+        jbe        L(last_2x_vec)
-+L(last_4x_vec):
-+        /* From 2 * VEC to 4 * VEC. */
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        addq        $VEC_SIZE, %rdi
-+        addq        $VEC_SIZE, %rsi
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        /* Use overlapping loads to avoid branches.  */
-+        leaq        -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
-+        leaq        -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        addq        $VEC_SIZE, %rdi
-+        addq        $VEC_SIZE, %rsi
-+        vmovdqu        (%rsi), %ymm2
-+        VPCMPEQ (%rdi), %ymm2, %ymm2
-+        vpmovmskb %ymm2, %eax
-+        subl    $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        VZEROUPPER
-+        ret
-+        .p2align 4
-+L(4x_vec_end):
-+        vpmovmskb %ymm1, %eax
-+        subl        $VEC_MASK, %eax
-+        jnz        L(first_vec)
-+        vpmovmskb %ymm2, %eax
-+        subl        $VEC_MASK, %eax
-+        jnz        L(first_vec_x1)
-+        vpmovmskb %ymm3, %eax
-+        subl        $VEC_MASK, %eax
-+        jnz        L(first_vec_x2)
-+        vpmovmskb %ymm4, %eax
-+        subl        $VEC_MASK, %eax
-+        tzcntl        %eax, %ecx
-+# ifdef USE_AS_WMEMCMP
-+        xorl        %eax, %eax
-+        movl        (VEC_SIZE * 3)(%rdi, %rcx), %edx
-+        cmpl        (VEC_SIZE * 3)(%rsi, %rcx), %edx
-+        jmp        L(wmemcmp_return)
-+# else
-+        movzbl        (VEC_SIZE * 3)(%rdi, %rcx), %eax
-+        movzbl        (VEC_SIZE * 3)(%rsi, %rcx), %edx
-+        sub        %edx, %eax
-+# endif
-+        VZEROUPPER
-+        ret
-+        .p2align 4
-+L(first_vec_x1):
-+        tzcntl        %eax, %ecx
-+# ifdef USE_AS_WMEMCMP
-+        xorl        %eax, %eax
-+        movl        VEC_SIZE(%rdi, %rcx), %edx
-+        cmpl        VEC_SIZE(%rsi, %rcx), %edx
-+        jmp        L(wmemcmp_return)
-+# else
-+        movzbl        VEC_SIZE(%rdi, %rcx), %eax
-+        movzbl        VEC_SIZE(%rsi, %rcx), %edx
-+        sub        %edx, %eax
-+# endif
-+        VZEROUPPER
-+        ret
-+        .p2align 4
-+L(first_vec_x2):
-+        tzcntl        %eax, %ecx
-+# ifdef USE_AS_WMEMCMP
-+        xorl        %eax, %eax
-+        movl        (VEC_SIZE * 2)(%rdi, %rcx), %edx
-+        cmpl        (VEC_SIZE * 2)(%rsi, %rcx), %edx
-+        jmp        L(wmemcmp_return)
-+# else
-+        movzbl        (VEC_SIZE * 2)(%rdi, %rcx), %eax
-+        movzbl        (VEC_SIZE * 2)(%rsi, %rcx), %edx
-+        sub        %edx, %eax
-+# endif
-+        VZEROUPPER
-+        ret
-+END (MEMCMP)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S
-new file mode 100644
-index 000000000..a958fb56d
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S
-@@ -0,0 +1,408 @@
-+/* memrchr optimized with AVX2.
-+   Copyright (C) 2017-2019 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#ifndef L
-+# define L(label)       .L##label
-+#endif
-+
-+#ifndef cfi_startproc
-+# define cfi_startproc  .cfi_startproc
-+#endif
-+
-+#ifndef cfi_endproc
-+# define cfi_endproc    .cfi_endproc
-+#endif
-+
-+#ifndef cfi_rel_offset
-+# define cfi_rel_offset(reg, off)       .cfi_rel_offset reg, off
-+#endif
-+
-+#ifndef cfi_restore
-+# define cfi_restore(reg)       .cfi_restore reg
-+#endif
-+
-+#ifndef cfi_adjust_cfa_offset
-+# define cfi_adjust_cfa_offset(off)     .cfi_adjust_cfa_offset off
-+#endif
-+
-+#ifndef ENTRY
-+# define ENTRY(name)    \
-+        .type name,  @function; \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+#endif
-+
-+#ifndef END
-+# define END(name)      \
-+        cfi_endproc;    \
-+        .size name,     .-name
-+#endif
-+
-+#define CFI_PUSH(REG)   \
-+        cfi_adjust_cfa_offset (4);      \
-+        cfi_rel_offset (REG, 0)
-+
-+#define CFI_POP(REG)    \
-+        cfi_adjust_cfa_offset (-4);     \
-+        cfi_restore (REG)
-+
-+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-+#define POP(REG) popl REG; CFI_POP (REG)
-+
-+# ifndef MEMRCHR
-+#  define MEMRCHR          memrchr_avx2
-+# endif
-+
-+#ifndef VZEROUPPER
-+#  define VZEROUPPER	vzeroupper
-+# endif
-+
-+# define VEC_SIZE 32
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (MEMRCHR)
-+	/* Broadcast CHAR to YMM0.  */
-+	vmovd	%esi, %xmm0
-+	vpbroadcastb %xmm0, %ymm0
-+
-+	sub	$VEC_SIZE, %rdx
-+	jbe	L(last_vec_or_less)
-+
-+	add	%rdx, %rdi
-+
-+	/* Check the last VEC_SIZE bytes.  */
-+	vpcmpeqb (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x0)
-+
-+	subq	$(VEC_SIZE * 4), %rdi
-+	movl	%edi, %ecx
-+	andl	$(VEC_SIZE - 1), %ecx
-+	jz	L(aligned_more)
-+
-+	/* Align data for aligned loads in the loop.  */
-+	addq	$VEC_SIZE, %rdi
-+	addq	$VEC_SIZE, %rdx
-+	andq	$-VEC_SIZE, %rdi
-+	subq	%rcx, %rdx
-+
-+	.p2align 4
-+L(aligned_more):
-+	subq	$(VEC_SIZE * 4), %rdx
-+	jbe	L(last_4x_vec_or_less)
-+
-+	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-+	   since data is only aligned to VEC_SIZE.  */
-+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x3)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-+	vpmovmskb %ymm2, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x2)
-+
-+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-+	vpmovmskb %ymm3, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x1)
-+
-+	vpcmpeqb (%rdi), %ymm0, %ymm4
-+	vpmovmskb %ymm4, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x0)
-+
-+	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
-+	   There are some overlaps with above if data isn't aligned
-+	   to 4 * VEC_SIZE.  */
-+	movl	%edi, %ecx
-+	andl	$(VEC_SIZE * 4 - 1), %ecx
-+	jz	L(loop_4x_vec)
-+
-+	addq	$(VEC_SIZE * 4), %rdi
-+	addq	$(VEC_SIZE * 4), %rdx
-+	andq	$-(VEC_SIZE * 4), %rdi
-+	subq	%rcx, %rdx
-+
-+	.p2align 4
-+L(loop_4x_vec):
-+	/* Compare 4 * VEC at a time forward.  */
-+	subq	$(VEC_SIZE * 4), %rdi
-+	subq	$(VEC_SIZE * 4), %rdx
-+	jbe	L(last_4x_vec_or_less)
-+
-+	vmovdqa	(%rdi), %ymm1
-+	vmovdqa	VEC_SIZE(%rdi), %ymm2
-+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-+
-+	vpcmpeqb %ymm1, %ymm0, %ymm1
-+	vpcmpeqb %ymm2, %ymm0, %ymm2
-+	vpcmpeqb %ymm3, %ymm0, %ymm3
-+	vpcmpeqb %ymm4, %ymm0, %ymm4
-+
-+	vpor	%ymm1, %ymm2, %ymm5
-+	vpor	%ymm3, %ymm4, %ymm6
-+	vpor	%ymm5, %ymm6, %ymm5
-+
-+	vpmovmskb %ymm5, %eax
-+	testl	%eax, %eax
-+	jz	L(loop_4x_vec)
-+
-+	/* There is a match.  */
-+	vpmovmskb %ymm4, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x3)
-+
-+	vpmovmskb %ymm3, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x2)
-+
-+	vpmovmskb %ymm2, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x1)
-+
-+	vpmovmskb %ymm1, %eax
-+	bsrl	%eax, %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_4x_vec_or_less):
-+	addl	$(VEC_SIZE * 4), %edx
-+	cmpl	$(VEC_SIZE * 2), %edx
-+	jbe	L(last_2x_vec)
-+
-+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x3)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
-+	vpmovmskb %ymm2, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x2)
-+
-+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
-+	vpmovmskb %ymm3, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x1_check)
-+	cmpl	$(VEC_SIZE * 3), %edx
-+	jbe	L(zero)
-+
-+	vpcmpeqb (%rdi), %ymm0, %ymm4
-+	vpmovmskb %ymm4, %eax
-+	testl	%eax, %eax
-+	jz	L(zero)
-+	bsrl	%eax, %eax
-+	subq	$(VEC_SIZE * 4), %rdx
-+	addq	%rax, %rdx
-+	jl	L(zero)
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_2x_vec):
-+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x3_check)
-+	cmpl	$VEC_SIZE, %edx
-+	jbe	L(zero)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jz	L(zero)
-+	bsrl	%eax, %eax
-+	subq	$(VEC_SIZE * 2), %rdx
-+	addq	%rax, %rdx
-+	jl	L(zero)
-+	addl	$(VEC_SIZE * 2), %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_x0):
-+	bsrl	%eax, %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_x1):
-+	bsrl	%eax, %eax
-+	addl	$VEC_SIZE, %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_x2):
-+	bsrl	%eax, %eax
-+	addl	$(VEC_SIZE * 2), %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_x3):
-+	bsrl	%eax, %eax
-+	addl	$(VEC_SIZE * 3), %eax
-+	addq	%rdi, %rax
-+	ret
-+
-+	.p2align 4
-+L(last_vec_x1_check):
-+	bsrl	%eax, %eax
-+	subq	$(VEC_SIZE * 3), %rdx
-+	addq	%rax, %rdx
-+	jl	L(zero)
-+	addl	$VEC_SIZE, %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_x3_check):
-+	bsrl	%eax, %eax
-+	subq	$VEC_SIZE, %rdx
-+	addq	%rax, %rdx
-+	jl	L(zero)
-+	addl	$(VEC_SIZE * 3), %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(zero):
-+	VZEROUPPER
-+L(null):
-+	xorl	%eax, %eax
-+	ret
-+
-+	.p2align 4
-+L(last_vec_or_less_aligned):
-+	movl	%edx, %ecx
-+
-+	vpcmpeqb (%rdi), %ymm0, %ymm1
-+
-+	movl	$1, %edx
-+	/* Support rdx << 32.  */
-+	salq	%cl, %rdx
-+	subq	$1, %rdx
-+
-+	vpmovmskb %ymm1, %eax
-+
-+	/* Remove the trailing bytes.  */
-+	andl	%edx, %eax
-+	testl	%eax, %eax
-+	jz	L(zero)
-+
-+	bsrl	%eax, %eax
-+	addq	%rdi, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_or_less):
-+	addl	$VEC_SIZE, %edx
-+
-+	/* Check for zero length.  */
-+	testl	%edx, %edx
-+	jz	L(null)
-+
-+	movl	%edi, %ecx
-+	andl	$(VEC_SIZE - 1), %ecx
-+	jz	L(last_vec_or_less_aligned)
-+
-+	movl	%ecx, %esi
-+	movl	%ecx, %r8d
-+	addl	%edx, %esi
-+	andq	$-VEC_SIZE, %rdi
-+
-+	subl	$VEC_SIZE, %esi
-+	ja	L(last_vec_2x_aligned)
-+
-+	/* Check the last VEC.  */
-+	vpcmpeqb (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+
-+	/* Remove the leading and trailing bytes.  */
-+	sarl	%cl, %eax
-+	movl	%edx, %ecx
-+
-+	movl	$1, %edx
-+	sall	%cl, %edx
-+	subl	$1, %edx
-+
-+	andl	%edx, %eax
-+	testl	%eax, %eax
-+	jz	L(zero)
-+
-+	bsrl	%eax, %eax
-+	addq	%rdi, %rax
-+	addq	%r8, %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_vec_2x_aligned):
-+	movl	%esi, %ecx
-+
-+	/* Check the last VEC.  */
-+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
-+
-+	movl	$1, %edx
-+	sall	%cl, %edx
-+	subl	$1, %edx
-+
-+	vpmovmskb %ymm1, %eax
-+
-+	/* Remove the trailing bytes.  */
-+	andl	%edx, %eax
-+
-+	testl	%eax, %eax
-+	jnz	L(last_vec_x1)
-+
-+	/* Check the second last VEC.  */
-+	vpcmpeqb (%rdi), %ymm0, %ymm1
-+
-+	movl	%r8d, %ecx
-+
-+	vpmovmskb %ymm1, %eax
-+
-+	/* Remove the leading bytes.  Must use unsigned right shift for
-+	   bsrl below.  */
-+	shrl	%cl, %eax
-+	testl	%eax, %eax
-+	jz	L(zero)
-+
-+	bsrl	%eax, %eax
-+	addq	%rdi, %rax
-+	addq	%r8, %rax
-+	VZEROUPPER
-+	ret
-+END (MEMRCHR)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S
-new file mode 100644
-index 000000000..7c485cf70
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S
-@@ -0,0 +1,140 @@
-+/*
-+Copyright (C) 2019 The Android Open Source Project
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions
-+are met:
-+ * Redistributions of source code must retain the above copyright
-+   notice, this list of conditions and the following disclaimer.
-+ * Redistributions in binary form must reproduce the above copyright
-+   notice, this list of conditions and the following disclaimer in
-+   the documentation and/or other materials provided with the
-+   distribution.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
-+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-+SUCH DAMAGE.
-+*/
-+
-+#include <private/bionic_asm.h>
-+
-+#ifndef WMEMSET
-+ #define WMEMSET wmemset_avx2
-+#endif
-+
-+        .section .text.avx2,"ax",@progbits
-+
-+ENTRY (WMEMSET)
-+# BB#0:
-+	testq	%rdx, %rdx
-+	je	.LBB0_14
-+# BB#1:
-+	cmpq	$32, %rdx
-+	jae	.LBB0_3
-+# BB#2:
-+	xorl	%r8d, %r8d
-+	movq	%rdi, %rax
-+	jmp	.LBB0_12
-+.LBB0_3:
-+	movq	%rdx, %r8
-+	andq	$-32, %r8
-+	vmovd	%esi, %xmm0
-+	vpbroadcastd	%xmm0, %ymm0
-+	leaq	-32(%r8), %rcx
-+	movq	%rcx, %rax
-+	shrq	$5, %rax
-+	leal	1(%rax), %r9d
-+	andl	$7, %r9d
-+	cmpq	$224, %rcx
-+	jae	.LBB0_5
-+# BB#4:
-+	xorl	%eax, %eax
-+	testq	%r9, %r9
-+	jne	.LBB0_8
-+	jmp	.LBB0_10
-+.LBB0_5:
-+	leaq	992(%rdi), %rcx
-+	leaq	-1(%r9), %r10
-+	subq	%rax, %r10
-+	xorl	%eax, %eax
-+	.p2align	4, 0x90
-+.LBB0_6:                                # =>This Inner Loop Header: Depth=1
-+	vmovdqu	%ymm0, -992(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -960(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -928(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -896(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -864(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -832(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -800(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -768(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -736(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -704(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -672(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -640(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -608(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -576(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -544(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -512(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -480(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -448(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -416(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -384(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -352(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -320(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -288(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -256(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -224(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -192(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -160(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -128(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -96(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -64(%rcx,%rax,4)
-+	vmovdqu	%ymm0, -32(%rcx,%rax,4)
-+	vmovdqu	%ymm0, (%rcx,%rax,4)
-+	addq	$256, %rax              # imm = 0x100
-+	addq	$8, %r10
-+	jne	.LBB0_6
-+# BB#7:
-+	testq	%r9, %r9
-+	je	.LBB0_10
-+.LBB0_8:
-+	leaq	(%rdi,%rax,4), %rax
-+	addq	$96, %rax
-+	negq	%r9
-+	.p2align	4, 0x90
-+.LBB0_9:                                # =>This Inner Loop Header: Depth=1
-+	vmovdqu	%ymm0, -96(%rax)
-+	vmovdqu	%ymm0, -64(%rax)
-+	vmovdqu	%ymm0, -32(%rax)
-+	vmovdqu	%ymm0, (%rax)
-+	subq	$-128, %rax
-+	addq	$1, %r9
-+	jne	.LBB0_9
-+.LBB0_10:
-+	cmpq	%rdx, %r8
-+	je	.LBB0_14
-+# BB#11:
-+	leaq	(%rdi,%r8,4), %rax
-+.LBB0_12:
-+	subq	%r8, %rdx
-+	.p2align	4, 0x90
-+.LBB0_13:                               # =>This Inner Loop Header: Depth=1
-+	movl	%esi, (%rax)
-+	addq	$4, %rax
-+	addq	$-1, %rdx
-+	jne	.LBB0_13
-+.LBB0_14:
-+	movq	%rdi, %rax
-+	vzeroupper
-+	retq
-+END(WMEMSET)
-diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
-similarity index 99%
-rename from libc/arch-x86_64/string/sse2-memmove-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
-index 739502888..7024f4950 100644
---- a/libc/arch-x86_64/string/sse2-memmove-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
-@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #include "cache.h"
- 
- #ifndef MEMMOVE
--# define MEMMOVE		memmove
-+# define MEMMOVE		memmove_generic
- #endif
- 
- #ifndef L
-@@ -515,4 +515,4 @@ L(mm_large_page_loop_backward):
- 
- END (MEMMOVE)
- 
--ALIAS_SYMBOL(memcpy, MEMMOVE)
-+//ALIAS_SYMBOL(memcpy, MEMMOVE)
-diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-memset-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-stpcpy-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-stpncpy-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-strcat-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-strcpy-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-strlen-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-strncat-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
-diff --git a/libc/arch-x86_64/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/sse2-strncpy-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
-diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
-similarity index 99%
-rename from libc/arch-x86_64/string/sse4-memcmp-slm.S
-rename to libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
-index 8a8b180a2..6cfcd767f 100644
---- a/libc/arch-x86_64/string/sse4-memcmp-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
-@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #include "cache.h"
- 
- #ifndef MEMCMP
--# define MEMCMP		memcmp
-+# define MEMCMP		memcmp_generic
- #endif
- 
- #ifndef L
-diff --git a/libc/arch-x86_64/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/ssse3-strcmp-slm.S
-rename to libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
-diff --git a/libc/arch-x86_64/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
-similarity index 100%
-rename from libc/arch-x86_64/string/ssse3-strncmp-slm.S
-rename to libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
-diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S
-index 93ff5f2fc..979ce4f18 100644
---- a/libc/arch-x86_64/static_function_dispatch.S
-+++ b/libc/arch-x86_64/static_function_dispatch.S
-@@ -35,3 +35,9 @@ END(name)
- 
- FUNCTION_DELEGATE(memset, memset_generic)
- FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic)
-+FUNCTION_DELEGATE(memcmp, memcmp_generic)
-+FUNCTION_DELEGATE(memcpy, memmove_generic)
-+FUNCTION_DELEGATE(memmove, memmove_generic)
-+FUNCTION_DELEGATE(memchr, memchr_openbsd)
-+FUNCTION_DELEGATE(memrchr, memrchr_openbsd)
-+//FUNCTION_DELEGATE(wmemset, wmemset_freebsd)
--- 
-2.25.1
-
diff --git a/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch b/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch
deleted file mode 100644
index 0432f627fd..0000000000
--- a/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch
+++ /dev/null
@@ -1,4169 +0,0 @@
-From b6a7f45aa68426f4e32a4bf51e71ec5453f25f8d Mon Sep 17 00:00:00 2001
-From: Ravi Kumar Soni <ravi.kumar.soni@intel.com>
-Date: Mon, 28 Oct 2024 15:08:14 +0530
-Subject: [PATCH 4/5] Optimize bionic string functions with avx implementation
-
-Following are the string functions that has been
-optimized with avx2 implementation from glibc 2.32 version.
-  - strcmp, strncmp
-  - strlen, strnlen
-  - strchr, strrchr
-  - strcpy, strncpy
-  - stpcpy, stpncpy
-  - strcat, strncat
-  - wcscmp, wcsncmp
-  - wcslen, wcsnlen
-  - wcschr, wcsrchr
-
-Test done: Build and boot is fine, Run the benchmarks suite.
-
-Change-Id: I7f08a7507d25447ce886e9fde0482527c3f7a178
-Signed-off-by: ahs <amrita.h.s@intel.com>
-Signed-off-by: Ravi Kumar Soni <ravi.kumar.soni@intel.com>
----
- libc/Android.bp                               |   45 +-
- .../arch-x86_64/dynamic_function_dispatch.cpp |  133 ++-
- libc/arch-x86_64/generic/string/memchr.c      |    2 +-
- libc/arch-x86_64/generic/string/memrchr.c     |    2 +-
- libc/arch-x86_64/generic/string/strchr.cpp    |   19 +
- libc/arch-x86_64/generic/string/strnlen.cpp   |   19 +
- libc/arch-x86_64/generic/string/strrchr.cpp   |   19 +
- libc/arch-x86_64/generic/string/wcschr.c      |   19 +
- libc/arch-x86_64/generic/string/wcscmp.c      |   19 +
- libc/arch-x86_64/generic/string/wcslen.c      |   19 +
- libc/arch-x86_64/generic/string/wcsncmp.c     |   19 +
- libc/arch-x86_64/generic/string/wcsnlen.c     |   19 +
- libc/arch-x86_64/generic/string/wcsrchr.c     |   19 +
- libc/arch-x86_64/generic/string/wmemset.c     |    2 +-
- .../{ => kabylake}/string/avx2-memset-kbl.S   |    0
- .../kabylake/string/avx2-stpcpy-kbl.S         |    3 +
- .../kabylake/string/avx2-stpncpy-kbl.S        |    5 +
- .../kabylake/string/avx2-strcat-kbl.S         |  299 +++++
- .../kabylake/string/avx2-strchr-kbl.S         |  277 +++++
- .../kabylake/string/avx2-strcmp-kbl.S         |  885 ++++++++++++++
- .../kabylake/string/avx2-strcpy-kbl.S         | 1046 +++++++++++++++++
- .../kabylake/string/avx2-strlen-kbl.S         |  418 +++++++
- .../kabylake/string/avx2-strncat-kbl.S        |    3 +
- .../kabylake/string/avx2-strncmp-kbl.S        |    4 +
- .../kabylake/string/avx2-strncpy-kbl.S        |    4 +
- .../kabylake/string/avx2-strnlen-kbl.S        |    4 +
- .../kabylake/string/avx2-strrchr-kbl.S        |  258 ++++
- .../kabylake/string/avx2-wcschr-kbl.S         |    3 +
- .../kabylake/string/avx2-wcscmp-kbl.S         |    4 +
- .../kabylake/string/avx2-wcslen-kbl.S         |    4 +
- .../kabylake/string/avx2-wcsncmp-kbl.S        |    6 +
- .../kabylake/string/avx2-wcsnlen-kbl.S        |    6 +
- .../kabylake/string/avx2-wcsrchr-kbl.S        |    3 +
- libc/arch-x86_64/kabylake/string/avx_regs.h   |   26 +
- .../{include => kabylake/string}/cache.h      |    0
- libc/arch-x86_64/silvermont/string/cache.h    |   36 +
- .../silvermont/string/sse2-stpcpy-slm.S       |    2 +-
- .../silvermont/string/sse2-stpncpy-slm.S      |    2 +-
- .../silvermont/string/sse2-strcat-slm.S       |    2 +-
- .../silvermont/string/sse2-strcpy-slm.S       |    2 +-
- .../silvermont/string/sse2-strlen-slm.S       |    2 +-
- .../silvermont/string/sse2-strncat-slm.S      |    2 +-
- .../silvermont/string/sse2-strncpy-slm.S      |    2 +-
- .../silvermont/string/ssse3-strcmp-slm.S      |    2 +-
- .../silvermont/string/ssse3-strncmp-slm.S     |    2 +-
- libc/arch-x86_64/static_function_dispatch.S   |   25 +-
- 46 files changed, 3669 insertions(+), 23 deletions(-)
- create mode 100644 libc/arch-x86_64/generic/string/strchr.cpp
- create mode 100644 libc/arch-x86_64/generic/string/strnlen.cpp
- create mode 100644 libc/arch-x86_64/generic/string/strrchr.cpp
- create mode 100644 libc/arch-x86_64/generic/string/wcschr.c
- create mode 100644 libc/arch-x86_64/generic/string/wcscmp.c
- create mode 100644 libc/arch-x86_64/generic/string/wcslen.c
- create mode 100644 libc/arch-x86_64/generic/string/wcsncmp.c
- create mode 100644 libc/arch-x86_64/generic/string/wcsnlen.c
- create mode 100644 libc/arch-x86_64/generic/string/wcsrchr.c
- rename libc/arch-x86_64/{ => kabylake}/string/avx2-memset-kbl.S (100%)
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S
- create mode 100644 libc/arch-x86_64/kabylake/string/avx_regs.h
- rename libc/arch-x86_64/{include => kabylake/string}/cache.h (100%)
- create mode 100644 libc/arch-x86_64/silvermont/string/cache.h
-
-diff --git a/libc/Android.bp b/libc/Android.bp
-index 530ce9111..92483e833 100644
---- a/libc/Android.bp
-+++ b/libc/Android.bp
-@@ -377,6 +377,17 @@ cc_library_static {
-                 "upstream-freebsd/lib/libc/string/wmemcmp.c",
-             ],
-         },
-+        x86_64: {
-+            exclude_srcs: [
-+                "upstream-freebsd/lib/libc/string/wcscmp.c",
-+                "upstream-freebsd/lib/libc/string/wcsncmp.c",
-+                "upstream-freebsd/lib/libc/string/wcslen.c",
-+                "upstream-freebsd/lib/libc/string/wcsnlen.c",
-+                "upstream-freebsd/lib/libc/string/wcschr.c",
-+                "upstream-freebsd/lib/libc/string/wcsrchr.c",
-+
-+            ],
-+        },
-     },
- 
-     cflags: [
-@@ -1185,7 +1196,6 @@ cc_library_static {
-             ],
-         },
-         x86_64: {
--            include_dirs: ["bionic/libc/arch-x86_64/include"],
-             srcs: [
-                 "arch-x86_64/bionic/__bionic_clone.S",
-                 "arch-x86_64/bionic/_exit_with_stack_teardown.S",
-@@ -1194,7 +1204,7 @@ cc_library_static {
-                 "arch-x86_64/bionic/syscall.S",
-                 "arch-x86_64/bionic/vfork.S",
- 
--                "arch-x86_64/string/avx2-memset-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-memset-kbl.S",
-                 "arch-x86_64/silvermont/string/sse2-memmove-slm.S",
-                 "arch-x86_64/silvermont/string/sse2-memset-slm.S",
-                 "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S",
-@@ -1211,17 +1221,42 @@ cc_library_static {
-                 //"arch-x86_64/generic/string/wmemset.c"
-                 "arch-x86_64/generic/string/memchr.c",
-                 "arch-x86_64/generic/string/memrchr.c",
-+                "arch-x86_64/generic/string/strchr.cpp",
-+                "arch-x86_64/generic/string/strrchr.cpp",
-+                "arch-x86_64/generic/string/strnlen.cpp",
-+                "arch-x86_64/generic/string/wcscmp.c",
-+                "arch-x86_64/generic/string/wcsncmp.c",
-+                "arch-x86_64/generic/string/wcslen.c",
-+                "arch-x86_64/generic/string/wcsnlen.c",
-+                "arch-x86_64/generic/string/wcschr.c",
-+                "arch-x86_64/generic/string/wcsrchr.c",
- 
-                 //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S"
-                 "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-memchr-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strlen-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strnlen-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strchr-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strrchr-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strcpy-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strncpy-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strcat-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-strncat-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-wcslen-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-wcschr-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S",
- 
--                "bionic/strchr.cpp",
-                 "bionic/strchrnul.cpp",
--                "bionic/strnlen.cpp",
--                "bionic/strrchr.cpp",
-             ],
-+
-         },
-     },
- 
-diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp
-index 43aaebb54..182eb4200 100644
---- a/libc/arch-x86_64/dynamic_function_dispatch.cpp
-+++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp
-@@ -67,21 +67,148 @@ typedef void* memchr_func(const void* __s, int __ch, size_t __n);
- DEFINE_IFUNC_FOR(memchr) {
-     __builtin_cpu_init();
-     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2);
--    RETURN_FUNC(memchr_func, memchr_openbsd);
-+    RETURN_FUNC(memchr_func, memchr_generic);
- }
- 
- typedef void* memrchr_func(const void* __s, int __ch, size_t __n);
- DEFINE_IFUNC_FOR(memrchr) {
-     __builtin_cpu_init();
-     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2);
--    RETURN_FUNC(memrchr_func, memrchr_openbsd);
-+    RETURN_FUNC(memrchr_func, memrchr_generic);
- }
- 
- // typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
- // DEFINE_IFUNC_FOR(wmemset) {
- //     __builtin_cpu_init();
- //     if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2);
--//     RETURN_FUNC(wmemset_func, wmemset_freebsd);
-+//     RETURN_FUNC(wmemset_func, wmemset_generic);
- // }
- 
-+typedef int strcmp_func(const char* __lhs, const char* __rhs);
-+DEFINE_IFUNC_FOR(strcmp) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcmp_func, strcmp_avx2);
-+    RETURN_FUNC(strcmp_func, strcmp_generic);
-+}
-+
-+typedef int strncmp_func(const char* __lhs, const char* __rhs, size_t __n);
-+DEFINE_IFUNC_FOR(strncmp) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncmp_func, strncmp_avx2);
-+    RETURN_FUNC(strncmp_func, strncmp_generic);
-+}
-+
-+typedef char* strcpy_func(char* __dst, const char* __src);
-+DEFINE_IFUNC_FOR(strcpy) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcpy_func, strcpy_avx2);
-+    RETURN_FUNC(strcpy_func, strcpy_generic);
-+}
-+
-+typedef char* strncpy_func(char* __dst, const char* __src, size_t __n);
-+DEFINE_IFUNC_FOR(strncpy) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncpy_func, strncpy_avx2);
-+    RETURN_FUNC(strncpy_func, strncpy_generic);
-+}
-+
-+typedef char* stpcpy_func(char* __dst, const char* __src);
-+DEFINE_IFUNC_FOR(stpcpy) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpcpy_func, stpcpy_avx2);
-+    RETURN_FUNC(stpcpy_func, stpcpy_generic);
-+}
-+
-+typedef char* stpncpy_func(char* __dst, const char* __src, size_t __n);
-+DEFINE_IFUNC_FOR(stpncpy) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpncpy_func, stpncpy_avx2);
-+    RETURN_FUNC(stpncpy_func, stpncpy_generic);
-+}
-+
-+typedef size_t strlen_func(const char* __s);
-+DEFINE_IFUNC_FOR(strlen) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strlen_func, strlen_avx2);
-+    RETURN_FUNC(strlen_func, strlen_generic);
-+}
-+
-+
-+typedef size_t strnlen_func(const char* __s, size_t __n);
-+DEFINE_IFUNC_FOR(strnlen) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strnlen_func, strnlen_avx2);
-+    RETURN_FUNC(strnlen_func, strnlen_generic);
-+}
-+
-+typedef char* strchr_func(const char* __s, int __ch);
-+DEFINE_IFUNC_FOR(strchr) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strchr_func, strchr_avx2);
-+    RETURN_FUNC(strchr_func, strchr_generic);
-+}
-+
-+typedef char* strrchr_func(const char* __s, int __ch);
-+DEFINE_IFUNC_FOR(strrchr) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strrchr_func, strrchr_avx2);
-+    RETURN_FUNC(strrchr_func, strrchr_generic);
-+}
-+
-+typedef char* strcat_func(char* __dst, const char* __src);
-+DEFINE_IFUNC_FOR(strcat) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcat_func, strcat_avx2);
-+    RETURN_FUNC(strcat_func, strcat_generic);
-+}
-+
-+typedef char* strncat_func(char* __dst, const char* __src, size_t __n);
-+DEFINE_IFUNC_FOR(strncat) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncat_func, strncat_avx2);
-+    RETURN_FUNC(strncat_func, strncat_generic);
-+}
-+
-+typedef int wcscmp_func(const wchar_t* __lhs, const wchar_t* __rhs);
-+DEFINE_IFUNC_FOR(wcscmp) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcscmp_func, wcscmp_avx2);
-+    RETURN_FUNC(wcscmp_func, wcscmp_generic);
-+}
-+
-+typedef int wcsncmp_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n);
-+DEFINE_IFUNC_FOR(wcsncmp) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsncmp_func, wcsncmp_avx2);
-+    RETURN_FUNC(wcsncmp_func, wcsncmp_generic);
-+}
-+
-+typedef size_t wcslen_func(const wchar_t* __s);
-+DEFINE_IFUNC_FOR(wcslen) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcslen_func, wcslen_avx2);
-+    RETURN_FUNC(wcslen_func, wcslen_generic);
-+}
-+
-+typedef size_t wcsnlen_func(const wchar_t* __s, size_t __n);
-+DEFINE_IFUNC_FOR(wcsnlen) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsnlen_func, wcsnlen_avx2);
-+    RETURN_FUNC(wcsnlen_func, wcsnlen_generic);
-+}
-+
-+typedef wchar_t* wcschr_func(const wchar_t* __s, wchar_t __wc);
-+DEFINE_IFUNC_FOR(wcschr) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcschr_func, wcschr_avx2);
-+    RETURN_FUNC(wcschr_func, wcschr_generic);
-+}
-+
-+typedef wchar_t* wcsrchr_func(const wchar_t* __s, wchar_t __wc);
-+DEFINE_IFUNC_FOR(wcsrchr) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsrchr_func, wcsrchr_avx2);
-+    RETURN_FUNC(wcsrchr_func, wcsrchr_generic);
-+}
-+
- }  // extern "C"
-diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c
-index 86ee02e0b..e6fc3eb84 100644
---- a/libc/arch-x86_64/generic/string/memchr.c
-+++ b/libc/arch-x86_64/generic/string/memchr.c
-@@ -15,6 +15,6 @@
- */
- 
- #include <upstream-openbsd/android/include/openbsd-compat.h>
--#define memchr memchr_openbsd
-+#define memchr memchr_generic
- 
- #include <upstream-openbsd/lib/libc/string/memchr.c>
-diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c
-index c803009f5..ee085e384 100644
---- a/libc/arch-x86_64/generic/string/memrchr.c
-+++ b/libc/arch-x86_64/generic/string/memrchr.c
-@@ -15,6 +15,6 @@
- */
- 
- #include <upstream-openbsd/android/include/openbsd-compat.h>
--#define memrchr memrchr_openbsd
-+#define memrchr memrchr_generic
- 
- #include <upstream-openbsd/lib/libc/string/memrchr.c>
-diff --git a/libc/arch-x86_64/generic/string/strchr.cpp b/libc/arch-x86_64/generic/string/strchr.cpp
-new file mode 100644
-index 000000000..8a3d6d619
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/strchr.cpp
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define strchr strchr_generic
-+
-+#include <bionic/strchr.cpp>
-diff --git a/libc/arch-x86_64/generic/string/strnlen.cpp b/libc/arch-x86_64/generic/string/strnlen.cpp
-new file mode 100644
-index 000000000..f60348656
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/strnlen.cpp
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define strnlen strnlen_generic
-+
-+#include <bionic/strnlen.cpp>
-diff --git a/libc/arch-x86_64/generic/string/strrchr.cpp b/libc/arch-x86_64/generic/string/strrchr.cpp
-new file mode 100644
-index 000000000..9f0f33fd2
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/strrchr.cpp
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define strrchr strrchr_generic
-+
-+#include <bionic/strrchr.cpp>
-diff --git a/libc/arch-x86_64/generic/string/wcschr.c b/libc/arch-x86_64/generic/string/wcschr.c
-new file mode 100644
-index 000000000..d45e45d20
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wcschr.c
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define wcschr wcschr_generic
-+
-+#include <upstream-freebsd/lib/libc/string/wcschr.c>
-diff --git a/libc/arch-x86_64/generic/string/wcscmp.c b/libc/arch-x86_64/generic/string/wcscmp.c
-new file mode 100644
-index 000000000..e55bab549
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wcscmp.c
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define wcscmp wcscmp_generic
-+
-+#include <upstream-freebsd/lib/libc/string/wcscmp.c>
-diff --git a/libc/arch-x86_64/generic/string/wcslen.c b/libc/arch-x86_64/generic/string/wcslen.c
-new file mode 100644
-index 000000000..5b873fc30
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wcslen.c
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define wcslen wcslen_generic
-+
-+#include <upstream-freebsd/lib/libc/string/wcslen.c>
-diff --git a/libc/arch-x86_64/generic/string/wcsncmp.c b/libc/arch-x86_64/generic/string/wcsncmp.c
-new file mode 100644
-index 000000000..40b2ca2f3
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wcsncmp.c
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define wcsncmp wcsncmp_generic
-+
-+#include <upstream-freebsd/lib/libc/string/wcsncmp.c>
-diff --git a/libc/arch-x86_64/generic/string/wcsnlen.c b/libc/arch-x86_64/generic/string/wcsnlen.c
-new file mode 100644
-index 000000000..91051cea7
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wcsnlen.c
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define wcsnlen wcsnlen_generic
-+
-+#include <upstream-freebsd/lib/libc/string/wcsnlen.c>
-diff --git a/libc/arch-x86_64/generic/string/wcsrchr.c b/libc/arch-x86_64/generic/string/wcsrchr.c
-new file mode 100644
-index 000000000..73e8c25bc
---- /dev/null
-+++ b/libc/arch-x86_64/generic/string/wcsrchr.c
-@@ -0,0 +1,19 @@
-+/*
-+ * Copyright (C) 2019 The Android Open Source Project
-+ *
-+ * Licensed under the Apache License, Version 2.0 (the "License");
-+ * you may not use this file except in compliance with the License.
-+ * You may obtain a copy of the License at
-+ *
-+ *      http://www.apache.org/licenses/LICENSE-2.0
-+ *
-+ * Unless required by applicable law or agreed to in writing, software
-+ * distributed under the License is distributed on an "AS IS" BASIS,
-+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-+ * See the License for the specific language governing permissions and
-+ * limitations under the License.
-+*/
-+
-+#define wcsrchr wcsrchr_generic
-+
-+#include <upstream-freebsd/lib/libc/string/wcsrchr.c>
-diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c
-index ac6bd7ec4..9675fe91f 100644
---- a/libc/arch-x86_64/generic/string/wmemset.c
-+++ b/libc/arch-x86_64/generic/string/wmemset.c
-@@ -15,6 +15,6 @@
- */
- 
- #include <upstream-openbsd/android/include/openbsd-compat.h>
--#define wmemset wmemset_freebsd
-+#define wmemset wmemset_generic
- 
- #include <upstream-freebsd/lib/libc/string/wmemset.c>
-diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
-similarity index 100%
-rename from libc/arch-x86_64/string/avx2-memset-kbl.S
-rename to libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S
-new file mode 100644
-index 000000000..63f9ba25b
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S
-@@ -0,0 +1,3 @@
-+#define USE_AS_STPCPY
-+#define STRCPY stpcpy_avx2
-+#include "avx2-strcpy-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S
-new file mode 100644
-index 000000000..c1bbdb29e
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S
-@@ -0,0 +1,5 @@
-+#define USE_AS_STPCPY
-+#define USE_AS_STRNCPY
-+#define STRCPY stpncpy_avx2
-+#include "avx_regs.h"
-+#include "avx2-strcpy-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S
-new file mode 100644
-index 000000000..d1e9b4b38
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S
-@@ -0,0 +1,299 @@
-+/* strcat with AVX2
-+   Copyright (C) 2011-2020 Free Software Foundation, Inc.
-+   Contributed by Intel Corporation.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+
-+
-+# ifndef STRCAT
-+#  define STRCAT  strcat_avx2
-+# endif
-+
-+# ifndef L
-+#  define L(label)      .L##label
-+# endif
-+
-+# ifndef cfi_startproc
-+#  define cfi_startproc .cfi_startproc
-+# endif
-+
-+# ifndef cfi_endproc
-+#  define cfi_endproc   .cfi_endproc
-+# endif
-+
-+# ifndef ENTRY
-+#  define ENTRY(name)   \
-+        .type name, @function;  \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+# endif
-+
-+# ifndef END
-+#  define END(name)     \
-+        cfi_endproc;    \
-+        .size name, .-name
-+# endif
-+
-+# define USE_AS_STRCAT
-+
-+/* Number of bytes in a vector register */
-+# define VEC_SIZE	32
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (STRCAT)
-+	mov	%rdi, %r9
-+# ifdef USE_AS_STRNCAT
-+	mov	%rdx, %r8
-+# endif
-+
-+	xor	%eax, %eax
-+	mov	%edi, %ecx
-+	and	$((VEC_SIZE * 4) - 1), %ecx
-+	vpxor	%xmm6, %xmm6, %xmm6
-+	cmp	$(VEC_SIZE * 3), %ecx
-+	ja	L(fourth_vector_boundary)
-+	vpcmpeqb (%rdi), %ymm6, %ymm0
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_first_vector)
-+	mov	%rdi, %rax
-+	and	$-VEC_SIZE, %rax
-+	jmp	L(align_vec_size_start)
-+L(fourth_vector_boundary):
-+	mov	%rdi, %rax
-+	and	$-VEC_SIZE, %rax
-+	vpcmpeqb	(%rax), %ymm6, %ymm0
-+	mov	$-1, %r10d
-+	sub	%rax, %rcx
-+	shl	%cl, %r10d
-+	vpmovmskb %ymm0, %edx
-+	and	%r10d, %edx
-+	jnz	L(exit)
-+
-+L(align_vec_size_start):
-+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_second_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-+	vpmovmskb %ymm1, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_third_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-+	vpmovmskb %ymm2, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fourth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-+	vpmovmskb %ymm3, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fifth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-+	add	$(VEC_SIZE * 4), %rax
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_second_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-+	vpmovmskb %ymm1, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_third_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-+	vpmovmskb %ymm2, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fourth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-+	vpmovmskb %ymm3, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fifth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-+	add	$(VEC_SIZE * 4), %rax
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_second_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-+	vpmovmskb %ymm1, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_third_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-+	vpmovmskb %ymm2, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fourth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-+	vpmovmskb %ymm3, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fifth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-+	add	$(VEC_SIZE * 4), %rax
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_second_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-+	vpmovmskb %ymm1, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_third_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-+	vpmovmskb %ymm2, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fourth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-+	vpmovmskb %ymm3, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fifth_vector)
-+
-+	test	$((VEC_SIZE * 4) - 1), %rax
-+	jz	L(align_four_vec_loop)
-+
-+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
-+	add	$(VEC_SIZE * 5), %rax
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit)
-+
-+	test	$((VEC_SIZE * 4) - 1), %rax
-+	jz	L(align_four_vec_loop)
-+
-+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
-+	add	$VEC_SIZE, %rax
-+	vpmovmskb %ymm1, %edx
-+	test	%edx, %edx
-+	jnz	L(exit)
-+
-+	test	$((VEC_SIZE * 4) - 1), %rax
-+	jz	L(align_four_vec_loop)
-+
-+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
-+	add	$VEC_SIZE, %rax
-+	vpmovmskb %ymm2, %edx
-+	test	%edx, %edx
-+	jnz	L(exit)
-+
-+	test	$((VEC_SIZE * 4) - 1), %rax
-+	jz	L(align_four_vec_loop)
-+
-+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
-+	add	$VEC_SIZE, %rax
-+	vpmovmskb %ymm3, %edx
-+	test	%edx, %edx
-+	jnz	L(exit)
-+
-+	add	$VEC_SIZE, %rax
-+
-+	.p2align 4
-+L(align_four_vec_loop):
-+	vmovaps	(%rax),	%ymm4
-+	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
-+	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
-+	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
-+	add	$(VEC_SIZE * 4),	%rax
-+	vpminub	%ymm4,	%ymm5, %ymm5
-+	vpcmpeqb %ymm5,	%ymm6, %ymm5
-+	vpmovmskb %ymm5,	%edx
-+	test	%edx,	%edx
-+	jz	L(align_four_vec_loop)
-+
-+	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
-+	sub	$(VEC_SIZE * 5),	%rax
-+	vpmovmskb %ymm0, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_second_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
-+	vpmovmskb %ymm1, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_third_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
-+	vpmovmskb %ymm2, %edx
-+	test	%edx, %edx
-+	jnz	L(exit_null_on_fourth_vector)
-+
-+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
-+	vpmovmskb %ymm3, %edx
-+	sub	%rdi, %rax
-+	bsf	%rdx, %rdx
-+	add	%rdx, %rax
-+	add	$(VEC_SIZE * 4), %rax
-+	jmp	L(StartStrcpyPart)
-+
-+	.p2align 4
-+L(exit):
-+	sub	%rdi, %rax
-+L(exit_null_on_first_vector):
-+	bsf	%rdx, %rdx
-+	add	%rdx, %rax
-+	jmp	L(StartStrcpyPart)
-+
-+	.p2align 4
-+L(exit_null_on_second_vector):
-+	sub	%rdi, %rax
-+	bsf	%rdx, %rdx
-+	add	%rdx, %rax
-+	add	$VEC_SIZE, %rax
-+	jmp	L(StartStrcpyPart)
-+
-+	.p2align 4
-+L(exit_null_on_third_vector):
-+	sub	%rdi, %rax
-+	bsf	%rdx, %rdx
-+	add	%rdx, %rax
-+	add	$(VEC_SIZE * 2), %rax
-+	jmp	L(StartStrcpyPart)
-+
-+	.p2align 4
-+L(exit_null_on_fourth_vector):
-+	sub	%rdi, %rax
-+	bsf	%rdx, %rdx
-+	add	%rdx, %rax
-+	add	$(VEC_SIZE * 3), %rax
-+	jmp	L(StartStrcpyPart)
-+
-+	.p2align 4
-+L(exit_null_on_fifth_vector):
-+	sub	%rdi, %rax
-+	bsf	%rdx, %rdx
-+	add	%rdx, %rax
-+	add	$(VEC_SIZE * 4), %rax
-+
-+	.p2align 4
-+L(StartStrcpyPart):
-+	lea	(%r9, %rax), %rdi
-+	mov	%rsi, %rcx
-+	mov	%r9, %rax      /* save result */
-+
-+# ifdef USE_AS_STRNCAT
-+	test	%r8, %r8
-+	jz	L(ExitZero)
-+#  define USE_AS_STRNCPY
-+# endif
-+
-+# include "avx2-strcpy-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S
-new file mode 100644
-index 000000000..7d8a44c81
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S
-@@ -0,0 +1,277 @@
-+/* strchr/strchrnul optimized with AVX2.
-+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+
-+# ifndef STRCHR
-+#  define STRCHR	strchr_avx2
-+# endif
-+
-+# ifndef L
-+#  define L(label)      .L##label
-+# endif
-+
-+# ifndef cfi_startproc
-+#  define cfi_startproc .cfi_startproc
-+# endif
-+
-+# ifndef cfi_endproc
-+#  define cfi_endproc   .cfi_endproc
-+# endif
-+
-+# ifndef ENTRY
-+#  define ENTRY(name)   \
-+        .type name, @function;  \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+# endif
-+
-+# ifndef END
-+#  define END(name)     \
-+        cfi_endproc;    \
-+        .size name, .-name
-+# endif
-+
-+# ifdef USE_AS_WCSCHR
-+#  define VPBROADCAST	vpbroadcastd
-+#  define VPCMPEQ	vpcmpeqd
-+#  define CHAR_REG	esi
-+# else
-+#  define VPBROADCAST	vpbroadcastb
-+#  define VPCMPEQ	vpcmpeqb
-+#  define CHAR_REG	sil
-+# endif
-+
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER	vzeroupper
-+# endif
-+
-+# define VEC_SIZE 32
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (STRCHR)
-+	movl	%edi, %ecx
-+	/* Broadcast CHAR to YMM0.  */
-+	vmovd	%esi, %xmm0
-+	vpxor	%xmm9, %xmm9, %xmm9
-+	VPBROADCAST %xmm0, %ymm0
-+	/* Check if we may cross page boundary with one vector load.  */
-+	andl	$(2 * VEC_SIZE - 1), %ecx
-+	cmpl	$VEC_SIZE, %ecx
-+	ja	L(cros_page_boundary)
-+
-+	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
-+	   null byte.  */
-+	vmovdqu	(%rdi), %ymm8
-+	VPCMPEQ %ymm8, %ymm0, %ymm1
-+	VPCMPEQ %ymm8, %ymm9, %ymm2
-+	vpor	%ymm1, %ymm2, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x0)
-+
-+	/* Align data for aligned loads in the loop.  */
-+	addq	$VEC_SIZE, %rdi
-+	andl	$(VEC_SIZE - 1), %ecx
-+	andq	$-VEC_SIZE, %rdi
-+
-+	jmp	L(more_4x_vec)
-+
-+	.p2align 4
-+L(cros_page_boundary):
-+	andl	$(VEC_SIZE - 1), %ecx
-+	andq	$-VEC_SIZE, %rdi
-+	vmovdqu	(%rdi), %ymm8
-+	VPCMPEQ %ymm8, %ymm0, %ymm1
-+	VPCMPEQ %ymm8, %ymm9, %ymm2
-+	vpor	%ymm1, %ymm2, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	/* Remove the leading bytes.  */
-+	sarl	%cl, %eax
-+	testl	%eax, %eax
-+	jz	L(aligned_more)
-+	/* Found CHAR or the null byte.  */
-+	tzcntl	%eax, %eax
-+	addq	%rcx, %rax
-+# ifdef USE_AS_STRCHRNUL
-+	addq	%rdi, %rax
-+# else
-+	xorl	%edx, %edx
-+	leaq	(%rdi, %rax), %rax
-+	cmp	(%rax), %CHAR_REG
-+	cmovne	%rdx, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(aligned_more):
-+	addq	$VEC_SIZE, %rdi
-+
-+L(more_4x_vec):
-+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-+	   since data is only aligned to VEC_SIZE.  */
-+	vmovdqa	(%rdi), %ymm8
-+	VPCMPEQ %ymm8, %ymm0, %ymm1
-+	VPCMPEQ %ymm8, %ymm9, %ymm2
-+	vpor	%ymm1, %ymm2, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x0)
-+
-+	vmovdqa	VEC_SIZE(%rdi), %ymm8
-+	VPCMPEQ %ymm8, %ymm0, %ymm1
-+	VPCMPEQ %ymm8, %ymm9, %ymm2
-+	vpor	%ymm1, %ymm2, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x1)
-+
-+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
-+	VPCMPEQ %ymm8, %ymm0, %ymm1
-+	VPCMPEQ %ymm8, %ymm9, %ymm2
-+	vpor	%ymm1, %ymm2, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x2)
-+
-+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-+	VPCMPEQ %ymm8, %ymm0, %ymm1
-+	VPCMPEQ %ymm8, %ymm9, %ymm2
-+	vpor	%ymm1, %ymm2, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x3)
-+
-+	addq	$(VEC_SIZE * 4), %rdi
-+
-+	/* Align data to 4 * VEC_SIZE.  */
-+	movq	%rdi, %rcx
-+	andl	$(4 * VEC_SIZE - 1), %ecx
-+	andq	$-(4 * VEC_SIZE), %rdi
-+
-+	.p2align 4
-+L(loop_4x_vec):
-+	/* Compare 4 * VEC at a time forward.  */
-+	vmovdqa	(%rdi), %ymm5
-+	vmovdqa	VEC_SIZE(%rdi), %ymm6
-+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-+
-+	VPCMPEQ %ymm5, %ymm0, %ymm1
-+	VPCMPEQ %ymm6, %ymm0, %ymm2
-+	VPCMPEQ %ymm7, %ymm0, %ymm3
-+	VPCMPEQ %ymm8, %ymm0, %ymm4
-+
-+	VPCMPEQ %ymm5, %ymm9, %ymm5
-+	VPCMPEQ %ymm6, %ymm9, %ymm6
-+	VPCMPEQ %ymm7, %ymm9, %ymm7
-+	VPCMPEQ %ymm8, %ymm9, %ymm8
-+
-+	vpor	%ymm1, %ymm5, %ymm1
-+	vpor	%ymm2, %ymm6, %ymm2
-+	vpor	%ymm3, %ymm7, %ymm3
-+	vpor	%ymm4, %ymm8, %ymm4
-+
-+	vpor	%ymm1, %ymm2, %ymm5
-+	vpor	%ymm3, %ymm4, %ymm6
-+
-+	vpor	%ymm5, %ymm6, %ymm5
-+
-+	vpmovmskb %ymm5, %eax
-+	testl	%eax, %eax
-+	jnz	L(4x_vec_end)
-+
-+	addq	$(VEC_SIZE * 4), %rdi
-+
-+	jmp	L(loop_4x_vec)
-+
-+	.p2align 4
-+L(first_vec_x0):
-+	/* Found CHAR or the null byte.  */
-+	tzcntl	%eax, %eax
-+# ifdef USE_AS_STRCHRNUL
-+	addq	%rdi, %rax
-+# else
-+	xorl	%edx, %edx
-+	leaq	(%rdi, %rax), %rax
-+	cmp	(%rax), %CHAR_REG
-+	cmovne	%rdx, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x1):
-+	tzcntl	%eax, %eax
-+# ifdef USE_AS_STRCHRNUL
-+	addq	$VEC_SIZE, %rax
-+	addq	%rdi, %rax
-+# else
-+	xorl	%edx, %edx
-+	leaq	VEC_SIZE(%rdi, %rax), %rax
-+	cmp	(%rax), %CHAR_REG
-+	cmovne	%rdx, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x2):
-+	tzcntl	%eax, %eax
-+# ifdef USE_AS_STRCHRNUL
-+	addq	$(VEC_SIZE * 2), %rax
-+	addq	%rdi, %rax
-+# else
-+	xorl	%edx, %edx
-+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-+	cmp	(%rax), %CHAR_REG
-+	cmovne	%rdx, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(4x_vec_end):
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x0)
-+	vpmovmskb %ymm2, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x1)
-+	vpmovmskb %ymm3, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x2)
-+	vpmovmskb %ymm4, %eax
-+	testl	%eax, %eax
-+L(first_vec_x3):
-+	tzcntl	%eax, %eax
-+# ifdef USE_AS_STRCHRNUL
-+	addq	$(VEC_SIZE * 3), %rax
-+	addq	%rdi, %rax
-+# else
-+	xorl	%edx, %edx
-+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
-+	cmp	(%rax), %CHAR_REG
-+	cmovne	%rdx, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+END (STRCHR)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S
-new file mode 100644
-index 000000000..b241812d8
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S
-@@ -0,0 +1,885 @@
-+/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
-+   Copyright (C) 2018-2020 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+
-+# ifndef STRCMP
-+#  define STRCMP	strcmp_avx2
-+# endif
-+
-+# ifndef L
-+#  define L(label)      .L##label
-+# endif
-+
-+# ifndef cfi_startproc
-+#  define cfi_startproc .cfi_startproc
-+# endif
-+
-+# ifndef cfi_endproc
-+#  define cfi_endproc   .cfi_endproc
-+# endif
-+
-+# ifndef ENTRY
-+#  define ENTRY(name)   \
-+        .type name, @function;  \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+# endif
-+
-+# ifndef END
-+#  define END(name)     \
-+        cfi_endproc;    \
-+        .size name, .-name
-+# endif
-+
-+# define PAGE_SIZE	4096
-+
-+/* VEC_SIZE = Number of bytes in a ymm register */
-+# define VEC_SIZE	32
-+
-+/* Shift for dividing by (VEC_SIZE * 4).  */
-+# define DIVIDE_BY_VEC_4_SHIFT	7
-+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-+#  error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-+# endif
-+
-+# ifdef USE_AS_WCSCMP
-+/* Compare packed dwords.  */
-+#  define VPCMPEQ	vpcmpeqd
-+/* Compare packed dwords and store minimum.  */
-+#  define VPMINU	vpminud
-+/* 1 dword char == 4 bytes.  */
-+#  define SIZE_OF_CHAR	4
-+# else
-+/* Compare packed bytes.  */
-+#  define VPCMPEQ	vpcmpeqb
-+/* Compare packed bytes and store minimum.  */
-+#  define VPMINU	vpminub
-+/* 1 byte char == 1 byte.  */
-+#  define SIZE_OF_CHAR	1
-+# endif
-+
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER	vzeroupper
-+# endif
-+
-+/* Warning!
-+           wcscmp/wcsncmp have to use SIGNED comparison for elements.
-+           strcmp/strncmp have to use UNSIGNED comparison for elements.
-+*/
-+
-+/* The main idea of the string comparison (byte or dword) using AVX2
-+   consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
-+   either packed bytes or dwords depending on USE_AS_WCSCMP. In order
-+   to check the null char, algorithm keeps the matched bytes/dwords,
-+   requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
-+   the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
-+   one VPMINU instructions, together with movdqu and testl instructions.
-+   Main loop (away from from page boundary) compares 4 vectors are a time,
-+   effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
-+
-+   The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
-+   is the same as strcmp, except that an a maximum offset is tracked.  If
-+   the maximum offset is reached before a difference is found, zero is
-+   returned.  */
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (STRCMP)
-+# ifdef USE_AS_STRNCMP
-+	/* Check for simple cases (0 or 1) in offset.  */
-+	cmp	$1, %RDX_LP
-+	je	L(char0)
-+	jb	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	/* Convert units: from wide to byte char.  */
-+	shl	$2, %RDX_LP
-+#  endif
-+	/* Register %r11 tracks the maximum offset.  */
-+	mov	%RDX_LP, %R11_LP
-+# endif
-+	movl	%edi, %eax
-+	xorl	%edx, %edx
-+	/* Make %xmm7 (%ymm7) all zeros in this function.  */
-+	vpxor	%xmm7, %xmm7, %xmm7
-+	orl	%esi, %eax
-+	andl	$(PAGE_SIZE - 1), %eax
-+	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
-+	jg	L(cross_page)
-+	/* Start comparing 4 vectors.  */
-+	vmovdqu	(%rdi), %ymm1
-+	VPCMPEQ	(%rsi), %ymm1, %ymm0
-+	VPMINU	%ymm1, %ymm0, %ymm0
-+	VPCMPEQ	%ymm7, %ymm0, %ymm0
-+	vpmovmskb %ymm0, %ecx
-+	testl	%ecx, %ecx
-+	je	L(next_3_vectors)
-+	tzcntl	%ecx, %edx
-+# ifdef USE_AS_STRNCMP
-+	/* Return 0 if the mismatched index (%rdx) is after the maximum
-+	   offset (%r11).   */
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+# endif
-+# ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(%rdi, %rdx), %ecx
-+	cmpl	(%rsi, %rdx), %ecx
-+	je	L(return)
-+L(wcscmp_return):
-+	setl	%al
-+	negl	%eax
-+	orl	$1, %eax
-+L(return):
-+# else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(return_vec_size):
-+	tzcntl	%ecx, %edx
-+# ifdef USE_AS_STRNCMP
-+	/* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
-+	   the maximum offset (%r11).  */
-+	addq	$VEC_SIZE, %rdx
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(%rdi, %rdx), %ecx
-+	cmpl	(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	VEC_SIZE(%rdi, %rdx), %ecx
-+	cmpl	VEC_SIZE(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	VEC_SIZE(%rdi, %rdx), %eax
-+	movzbl	VEC_SIZE(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(return_2_vec_size):
-+	tzcntl	%ecx, %edx
-+# ifdef USE_AS_STRNCMP
-+	/* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
-+	   after the maximum offset (%r11).  */
-+	addq	$(VEC_SIZE * 2), %rdx
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(%rdi, %rdx), %ecx
-+	cmpl	(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(VEC_SIZE * 2)(%rdi, %rdx), %ecx
-+	cmpl	(VEC_SIZE * 2)(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(VEC_SIZE * 2)(%rdi, %rdx), %eax
-+	movzbl	(VEC_SIZE * 2)(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(return_3_vec_size):
-+	tzcntl	%ecx, %edx
-+# ifdef USE_AS_STRNCMP
-+	/* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
-+	   after the maximum offset (%r11).  */
-+	addq	$(VEC_SIZE * 3), %rdx
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(%rdi, %rdx), %ecx
-+	cmpl	(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(VEC_SIZE * 3)(%rdi, %rdx), %ecx
-+	cmpl	(VEC_SIZE * 3)(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(VEC_SIZE * 3)(%rdi, %rdx), %eax
-+	movzbl	(VEC_SIZE * 3)(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(next_3_vectors):
-+	vmovdqu	VEC_SIZE(%rdi), %ymm6
-+	VPCMPEQ	VEC_SIZE(%rsi), %ymm6, %ymm3
-+	VPMINU	%ymm6, %ymm3, %ymm3
-+	VPCMPEQ	%ymm7, %ymm3, %ymm3
-+	vpmovmskb %ymm3, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(return_vec_size)
-+	vmovdqu	(VEC_SIZE * 2)(%rdi), %ymm5
-+	vmovdqu	(VEC_SIZE * 3)(%rdi), %ymm4
-+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm0
-+	VPCMPEQ	(VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
-+	VPMINU	%ymm5, %ymm2, %ymm2
-+	VPCMPEQ	%ymm4, %ymm0, %ymm0
-+	VPCMPEQ	%ymm7, %ymm2, %ymm2
-+	vpmovmskb %ymm2, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(return_2_vec_size)
-+	VPMINU	%ymm4, %ymm0, %ymm0
-+	VPCMPEQ	%ymm7, %ymm0, %ymm0
-+	vpmovmskb %ymm0, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(return_3_vec_size)
-+L(main_loop_header):
-+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
-+	movl	$PAGE_SIZE, %ecx
-+	/* Align load via RAX.  */
-+	andq	$-(VEC_SIZE * 4), %rdx
-+	subq	%rdi, %rdx
-+	leaq	(%rdi, %rdx), %rax
-+# ifdef USE_AS_STRNCMP
-+	/* Starting from this point, the maximum offset, or simply the
-+	   'offset', DECREASES by the same amount when base pointers are
-+	   moved forward.  Return 0 when:
-+	     1) On match: offset <= the matched vector index.
-+	     2) On mistmach, offset is before the mistmatched index.
-+	 */
-+	subq	%rdx, %r11
-+	jbe	L(zero)
-+# endif
-+	addq	%rsi, %rdx
-+	movq	%rdx, %rsi
-+	andl	$(PAGE_SIZE - 1), %esi
-+	/* Number of bytes before page crossing.  */
-+	subq	%rsi, %rcx
-+	/* Number of VEC_SIZE * 4 blocks before page crossing.  */
-+	shrq	$DIVIDE_BY_VEC_4_SHIFT, %rcx
-+	/* ESI: Number of VEC_SIZE * 4 blocks before page crossing.   */
-+	movl	%ecx, %esi
-+	jmp	L(loop_start)
-+
-+	.p2align 4
-+L(loop):
-+# ifdef USE_AS_STRNCMP
-+	/* Base pointers are moved forward by 4 * VEC_SIZE.  Decrease
-+	   the maximum offset (%r11) by the same amount.  */
-+	subq	$(VEC_SIZE * 4), %r11
-+	jbe	L(zero)
-+# endif
-+	addq	$(VEC_SIZE * 4), %rax
-+	addq	$(VEC_SIZE * 4), %rdx
-+L(loop_start):
-+	testl	%esi, %esi
-+	leal	-1(%esi), %esi
-+	je	L(loop_cross_page)
-+L(back_to_loop):
-+	/* Main loop, comparing 4 vectors are a time.  */
-+	vmovdqa	(%rax), %ymm0
-+	vmovdqa	VEC_SIZE(%rax), %ymm3
-+	VPCMPEQ	(%rdx), %ymm0, %ymm4
-+	VPCMPEQ	VEC_SIZE(%rdx), %ymm3, %ymm1
-+	VPMINU	%ymm0, %ymm4, %ymm4
-+	VPMINU	%ymm3, %ymm1, %ymm1
-+	vmovdqa	(VEC_SIZE * 2)(%rax), %ymm2
-+	VPMINU	%ymm1, %ymm4, %ymm0
-+	vmovdqa	(VEC_SIZE * 3)(%rax), %ymm3
-+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
-+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
-+	VPMINU	%ymm2, %ymm5, %ymm5
-+	VPMINU	%ymm3, %ymm6, %ymm6
-+	VPMINU	%ymm5, %ymm0, %ymm0
-+	VPMINU	%ymm6, %ymm0, %ymm0
-+	VPCMPEQ	%ymm7, %ymm0, %ymm0
-+
-+	/* Test each mask (32 bits) individually because for VEC_SIZE
-+	   == 32 is not possible to OR the four masks and keep all bits
-+	   in a 64-bit integer register, differing from SSE2 strcmp
-+	   where ORing is possible.  */
-+	vpmovmskb %ymm0, %ecx
-+	testl	%ecx, %ecx
-+	je	L(loop)
-+	VPCMPEQ	%ymm7, %ymm4, %ymm0
-+	vpmovmskb %ymm0, %edi
-+	testl	%edi, %edi
-+	je	L(test_vec)
-+	tzcntl	%edi, %ecx
-+# ifdef USE_AS_STRNCMP
-+	cmpq	%rcx, %r11
-+	jbe	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rcx), %edi
-+	cmpl	(%rdx, %rcx), %edi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rcx), %eax
-+	movzbl	(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rcx), %edi
-+	cmpl	(%rdx, %rcx), %edi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rcx), %eax
-+	movzbl	(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(test_vec):
-+# ifdef USE_AS_STRNCMP
-+	/* The first vector matched.  Return 0 if the maximum offset
-+	   (%r11) <= VEC_SIZE.  */
-+	cmpq	$VEC_SIZE, %r11
-+	jbe	L(zero)
-+# endif
-+	VPCMPEQ	%ymm7, %ymm1, %ymm1
-+	vpmovmskb %ymm1, %ecx
-+	testl	%ecx, %ecx
-+	je	L(test_2_vec)
-+	tzcntl	%ecx, %edi
-+# ifdef USE_AS_STRNCMP
-+	addq	$VEC_SIZE, %rdi
-+	cmpq	%rdi, %r11
-+	jbe	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rdi), %ecx
-+	cmpl	(%rdx, %rdi), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rdi), %eax
-+	movzbl	(%rdx, %rdi), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	VEC_SIZE(%rsi, %rdi), %ecx
-+	cmpl	VEC_SIZE(%rdx, %rdi), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	VEC_SIZE(%rax, %rdi), %eax
-+	movzbl	VEC_SIZE(%rdx, %rdi), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(test_2_vec):
-+# ifdef USE_AS_STRNCMP
-+	/* The first 2 vectors matched.  Return 0 if the maximum offset
-+	   (%r11) <= 2 * VEC_SIZE.  */
-+	cmpq	$(VEC_SIZE * 2), %r11
-+	jbe	L(zero)
-+# endif
-+	VPCMPEQ	%ymm7, %ymm5, %ymm5
-+	vpmovmskb %ymm5, %ecx
-+	testl	%ecx, %ecx
-+	je	L(test_3_vec)
-+	tzcntl	%ecx, %edi
-+# ifdef USE_AS_STRNCMP
-+	addq	$(VEC_SIZE * 2), %rdi
-+	cmpq	%rdi, %r11
-+	jbe	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rdi), %ecx
-+	cmpl	(%rdx, %rdi), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rdi), %eax
-+	movzbl	(%rdx, %rdi), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(VEC_SIZE * 2)(%rsi, %rdi), %ecx
-+	cmpl	(VEC_SIZE * 2)(%rdx, %rdi), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(VEC_SIZE * 2)(%rax, %rdi), %eax
-+	movzbl	(VEC_SIZE * 2)(%rdx, %rdi), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(test_3_vec):
-+# ifdef USE_AS_STRNCMP
-+	/* The first 3 vectors matched.  Return 0 if the maximum offset
-+	   (%r11) <= 3 * VEC_SIZE.  */
-+	cmpq	$(VEC_SIZE * 3), %r11
-+	jbe	L(zero)
-+# endif
-+	VPCMPEQ	%ymm7, %ymm6, %ymm6
-+	vpmovmskb %ymm6, %esi
-+	tzcntl	%esi, %ecx
-+# ifdef USE_AS_STRNCMP
-+	addq	$(VEC_SIZE * 3), %rcx
-+	cmpq	%rcx, %r11
-+	jbe	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rcx), %esi
-+	cmpl	(%rdx, %rcx), %esi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rcx), %eax
-+	movzbl	(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(VEC_SIZE * 3)(%rsi, %rcx), %esi
-+	cmpl	(VEC_SIZE * 3)(%rdx, %rcx), %esi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(VEC_SIZE * 3)(%rax, %rcx), %eax
-+	movzbl	(VEC_SIZE * 3)(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(loop_cross_page):
-+	xorl	%r10d, %r10d
-+	movq	%rdx, %rcx
-+	/* Align load via RDX.  We load the extra ECX bytes which should
-+	   be ignored.  */
-+	andl	$((VEC_SIZE * 4) - 1), %ecx
-+	/* R10 is -RCX.  */
-+	subq	%rcx, %r10
-+
-+	/* This works only if VEC_SIZE * 2 == 64. */
-+# if (VEC_SIZE * 2) != 64
-+#  error (VEC_SIZE * 2) != 64
-+# endif
-+
-+	/* Check if the first VEC_SIZE * 2 bytes should be ignored.  */
-+	cmpl	$(VEC_SIZE * 2), %ecx
-+	jge	L(loop_cross_page_2_vec)
-+
-+	vmovdqu	(%rax, %r10), %ymm2
-+	vmovdqu	VEC_SIZE(%rax, %r10), %ymm3
-+	VPCMPEQ	(%rdx, %r10), %ymm2, %ymm0
-+	VPCMPEQ	VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
-+	VPMINU	%ymm2, %ymm0, %ymm0
-+	VPMINU	%ymm3, %ymm1, %ymm1
-+	VPCMPEQ	%ymm7, %ymm0, %ymm0
-+	VPCMPEQ	%ymm7, %ymm1, %ymm1
-+
-+	vpmovmskb %ymm0, %edi
-+	vpmovmskb %ymm1, %esi
-+
-+	salq	$32, %rsi
-+	xorq	%rsi, %rdi
-+
-+	/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes.  */
-+	shrq	%cl, %rdi
-+
-+	testq	%rdi, %rdi
-+	je	L(loop_cross_page_2_vec)
-+	tzcntq	%rdi, %rcx
-+# ifdef USE_AS_STRNCMP
-+	cmpq	%rcx, %r11
-+	jbe	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rcx), %edi
-+	cmpl	(%rdx, %rcx), %edi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rcx), %eax
-+	movzbl	(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rcx), %edi
-+	cmpl	(%rdx, %rcx), %edi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rcx), %eax
-+	movzbl	(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(loop_cross_page_2_vec):
-+	/* The first VEC_SIZE * 2 bytes match or are ignored.  */
-+	vmovdqu	(VEC_SIZE * 2)(%rax, %r10), %ymm2
-+	vmovdqu	(VEC_SIZE * 3)(%rax, %r10), %ymm3
-+	VPCMPEQ	(VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
-+	VPMINU	%ymm2, %ymm5, %ymm5
-+	VPCMPEQ	(VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
-+	VPCMPEQ	%ymm7, %ymm5, %ymm5
-+	VPMINU	%ymm3, %ymm6, %ymm6
-+	VPCMPEQ	%ymm7, %ymm6, %ymm6
-+
-+	vpmovmskb %ymm5, %edi
-+	vpmovmskb %ymm6, %esi
-+
-+	salq	$32, %rsi
-+	xorq	%rsi, %rdi
-+
-+	xorl	%r8d, %r8d
-+	/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes.  */
-+	subl	$(VEC_SIZE * 2), %ecx
-+	jle	1f
-+	/* Skip ECX bytes.  */
-+	shrq	%cl, %rdi
-+	/* R8 has number of bytes skipped.  */
-+	movl	%ecx, %r8d
-+1:
-+	/* Before jumping back to the loop, set ESI to the number of
-+	   VEC_SIZE * 4 blocks before page crossing.  */
-+	movl	$(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
-+
-+	testq	%rdi, %rdi
-+# ifdef USE_AS_STRNCMP
-+	/* At this point, if %rdi value is 0, it already tested
-+	   VEC_SIZE*4+%r10 byte starting from %rax. This label
-+	   checks whether strncmp maximum offset reached or not.  */
-+	je	L(string_nbyte_offset_check)
-+# else
-+	je	L(back_to_loop)
-+# endif
-+	tzcntq	%rdi, %rcx
-+	addq	%r10, %rcx
-+	/* Adjust for number of bytes skipped.  */
-+	addq	%r8, %rcx
-+# ifdef USE_AS_STRNCMP
-+	addq	$(VEC_SIZE * 2), %rcx
-+	subq	%rcx, %r11
-+	jbe	L(zero)
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(%rsi, %rcx), %edi
-+	cmpl	(%rdx, %rcx), %edi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rax, %rcx), %eax
-+	movzbl	(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# else
-+#  ifdef USE_AS_WCSCMP
-+	movq	%rax, %rsi
-+	xorl	%eax, %eax
-+	movl	(VEC_SIZE * 2)(%rsi, %rcx), %edi
-+	cmpl	(VEC_SIZE * 2)(%rdx, %rcx), %edi
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(VEC_SIZE * 2)(%rax, %rcx), %eax
-+	movzbl	(VEC_SIZE * 2)(%rdx, %rcx), %edx
-+	subl	%edx, %eax
-+#  endif
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+# ifdef USE_AS_STRNCMP
-+L(string_nbyte_offset_check):
-+	leaq	(VEC_SIZE * 4)(%r10), %r10
-+	cmpq	%r10, %r11
-+	jbe	L(zero)
-+	jmp	L(back_to_loop)
-+# endif
-+
-+	.p2align 4
-+L(cross_page_loop):
-+	/* Check one byte/dword at a time.  */
-+# ifdef USE_AS_WCSCMP
-+	cmpl	%ecx, %eax
-+# else
-+	subl	%ecx, %eax
-+# endif
-+	jne	L(different)
-+	addl	$SIZE_OF_CHAR, %edx
-+	cmpl	$(VEC_SIZE * 4), %edx
-+	je	L(main_loop_header)
-+# ifdef USE_AS_STRNCMP
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+# endif
-+# ifdef USE_AS_WCSCMP
-+	movl	(%rdi, %rdx), %eax
-+	movl	(%rsi, %rdx), %ecx
-+# else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %ecx
-+# endif
-+	/* Check null char.  */
-+	testl	%eax, %eax
-+	jne	L(cross_page_loop)
-+	/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
-+	   comparisons.  */
-+	subl	%ecx, %eax
-+# ifndef USE_AS_WCSCMP
-+L(different):
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+# ifdef USE_AS_WCSCMP
-+	.p2align 4
-+L(different):
-+	/* Use movl to avoid modifying EFLAGS.  */
-+	movl	$0, %eax
-+	setl	%al
-+	negl	%eax
-+	orl	$1, %eax
-+	VZEROUPPER
-+	ret
-+# endif
-+
-+# ifdef USE_AS_STRNCMP
-+	.p2align 4
-+L(zero):
-+	xorl	%eax, %eax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(char0):
-+#  ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(%rdi), %ecx
-+	cmpl	(%rsi), %ecx
-+	jne	L(wcscmp_return)
-+#  else
-+	movzbl	(%rsi), %ecx
-+	movzbl	(%rdi), %eax
-+	subl	%ecx, %eax
-+#  endif
-+	VZEROUPPER
-+	ret
-+# endif
-+
-+	.p2align 4
-+L(last_vector):
-+	addq	%rdx, %rdi
-+	addq	%rdx, %rsi
-+# ifdef USE_AS_STRNCMP
-+	subq	%rdx, %r11
-+# endif
-+	tzcntl	%ecx, %edx
-+# ifdef USE_AS_STRNCMP
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+# endif
-+# ifdef USE_AS_WCSCMP
-+	xorl	%eax, %eax
-+	movl	(%rdi, %rdx), %ecx
-+	cmpl	(%rsi, %rdx), %ecx
-+	jne	L(wcscmp_return)
-+# else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %edx
-+	subl	%edx, %eax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	/* Comparing on page boundary region requires special treatment:
-+	   It must done one vector at the time, starting with the wider
-+	   ymm vector if possible, if not, with xmm. If fetching 16 bytes
-+	   (xmm) still passes the boundary, byte comparison must be done.
-+	 */
-+	.p2align 4
-+L(cross_page):
-+	/* Try one ymm vector at a time.  */
-+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-+	jg	L(cross_page_1_vector)
-+L(loop_1_vector):
-+	vmovdqu	(%rdi, %rdx), %ymm1
-+	VPCMPEQ	(%rsi, %rdx), %ymm1, %ymm0
-+	VPMINU	%ymm1, %ymm0, %ymm0
-+	VPCMPEQ	%ymm7, %ymm0, %ymm0
-+	vpmovmskb %ymm0, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(last_vector)
-+
-+	addl	$VEC_SIZE, %edx
-+
-+	addl	$VEC_SIZE, %eax
-+# ifdef USE_AS_STRNCMP
-+	/* Return 0 if the current offset (%rdx) >= the maximum offset
-+	   (%r11).  */
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+# endif
-+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-+	jle	L(loop_1_vector)
-+L(cross_page_1_vector):
-+	/* Less than 32 bytes to check, try one xmm vector.  */
-+	cmpl	$(PAGE_SIZE - 16), %eax
-+	jg	L(cross_page_1_xmm)
-+	vmovdqu	(%rdi, %rdx), %xmm1
-+	VPCMPEQ	(%rsi, %rdx), %xmm1, %xmm0
-+	VPMINU	%xmm1, %xmm0, %xmm0
-+	VPCMPEQ	%xmm7, %xmm0, %xmm0
-+	vpmovmskb %xmm0, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(last_vector)
-+
-+	addl	$16, %edx
-+# ifndef USE_AS_WCSCMP
-+	addl	$16, %eax
-+# endif
-+# ifdef USE_AS_STRNCMP
-+	/* Return 0 if the current offset (%rdx) >= the maximum offset
-+	   (%r11).  */
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+# endif
-+
-+L(cross_page_1_xmm):
-+# ifndef USE_AS_WCSCMP
-+	/* Less than 16 bytes to check, try 8 byte vector.  NB: No need
-+	   for wcscmp nor wcsncmp since wide char is 4 bytes.   */
-+	cmpl	$(PAGE_SIZE - 8), %eax
-+	jg	L(cross_page_8bytes)
-+	vmovq	(%rdi, %rdx), %xmm1
-+	vmovq	(%rsi, %rdx), %xmm0
-+	VPCMPEQ	%xmm0, %xmm1, %xmm0
-+	VPMINU	%xmm1, %xmm0, %xmm0
-+	VPCMPEQ	%xmm7, %xmm0, %xmm0
-+	vpmovmskb %xmm0, %ecx
-+	/* Only last 8 bits are valid.  */
-+	andl	$0xff, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(last_vector)
-+
-+	addl	$8, %edx
-+	addl	$8, %eax
-+#  ifdef USE_AS_STRNCMP
-+	/* Return 0 if the current offset (%rdx) >= the maximum offset
-+	   (%r11).  */
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+#  endif
-+
-+L(cross_page_8bytes):
-+	/* Less than 8 bytes to check, try 4 byte vector.  */
-+	cmpl	$(PAGE_SIZE - 4), %eax
-+	jg	L(cross_page_4bytes)
-+	vmovd	(%rdi, %rdx), %xmm1
-+	vmovd	(%rsi, %rdx), %xmm0
-+	VPCMPEQ	%xmm0, %xmm1, %xmm0
-+	VPMINU	%xmm1, %xmm0, %xmm0
-+	VPCMPEQ	%xmm7, %xmm0, %xmm0
-+	vpmovmskb %xmm0, %ecx
-+	/* Only last 4 bits are valid.  */
-+	andl	$0xf, %ecx
-+	testl	%ecx, %ecx
-+	jne	L(last_vector)
-+
-+	addl	$4, %edx
-+#  ifdef USE_AS_STRNCMP
-+	/* Return 0 if the current offset (%rdx) >= the maximum offset
-+	   (%r11).  */
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+#  endif
-+
-+L(cross_page_4bytes):
-+# endif
-+	/* Less than 4 bytes to check, try one byte/dword at a time.  */
-+# ifdef USE_AS_STRNCMP
-+	cmpq	%r11, %rdx
-+	jae	L(zero)
-+# endif
-+# ifdef USE_AS_WCSCMP
-+	movl	(%rdi, %rdx), %eax
-+	movl	(%rsi, %rdx), %ecx
-+# else
-+	movzbl	(%rdi, %rdx), %eax
-+	movzbl	(%rsi, %rdx), %ecx
-+# endif
-+	testl	%eax, %eax
-+	jne	L(cross_page_loop)
-+	subl	%ecx, %eax
-+	VZEROUPPER
-+	ret
-+END (STRCMP)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S
-new file mode 100644
-index 000000000..809a9ac00
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S
-@@ -0,0 +1,1046 @@
-+/* strcpy with AVX2
-+   Copyright (C) 2011-2020 Free Software Foundation, Inc.
-+   Contributed by Intel Corporation.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+
-+# ifndef USE_AS_STRCAT
-+
-+#  ifndef STRCPY
-+#   define STRCPY  strcpy_avx2
-+#  endif
-+
-+# endif
-+
-+# ifndef L
-+#  define L(label)      .L##label
-+# endif
-+
-+# ifndef cfi_startproc
-+#  define cfi_startproc .cfi_startproc
-+# endif
-+
-+# ifndef cfi_endproc
-+#  define cfi_endproc   .cfi_endproc
-+# endif
-+
-+# ifndef ENTRY
-+#  define ENTRY(name)   \
-+        .type name, @function;  \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+# endif
-+
-+# ifndef END
-+#  define END(name)     \
-+        cfi_endproc;    \
-+        .size name, .-name
-+# endif
-+
-+/* Number of bytes in a vector register */
-+# ifndef VEC_SIZE
-+#  define VEC_SIZE	32
-+# endif
-+
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER	vzeroupper
-+# endif
-+
-+/* zero register */
-+#define xmmZ	xmm0
-+#define ymmZ	ymm0
-+
-+/* mask register */
-+#define ymmM	ymm1
-+
-+# ifndef USE_AS_STRCAT
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (STRCPY)
-+#  ifdef USE_AS_STRNCPY
-+	mov	%RDX_LP, %R8_LP
-+	test	%R8_LP, %R8_LP
-+	jz	L(ExitZero)
-+#  endif
-+	mov	%rsi, %rcx
-+#  ifndef USE_AS_STPCPY
-+	mov	%rdi, %rax      /* save result */
-+#  endif
-+
-+# endif
-+
-+	vpxor	%xmmZ, %xmmZ, %xmmZ
-+
-+	and	$((VEC_SIZE * 4) - 1), %ecx
-+	cmp	$(VEC_SIZE * 2), %ecx
-+	jbe	L(SourceStringAlignmentLessTwoVecSize)
-+
-+	and	$-VEC_SIZE, %rsi
-+	and	$(VEC_SIZE - 1), %ecx
-+
-+	vpcmpeqb (%rsi), %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	shr	%cl, %rdx
-+
-+# ifdef USE_AS_STRNCPY
-+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-+	mov	$VEC_SIZE, %r10
-+	sub	%rcx, %r10
-+	cmp	%r10, %r8
-+#  else
-+	mov	$(VEC_SIZE + 1), %r10
-+	sub	%rcx, %r10
-+	cmp	%r10, %r8
-+#  endif
-+	jbe	L(CopyVecSizeTailCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+	jnz	L(CopyVecSizeTail)
-+
-+	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
-+	vpmovmskb %ymm2, %edx
-+
-+# ifdef USE_AS_STRNCPY
-+	add	$VEC_SIZE, %r10
-+	cmp	%r10, %r8
-+	jbe	L(CopyTwoVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+	jnz	L(CopyTwoVecSize)
-+
-+	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
-+	vmovdqu %ymm2, (%rdi)
-+
-+/* If source address alignment != destination address alignment */
-+	.p2align 4
-+L(UnalignVecSizeBoth):
-+	sub	%rcx, %rdi
-+# ifdef USE_AS_STRNCPY
-+	add	%rcx, %r8
-+	sbb	%rcx, %rcx
-+	or	%rcx, %r8
-+# endif
-+	mov	$VEC_SIZE, %rcx
-+	vmovdqa (%rsi, %rcx), %ymm2
-+	vmovdqu %ymm2, (%rdi, %rcx)
-+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-+	vpcmpeqb %ymm2, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$VEC_SIZE, %rcx
-+# ifdef USE_AS_STRNCPY
-+	sub	$(VEC_SIZE * 3), %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec2)
-+# else
-+	jnz	L(CopyVecSize)
-+# endif
-+
-+	vmovdqu %ymm2, (%rdi, %rcx)
-+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-+	vpcmpeqb %ymm3, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$VEC_SIZE, %rcx
-+# ifdef USE_AS_STRNCPY
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec3)
-+# else
-+	jnz	L(CopyVecSize)
-+# endif
-+
-+	vmovdqu %ymm3, (%rdi, %rcx)
-+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
-+	vpcmpeqb %ymm4, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$VEC_SIZE, %rcx
-+# ifdef USE_AS_STRNCPY
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec4)
-+# else
-+	jnz	L(CopyVecSize)
-+# endif
-+
-+	vmovdqu %ymm4, (%rdi, %rcx)
-+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-+	vpcmpeqb %ymm2, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$VEC_SIZE, %rcx
-+# ifdef USE_AS_STRNCPY
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec2)
-+# else
-+	jnz	L(CopyVecSize)
-+# endif
-+
-+	vmovdqu %ymm2, (%rdi, %rcx)
-+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
-+	vpcmpeqb %ymm2, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$VEC_SIZE, %rcx
-+# ifdef USE_AS_STRNCPY
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec2)
-+# else
-+	jnz	L(CopyVecSize)
-+# endif
-+
-+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
-+	vmovdqu %ymm2, (%rdi, %rcx)
-+	vpcmpeqb %ymm3, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$VEC_SIZE, %rcx
-+# ifdef USE_AS_STRNCPY
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec3)
-+# else
-+	jnz	L(CopyVecSize)
-+# endif
-+
-+	vmovdqu %ymm3, (%rdi, %rcx)
-+	mov	%rsi, %rdx
-+	lea	VEC_SIZE(%rsi, %rcx), %rsi
-+	and	$-(VEC_SIZE * 4), %rsi
-+	sub	%rsi, %rdx
-+	sub	%rdx, %rdi
-+# ifdef USE_AS_STRNCPY
-+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-+# endif
-+L(UnalignedFourVecSizeLoop):
-+	vmovdqa (%rsi), %ymm4
-+	vmovdqa VEC_SIZE(%rsi), %ymm5
-+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-+	vpminub %ymm5, %ymm4, %ymm2
-+	vpminub %ymm7, %ymm6, %ymm3
-+	vpminub %ymm2, %ymm3, %ymm3
-+	vpcmpeqb %ymmM, %ymm3, %ymm3
-+	vpmovmskb %ymm3, %edx
-+# ifdef USE_AS_STRNCPY
-+	sub	$(VEC_SIZE * 4), %r8
-+	jbe	L(UnalignedLeaveCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+	jnz	L(UnalignedFourVecSizeLeave)
-+
-+L(UnalignedFourVecSizeLoop_start):
-+	add	$(VEC_SIZE * 4), %rdi
-+	add	$(VEC_SIZE * 4), %rsi
-+	vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
-+	vmovdqa (%rsi), %ymm4
-+	vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
-+	vmovdqa VEC_SIZE(%rsi), %ymm5
-+	vpminub %ymm5, %ymm4, %ymm2
-+	vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
-+	vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
-+	vmovdqu %ymm7, -VEC_SIZE(%rdi)
-+	vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
-+	vpminub %ymm7, %ymm6, %ymm3
-+	vpminub %ymm2, %ymm3, %ymm3
-+	vpcmpeqb %ymmM, %ymm3, %ymm3
-+	vpmovmskb %ymm3, %edx
-+# ifdef USE_AS_STRNCPY
-+	sub	$(VEC_SIZE * 4), %r8
-+	jbe	L(UnalignedLeaveCase2OrCase3)
-+# endif
-+	test	%edx, %edx
-+	jz	L(UnalignedFourVecSizeLoop_start)
-+
-+L(UnalignedFourVecSizeLeave):
-+	vpcmpeqb %ymm4, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	test	%edx, %edx
-+	jnz	L(CopyVecSizeUnaligned_0)
-+
-+	vpcmpeqb %ymm5, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %ecx
-+	test	%ecx, %ecx
-+	jnz	L(CopyVecSizeUnaligned_16)
-+
-+	vpcmpeqb %ymm6, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	test	%edx, %edx
-+	jnz	L(CopyVecSizeUnaligned_32)
-+
-+	vpcmpeqb %ymm7, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %ecx
-+	bsf	%ecx, %edx
-+	vmovdqu %ymm4, (%rdi)
-+	vmovdqu %ymm5, VEC_SIZE(%rdi)
-+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+# ifdef USE_AS_STPCPY
-+	lea	(VEC_SIZE * 3)(%rdi, %rdx), %rax
-+# endif
-+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-+	add	$(VEC_SIZE - 1), %r8
-+	sub	%rdx, %r8
-+	lea	((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
-+	jmp	L(StrncpyFillTailWithZero)
-+# else
-+	add	$(VEC_SIZE * 3), %rsi
-+	add	$(VEC_SIZE * 3), %rdi
-+	jmp	L(CopyVecSizeExit)
-+# endif
-+
-+/* If source address alignment == destination address alignment */
-+
-+L(SourceStringAlignmentLessTwoVecSize):
-+	vmovdqu (%rsi), %ymm3
-+	vmovdqu VEC_SIZE(%rsi), %ymm2
-+	vpcmpeqb %ymm3, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+
-+# ifdef USE_AS_STRNCPY
-+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-+	cmp	$VEC_SIZE, %r8
-+#  else
-+	cmp	$(VEC_SIZE + 1), %r8
-+#  endif
-+	jbe	L(CopyVecSizeTail1Case2OrCase3)
-+# endif
-+	test	%edx, %edx
-+	jnz	L(CopyVecSizeTail1)
-+
-+	vmovdqu %ymm3, (%rdi)
-+	vpcmpeqb %ymm2, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+
-+# ifdef USE_AS_STRNCPY
-+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-+	cmp	$(VEC_SIZE * 2), %r8
-+#  else
-+	cmp	$((VEC_SIZE * 2) + 1), %r8
-+#  endif
-+	jbe	L(CopyTwoVecSize1Case2OrCase3)
-+# endif
-+	test	%edx, %edx
-+	jnz	L(CopyTwoVecSize1)
-+
-+	and	$-VEC_SIZE, %rsi
-+	and	$(VEC_SIZE - 1), %ecx
-+	jmp	L(UnalignVecSizeBoth)
-+
-+/*------End of main part with loops---------------------*/
-+
-+/* Case1 */
-+
-+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
-+	.p2align 4
-+L(CopyVecSize):
-+	add	%rcx, %rdi
-+# endif
-+L(CopyVecSizeTail):
-+	add	%rcx, %rsi
-+L(CopyVecSizeTail1):
-+	bsf	%edx, %edx
-+L(CopyVecSizeExit):
-+	cmp	$32, %edx
-+	jae	L(Exit32_63)
-+	cmp	$16, %edx
-+	jae	L(Exit16_31)
-+	cmp	$8, %edx
-+	jae	L(Exit8_15)
-+	cmp	$4, %edx
-+	jae	L(Exit4_7)
-+	cmp	$3, %edx
-+	je	L(Exit3)
-+	cmp	$1, %edx
-+	ja	L(Exit2)
-+	je	L(Exit1)
-+	movb	$0, (%rdi)
-+# ifdef USE_AS_STPCPY
-+	lea	(%rdi), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	$1, %r8
-+	lea	1(%rdi), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(CopyTwoVecSize1):
-+	add	$VEC_SIZE, %rsi
-+	add	$VEC_SIZE, %rdi
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	$VEC_SIZE, %r8
-+# endif
-+	jmp	L(CopyVecSizeTail1)
-+
-+	.p2align 4
-+L(CopyTwoVecSize):
-+	bsf	%edx, %edx
-+	add	%rcx, %rsi
-+	add	$VEC_SIZE, %edx
-+	sub	%ecx, %edx
-+	jmp	L(CopyVecSizeExit)
-+
-+	.p2align 4
-+L(CopyVecSizeUnaligned_0):
-+	bsf	%edx, %edx
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+# ifdef USE_AS_STPCPY
-+	lea	(%rdi, %rdx), %rax
-+# endif
-+	vmovdqu %ymm4, (%rdi)
-+	add	$((VEC_SIZE * 4) - 1), %r8
-+	sub	%rdx, %r8
-+	lea	1(%rdi, %rdx), %rdi
-+	jmp	L(StrncpyFillTailWithZero)
-+# else
-+	jmp	L(CopyVecSizeExit)
-+# endif
-+
-+	.p2align 4
-+L(CopyVecSizeUnaligned_16):
-+	bsf	%ecx, %edx
-+	vmovdqu %ymm4, (%rdi)
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+# ifdef USE_AS_STPCPY
-+	lea	VEC_SIZE(%rdi, %rdx), %rax
-+# endif
-+	vmovdqu %ymm5, VEC_SIZE(%rdi)
-+	add	$((VEC_SIZE * 3) - 1), %r8
-+	sub	%rdx, %r8
-+	lea	(VEC_SIZE + 1)(%rdi, %rdx), %rdi
-+	jmp	L(StrncpyFillTailWithZero)
-+# else
-+	add	$VEC_SIZE, %rsi
-+	add	$VEC_SIZE, %rdi
-+	jmp	L(CopyVecSizeExit)
-+# endif
-+
-+	.p2align 4
-+L(CopyVecSizeUnaligned_32):
-+	bsf	%edx, %edx
-+	vmovdqu %ymm4, (%rdi)
-+	vmovdqu %ymm5, VEC_SIZE(%rdi)
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+# ifdef USE_AS_STPCPY
-+	lea	(VEC_SIZE * 2)(%rdi, %rdx), %rax
-+# endif
-+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-+	add	$((VEC_SIZE * 2) - 1), %r8
-+	sub	%rdx, %r8
-+	lea	((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
-+	jmp	L(StrncpyFillTailWithZero)
-+# else
-+	add	$(VEC_SIZE * 2), %rsi
-+	add	$(VEC_SIZE * 2), %rdi
-+	jmp	L(CopyVecSizeExit)
-+# endif
-+
-+# ifdef USE_AS_STRNCPY
-+#  ifndef USE_AS_STRCAT
-+	.p2align 4
-+L(CopyVecSizeUnalignedVec6):
-+	vmovdqu %ymm6, (%rdi, %rcx)
-+	jmp	L(CopyVecSizeVecExit)
-+
-+	.p2align 4
-+L(CopyVecSizeUnalignedVec5):
-+	vmovdqu %ymm5, (%rdi, %rcx)
-+	jmp	L(CopyVecSizeVecExit)
-+
-+	.p2align 4
-+L(CopyVecSizeUnalignedVec4):
-+	vmovdqu %ymm4, (%rdi, %rcx)
-+	jmp	L(CopyVecSizeVecExit)
-+
-+	.p2align 4
-+L(CopyVecSizeUnalignedVec3):
-+	vmovdqu %ymm3, (%rdi, %rcx)
-+	jmp	L(CopyVecSizeVecExit)
-+#  endif
-+
-+/* Case2 */
-+
-+	.p2align 4
-+L(CopyVecSizeCase2):
-+	add	$VEC_SIZE, %r8
-+	add	%rcx, %rdi
-+	add	%rcx, %rsi
-+	bsf	%edx, %edx
-+	cmp	%r8d, %edx
-+	jb	L(CopyVecSizeExit)
-+	jmp	L(StrncpyExit)
-+
-+	.p2align 4
-+L(CopyTwoVecSizeCase2):
-+	add	%rcx, %rsi
-+	bsf	%edx, %edx
-+	add	$VEC_SIZE, %edx
-+	sub	%ecx, %edx
-+	cmp	%r8d, %edx
-+	jb	L(CopyVecSizeExit)
-+	jmp	L(StrncpyExit)
-+
-+L(CopyVecSizeTailCase2):
-+	add	%rcx, %rsi
-+	bsf	%edx, %edx
-+	cmp	%r8d, %edx
-+	jb	L(CopyVecSizeExit)
-+	jmp	L(StrncpyExit)
-+
-+L(CopyVecSizeTail1Case2):
-+	bsf	%edx, %edx
-+	cmp	%r8d, %edx
-+	jb	L(CopyVecSizeExit)
-+	jmp	L(StrncpyExit)
-+
-+/* Case2 or Case3,  Case3 */
-+
-+	.p2align 4
-+L(CopyVecSizeCase2OrCase3):
-+	test	%rdx, %rdx
-+	jnz	L(CopyVecSizeCase2)
-+L(CopyVecSizeCase3):
-+	add	$VEC_SIZE, %r8
-+	add	%rcx, %rdi
-+	add	%rcx, %rsi
-+	jmp	L(StrncpyExit)
-+
-+	.p2align 4
-+L(CopyTwoVecSizeCase2OrCase3):
-+	test	%rdx, %rdx
-+	jnz	L(CopyTwoVecSizeCase2)
-+	add	%rcx, %rsi
-+	jmp	L(StrncpyExit)
-+
-+	.p2align 4
-+L(CopyVecSizeTailCase2OrCase3):
-+	test	%rdx, %rdx
-+	jnz	L(CopyVecSizeTailCase2)
-+	add	%rcx, %rsi
-+	jmp	L(StrncpyExit)
-+
-+	.p2align 4
-+L(CopyTwoVecSize1Case2OrCase3):
-+	add	$VEC_SIZE, %rdi
-+	add	$VEC_SIZE, %rsi
-+	sub	$VEC_SIZE, %r8
-+L(CopyVecSizeTail1Case2OrCase3):
-+	test	%rdx, %rdx
-+	jnz	L(CopyVecSizeTail1Case2)
-+	jmp	L(StrncpyExit)
-+# endif
-+
-+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
-+
-+	.p2align 4
-+L(Exit1):
-+	movzwl	(%rsi), %edx
-+	mov	%dx, (%rdi)
-+# ifdef USE_AS_STPCPY
-+	lea	1(%rdi), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	$2, %r8
-+	lea	2(%rdi), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Exit2):
-+	movzwl	(%rsi), %ecx
-+	mov	%cx, (%rdi)
-+	movb	$0, 2(%rdi)
-+# ifdef USE_AS_STPCPY
-+	lea	2(%rdi), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	$3, %r8
-+	lea	3(%rdi), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Exit3):
-+	mov	(%rsi), %edx
-+	mov	%edx, (%rdi)
-+# ifdef USE_AS_STPCPY
-+	lea	3(%rdi), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	$4, %r8
-+	lea	4(%rdi), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Exit4_7):
-+	mov	(%rsi), %ecx
-+	mov	%ecx, (%rdi)
-+	mov	-3(%rsi, %rdx), %ecx
-+	mov	%ecx, -3(%rdi, %rdx)
-+# ifdef USE_AS_STPCPY
-+	lea	(%rdi, %rdx), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	%rdx, %r8
-+	sub	$1, %r8
-+	lea	1(%rdi, %rdx), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Exit8_15):
-+	mov	(%rsi), %rcx
-+	mov	-7(%rsi, %rdx), %r9
-+	mov	%rcx, (%rdi)
-+	mov	%r9, -7(%rdi, %rdx)
-+# ifdef USE_AS_STPCPY
-+	lea	(%rdi, %rdx), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	%rdx, %r8
-+	sub	$1, %r8
-+	lea	1(%rdi, %rdx), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Exit16_31):
-+	vmovdqu (%rsi), %xmm2
-+	vmovdqu -15(%rsi, %rdx), %xmm3
-+	vmovdqu %xmm2, (%rdi)
-+	vmovdqu %xmm3, -15(%rdi, %rdx)
-+# ifdef USE_AS_STPCPY
-+	lea	(%rdi, %rdx), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub %rdx, %r8
-+	sub $1, %r8
-+	lea 1(%rdi, %rdx), %rdi
-+	jnz L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Exit32_63):
-+	vmovdqu (%rsi), %ymm2
-+	vmovdqu -31(%rsi, %rdx), %ymm3
-+	vmovdqu %ymm2, (%rdi)
-+	vmovdqu %ymm3, -31(%rdi, %rdx)
-+# ifdef USE_AS_STPCPY
-+	lea	(%rdi, %rdx), %rax
-+# endif
-+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-+	sub	%rdx, %r8
-+	sub	$1, %r8
-+	lea	1(%rdi, %rdx), %rdi
-+	jnz	L(StrncpyFillTailWithZero)
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+# ifdef USE_AS_STRNCPY
-+
-+	.p2align 4
-+L(StrncpyExit1):
-+	movzbl	(%rsi), %edx
-+	mov	%dl, (%rdi)
-+#  ifdef USE_AS_STPCPY
-+	lea	1(%rdi), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, 1(%rdi)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit2):
-+	movzwl	(%rsi), %edx
-+	mov	%dx, (%rdi)
-+#  ifdef USE_AS_STPCPY
-+	lea	2(%rdi), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, 2(%rdi)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit3_4):
-+	movzwl	(%rsi), %ecx
-+	movzwl	-2(%rsi, %r8), %edx
-+	mov	%cx, (%rdi)
-+	mov	%dx, -2(%rdi, %r8)
-+#  ifdef USE_AS_STPCPY
-+	lea	(%rdi, %r8), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (%rdi, %r8)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit5_8):
-+	mov	(%rsi), %ecx
-+	mov	-4(%rsi, %r8), %edx
-+	mov	%ecx, (%rdi)
-+	mov	%edx, -4(%rdi, %r8)
-+#  ifdef USE_AS_STPCPY
-+	lea	(%rdi, %r8), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (%rdi, %r8)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit9_16):
-+	mov	(%rsi), %rcx
-+	mov	-8(%rsi, %r8), %rdx
-+	mov	%rcx, (%rdi)
-+	mov	%rdx, -8(%rdi, %r8)
-+#  ifdef USE_AS_STPCPY
-+	lea	(%rdi, %r8), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (%rdi, %r8)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit17_32):
-+	vmovdqu (%rsi), %xmm2
-+	vmovdqu -16(%rsi, %r8), %xmm3
-+	vmovdqu %xmm2, (%rdi)
-+	vmovdqu %xmm3, -16(%rdi, %r8)
-+#  ifdef USE_AS_STPCPY
-+	lea	(%rdi, %r8), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (%rdi, %r8)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit33_64):
-+	/*  0/32, 31/16 */
-+	vmovdqu (%rsi), %ymm2
-+	vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
-+	vmovdqu %ymm2, (%rdi)
-+	vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
-+#  ifdef USE_AS_STPCPY
-+	lea	(%rdi, %r8), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (%rdi, %r8)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(StrncpyExit65):
-+	/* 0/32, 32/32, 64/1 */
-+	vmovdqu (%rsi), %ymm2
-+	vmovdqu 32(%rsi), %ymm3
-+	mov	64(%rsi), %cl
-+	vmovdqu %ymm2, (%rdi)
-+	vmovdqu %ymm3, 32(%rdi)
-+	mov	%cl, 64(%rdi)
-+#  ifdef USE_AS_STPCPY
-+	lea	65(%rdi), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, 65(%rdi)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+#  ifndef USE_AS_STRCAT
-+
-+	.p2align 4
-+L(Fill1):
-+	mov	%dl, (%rdi)
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Fill2):
-+	mov	%dx, (%rdi)
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Fill3_4):
-+	mov	%dx, (%rdi)
-+	mov     %dx, -2(%rdi, %r8)
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Fill5_8):
-+	mov	%edx, (%rdi)
-+	mov     %edx, -4(%rdi, %r8)
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Fill9_16):
-+	mov	%rdx, (%rdi)
-+	mov	%rdx, -8(%rdi, %r8)
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(Fill17_32):
-+	vmovdqu %xmmZ, (%rdi)
-+	vmovdqu %xmmZ, -16(%rdi, %r8)
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(CopyVecSizeUnalignedVec2):
-+	vmovdqu %ymm2, (%rdi, %rcx)
-+
-+	.p2align 4
-+L(CopyVecSizeVecExit):
-+	bsf	%edx, %edx
-+	add	$(VEC_SIZE - 1), %r8
-+	add	%rcx, %rdi
-+#   ifdef USE_AS_STPCPY
-+	lea	(%rdi, %rdx), %rax
-+#   endif
-+	sub	%rdx, %r8
-+	lea	1(%rdi, %rdx), %rdi
-+
-+	.p2align 4
-+L(StrncpyFillTailWithZero):
-+	xor	%edx, %edx
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(StrncpyFillExit)
-+
-+	vmovdqu %ymmZ, (%rdi)
-+	add	$VEC_SIZE, %rdi
-+
-+	mov	%rdi, %rsi
-+	and	$(VEC_SIZE - 1), %esi
-+	sub	%rsi, %rdi
-+	add	%rsi, %r8
-+	sub	$(VEC_SIZE * 4), %r8
-+	jb	L(StrncpyFillLessFourVecSize)
-+
-+L(StrncpyFillLoopVmovdqa):
-+	vmovdqa %ymmZ, (%rdi)
-+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-+	vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
-+	vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
-+	add	$(VEC_SIZE * 4), %rdi
-+	sub	$(VEC_SIZE * 4), %r8
-+	jae	L(StrncpyFillLoopVmovdqa)
-+
-+L(StrncpyFillLessFourVecSize):
-+	add	$(VEC_SIZE * 2), %r8
-+	jl	L(StrncpyFillLessTwoVecSize)
-+	vmovdqa %ymmZ, (%rdi)
-+	vmovdqa %ymmZ, VEC_SIZE(%rdi)
-+	add	$(VEC_SIZE * 2), %rdi
-+	sub	$VEC_SIZE, %r8
-+	jl	L(StrncpyFillExit)
-+	vmovdqa %ymmZ, (%rdi)
-+	add	$VEC_SIZE, %rdi
-+	jmp	L(Fill)
-+
-+	.p2align 4
-+L(StrncpyFillLessTwoVecSize):
-+	add	$VEC_SIZE, %r8
-+	jl	L(StrncpyFillExit)
-+	vmovdqa %ymmZ, (%rdi)
-+	add	$VEC_SIZE, %rdi
-+	jmp	L(Fill)
-+
-+	.p2align 4
-+L(StrncpyFillExit):
-+	add	$VEC_SIZE, %r8
-+L(Fill):
-+	cmp	$17, %r8d
-+	jae	L(Fill17_32)
-+	cmp	$9, %r8d
-+	jae	L(Fill9_16)
-+	cmp	$5, %r8d
-+	jae	L(Fill5_8)
-+	cmp	$3, %r8d
-+	jae	L(Fill3_4)
-+	cmp	$1, %r8d
-+	ja	L(Fill2)
-+	je	L(Fill1)
-+	VZEROUPPER
-+	ret
-+
-+/* end of ifndef USE_AS_STRCAT */
-+#  endif
-+
-+	.p2align 4
-+L(UnalignedLeaveCase2OrCase3):
-+	test	%rdx, %rdx
-+	jnz	L(UnalignedFourVecSizeLeaveCase2)
-+L(UnalignedFourVecSizeLeaveCase3):
-+	lea	(VEC_SIZE * 4)(%r8), %rcx
-+	and	$-VEC_SIZE, %rcx
-+	add	$(VEC_SIZE * 3), %r8
-+	jl	L(CopyVecSizeCase3)
-+	vmovdqu %ymm4, (%rdi)
-+	sub	$VEC_SIZE, %r8
-+	jb	L(CopyVecSizeCase3)
-+	vmovdqu %ymm5, VEC_SIZE(%rdi)
-+	sub	$VEC_SIZE, %r8
-+	jb	L(CopyVecSizeCase3)
-+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-+	sub	$VEC_SIZE, %r8
-+	jb	L(CopyVecSizeCase3)
-+	vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
-+#  ifdef USE_AS_STPCPY
-+	lea	(VEC_SIZE * 4)(%rdi), %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (VEC_SIZE * 4)(%rdi)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(UnalignedFourVecSizeLeaveCase2):
-+	xor	%ecx, %ecx
-+	vpcmpeqb %ymm4, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	add	$(VEC_SIZE * 3), %r8
-+	jle	L(CopyVecSizeCase2OrCase3)
-+	test	%edx, %edx
-+#  ifndef USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec4)
-+#  else
-+	jnz	L(CopyVecSize)
-+#  endif
-+	vpcmpeqb %ymm5, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	vmovdqu %ymm4, (%rdi)
-+	add	$VEC_SIZE, %rcx
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+	test	%edx, %edx
-+#  ifndef USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec5)
-+#  else
-+	jnz	L(CopyVecSize)
-+#  endif
-+
-+	vpcmpeqb %ymm6, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	vmovdqu %ymm5, VEC_SIZE(%rdi)
-+	add	$VEC_SIZE, %rcx
-+	sub	$VEC_SIZE, %r8
-+	jbe	L(CopyVecSizeCase2OrCase3)
-+	test	%edx, %edx
-+#  ifndef USE_AS_STRCAT
-+	jnz	L(CopyVecSizeUnalignedVec6)
-+#  else
-+	jnz	L(CopyVecSize)
-+#  endif
-+
-+	vpcmpeqb %ymm7, %ymmZ, %ymmM
-+	vpmovmskb %ymmM, %edx
-+	vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
-+	lea	VEC_SIZE(%rdi, %rcx), %rdi
-+	lea	VEC_SIZE(%rsi, %rcx), %rsi
-+	bsf	%edx, %edx
-+	cmp	%r8d, %edx
-+	jb	L(CopyVecSizeExit)
-+L(StrncpyExit):
-+	cmp	$65, %r8d
-+	je	L(StrncpyExit65)
-+	cmp	$33, %r8d
-+	jae	L(StrncpyExit33_64)
-+	cmp	$17, %r8d
-+	jae	L(StrncpyExit17_32)
-+	cmp	$9, %r8d
-+	jae	L(StrncpyExit9_16)
-+	cmp	$5, %r8d
-+	jae	L(StrncpyExit5_8)
-+	cmp	$3, %r8d
-+	jae	L(StrncpyExit3_4)
-+	cmp	$1, %r8d
-+	ja	L(StrncpyExit2)
-+	je	L(StrncpyExit1)
-+#  ifdef USE_AS_STPCPY
-+	mov	%rdi, %rax
-+#  endif
-+#  ifdef USE_AS_STRCAT
-+	movb	$0, (%rdi)
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(ExitZero):
-+#  ifndef USE_AS_STRCAT
-+	mov	%rdi, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+# endif
-+
-+# ifndef USE_AS_STRCAT
-+END (STRCPY)
-+# else
-+END (STRCAT)
-+# endif
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S
-new file mode 100644
-index 000000000..912d771b4
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S
-@@ -0,0 +1,418 @@
-+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
-+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+# ifndef STRLEN
-+#  define STRLEN	strlen_avx2
-+# endif
-+
-+# ifndef L
-+#  define L(label)      .L##label
-+# endif
-+
-+# ifndef cfi_startproc
-+#  define cfi_startproc .cfi_startproc
-+# endif
-+
-+# ifndef cfi_endproc
-+#  define cfi_endproc   .cfi_endproc
-+# endif
-+
-+# ifndef ENTRY
-+#  define ENTRY(name)   \
-+        .type name, @function;  \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+# endif
-+
-+# ifndef END
-+#  define END(name)     \
-+        cfi_endproc;    \
-+        .size name, .-name
-+# endif
-+
-+# ifdef USE_AS_WCSLEN
-+#  define VPCMPEQ	vpcmpeqd
-+#  define VPMINU	vpminud
-+# else
-+#  define VPCMPEQ	vpcmpeqb
-+#  define VPMINU	vpminub
-+# endif
-+
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER	vzeroupper
-+# endif
-+
-+# define VEC_SIZE 32
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (STRLEN)
-+# ifdef USE_AS_STRNLEN
-+	/* Check for zero length.  */
-+	test	%RSI_LP, %RSI_LP
-+	jz	L(zero)
-+#  ifdef USE_AS_WCSLEN
-+	shl	$2, %RSI_LP
-+#  elif defined __ILP32__
-+	/* Clear the upper 32 bits.  */
-+	movl	%esi, %esi
-+#  endif
-+	mov	%RSI_LP, %R8_LP
-+# endif
-+	movl	%edi, %ecx
-+	movq	%rdi, %rdx
-+	vpxor	%xmm0, %xmm0, %xmm0
-+
-+	/* Check if we may cross page boundary with one vector load.  */
-+	andl	$(2 * VEC_SIZE - 1), %ecx
-+	cmpl	$VEC_SIZE, %ecx
-+	ja	L(cros_page_boundary)
-+
-+	/* Check the first VEC_SIZE bytes.  */
-+	VPCMPEQ (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+
-+# ifdef USE_AS_STRNLEN
-+	jnz	L(first_vec_x0_check)
-+	/* Adjust length and check the end of data.  */
-+	subq	$VEC_SIZE, %rsi
-+	jbe	L(max)
-+# else
-+	jnz	L(first_vec_x0)
-+# endif
-+
-+	/* Align data for aligned loads in the loop.  */
-+	addq	$VEC_SIZE, %rdi
-+	andl	$(VEC_SIZE - 1), %ecx
-+	andq	$-VEC_SIZE, %rdi
-+
-+# ifdef USE_AS_STRNLEN
-+	/* Adjust length.  */
-+	addq	%rcx, %rsi
-+
-+	subq	$(VEC_SIZE * 4), %rsi
-+	jbe	L(last_4x_vec_or_less)
-+# endif
-+	jmp	L(more_4x_vec)
-+
-+	.p2align 4
-+L(cros_page_boundary):
-+	andl	$(VEC_SIZE - 1), %ecx
-+	andq	$-VEC_SIZE, %rdi
-+	VPCMPEQ (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	/* Remove the leading bytes.  */
-+	sarl	%cl, %eax
-+	testl	%eax, %eax
-+	jz	L(aligned_more)
-+	tzcntl	%eax, %eax
-+# ifdef USE_AS_STRNLEN
-+	/* Check the end of data.  */
-+	cmpq	%rax, %rsi
-+	jbe	L(max)
-+# endif
-+	addq	%rdi, %rax
-+	addq	%rcx, %rax
-+	subq	%rdx, %rax
-+# ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(aligned_more):
-+# ifdef USE_AS_STRNLEN
-+        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
-+	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
-+	    to void possible addition overflow.  */
-+	negq	%rcx
-+	addq	$VEC_SIZE, %rcx
-+
-+	/* Check the end of data.  */
-+	subq	%rcx, %rsi
-+	jbe	L(max)
-+# endif
-+
-+	addq	$VEC_SIZE, %rdi
-+
-+# ifdef USE_AS_STRNLEN
-+	subq	$(VEC_SIZE * 4), %rsi
-+	jbe	L(last_4x_vec_or_less)
-+# endif
-+
-+L(more_4x_vec):
-+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-+	   since data is only aligned to VEC_SIZE.  */
-+	VPCMPEQ (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x0)
-+
-+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x1)
-+
-+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x2)
-+
-+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x3)
-+
-+	addq	$(VEC_SIZE * 4), %rdi
-+
-+# ifdef USE_AS_STRNLEN
-+	subq	$(VEC_SIZE * 4), %rsi
-+	jbe	L(last_4x_vec_or_less)
-+# endif
-+
-+	/* Align data to 4 * VEC_SIZE.  */
-+	movq	%rdi, %rcx
-+	andl	$(4 * VEC_SIZE - 1), %ecx
-+	andq	$-(4 * VEC_SIZE), %rdi
-+
-+# ifdef USE_AS_STRNLEN
-+	/* Adjust length.  */
-+	addq	%rcx, %rsi
-+# endif
-+
-+	.p2align 4
-+L(loop_4x_vec):
-+	/* Compare 4 * VEC at a time forward.  */
-+	vmovdqa (%rdi), %ymm1
-+	vmovdqa	VEC_SIZE(%rdi), %ymm2
-+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-+	VPMINU	%ymm1, %ymm2, %ymm5
-+	VPMINU	%ymm3, %ymm4, %ymm6
-+	VPMINU	%ymm5, %ymm6, %ymm5
-+
-+	VPCMPEQ	%ymm5, %ymm0, %ymm5
-+	vpmovmskb %ymm5, %eax
-+	testl	%eax, %eax
-+	jnz	L(4x_vec_end)
-+
-+	addq	$(VEC_SIZE * 4), %rdi
-+
-+# ifndef USE_AS_STRNLEN
-+	jmp	L(loop_4x_vec)
-+# else
-+	subq	$(VEC_SIZE * 4), %rsi
-+	ja	L(loop_4x_vec)
-+
-+L(last_4x_vec_or_less):
-+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-+	addl	$(VEC_SIZE * 2), %esi
-+	jle	L(last_2x_vec)
-+
-+	VPCMPEQ (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x0)
-+
-+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x1)
-+
-+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+
-+	jnz	L(first_vec_x2_check)
-+	subl	$VEC_SIZE, %esi
-+	jle	L(max)
-+
-+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+
-+	jnz	L(first_vec_x3_check)
-+	movq	%r8, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(last_2x_vec):
-+	addl	$(VEC_SIZE * 2), %esi
-+	VPCMPEQ (%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+
-+	jnz	L(first_vec_x0_check)
-+	subl	$VEC_SIZE, %esi
-+	jle	L(max)
-+
-+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x1_check)
-+	movq	%r8, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x0_check):
-+	tzcntl	%eax, %eax
-+	/* Check the end of data.  */
-+	cmpq	%rax, %rsi
-+	jbe	L(max)
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x1_check):
-+	tzcntl	%eax, %eax
-+	/* Check the end of data.  */
-+	cmpq	%rax, %rsi
-+	jbe	L(max)
-+	addq	$VEC_SIZE, %rax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x2_check):
-+	tzcntl	%eax, %eax
-+	/* Check the end of data.  */
-+	cmpq	%rax, %rsi
-+	jbe	L(max)
-+	addq	$(VEC_SIZE * 2), %rax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x3_check):
-+	tzcntl	%eax, %eax
-+	/* Check the end of data.  */
-+	cmpq	%rax, %rsi
-+	jbe	L(max)
-+	addq	$(VEC_SIZE * 3), %rax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(max):
-+	movq	%r8, %rax
-+#  ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+#  endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(zero):
-+	xorl	%eax, %eax
-+	ret
-+# endif
-+
-+	.p2align 4
-+L(first_vec_x0):
-+	tzcntl	%eax, %eax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+# ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x1):
-+	tzcntl	%eax, %eax
-+	addq	$VEC_SIZE, %rax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+# ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(first_vec_x2):
-+	tzcntl	%eax, %eax
-+	addq	$(VEC_SIZE * 2), %rax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+# ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(4x_vec_end):
-+	VPCMPEQ	%ymm1, %ymm0, %ymm1
-+	vpmovmskb %ymm1, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x0)
-+	VPCMPEQ %ymm2, %ymm0, %ymm2
-+	vpmovmskb %ymm2, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x1)
-+	VPCMPEQ %ymm3, %ymm0, %ymm3
-+	vpmovmskb %ymm3, %eax
-+	testl	%eax, %eax
-+	jnz	L(first_vec_x2)
-+	VPCMPEQ %ymm4, %ymm0, %ymm4
-+	vpmovmskb %ymm4, %eax
-+L(first_vec_x3):
-+	tzcntl	%eax, %eax
-+	addq	$(VEC_SIZE * 3), %rax
-+	addq	%rdi, %rax
-+	subq	%rdx, %rax
-+# ifdef USE_AS_WCSLEN
-+	shrq	$2, %rax
-+# endif
-+	VZEROUPPER
-+	ret
-+
-+END (STRLEN)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S
-new file mode 100644
-index 000000000..71e1a46c2
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S
-@@ -0,0 +1,3 @@
-+#define USE_AS_STRNCAT
-+#define STRCAT strncat_avx2
-+#include "avx2-strcat-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S
-new file mode 100644
-index 000000000..b21a19134
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S
-@@ -0,0 +1,4 @@
-+#define STRCMP	strncmp_avx2
-+#define USE_AS_STRNCMP 1
-+#include "avx_regs.h"
-+#include "avx2-strcmp-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S
-new file mode 100644
-index 000000000..7ad840667
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S
-@@ -0,0 +1,4 @@
-+#define USE_AS_STRNCPY
-+#define STRCPY strncpy_avx2
-+#include "avx_regs.h"
-+#include "avx2-strcpy-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S
-new file mode 100644
-index 000000000..22cc5c527
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S
-@@ -0,0 +1,4 @@
-+#define STRLEN strnlen_avx2
-+#define USE_AS_STRNLEN 1
-+#include "avx_regs.h"
-+#include "avx2-strlen-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S
-new file mode 100644
-index 000000000..b3a65fbc6
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S
-@@ -0,0 +1,258 @@
-+/* strrchr/wcsrchr optimized with AVX2.
-+   Copyright (C) 2017-2020 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+
-+# ifndef STRRCHR
-+#  define STRRCHR	strrchr_avx2
-+# endif
-+
-+# ifndef L
-+#  define L(label)      .L##label
-+# endif
-+
-+# ifndef cfi_startproc
-+#  define cfi_startproc .cfi_startproc
-+# endif
-+
-+# ifndef cfi_endproc
-+#  define cfi_endproc   .cfi_endproc
-+# endif
-+
-+# ifndef ENTRY
-+#  define ENTRY(name)   \
-+        .type name, @function;  \
-+        .globl name;    \
-+        .p2align 4;     \
-+name:   \
-+        cfi_startproc
-+# endif
-+
-+# ifndef END
-+#  define END(name)     \
-+        cfi_endproc;    \
-+        .size name, .-name
-+# endif
-+
-+# ifdef USE_AS_WCSRCHR
-+#  define VPBROADCAST	vpbroadcastd
-+#  define VPCMPEQ	vpcmpeqd
-+# else
-+#  define VPBROADCAST	vpbroadcastb
-+#  define VPCMPEQ	vpcmpeqb
-+# endif
-+
-+# ifndef VZEROUPPER
-+#  define VZEROUPPER	vzeroupper
-+# endif
-+
-+# define VEC_SIZE	32
-+
-+	.section .text.avx,"ax",@progbits
-+ENTRY (STRRCHR)
-+	movd	%esi, %xmm4
-+	movl	%edi, %ecx
-+	/* Broadcast CHAR to YMM4.  */
-+	VPBROADCAST %xmm4, %ymm4
-+	vpxor	%xmm0, %xmm0, %xmm0
-+
-+	/* Check if we may cross page boundary with one vector load.  */
-+	andl	$(2 * VEC_SIZE - 1), %ecx
-+	cmpl	$VEC_SIZE, %ecx
-+	ja	L(cros_page_boundary)
-+
-+	vmovdqu	(%rdi), %ymm1
-+	VPCMPEQ	%ymm1, %ymm0, %ymm2
-+	VPCMPEQ	%ymm1, %ymm4, %ymm3
-+	vpmovmskb %ymm2, %ecx
-+	vpmovmskb %ymm3, %eax
-+	addq	$VEC_SIZE, %rdi
-+
-+	testl	%eax, %eax
-+	jnz	L(first_vec)
-+
-+	testl	%ecx, %ecx
-+	jnz	L(return_null)
-+
-+	andq	$-VEC_SIZE, %rdi
-+	xorl	%edx, %edx
-+	jmp	L(aligned_loop)
-+
-+	.p2align 4
-+L(first_vec):
-+	/* Check if there is a nul CHAR.  */
-+	testl	%ecx, %ecx
-+	jnz	L(char_and_nul_in_first_vec)
-+
-+	/* Remember the match and keep searching.  */
-+	movl	%eax, %edx
-+	movq	%rdi, %rsi
-+	andq	$-VEC_SIZE, %rdi
-+	jmp	L(aligned_loop)
-+
-+	.p2align 4
-+L(cros_page_boundary):
-+	andl	$(VEC_SIZE - 1), %ecx
-+	andq	$-VEC_SIZE, %rdi
-+	vmovdqa	(%rdi), %ymm1
-+	VPCMPEQ	%ymm1, %ymm0, %ymm2
-+	VPCMPEQ	%ymm1, %ymm4, %ymm3
-+	vpmovmskb %ymm2, %edx
-+	vpmovmskb %ymm3, %eax
-+	shrl	%cl, %edx
-+	shrl	%cl, %eax
-+	addq	$VEC_SIZE, %rdi
-+
-+	/* Check if there is a CHAR.  */
-+	testl	%eax, %eax
-+	jnz	L(found_char)
-+
-+	testl	%edx, %edx
-+	jnz	L(return_null)
-+
-+	jmp	L(aligned_loop)
-+
-+	.p2align 4
-+L(found_char):
-+	testl	%edx, %edx
-+	jnz	L(char_and_nul)
-+
-+	/* Remember the match and keep searching.  */
-+	movl	%eax, %edx
-+	leaq	(%rdi, %rcx), %rsi
-+
-+	.p2align 4
-+L(aligned_loop):
-+	vmovdqa	(%rdi), %ymm1
-+	VPCMPEQ	%ymm1, %ymm0, %ymm2
-+	addq	$VEC_SIZE, %rdi
-+	VPCMPEQ	%ymm1, %ymm4, %ymm3
-+	vpmovmskb %ymm2, %ecx
-+	vpmovmskb %ymm3, %eax
-+	orl	%eax, %ecx
-+	jnz	L(char_nor_null)
-+
-+	vmovdqa	(%rdi), %ymm1
-+	VPCMPEQ	%ymm1, %ymm0, %ymm2
-+	add	$VEC_SIZE, %rdi
-+	VPCMPEQ	%ymm1, %ymm4, %ymm3
-+	vpmovmskb %ymm2, %ecx
-+	vpmovmskb %ymm3, %eax
-+	orl	%eax, %ecx
-+	jnz	L(char_nor_null)
-+
-+	vmovdqa	(%rdi), %ymm1
-+	VPCMPEQ	%ymm1, %ymm0, %ymm2
-+	addq	$VEC_SIZE, %rdi
-+	VPCMPEQ	%ymm1, %ymm4, %ymm3
-+	vpmovmskb %ymm2, %ecx
-+	vpmovmskb %ymm3, %eax
-+	orl	%eax, %ecx
-+	jnz	L(char_nor_null)
-+
-+	vmovdqa	(%rdi), %ymm1
-+	VPCMPEQ	%ymm1, %ymm0, %ymm2
-+	addq	$VEC_SIZE, %rdi
-+	VPCMPEQ	%ymm1, %ymm4, %ymm3
-+	vpmovmskb %ymm2, %ecx
-+	vpmovmskb %ymm3, %eax
-+	orl	%eax, %ecx
-+	jz	L(aligned_loop)
-+
-+	.p2align 4
-+L(char_nor_null):
-+	/* Find a CHAR or a nul CHAR in a loop.  */
-+	testl	%eax, %eax
-+	jnz	L(match)
-+L(return_value):
-+	testl	%edx, %edx
-+	jz	L(return_null)
-+	movl	%edx, %eax
-+	movq	%rsi, %rdi
-+
-+# ifdef USE_AS_WCSRCHR
-+	/* Keep the first bit for each matching CHAR for bsr.  */
-+	andl	$0x11111111, %eax
-+# endif
-+	bsrl	%eax, %eax
-+	leaq	-VEC_SIZE(%rdi, %rax), %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(match):
-+	/* Find a CHAR.  Check if there is a nul CHAR.  */
-+	vpmovmskb %ymm2, %ecx
-+	testl	%ecx, %ecx
-+	jnz	L(find_nul)
-+
-+	/* Remember the match and keep searching.  */
-+	movl	%eax, %edx
-+	movq	%rdi, %rsi
-+	jmp	L(aligned_loop)
-+
-+	.p2align 4
-+L(find_nul):
-+# ifdef USE_AS_WCSRCHR
-+	/* Keep the first bit for each matching CHAR for bsr.  */
-+	andl	$0x11111111, %ecx
-+	andl	$0x11111111, %eax
-+# endif
-+	/* Mask out any matching bits after the nul CHAR.  */
-+	movl	%ecx, %r8d
-+	subl	$1, %r8d
-+	xorl	%ecx, %r8d
-+	andl	%r8d, %eax
-+	testl	%eax, %eax
-+	/* If there is no CHAR here, return the remembered one.  */
-+	jz	L(return_value)
-+	bsrl	%eax, %eax
-+	leaq	-VEC_SIZE(%rdi, %rax), %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(char_and_nul):
-+	/* Find both a CHAR and a nul CHAR.  */
-+	addq	%rcx, %rdi
-+	movl	%edx, %ecx
-+L(char_and_nul_in_first_vec):
-+# ifdef USE_AS_WCSRCHR
-+	/* Keep the first bit for each matching CHAR for bsr.  */
-+	andl	$0x11111111, %ecx
-+	andl	$0x11111111, %eax
-+# endif
-+	/* Mask out any matching bits after the nul CHAR.  */
-+	movl	%ecx, %r8d
-+	subl	$1, %r8d
-+	xorl	%ecx, %r8d
-+	andl	%r8d, %eax
-+	testl	%eax, %eax
-+	/* Return null pointer if the nul CHAR comes first.  */
-+	jz	L(return_null)
-+	bsrl	%eax, %eax
-+	leaq	-VEC_SIZE(%rdi, %rax), %rax
-+	VZEROUPPER
-+	ret
-+
-+	.p2align 4
-+L(return_null):
-+	xorl	%eax, %eax
-+	VZEROUPPER
-+	ret
-+
-+END (STRRCHR)
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S
-new file mode 100644
-index 000000000..b03124767
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S
-@@ -0,0 +1,3 @@
-+#define STRCHR wcschr_avx2
-+#define USE_AS_WCSCHR 1
-+#include "avx2-strchr-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S
-new file mode 100644
-index 000000000..bcbcd4ce7
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S
-@@ -0,0 +1,4 @@
-+#define STRCMP wcscmp_avx2
-+#define USE_AS_WCSCMP 1
-+
-+#include "avx2-strcmp-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S
-new file mode 100644
-index 000000000..f1b973572
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S
-@@ -0,0 +1,4 @@
-+#define STRLEN wcslen_avx2
-+#define USE_AS_WCSLEN 1
-+
-+#include "avx2-strlen-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S
-new file mode 100644
-index 000000000..7603169c1
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S
-@@ -0,0 +1,6 @@
-+#define STRCMP wcsncmp_avx2
-+#define USE_AS_STRNCMP 1
-+#define USE_AS_WCSCMP 1
-+
-+#include "avx_regs.h"
-+#include "avx2-strcmp-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S
-new file mode 100644
-index 000000000..2095cd8e0
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S
-@@ -0,0 +1,6 @@
-+#define STRLEN wcsnlen_avx2
-+#define USE_AS_WCSLEN 1
-+#define USE_AS_STRNLEN 1
-+
-+#include "avx_regs.h"
-+#include "avx2-strlen-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S
-new file mode 100644
-index 000000000..fbec1286c
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S
-@@ -0,0 +1,3 @@
-+#define STRRCHR wcsrchr_avx2
-+#define USE_AS_WCSRCHR 1
-+#include "avx2-strrchr-kbl.S"
-diff --git a/libc/arch-x86_64/kabylake/string/avx_regs.h b/libc/arch-x86_64/kabylake/string/avx_regs.h
-new file mode 100644
-index 000000000..223d97e3e
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx_regs.h
-@@ -0,0 +1,26 @@
-+/* Long and pointer size in bytes.  */
-+#define LP_SIZE 8
-+
-+/* Instruction to operate on long and pointer.  */
-+#define LP_OP(insn) insn##q
-+
-+/* Assembler address directive. */
-+#define ASM_ADDR .quad
-+
-+/* Registers to hold long and pointer.  */
-+#define RAX_LP  rax
-+#define RBP_LP  rbp
-+#define RBX_LP  rbx
-+#define RCX_LP  rcx
-+#define RDI_LP  rdi
-+#define RDX_LP  rdx
-+#define RSI_LP  rsi
-+#define RSP_LP  rsp
-+#define R8_LP   r8
-+#define R9_LP   r9
-+#define R10_LP  r10
-+#define R11_LP  r11
-+#define R12_LP  r12
-+#define R13_LP  r13
-+#define R14_LP  r14
-+#define R15_LP  r15
-diff --git a/libc/arch-x86_64/include/cache.h b/libc/arch-x86_64/kabylake/string/cache.h
-similarity index 100%
-rename from libc/arch-x86_64/include/cache.h
-rename to libc/arch-x86_64/kabylake/string/cache.h
-diff --git a/libc/arch-x86_64/silvermont/string/cache.h b/libc/arch-x86_64/silvermont/string/cache.h
-new file mode 100644
-index 000000000..3606d2a1a
---- /dev/null
-+++ b/libc/arch-x86_64/silvermont/string/cache.h
-@@ -0,0 +1,36 @@
-+/*
-+Copyright (c) 2014, Intel Corporation
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+
-+    * Redistributions of source code must retain the above copyright notice,
-+    * this list of conditions and the following disclaimer.
-+
-+    * Redistributions in binary form must reproduce the above copyright notice,
-+    * this list of conditions and the following disclaimer in the documentation
-+    * and/or other materials provided with the distribution.
-+
-+    * Neither the name of Intel Corporation nor the names of its contributors
-+    * may be used to endorse or promote products derived from this software
-+    * without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+/* Values are optimized for Silvermont */
-+#define SHARED_CACHE_SIZE (1024*1024)  /* Silvermont L2 Cache */
-+#define DATA_CACHE_SIZE   (24*1024)    /* Silvermont L1 Data Cache */
-+
-+#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
-+#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
-index 0ad2d44cf..ce15cdf1c 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S
-@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #define USE_AS_STPCPY
--#define STRCPY		stpcpy
-+#define STRCPY		stpcpy_generic
- #include "sse2-strcpy-slm.S"
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
-index 30666850b..02b4df02d 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S
-@@ -30,5 +30,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- 
- #define USE_AS_STRNCPY
- #define USE_AS_STPCPY
--#define STRCPY		stpncpy
-+#define STRCPY		stpncpy_generic
- #include "sse2-strcpy-slm.S"
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
-index dd8207ff5..007adfe95 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S
-@@ -29,7 +29,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #ifndef STRCAT
--# define STRCAT		strcat
-+# define STRCAT		strcat_generic
- #endif
- 
- #ifndef L
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
-index 3e146bfbc..ade9eac4f 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S
-@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #ifndef USE_AS_STRCAT
- 
- # ifndef STRCPY
--#  define STRCPY	strcpy
-+#  define STRCPY	strcpy_generic
- # endif
- 
- # ifndef L
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
-index 3772fe770..df24f9de2 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S
-@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #ifndef USE_AS_STRCAT
- 
- #ifndef STRLEN
--# define STRLEN		strlen
-+# define STRLEN		strlen_generic
- #endif
- 
- #ifndef L
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
-index 6b4a43084..c5394f9d5 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S
-@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #define USE_AS_STRNCAT
--#define STRCAT		strncat
-+#define STRCAT		strncat_generic
- #include "sse2-strcat-slm.S"
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
-index 594e78f74..2e8d68d12 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S
-@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #define USE_AS_STRNCPY
--#define STRCPY		strncpy
-+#define STRCPY		strncpy_generic
- #include "sse2-strcpy-slm.S"
-diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
-index e8acd5ba4..fa2542f00 100644
---- a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S
-@@ -43,7 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #else
- #define UPDATE_STRNCMP_COUNTER
- #ifndef STRCMP
--#define STRCMP		strcmp
-+#define STRCMP		strcmp_generic
- #endif
- #endif
- 
-diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
-index 0e4077517..5d20a483f 100644
---- a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S
-@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #define USE_AS_STRNCMP
--#define STRCMP		strncmp
-+#define STRCMP		strncmp_generic
- #include "ssse3-strcmp-slm.S"
-diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S
-index 979ce4f18..5c0f1f2ba 100644
---- a/libc/arch-x86_64/static_function_dispatch.S
-+++ b/libc/arch-x86_64/static_function_dispatch.S
-@@ -38,6 +38,25 @@ FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic)
- FUNCTION_DELEGATE(memcmp, memcmp_generic)
- FUNCTION_DELEGATE(memcpy, memmove_generic)
- FUNCTION_DELEGATE(memmove, memmove_generic)
--FUNCTION_DELEGATE(memchr, memchr_openbsd)
--FUNCTION_DELEGATE(memrchr, memrchr_openbsd)
--//FUNCTION_DELEGATE(wmemset, wmemset_freebsd)
-+FUNCTION_DELEGATE(memchr, memchr_generic)
-+FUNCTION_DELEGATE(memrchr, memrchr_generic)
-+//FUNCTION_DELEGATE(wmemset, wmemset_generic)
-+FUNCTION_DELEGATE(strcmp, strcmp_generic)
-+FUNCTION_DELEGATE(strncmp, strncmp_generic)
-+FUNCTION_DELEGATE(strcpy, strcpy_generic)
-+FUNCTION_DELEGATE(strncpy, strncpy_generic)
-+FUNCTION_DELEGATE(stpcpy, stpcpy_generic)
-+FUNCTION_DELEGATE(stpncpy, stpncpy_generic)
-+FUNCTION_DELEGATE(strlen, strlen_generic)
-+FUNCTION_DELEGATE(strnlen, strnlen_generic)
-+FUNCTION_DELEGATE(strchr, strchr_generic)
-+FUNCTION_DELEGATE(strrchr, strrchr_generic)
-+FUNCTION_DELEGATE(strcat, strcat_generic)
-+FUNCTION_DELEGATE(strncat, strncat_generic)
-+FUNCTION_DELEGATE(wcscmp, wcscmp_generic)
-+FUNCTION_DELEGATE(wcsncmp, wcsncmp_generic)
-+FUNCTION_DELEGATE(wcslen, wcslen_generic)
-+FUNCTION_DELEGATE(wcsnlen, wcsnlen_generic)
-+FUNCTION_DELEGATE(wcschr, wcschr_generic)
-+FUNCTION_DELEGATE(wcsrchr, wcsrchr_generic)
-+
--- 
-2.25.1
-
diff --git a/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch b/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch
deleted file mode 100644
index 6f47b3414b..0000000000
--- a/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch
+++ /dev/null
@@ -1,645 +0,0 @@
-From 05ace70e6407263d0bef91800005942a079058d6 Mon Sep 17 00:00:00 2001
-From: "Reddy, Alavala Srinivasa" <alavala.srinivasa.reddy@intel.com>
-Date: Wed, 1 Nov 2023 18:43:18 +0530
-Subject: [PATCH 5/5] avx2 implementation for memmove api
-
-This patch includes handwritten avx2 assembly
-implementation for memmove 64-bit.
-
-Test done: Build and boot is fine, Run the benchmarks suite.
-
-Signed-off-by: ahs <amrita.h.s@intel.com>
----
- libc/Android.bp                               |   1 +
- .../arch-x86_64/dynamic_function_dispatch.cpp |   2 +
- .../kabylake/string/avx2-memmove-kbl.S        | 593 ++++++++++++++++++
- 3 files changed, 596 insertions(+)
- create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
-
-diff --git a/libc/Android.bp b/libc/Android.bp
-index 92483e833..5deb88b48 100644
---- a/libc/Android.bp
-+++ b/libc/Android.bp
-@@ -1235,6 +1235,7 @@ cc_library_static {
-                 "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-memchr-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S",
-+                "arch-x86_64/kabylake/string/avx2-memmove-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S",
-                 "arch-x86_64/kabylake/string/avx2-strlen-kbl.S",
-diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp
-index 182eb4200..5bcf63e4c 100644
---- a/libc/arch-x86_64/dynamic_function_dispatch.cpp
-+++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp
-@@ -55,6 +55,8 @@ DEFINE_IFUNC_FOR(memcmp) {
- 
- typedef void* memmove_func(void* __dst, const void* __src, size_t __n);
- DEFINE_IFUNC_FOR(memmove) {
-+    __builtin_cpu_init();
-+    if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memmove_func, memmove_avx2);
-     RETURN_FUNC(memmove_func, memmove_generic);
- }
- 
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
-new file mode 100644
-index 000000000..02e9ec1d2
---- /dev/null
-+++ b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
-@@ -0,0 +1,593 @@
-+/*
-+Copyright (c) 2014, Intel Corporation
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+
-+    * Redistributions of source code must retain the above copyright notice,
-+    * this list of conditions and the following disclaimer.
-+
-+    * Redistributions in binary form must reproduce the above copyright notice,
-+    * this list of conditions and the following disclaimer in the documentation
-+    * and/or other materials provided with the distribution.
-+
-+    * Neither the name of Intel Corporation nor the names of its contributors
-+    * may be used to endorse or promote products derived from this software
-+    * without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
-+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include "cache.h"
-+
-+#ifndef MEMMOVE
-+# define MEMMOVE		memmove_avx2
-+#endif
-+
-+#ifndef L
-+# define L(label)	.L##label
-+#endif
-+
-+#ifndef cfi_startproc
-+# define cfi_startproc	.cfi_startproc
-+#endif
-+
-+#ifndef cfi_endproc
-+# define cfi_endproc	.cfi_endproc
-+#endif
-+
-+#ifndef cfi_rel_offset
-+# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
-+#endif
-+
-+#ifndef cfi_restore
-+# define cfi_restore(reg)	.cfi_restore reg
-+#endif
-+
-+#ifndef cfi_adjust_cfa_offset
-+# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
-+#endif
-+
-+#ifndef ENTRY
-+# define ENTRY(name)		\
-+	.type name,  @function;		\
-+	.globl name;		\
-+	.p2align 4;		\
-+name:		\
-+	cfi_startproc
-+#endif
-+
-+#ifndef ALIAS_SYMBOL
-+# define ALIAS_SYMBOL(alias, original) \
-+	.globl alias; \
-+	.equ alias, original
-+#endif
-+
-+#ifndef END
-+# define END(name)		\
-+	cfi_endproc;		\
-+	.size name, .-name
-+#endif
-+
-+#define CFI_PUSH(REG)		\
-+	cfi_adjust_cfa_offset (4);		\
-+	cfi_rel_offset (REG, 0)
-+
-+#define CFI_POP(REG)		\
-+	cfi_adjust_cfa_offset (-4);		\
-+	cfi_restore (REG)
-+
-+#define PUSH(REG)	push REG;
-+#define POP(REG)	pop REG;
-+
-+#define ENTRANCE	PUSH (%rbx);
-+#define RETURN_END	POP (%rbx); ret
-+#define RETURN		RETURN_END;
-+
-+	.section .text.avx2,"ax",@progbits
-+ENTRY (MEMMOVE)
-+	ENTRANCE
-+	mov	%rdi, %rax
-+
-+/* Check whether we should copy backward or forward.  */
-+	cmp	%rsi, %rdi
-+	je	L(mm_return)
-+	jg	L(mm_len_0_or_more_backward)
-+
-+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
-+	separately.  */
-+	cmp	$16, %rdx
-+	jbe	L(mm_len_0_16_bytes_forward)
-+
-+	cmp	$32, %rdx
-+	ja	L(mm_len_32_or_more_forward)
-+
-+/* Copy [0..32] and return.  */
-+	movdqu	(%rsi), %xmm0
-+	movdqu	-16(%rsi, %rdx), %xmm1
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, -16(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_32_or_more_forward):
-+	cmp	$64, %rdx
-+	ja	L(mm_len_64_or_more_forward)
-+
-+/* Copy [0..64] and return.  */
-+        movdqu	(%rsi), %xmm0
-+	movdqu	16(%rsi), %xmm1
-+	movdqu	-16(%rsi, %rdx), %xmm2
-+	movdqu	-32(%rsi, %rdx), %xmm3
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, 16(%rdi)
-+	movdqu	%xmm2, -16(%rdi, %rdx)
-+	movdqu	%xmm3, -32(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_64_or_more_forward):
-+	cmp	$128, %rdx
-+	ja	L(mm_len_128_or_more_forward)
-+
-+/* Copy [0..128] and return.  */
-+        movdqu	(%rsi), %xmm0
-+	movdqu	16(%rsi), %xmm1
-+	movdqu	32(%rsi), %xmm2
-+	movdqu	48(%rsi), %xmm3
-+	movdqu	-64(%rsi, %rdx), %xmm4
-+	movdqu	-48(%rsi, %rdx), %xmm5
-+	movdqu	-32(%rsi, %rdx), %xmm6
-+	movdqu	-16(%rsi, %rdx), %xmm7
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, 16(%rdi)
-+	movdqu	%xmm2, 32(%rdi)
-+	movdqu	%xmm3, 48(%rdi)
-+	movdqu	%xmm4, -64(%rdi, %rdx)
-+	movdqu	%xmm5, -48(%rdi, %rdx)
-+	movdqu	%xmm6, -32(%rdi, %rdx)
-+	movdqu	%xmm7, -16(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_128_or_more_forward):
-+        cmp     $256, %rdx
-+        ja      L(mm_len_256_or_more_forward)
-+
-+/* Copy [0..256] and return.  */
-+	movdqu	(%rsi), %xmm0
-+	movdqu	16(%rsi), %xmm1
-+	movdqu	32(%rsi), %xmm2
-+	movdqu	48(%rsi), %xmm3
-+	movdqu	64(%rsi), %xmm4
-+	movdqu	80(%rsi), %xmm5
-+	movdqu	96(%rsi), %xmm6
-+	movdqu	112(%rsi), %xmm7
-+	movdqu	-128(%rsi, %rdx), %xmm8
-+	movdqu	-112(%rsi, %rdx), %xmm9
-+	movdqu	-96(%rsi, %rdx), %xmm10
-+	movdqu	-80(%rsi, %rdx), %xmm11
-+	movdqu	-64(%rsi, %rdx), %xmm12
-+	movdqu	-48(%rsi, %rdx), %xmm13
-+	movdqu	-32(%rsi, %rdx), %xmm14
-+	movdqu	-16(%rsi, %rdx), %xmm15
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, 16(%rdi)
-+	movdqu	%xmm2, 32(%rdi)
-+	movdqu	%xmm3, 48(%rdi)
-+	movdqu	%xmm4, 64(%rdi)
-+	movdqu	%xmm5, 80(%rdi)
-+	movdqu	%xmm6, 96(%rdi)
-+	movdqu	%xmm7, 112(%rdi)
-+	movdqu	%xmm8, -128(%rdi, %rdx)
-+	movdqu	%xmm9, -112(%rdi, %rdx)
-+	movdqu	%xmm10, -96(%rdi, %rdx)
-+	movdqu	%xmm11, -80(%rdi, %rdx)
-+	movdqu	%xmm12, -64(%rdi, %rdx)
-+	movdqu	%xmm13, -48(%rdi, %rdx)
-+	movdqu	%xmm14, -32(%rdi, %rdx)
-+	movdqu	%xmm15, -16(%rdi, %rdx)
-+        jmp     L(mm_return)
-+
-+L(mm_len_256_or_more_forward):
-+/* Aligning the address of destination.  */
-+/*  save first unaligned 128 bytes */
-+        vmovdqu (%rsi), %ymm0
-+        vmovdqu 32(%rsi), %ymm1
-+        vmovdqu 64(%rsi), %ymm2
-+        vmovdqu 96(%rsi), %ymm3
-+
-+        lea     128(%rdi), %r8
-+        and     $-128, %r8  /* r8 now aligned to next 128 byte boundary */
-+        sub     %rdi, %rsi /* rsi = src - dst = diff */
-+
-+        vmovdqu (%r8, %rsi), %ymm4
-+        vmovdqu 32(%r8, %rsi), %ymm5
-+        vmovdqu 64(%r8, %rsi), %ymm6
-+        vmovdqu 96(%r8, %rsi), %ymm7
-+
-+        vmovdqu %ymm0, (%rdi)
-+        vmovdqu %ymm1, 32(%rdi)
-+        vmovdqu %ymm2, 64(%rdi)
-+        vmovdqu %ymm3, 96(%rdi)
-+        vmovdqa %ymm4, (%r8)
-+        vmovaps %ymm5, 32(%r8)
-+        vmovaps %ymm6, 64(%r8)
-+        vmovaps %ymm7, 96(%r8)
-+        add     $128, %r8
-+
-+        lea     (%rdi, %rdx), %rbx
-+        and     $-128, %rbx
-+        cmp     %r8, %rbx
-+        jbe     L(mm_copy_remaining_forward)
-+
-+        cmp     $SHARED_CACHE_SIZE_HALF, %rdx
-+        jae     L(mm_large_page_loop_forward)
-+
-+        .p2align 4
-+L(mm_main_loop_forward):
-+        prefetcht0 128(%r8, %rsi)
-+        vmovdqu (%r8, %rsi), %ymm0
-+        vmovdqu 32(%r8, %rsi), %ymm1
-+        vmovdqa %ymm0, (%r8)
-+        vmovaps %ymm1, 32(%r8)
-+        lea     64(%r8), %r8
-+        cmp     %r8, %rbx
-+        ja      L(mm_main_loop_forward)
-+
-+L(mm_copy_remaining_forward):
-+	add	%rdi, %rdx
-+	sub	%r8, %rdx
-+/* We copied all up till %rdi position in the dst.
-+	In %rdx now is how many bytes are left to copy.
-+	Now we need to advance %r8. */
-+	lea	(%r8, %rsi), %r9
-+
-+L(mm_remaining_0_128_bytes_forward):
-+        cmp     $64, %rdx
-+        ja      L(mm_remaining_65_128_bytes_forward)
-+	cmp	$32, %rdx
-+	ja	L(mm_remaining_33_64_bytes_forward)
-+        vzeroupper
-+	cmp	$16, %rdx
-+	ja	L(mm_remaining_17_32_bytes_forward)
-+	test	%rdx, %rdx
-+	.p2align 4,,2
-+	je	L(mm_return)
-+
-+	cmpb	$8, %dl
-+	ja	L(mm_remaining_9_16_bytes_forward)
-+	cmpb	$4, %dl
-+	.p2align 4,,5
-+	ja	L(mm_remaining_5_8_bytes_forward)
-+	cmpb	$2, %dl
-+	.p2align 4,,1
-+	ja	L(mm_remaining_3_4_bytes_forward)
-+	movzbl	-1(%r9,%rdx), %esi
-+	movzbl	(%r9), %ebx
-+	movb	%sil, -1(%r8,%rdx)
-+	movb	%bl, (%r8)
-+	jmp	L(mm_return)
-+
-+L(mm_remaining_65_128_bytes_forward):
-+        vmovdqu (%r9), %ymm0
-+        vmovdqu 32(%r9), %ymm1
-+        vmovdqu -64(%r9, %rdx), %ymm2
-+        vmovdqu -32(%r9, %rdx), %ymm3
-+        vmovdqu %ymm0, (%r8)
-+        vmovdqu %ymm1, 32(%r8)
-+        vmovdqu %ymm2, -64(%r8, %rdx)
-+        vmovdqu %ymm3, -32(%r8, %rdx)
-+        jmp L(mm_return)
-+
-+L(mm_remaining_33_64_bytes_forward):
-+        vmovdqu (%r9), %ymm0
-+        vmovdqu -32(%r9, %rdx), %ymm1
-+        vmovdqu %ymm0, (%r8)
-+        vmovdqu %ymm1, -32(%r8, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_remaining_17_32_bytes_forward):
-+	movdqu	(%r9), %xmm0
-+	movdqu	-16(%r9, %rdx), %xmm1
-+	movdqu	%xmm0, (%r8)
-+	movdqu	%xmm1, -16(%r8, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_remaining_5_8_bytes_forward):
-+	movl	(%r9), %esi
-+	movl	-4(%r9,%rdx), %ebx
-+	movl	%esi, (%r8)
-+	movl	%ebx, -4(%r8,%rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_remaining_9_16_bytes_forward):
-+	mov	(%r9), %rsi
-+	mov	-8(%r9, %rdx), %rbx
-+	mov	%rsi, (%r8)
-+	mov	%rbx, -8(%r8, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_remaining_3_4_bytes_forward):
-+	movzwl	-2(%r9,%rdx), %esi
-+	movzwl	(%r9), %ebx
-+	movw	%si, -2(%r8,%rdx)
-+	movw	%bx, (%r8)
-+	jmp	L(mm_return)
-+
-+L(mm_len_0_16_bytes_forward):
-+	testb	$24, %dl
-+	jne	L(mm_len_9_16_bytes_forward)
-+	testb	$4, %dl
-+	.p2align 4,,5
-+	jne	L(mm_len_5_8_bytes_forward)
-+	test	%rdx, %rdx
-+	.p2align 4,,2
-+	je	L(mm_return)
-+	testb	$2, %dl
-+	.p2align 4,,1
-+	jne	L(mm_len_2_4_bytes_forward)
-+	movzbl	-1(%rsi,%rdx), %ebx
-+	movzbl	(%rsi), %esi
-+	movb	%bl, -1(%rdi,%rdx)
-+	movb	%sil, (%rdi)
-+	jmp	L(mm_return)
-+
-+L(mm_len_2_4_bytes_forward):
-+	movzwl	-2(%rsi,%rdx), %ebx
-+	movzwl	(%rsi), %esi
-+	movw	%bx, -2(%rdi,%rdx)
-+	movw	%si, (%rdi)
-+	jmp	L(mm_return)
-+
-+L(mm_len_5_8_bytes_forward):
-+	movl	(%rsi), %ebx
-+	movl	-4(%rsi,%rdx), %esi
-+	movl	%ebx, (%rdi)
-+	movl	%esi, -4(%rdi,%rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_9_16_bytes_forward):
-+	mov	(%rsi), %rbx
-+	mov	-8(%rsi, %rdx), %rsi
-+	mov	%rbx, (%rdi)
-+	mov	%rsi, -8(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_recalc_len):
-+/* Compute in %rdx how many bytes are left to copy after
-+	the main loop stops.  */
-+	vzeroupper
-+	mov 	%rbx, %rdx
-+	sub 	%rdi, %rdx
-+/* The code for copying backwards.  */
-+L(mm_len_0_or_more_backward):
-+
-+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
-+	separately.  */
-+	cmp	$16, %rdx
-+	jbe	L(mm_len_0_16_bytes_backward)
-+
-+	cmp	$32, %rdx
-+	ja	L(mm_len_32_or_more_backward)
-+
-+/* Copy [0..32] and return.  */
-+	movdqu	(%rsi), %xmm0
-+	movdqu	-16(%rsi, %rdx), %xmm1
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, -16(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_32_or_more_backward):
-+	cmp	$64, %rdx
-+	ja	L(mm_len_64_or_more_backward)
-+
-+/* Copy [0..64] and return.  */
-+        movdqu	(%rsi), %xmm0
-+	movdqu	16(%rsi), %xmm1
-+	movdqu	-16(%rsi, %rdx), %xmm2
-+	movdqu	-32(%rsi, %rdx), %xmm3
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, 16(%rdi)
-+	movdqu	%xmm2, -16(%rdi, %rdx)
-+	movdqu	%xmm3, -32(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_64_or_more_backward):
-+	cmp	$128, %rdx
-+	ja	L(mm_len_128_or_more_backward)
-+
-+/* Copy [0..128] and return.  */
-+        movdqu	(%rsi), %xmm0
-+	movdqu	16(%rsi), %xmm1
-+	movdqu	32(%rsi), %xmm2
-+	movdqu	48(%rsi), %xmm3
-+	movdqu	-64(%rsi, %rdx), %xmm4
-+	movdqu	-48(%rsi, %rdx), %xmm5
-+	movdqu	-32(%rsi, %rdx), %xmm6
-+	movdqu	-16(%rsi, %rdx), %xmm7
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, 16(%rdi)
-+	movdqu	%xmm2, 32(%rdi)
-+	movdqu	%xmm3, 48(%rdi)
-+	movdqu	%xmm4, -64(%rdi, %rdx)
-+	movdqu	%xmm5, -48(%rdi, %rdx)
-+	movdqu	%xmm6, -32(%rdi, %rdx)
-+	movdqu	%xmm7, -16(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_128_or_more_backward):
-+	cmp	$256, %rdx
-+	ja	L(mm_len_256_or_more_backward)
-+
-+/* Copy [0..256] and return.  */
-+	movdqu	(%rsi), %xmm0
-+	movdqu	16(%rsi), %xmm1
-+	movdqu	32(%rsi), %xmm2
-+	movdqu	48(%rsi), %xmm3
-+	movdqu	64(%rsi), %xmm4
-+	movdqu	80(%rsi), %xmm5
-+	movdqu	96(%rsi), %xmm6
-+	movdqu	112(%rsi), %xmm7
-+	movdqu	-128(%rsi, %rdx), %xmm8
-+	movdqu	-112(%rsi, %rdx), %xmm9
-+	movdqu	-96(%rsi, %rdx), %xmm10
-+	movdqu	-80(%rsi, %rdx), %xmm11
-+	movdqu	-64(%rsi, %rdx), %xmm12
-+	movdqu	-48(%rsi, %rdx), %xmm13
-+	movdqu	-32(%rsi, %rdx), %xmm14
-+	movdqu	-16(%rsi, %rdx), %xmm15
-+	movdqu	%xmm0, (%rdi)
-+	movdqu	%xmm1, 16(%rdi)
-+	movdqu	%xmm2, 32(%rdi)
-+	movdqu	%xmm3, 48(%rdi)
-+	movdqu	%xmm4, 64(%rdi)
-+	movdqu	%xmm5, 80(%rdi)
-+	movdqu	%xmm6, 96(%rdi)
-+	movdqu	%xmm7, 112(%rdi)
-+	movdqu	%xmm8, -128(%rdi, %rdx)
-+	movdqu	%xmm9, -112(%rdi, %rdx)
-+	movdqu	%xmm10, -96(%rdi, %rdx)
-+	movdqu	%xmm11, -80(%rdi, %rdx)
-+	movdqu	%xmm12, -64(%rdi, %rdx)
-+	movdqu	%xmm13, -48(%rdi, %rdx)
-+	movdqu	%xmm14, -32(%rdi, %rdx)
-+	movdqu	%xmm15, -16(%rdi, %rdx)
-+	jmp	L(mm_return)
-+
-+L(mm_len_256_or_more_backward):
-+/* Aligning the address of destination. We need to save
-+	128 bytes from the source in order not to overwrite them.  */
-+	vmovdqu	-32(%rsi, %rdx), %ymm0
-+	vmovdqu	-64(%rsi, %rdx), %ymm1
-+	vmovdqu	-96(%rsi, %rdx), %ymm2
-+	vmovdqu	-128(%rsi, %rdx), %ymm3
-+
-+	lea	(%rdi, %rdx), %r9
-+	and	$-128, %r9 /* r9 = aligned dst */
-+
-+	mov	%rsi, %r8
-+	sub	%rdi, %r8 /* r8 = src - dst, diff */
-+
-+	vmovdqu	-32(%r9, %r8), %ymm4
-+	vmovdqu	-64(%r9, %r8), %ymm5
-+	vmovdqu	-96(%r9, %r8), %ymm6
-+	vmovdqu	-128(%r9, %r8), %ymm7
-+
-+	vmovdqu	%ymm0, -32(%rdi, %rdx)
-+	vmovdqu	%ymm1, -64(%rdi, %rdx)
-+	vmovdqu	%ymm2, -96(%rdi, %rdx)
-+	vmovdqu	%ymm3, -128(%rdi, %rdx)
-+	vmovdqa	%ymm4, -32(%r9)
-+	vmovdqa	%ymm5, -64(%r9)
-+	vmovdqa	%ymm6, -96(%r9)
-+	vmovdqa	%ymm7, -128(%r9)
-+	lea	-128(%r9), %r9
-+
-+	lea	128(%rdi), %rbx
-+	and	$-128, %rbx
-+
-+	cmp	%r9, %rbx
-+	jae	L(mm_recalc_len)
-+
-+	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
-+	jae	L(mm_large_page_loop_backward)
-+
-+	.p2align 4
-+L(mm_main_loop_backward):
-+	prefetcht0 -128(%r9, %r8)
-+
-+	vmovdqu	-64(%r9, %r8), %ymm0
-+	vmovdqu	-32(%r9, %r8), %ymm1
-+	vmovdqa	%ymm0, -64(%r9)
-+	vmovaps	%ymm1, -32(%r9)
-+	lea	-64(%r9), %r9
-+	cmp	%r9, %rbx
-+	jb	L(mm_main_loop_backward)
-+	jmp	L(mm_recalc_len)
-+
-+/* Copy [0..16] and return.  */
-+L(mm_len_0_16_bytes_backward):
-+	testb	$24, %dl
-+	jnz	L(mm_len_9_16_bytes_backward)
-+	testb	$4, %dl
-+	.p2align 4,,5
-+	jnz	L(mm_len_5_8_bytes_backward)
-+	test	%rdx, %rdx
-+	.p2align 4,,2
-+	je	L(mm_return)
-+	testb	$2, %dl
-+	.p2align 4,,1
-+	jne	L(mm_len_3_4_bytes_backward)
-+	movzbl	-1(%rsi,%rdx), %ebx
-+	movzbl	(%rsi), %ecx
-+	movb	%bl, -1(%rdi,%rdx)
-+	movb	%cl, (%rdi)
-+	jmp	L(mm_return)
-+
-+L(mm_len_3_4_bytes_backward):
-+	movzwl	-2(%rsi,%rdx), %ebx
-+	movzwl	(%rsi), %ecx
-+	movw	%bx, -2(%rdi,%rdx)
-+	movw	%cx, (%rdi)
-+	jmp	L(mm_return)
-+
-+L(mm_len_9_16_bytes_backward):
-+	movl	-4(%rsi,%rdx), %ebx
-+	movl	-8(%rsi,%rdx), %ecx
-+	movl	%ebx, -4(%rdi,%rdx)
-+	movl	%ecx, -8(%rdi,%rdx)
-+	sub	$8, %rdx
-+	jmp	L(mm_len_0_16_bytes_backward)
-+
-+L(mm_len_5_8_bytes_backward):
-+	movl	(%rsi), %ebx
-+	movl	-4(%rsi,%rdx), %ecx
-+	movl	%ebx, (%rdi)
-+	movl	%ecx, -4(%rdi,%rdx)
-+
-+L(mm_return):
-+        vzeroupper
-+	RETURN
-+
-+/* Big length copy forward part.  */
-+
-+	.p2align 4
-+L(mm_large_page_loop_forward):
-+	vmovdqu	  (%r8, %rsi), %ymm0
-+	vmovdqu	  32(%r8, %rsi), %ymm1
-+	vmovdqu	  64(%r8, %rsi), %ymm2
-+	vmovdqu	  96(%r8, %rsi), %ymm3
-+	vmovntdq  %ymm0, (%r8)
-+	vmovntdq  %ymm1, 32(%r8)
-+	vmovntdq  %ymm2, 64(%r8)
-+	vmovntdq  %ymm3, 96(%r8)
-+	lea 	  128(%r8), %r8
-+	cmp	  %r8, %rbx
-+	ja	  L(mm_large_page_loop_forward)
-+	sfence
-+	jmp	  L(mm_copy_remaining_forward)
-+
-+/* Big length copy backward part.  */
-+	.p2align 4
-+L(mm_large_page_loop_backward):
-+	vmovdqu	  -64(%r9, %r8), %ymm0
-+	vmovdqu	  -32(%r9, %r8), %ymm1
-+	vmovntdq  %ymm0, -64(%r9)
-+	vmovntdq  %ymm1, -32(%r9)
-+	lea 	  -64(%r9), %r9
-+	cmp	  %r9, %rbx
-+	jb	  L(mm_large_page_loop_backward)
-+	sfence
-+	jmp	  L(mm_recalc_len)
-+
-+END (MEMMOVE)
-+
-+//ALIAS_SYMBOL(memcpy, MEMMOVE)
--- 
-2.25.1
-
diff --git a/aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch b/aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch
deleted file mode 100644
index 204371f263..0000000000
--- a/aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch
+++ /dev/null
@@ -1,594 +0,0 @@
-From e4ddc78e40f68994a1822c2e126e517c8f4060c5 Mon Sep 17 00:00:00 2001
-From: Elliott Hughes <enh@google.com>
-Date: Fri, 19 Jul 2024 12:00:17 +0000
-Subject: [PATCH] Obtain x86 cache info from CPU
-
-The cache info today is hardcoded in cache.h
-May not be optimal across various uarchs/SKUs
-Leverage bionic sysconf to get the underlying cache.
-
-Improvements seen on RPL, for various sizes
-memmove_non_overlapping
-1.25M - 31%
-1.5M - 30%
-1.75M - 28%
-
-memcpy
-1.25M - 31%
-1.5M - 31%
-1.75M - 30%
-
-The bionic benchmarks (which only go up to 128KiB) show no change, as
-you'd expect.
-
-Test: bionic/tests/run-on-host.sh 64 && bionic/tests/run-on-host.sh 32
-Bug: 202102347
-Change-Id: I4bbad51794758873744149d0f58b86bb92ee307f
-Signed-off-by: Vinay Prasad Kompella <vinay.kompella@intel.com>
-Signed-off-by: Soni, Ravi Kumar <ravi.kumar.soni@intel.com>
----
- libc/arch-x86/string/cache.h                  | 41 -------------------
- libc/arch-x86/string/sse2-memmove-slm.S       | 19 +++++++--
- libc/arch-x86/string/sse2-memset-atom.S       | 13 ++++--
- libc/arch-x86/string/sse2-memset-slm.S        | 11 +++--
- libc/arch-x86/string/ssse3-memcpy-atom.S      |  1 -
- .../kabylake/string/avx2-memmove-kbl.S        | 26 +++++++++---
- .../kabylake/string/avx2-memset-kbl.S         |  8 +---
- libc/arch-x86_64/kabylake/string/cache.h      | 36 ----------------
- libc/arch-x86_64/silvermont/string/cache.h    | 36 ----------------
- .../silvermont/string/sse2-memmove-slm.S      | 26 +++++++++---
- .../silvermont/string/sse2-memset-slm.S       |  8 +---
- .../silvermont/string/sse4-memcmp-slm.S       | 13 +-----
- libc/bionic/libc_init_common.cpp              | 26 ++++++++++++
- 13 files changed, 108 insertions(+), 156 deletions(-)
- delete mode 100644 libc/arch-x86/string/cache.h
- delete mode 100644 libc/arch-x86_64/kabylake/string/cache.h
- delete mode 100644 libc/arch-x86_64/silvermont/string/cache.h
-
-diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/string/cache.h
-deleted file mode 100644
-index 33719a0cb..000000000
---- a/libc/arch-x86/string/cache.h
-+++ /dev/null
-@@ -1,41 +0,0 @@
--/*
--Copyright (c) 2010, Intel Corporation
--All rights reserved.
--
--Redistribution and use in source and binary forms, with or without
--modification, are permitted provided that the following conditions are met:
--
--    * Redistributions of source code must retain the above copyright notice,
--    * this list of conditions and the following disclaimer.
--
--    * Redistributions in binary form must reproduce the above copyright notice,
--    * this list of conditions and the following disclaimer in the documentation
--    * and/or other materials provided with the distribution.
--
--    * Neither the name of Intel Corporation nor the names of its contributors
--    * may be used to endorse or promote products derived from this software
--    * without specific prior written permission.
--
--THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
--ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
--ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
--(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
--LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
--ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
--(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
--SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--*/
--
--#ifdef FOR_ATOM
--#define SHARED_CACHE_SIZE (512 * 1024) /* Atom L2 Cache */
--#endif
--#ifdef FOR_SILVERMONT
--#define SHARED_CACHE_SIZE (1024 * 1024) /* Silvermont L2 Cache */
--#endif
--
--#define DATA_CACHE_SIZE (24 * 1024) /* Atom and Silvermont L1 Data Cache */
--
--#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
--#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2)
-diff --git a/libc/arch-x86/string/sse2-memmove-slm.S b/libc/arch-x86/string/sse2-memmove-slm.S
-index 79b5d1b7e..7f4237486 100644
---- a/libc/arch-x86/string/sse2-memmove-slm.S
-+++ b/libc/arch-x86/string/sse2-memmove-slm.S
-@@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #define FOR_SILVERMONT
--#include "cache.h"
- 
- #ifndef MEMMOVE
- # define MEMMOVE	memmove_generic
-@@ -94,6 +93,8 @@ name:		\
- #define RETURN_END	POP (%ebx); ret
- #define RETURN		RETURN_END; CFI_PUSH (%ebx)
- 
-+#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-+
- 	.section .text.sse2,"ax",@progbits
- ENTRY (MEMMOVE)
- 	ENTRANCE
-@@ -193,7 +194,13 @@ L(mm_len_128_or_more_forward):
- 	cmp	%edi, %ebx
- 	jbe	L(mm_copy_remaining_forward)
- 
--	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-+	PUSH(%ebx)
-+	SETUP_PIC_REG(bx)
-+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-+	/* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */
-+	POP(%ebx)
-+
- 	jae	L(mm_large_page_loop_forward)
- 
- 	.p2align 4
-@@ -424,7 +431,13 @@ L(mm_len_128_or_more_backward):
- 	cmp	%edi, %ebx
- 	jae	L(mm_main_loop_backward_end)
- 
--	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-+	PUSH(%ebx)
-+	SETUP_PIC_REG(bx)
-+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-+	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-+	/* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */
-+	POP(%ebx)
-+
- 	jae	L(mm_large_page_loop_backward)
- 
- 	.p2align 4
-diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/string/sse2-memset-atom.S
-index 320afec11..e43ead0d1 100644
---- a/libc/arch-x86/string/sse2-memset-atom.S
-+++ b/libc/arch-x86/string/sse2-memset-atom.S
-@@ -31,7 +31,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #include <private/bionic_asm.h>
- 
- #define FOR_ATOM
--#include "cache.h"
- 
- #ifndef L
- # define L(label)	.L##label
-@@ -64,6 +63,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #define RETURN		RETURN_END; CFI_PUSH(%ebx)
- #define JMPTBL(I, B)	I - B
- 
-+#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-+
- /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-    jump table with relative offsets.   */
- # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-@@ -256,14 +257,20 @@ L(aligned_16_less128bytes):
- 	ALIGN(4)
- L(128bytesormore):
- 	PUSH(%ebx)
--	mov	$SHARED_CACHE_SIZE, %ebx
-+	SETUP_PIC_REG(bx)
-+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
- 	cmp	%ebx, %ecx
- 	jae	L(128bytesormore_nt_start)
- 
- 
- 	POP(%ebx)
- # define RESTORE_EBX_STATE CFI_PUSH(%ebx)
--	cmp	$DATA_CACHE_SIZE, %ecx
-+	PUSH(%ebx)
-+	SETUP_PIC_REG(bx)
-+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-+	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
-+	POP(%ebx)
- 
- 	jae	L(128bytes_L2_normal)
- 	subl	$128, %ecx
-diff --git a/libc/arch-x86/string/sse2-memset-slm.S b/libc/arch-x86/string/sse2-memset-slm.S
-index 5cff141ad..e4c8fa108 100644
---- a/libc/arch-x86/string/sse2-memset-slm.S
-+++ b/libc/arch-x86/string/sse2-memset-slm.S
-@@ -31,7 +31,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- #include <private/bionic_asm.h>
- 
- #define FOR_SILVERMONT
--#include "cache.h"
- 
- #ifndef L
- # define L(label)	.L##label
-@@ -64,6 +63,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- # define RETURN		RETURN_END; CFI_PUSH(%ebx)
- # define JMPTBL(I, B)	I - B
- 
-+#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
-+
- /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-    jump table with relative offsets.   */
- # define BRANCH_TO_JMPTBL_ENTRY(TABLE)				\
-@@ -177,14 +178,18 @@ L(aligned_16_less128bytes):
- 	ALIGN(4)
- L(128bytesormore):
- 	PUSH(%ebx)
--	mov	$SHARED_CACHE_SIZE, %ebx
-+	SETUP_PIC_REG(bx)
-+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-+	mov	__x86_shared_cache_size@GOTOFF(%ebx), %ebx
- 	cmp	%ebx, %ecx
- 	jae	L(128bytesormore_nt_start)
- 
- 	POP(%ebx)
- 
- 	PUSH(%ebx)
--	mov	$DATA_CACHE_SIZE, %ebx
-+	SETUP_PIC_REG(bx)
-+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-+	mov	__x86_data_cache_size@GOTOFF(%ebx), %ebx
- 
- 	cmp	%ebx, %ecx
- 	jae	L(128bytes_L2_normal)
-diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/string/ssse3-memcpy-atom.S
-index fe3082ee7..83e198504 100644
---- a/libc/arch-x86/string/ssse3-memcpy-atom.S
-+++ b/libc/arch-x86/string/ssse3-memcpy-atom.S
-@@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
- #define FOR_ATOM
--#include "cache.h"
- 
- #ifndef MEMCPY
- # define MEMCPY	memcpy_atom
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
-index 02e9ec1d2..77d628eb0 100644
---- a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
-+++ b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S
-@@ -28,7 +28,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
--#include "cache.h"
-+
- 
- #ifndef MEMMOVE
- # define MEMMOVE		memmove_avx2
-@@ -228,8 +228,9 @@ L(mm_len_256_or_more_forward):
-         cmp     %r8, %rbx
-         jbe     L(mm_copy_remaining_forward)
- 
--        cmp     $SHARED_CACHE_SIZE_HALF, %rdx
--        jae     L(mm_large_page_loop_forward)
-+           cmp     __x86_shared_cache_size_half(%rip), %rdx
-+
-+           ja      L(mm_overlapping_check_forward)
- 
-         .p2align 4
- L(mm_main_loop_forward):
-@@ -497,8 +498,10 @@ L(mm_len_256_or_more_backward):
- 	cmp	%r9, %rbx
- 	jae	L(mm_recalc_len)
- 
--	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
--	jae	L(mm_large_page_loop_backward)
-+	cmp     __x86_shared_cache_size_half(%rip), %rdx
-+
-+       ja      L(mm_overlapping_check_backward)
-+
- 
- 	.p2align 4
- L(mm_main_loop_backward):
-@@ -560,6 +563,12 @@ L(mm_return):
- /* Big length copy forward part.  */
- 
- 	.p2align 4
-+L(mm_overlapping_check_forward):
-+       mov     %rsi, %r9
-+       add     %rdx, %r9
-+       cmp     __x86_shared_cache_size(%rip), %r9
-+       jbe     L(mm_main_loop_forward)
-+
- L(mm_large_page_loop_forward):
- 	vmovdqu	  (%r8, %rsi), %ymm0
- 	vmovdqu	  32(%r8, %rsi), %ymm1
-@@ -577,6 +586,13 @@ L(mm_large_page_loop_forward):
- 
- /* Big length copy backward part.  */
- 	.p2align 4
-+L(mm_overlapping_check_backward):
-+       mov     %rdi, %r11
-+       sub     %rsi, %r11 /* r11 = dst - src, diff */
-+       add     %rdx, %r11
-+       cmp     __x86_shared_cache_size(%rip), %r11
-+       jbe     L(mm_main_loop_backward)
-+
- L(mm_large_page_loop_backward):
- 	vmovdqu	  -64(%r9, %r8), %ymm0
- 	vmovdqu	  -32(%r9, %r8), %ymm1
-diff --git a/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
-index ca62a9f8c..35d682a5d 100644
---- a/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
-+++ b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S
-@@ -30,7 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- 
- #include <private/bionic_asm.h>
- 
--#include "cache.h"
- 
- #ifndef L
- # define L(label)	.L##label
-@@ -117,11 +116,8 @@ L(16bytesormore):
- 	cmpq	%rcx, %rdx
- 	je	L(done)
- 
--#ifdef SHARED_CACHE_SIZE
--	cmp	$SHARED_CACHE_SIZE, %r8
--#else
--	cmp	__x86_64_shared_cache_size(%rip), %r8
--#endif
-+	cmp	__x86_shared_cache_size(%rip), %r8
-+
- 	ja	L(non_temporal_loop)
- 
- 	ALIGN (4)
-diff --git a/libc/arch-x86_64/kabylake/string/cache.h b/libc/arch-x86_64/kabylake/string/cache.h
-deleted file mode 100644
-index 4131509fb..000000000
---- a/libc/arch-x86_64/kabylake/string/cache.h
-+++ /dev/null
-@@ -1,36 +0,0 @@
--/*
--Copyright (c) 2014, Intel Corporation
--All rights reserved.
--
--Redistribution and use in source and binary forms, with or without
--modification, are permitted provided that the following conditions are met:
--
--    * Redistributions of source code must retain the above copyright notice,
--    * this list of conditions and the following disclaimer.
--
--    * Redistributions in binary form must reproduce the above copyright notice,
--    * this list of conditions and the following disclaimer in the documentation
--    * and/or other materials provided with the distribution.
--
--    * Neither the name of Intel Corporation nor the names of its contributors
--    * may be used to endorse or promote products derived from this software
--    * without specific prior written permission.
--
--THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
--ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
--ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
--(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
--LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
--ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
--(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
--SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--*/
--
--/* Values are optimized for Core Architecture */
--#define SHARED_CACHE_SIZE (4096*1024)  /* Core Architecture L2 Cache */
--#define DATA_CACHE_SIZE   (24*1024)    /* Core Architecture L1 Data Cache */
--
--#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
--#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
-diff --git a/libc/arch-x86_64/silvermont/string/cache.h b/libc/arch-x86_64/silvermont/string/cache.h
-deleted file mode 100644
-index 3606d2a1a..000000000
---- a/libc/arch-x86_64/silvermont/string/cache.h
-+++ /dev/null
-@@ -1,36 +0,0 @@
--/*
--Copyright (c) 2014, Intel Corporation
--All rights reserved.
--
--Redistribution and use in source and binary forms, with or without
--modification, are permitted provided that the following conditions are met:
--
--    * Redistributions of source code must retain the above copyright notice,
--    * this list of conditions and the following disclaimer.
--
--    * Redistributions in binary form must reproduce the above copyright notice,
--    * this list of conditions and the following disclaimer in the documentation
--    * and/or other materials provided with the distribution.
--
--    * Neither the name of Intel Corporation nor the names of its contributors
--    * may be used to endorse or promote products derived from this software
--    * without specific prior written permission.
--
--THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
--ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
--WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
--DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
--ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
--(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
--LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
--ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
--(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
--SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--*/
--
--/* Values are optimized for Silvermont */
--#define SHARED_CACHE_SIZE (1024*1024)  /* Silvermont L2 Cache */
--#define DATA_CACHE_SIZE   (24*1024)    /* Silvermont L1 Data Cache */
--
--#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2)
--#define DATA_CACHE_SIZE_HALF   (DATA_CACHE_SIZE / 2)
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
-index 7024f4950..0530a6f59 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S
-@@ -28,7 +28,6 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
--#include "cache.h"
- 
- #ifndef MEMMOVE
- # define MEMMOVE		memmove_generic
-@@ -189,8 +188,9 @@ L(mm_len_128_or_more_forward):
- 	cmp	%r8, %rbx
- 	jbe	L(mm_copy_remaining_forward)
- 
--	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
--	jae	L(mm_large_page_loop_forward)
-+	cmp	__x86_shared_cache_size_half(%rip), %rdx
-+
-+	ja      L(mm_overlapping_check_forward)
- 
- 	.p2align 4
- L(mm_main_loop_forward):
-@@ -414,8 +414,9 @@ L(mm_len_128_or_more_backward):
- 	cmp	%r9, %rbx
- 	jae	L(mm_recalc_len)
- 
--	cmp	$SHARED_CACHE_SIZE_HALF, %rdx
--	jae	L(mm_large_page_loop_backward)
-+	cmp	__x86_shared_cache_size_half(%rip), %rdx
-+
-+	ja	L(mm_overlapping_check_backward)
- 
- 	.p2align 4
- L(mm_main_loop_backward):
-@@ -481,6 +482,13 @@ L(mm_return):
- /* Big length copy forward part.  */
- 
- 	.p2align 4
-+
-+L(mm_overlapping_check_forward):
-+	mov	%rsi, %r9
-+	add	%rdx, %r9
-+	cmp	__x86_shared_cache_size(%rip), %r9
-+	jbe	L(mm_main_loop_forward)
-+
- L(mm_large_page_loop_forward):
- 	movdqu	(%r8, %rsi), %xmm0
- 	movdqu	16(%r8, %rsi), %xmm1
-@@ -498,6 +506,14 @@ L(mm_large_page_loop_forward):
- 
- /* Big length copy backward part.  */
- 	.p2align 4
-+
-+L(mm_overlapping_check_backward):
-+	mov	%rdi, %r11
-+	sub	%rsi, %r11 /* r11 = dst - src, diff */
-+	add	%rdx, %r11
-+	cmp	__x86_shared_cache_size(%rip), %r11
-+	jbe	L(mm_main_loop_backward)
-+
- L(mm_large_page_loop_backward):
- 	movdqu	-64(%r9, %r8), %xmm0
- 	movdqu	-48(%r9, %r8), %xmm1
-diff --git a/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
-index cceadd297..84ab327c9 100644
---- a/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S
-@@ -30,7 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- 
- #include <private/bionic_asm.h>
- 
--#include "cache.h"
- 
- #ifndef L
- # define L(label)	.L##label
-@@ -116,11 +115,8 @@ L(128bytesmore):
- 	cmpq	%rcx, %rdx
- 	je	L(return)
- 
--#ifdef SHARED_CACHE_SIZE
--	cmp	$SHARED_CACHE_SIZE, %r8
--#else
--	cmp	__x86_64_shared_cache_size(%rip), %r8
--#endif
-+	cmp	__x86_shared_cache_size(%rip), %r8
-+
- 	ja	L(128bytesmore_nt)
- 
- 	ALIGN (4)
-diff --git a/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
-index 6cfcd767f..c5980d431 100644
---- a/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
-+++ b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S
-@@ -28,7 +28,6 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
- 
--#include "cache.h"
- 
- #ifndef MEMCMP
- # define MEMCMP		memcmp_generic
-@@ -353,11 +352,7 @@ L(less32bytesin256):
- 
- 	ALIGN (4)
- L(512bytesormore):
--#ifdef DATA_CACHE_SIZE_HALF
--	mov	$DATA_CACHE_SIZE_HALF, %r8
--#else
--	mov	__x86_64_data_cache_size_half(%rip), %r8
--#endif
-+	mov	__x86_data_cache_size_half(%rip), %r8
- 	mov	%r8, %r9
- 	shr	$1, %r8
- 	add	%r9, %r8
-@@ -669,11 +664,7 @@ L(less32bytesin256in2alinged):
- 
- 	ALIGN (4)
- L(512bytesormorein2aligned):
--#ifdef DATA_CACHE_SIZE_HALF
--	mov	$DATA_CACHE_SIZE_HALF, %r8
--#else
--	mov	__x86_64_data_cache_size_half(%rip), %r8
--#endif
-+	mov	__x86_data_cache_size_half(%rip), %r8
- 	mov	%r8, %r9
- 	shr	$1, %r8
- 	add	%r9, %r8
-diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp
-index c82c52e9d..6e4213a90 100644
---- a/libc/bionic/libc_init_common.cpp
-+++ b/libc/bionic/libc_init_common.cpp
-@@ -63,6 +63,28 @@ __LIBC_HIDDEN__ constinit _Atomic(bool) __libc_memtag_stack;
- __BIONIC_WEAK_VARIABLE_FOR_NATIVE_BRIDGE
- const char* __progname;
- 
-+#if defined(__i386__) || defined(__x86_64__)
-+// Default sizes based on the old hard-coded values for Atom/Silvermont (x86) and Core 2 (x86-64)...
-+size_t __x86_data_cache_size = 24 * 1024;
-+size_t __x86_data_cache_size_half = __x86_data_cache_size / 2;
-+size_t __x86_shared_cache_size = sizeof(long) == 8 ? 4096 * 1024 : 1024 * 1024;
-+size_t __x86_shared_cache_size_half = __x86_shared_cache_size / 2;
-+// ...overwritten at runtime based on the cpu's reported cache sizes.
-+static void __libc_init_x86_cache_info() {
-+  // Handle the case where during early boot /sys fs may not yet be ready,
-+  // resulting in sysconf() returning 0, leading to crashes.
-+  // In that case (basically just init), we keep the defaults.
-+  if (sysconf(_SC_LEVEL1_DCACHE_SIZE) != 0) {
-+    __x86_data_cache_size = sysconf(_SC_LEVEL1_DCACHE_SIZE);
-+    __x86_data_cache_size_half = __x86_data_cache_size / 2;
-+  }
-+  if (sysconf(_SC_LEVEL2_CACHE_SIZE) != 0) {
-+    __x86_shared_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE);
-+    __x86_shared_cache_size_half = __x86_shared_cache_size / 2;
-+  }
-+}
-+#endif
-+
- void __libc_init_globals() {
-   // Initialize libc globals that are needed in both the linker and in libc.
-   // In dynamic binaries, this is run at least twice for different copies of the
-@@ -172,6 +194,10 @@ void __libc_init_common() {
-   __system_properties_init(); // Requires 'environ'.
-   __libc_init_fdsan(); // Requires system properties (for debug.fdsan).
-   __libc_init_fdtrack();
-+
-+#if defined(__i386__) || defined(__x86_64__)
-+  __libc_init_x86_cache_info();
-+#endif
- }
- 
- void __libc_init_fork_handler() {
--- 
-2.34.1
-
diff --git a/vendorsetup.sh b/vendorsetup.sh
index 51f9c94771..6c328b8203 100755
--- a/vendorsetup.sh
+++ b/vendorsetup.sh
@@ -35,12 +35,8 @@ function lunch
 }
 
 # Get the exact value of a build variable.
-function _get_build_var_cached()
+function get_build_var()
 {
-    # Set the TARGET_RELEASE variable to the release_config for
-    # which we want to build CELADON. It should be one among
-    # $(TOP)/build/release/release_configs/*
-    TARGET_RELEASE=ap3a
     if [ "$1" = "COMMON_LUNCH_CHOICES" ]
     then
         valid_targets=`mixinup -t`
@@ -50,8 +46,7 @@ function _get_build_var_cached()
             array=(${t/-/ })
             target=${array[0]}
             if [[ "${valid_targets}" =~ "$target" ]]; then
-                   tgt=$target-$TARGET_RELEASE-${array[1]}
-                   LUNCH_MENU_CHOICES+=($tgt)
+                   LUNCH_MENU_CHOICES+=($t)
             fi
         done
         echo ${LUNCH_MENU_CHOICES[@]}