diff --git a/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch b/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch deleted file mode 100644 index db61807bff..0000000000 --- a/aosp_diff/preliminary/bionic/0003-Optimize-bionic-memory-functions-with-avx2-instructi.patch +++ /dev/null @@ -1,3779 +0,0 @@ -From 013b505284379453df6637f009a224f6d5c6f3bd Mon Sep 17 00:00:00 2001 -From: "Reddy, Alavala Srinivasa" -Date: Wed, 13 Sep 2023 18:36:21 +0530 -Subject: [PATCH 3/5] Optimize bionic memory functions with avx2 instructions - -Following memory related functions are optimized with -avx2 implementation ported from glibc 2.20 -(only for 64-bit) - - memchr - - memcmp - - memrchr - -Test done: Build and boot is fine, Run the benchmarks suite. - -Change-Id: I956773c79b9bcebee69726820eaa74c709df7081 -Signed-off-by: ahs -Signed-off-by: Ravi Kumar Soni ---- - libc/Android.bp | 36 +- - .../kabylake/string/avx2-memcpy-kbl.S | 2052 +++++++++++++++++ - .../arch-x86_64/dynamic_function_dispatch.cpp | 38 + - libc/arch-x86_64/generic/string/memchr.c | 20 + - libc/arch-x86_64/generic/string/memrchr.c | 20 + - libc/arch-x86_64/generic/string/wmemset.c | 20 + - libc/arch-x86_64/{string => include}/cache.h | 0 - .../kabylake/string/avx2-memchr-kbl.S | 371 +++ - .../kabylake/string/avx2-memcmp-kbl.S | 428 ++++ - .../kabylake/string/avx2-memrchr-kbl.S | 408 ++++ - .../kabylake/string/avx2-wmemset-kbl.S | 140 ++ - .../string/sse2-memmove-slm.S | 4 +- - .../{ => silvermont}/string/sse2-memset-slm.S | 0 - .../{ => silvermont}/string/sse2-stpcpy-slm.S | 0 - .../string/sse2-stpncpy-slm.S | 0 - .../{ => silvermont}/string/sse2-strcat-slm.S | 0 - .../{ => silvermont}/string/sse2-strcpy-slm.S | 0 - .../{ => silvermont}/string/sse2-strlen-slm.S | 0 - .../string/sse2-strncat-slm.S | 0 - .../string/sse2-strncpy-slm.S | 0 - .../{ => silvermont}/string/sse4-memcmp-slm.S | 2 +- - .../string/ssse3-strcmp-slm.S | 0 - .../string/ssse3-strncmp-slm.S | 0 - libc/arch-x86_64/static_function_dispatch.S | 6 + - 24 files changed, 3528 insertions(+), 17 deletions(-) - create mode 100644 libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S - create mode 100644 libc/arch-x86_64/generic/string/memchr.c - create mode 100644 libc/arch-x86_64/generic/string/memrchr.c - create mode 100644 libc/arch-x86_64/generic/string/wmemset.c - rename libc/arch-x86_64/{string => include}/cache.h (100%) - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S - rename libc/arch-x86_64/{ => silvermont}/string/sse2-memmove-slm.S (99%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-memset-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpcpy-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-stpncpy-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcat-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-strcpy-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-strlen-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncat-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse2-strncpy-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/sse4-memcmp-slm.S (99%) - rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strcmp-slm.S (100%) - rename libc/arch-x86_64/{ => silvermont}/string/ssse3-strncmp-slm.S (100%) - -diff --git a/libc/Android.bp b/libc/Android.bp -index 943d41fba..530ce9111 100644 ---- a/libc/Android.bp -+++ b/libc/Android.bp -@@ -617,8 +617,6 @@ cc_library_static { - }, - x86_64: { - srcs: [ -- "upstream-openbsd/lib/libc/string/memchr.c", -- "upstream-openbsd/lib/libc/string/memrchr.c", - "upstream-openbsd/lib/libc/string/strlcat.c", - "upstream-openbsd/lib/libc/string/strlcpy.c", - ], -@@ -1187,6 +1185,7 @@ cc_library_static { - ], - }, - x86_64: { -+ include_dirs: ["bionic/libc/arch-x86_64/include"], - srcs: [ - "arch-x86_64/bionic/__bionic_clone.S", - "arch-x86_64/bionic/_exit_with_stack_teardown.S", -@@ -1196,18 +1195,27 @@ cc_library_static { - "arch-x86_64/bionic/vfork.S", - - "arch-x86_64/string/avx2-memset-kbl.S", -- "arch-x86_64/string/sse2-memmove-slm.S", -- "arch-x86_64/string/sse2-memset-slm.S", -- "arch-x86_64/string/sse2-stpcpy-slm.S", -- "arch-x86_64/string/sse2-stpncpy-slm.S", -- "arch-x86_64/string/sse2-strcat-slm.S", -- "arch-x86_64/string/sse2-strcpy-slm.S", -- "arch-x86_64/string/sse2-strlen-slm.S", -- "arch-x86_64/string/sse2-strncat-slm.S", -- "arch-x86_64/string/sse2-strncpy-slm.S", -- "arch-x86_64/string/sse4-memcmp-slm.S", -- "arch-x86_64/string/ssse3-strcmp-slm.S", -- "arch-x86_64/string/ssse3-strncmp-slm.S", -+ "arch-x86_64/silvermont/string/sse2-memmove-slm.S", -+ "arch-x86_64/silvermont/string/sse2-memset-slm.S", -+ "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S", -+ "arch-x86_64/silvermont/string/sse2-stpncpy-slm.S", -+ "arch-x86_64/silvermont/string/sse2-strcat-slm.S", -+ "arch-x86_64/silvermont/string/sse2-strcpy-slm.S", -+ "arch-x86_64/silvermont/string/sse2-strlen-slm.S", -+ "arch-x86_64/silvermont/string/sse2-strncat-slm.S", -+ "arch-x86_64/silvermont/string/sse2-strncpy-slm.S", -+ "arch-x86_64/silvermont/string/sse4-memcmp-slm.S", -+ "arch-x86_64/silvermont/string/ssse3-strcmp-slm.S", -+ "arch-x86_64/silvermont/string/ssse3-strncmp-slm.S", -+ -+ //"arch-x86_64/generic/string/wmemset.c" -+ "arch-x86_64/generic/string/memchr.c", -+ "arch-x86_64/generic/string/memrchr.c", -+ -+ //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S" -+ "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-memchr-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S", - - "bionic/strchr.cpp", - "bionic/strchrnul.cpp", -diff --git a/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S -new file mode 100644 -index 000000000..69fca7cf1 ---- /dev/null -+++ b/libc/arch-x86/kabylake/string/avx2-memcpy-kbl.S -@@ -0,0 +1,2052 @@ -+#define ENTRY(f) \ -+ .text; \ -+ .globl f; \ -+ .p2align 4, 0x90; \ -+ .type f,@function; \ -+ f: \ -+ -+#define END(f) -+ .size f, .-f; \ -+ .section .rodata,"a",@progbits; \ -+ .p2align 2 \ -+ -+ENTRY(memcpy_avx2) -+# %bb.0: -+ pushl %ebp -+ pushl %ebx -+ pushl %edi -+ pushl %esi -+ movl 28(%esp), %ebx -+ movl 24(%esp), %ecx -+ movl 20(%esp), %eax -+ calll .L0$pb -+.L0$pb: -+ popl %esi -+.Ltmp0: -+ addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %esi -+ cmpl $256, %ebx # imm = 0x100 -+ ja .LBB0_251 -+# %bb.1: -+ leal -1(%ebx), %edi -+ cmpl $255, %edi -+ ja .LBB0_270 -+# %bb.2: -+ addl .LJTI0_1@GOTOFF(%esi,%edi,4), %esi -+ leal (%eax,%ebx), %edx -+ addl %ebx, %ecx -+ jmpl *%esi -+.LBB0_251: -+ movl %eax, %ebp -+ vmovups (%ecx), %ymm0 -+ movl %ebx, %edi -+ negl %ebp -+ andl $31, %ebp -+ subl %ebp, %edi -+ addl %ebp, %ecx -+ leal (%eax,%ebp), %edx -+ cmpl $2097152, %edi # imm = 0x200000 -+ vmovups %ymm0, (%eax) -+ ja .LBB0_256 -+# %bb.252: -+ cmpl $256, %edi # imm = 0x100 -+ jb .LBB0_260 -+# %bb.253: -+ subl %ebp, %ebx -+ .p2align 4, 0x90 -+.LBB0_254: # =>This Inner Loop Header: Depth=1 -+ vmovups (%ecx), %ymm0 -+ vmovups 32(%ecx), %ymm1 -+ vmovups 64(%ecx), %ymm2 -+ vmovups 96(%ecx), %ymm3 -+ vmovups 128(%ecx), %ymm4 -+ vmovups 160(%ecx), %ymm5 -+ vmovups 192(%ecx), %ymm6 -+ vmovups 224(%ecx), %ymm7 -+ prefetchnta 512(%ecx) -+ addl $-256, %edi -+ addl $256, %ecx # imm = 0x100 -+ vmovups %ymm0, (%edx) -+ vmovups %ymm1, 32(%edx) -+ vmovups %ymm2, 64(%edx) -+ vmovups %ymm3, 96(%edx) -+ vmovups %ymm4, 128(%edx) -+ vmovups %ymm5, 160(%edx) -+ vmovups %ymm6, 192(%edx) -+ vmovups %ymm7, 224(%edx) -+ addl $256, %edx # imm = 0x100 -+ cmpl $255, %edi -+ ja .LBB0_254 -+# %bb.255: -+ movzbl %bl, %edi -+ leal -1(%edi), %ebx -+ cmpl $255, %ebx -+ jbe .LBB0_261 -+ jmp .LBB0_270 -+.LBB0_256: -+ prefetchnta (%ecx) -+ subl %ebp, %ebx -+ testb $31, %cl -+ je .LBB0_257 -+ .p2align 4, 0x90 -+.LBB0_258: # =>This Inner Loop Header: Depth=1 -+ vmovups (%ecx), %ymm0 -+ vmovups 32(%ecx), %ymm1 -+ vmovups 64(%ecx), %ymm2 -+ vmovups 96(%ecx), %ymm3 -+ vmovups 128(%ecx), %ymm4 -+ vmovups 160(%ecx), %ymm5 -+ vmovups 192(%ecx), %ymm6 -+ vmovups 224(%ecx), %ymm7 -+ prefetchnta 512(%ecx) -+ addl $-256, %edi -+ addl $256, %ecx # imm = 0x100 -+ vmovntps %ymm0, (%edx) -+ vmovntps %ymm1, 32(%edx) -+ vmovntps %ymm2, 64(%edx) -+ vmovntps %ymm3, 96(%edx) -+ vmovntps %ymm4, 128(%edx) -+ vmovntps %ymm5, 160(%edx) -+ vmovntps %ymm6, 192(%edx) -+ vmovntps %ymm7, 224(%edx) -+ addl $256, %edx # imm = 0x100 -+ cmpl $255, %edi -+ ja .LBB0_258 -+ jmp .LBB0_259 -+ .p2align 4, 0x90 -+.LBB0_257: # =>This Inner Loop Header: Depth=1 -+ vmovaps (%ecx), %ymm0 -+ vmovaps 32(%ecx), %ymm1 -+ vmovaps 64(%ecx), %ymm2 -+ vmovaps 96(%ecx), %ymm3 -+ vmovaps 128(%ecx), %ymm4 -+ vmovaps 160(%ecx), %ymm5 -+ vmovaps 192(%ecx), %ymm6 -+ vmovaps 224(%ecx), %ymm7 -+ prefetchnta 512(%ecx) -+ addl $-256, %edi -+ addl $256, %ecx # imm = 0x100 -+ vmovntps %ymm0, (%edx) -+ vmovntps %ymm1, 32(%edx) -+ vmovntps %ymm2, 64(%edx) -+ vmovntps %ymm3, 96(%edx) -+ vmovntps %ymm4, 128(%edx) -+ vmovntps %ymm5, 160(%edx) -+ vmovntps %ymm6, 192(%edx) -+ vmovntps %ymm7, 224(%edx) -+ addl $256, %edx # imm = 0x100 -+ cmpl $255, %edi -+ ja .LBB0_257 -+.LBB0_259: -+ sfence -+ movzbl %bl, %edi -+.LBB0_260: -+ leal -1(%edi), %ebx -+ cmpl $255, %ebx -+ ja .LBB0_270 -+.LBB0_261: -+ addl .LJTI0_0@GOTOFF(%esi,%ebx,4), %esi -+ addl %edi, %edx -+ addl %edi, %ecx -+ jmpl *%esi -+.LBB0_11: -+ vmovups -131(%ecx), %ymm0 -+ vmovups %ymm0, -131(%edx) -+ vmovups -99(%ecx), %ymm0 -+ vmovups %ymm0, -99(%edx) -+ vmovups -67(%ecx), %ymm0 -+ vmovups %ymm0, -67(%edx) -+ vmovups -35(%ecx), %ymm0 -+ vmovups %ymm0, -35(%edx) -+.LBB0_12: -+ movzwl -3(%ecx), %esi -+ movw %si, -3(%edx) -+ jmp .LBB0_6 -+.LBB0_17: -+ vmovups -133(%ecx), %ymm0 -+ vmovups %ymm0, -133(%edx) -+ vmovups -101(%ecx), %ymm0 -+ vmovups %ymm0, -101(%edx) -+ vmovups -69(%ecx), %ymm0 -+ vmovups %ymm0, -69(%edx) -+ vmovups -37(%ecx), %ymm0 -+ vmovups %ymm0, -37(%edx) -+.LBB0_18: -+ movl -5(%ecx), %esi -+ movl %esi, -5(%edx) -+ jmp .LBB0_6 -+.LBB0_19: -+ vmovups -134(%ecx), %ymm0 -+ vmovups %ymm0, -134(%edx) -+ vmovups -102(%ecx), %ymm0 -+ vmovups %ymm0, -102(%edx) -+ vmovups -70(%ecx), %ymm0 -+ vmovups %ymm0, -70(%edx) -+ vmovups -38(%ecx), %ymm0 -+ vmovups %ymm0, -38(%edx) -+.LBB0_20: -+ movl -6(%ecx), %esi -+ movl %esi, -6(%edx) -+ jmp .LBB0_10 -+.LBB0_21: -+ vmovups -135(%ecx), %ymm0 -+ vmovups %ymm0, -135(%edx) -+ vmovups -103(%ecx), %ymm0 -+ vmovups %ymm0, -103(%edx) -+ vmovups -71(%ecx), %ymm0 -+ vmovups %ymm0, -71(%edx) -+ vmovups -39(%ecx), %ymm0 -+ vmovups %ymm0, -39(%edx) -+.LBB0_22: -+ movl -7(%ecx), %esi -+ movl %esi, -7(%edx) -+ jmp .LBB0_16 -+.LBB0_27: -+ vmovups -137(%ecx), %ymm0 -+ vmovups %ymm0, -137(%edx) -+ vmovups -105(%ecx), %ymm0 -+ vmovups %ymm0, -105(%edx) -+ vmovups -73(%ecx), %ymm0 -+ vmovups %ymm0, -73(%edx) -+ vmovups -41(%ecx), %ymm0 -+ vmovups %ymm0, -41(%edx) -+.LBB0_28: -+ vmovsd -9(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -9(%edx) -+ jmp .LBB0_6 -+.LBB0_29: -+ vmovups -138(%ecx), %ymm0 -+ vmovups %ymm0, -138(%edx) -+ vmovups -106(%ecx), %ymm0 -+ vmovups %ymm0, -106(%edx) -+ vmovups -74(%ecx), %ymm0 -+ vmovups %ymm0, -74(%edx) -+ vmovups -42(%ecx), %ymm0 -+ vmovups %ymm0, -42(%edx) -+.LBB0_30: -+ vmovsd -10(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -10(%edx) -+ jmp .LBB0_10 -+.LBB0_31: -+ vmovups -139(%ecx), %ymm0 -+ vmovups %ymm0, -139(%edx) -+ vmovups -107(%ecx), %ymm0 -+ vmovups %ymm0, -107(%edx) -+ vmovups -75(%ecx), %ymm0 -+ vmovups %ymm0, -75(%edx) -+ vmovups -43(%ecx), %ymm0 -+ vmovups %ymm0, -43(%edx) -+.LBB0_32: -+ vmovsd -11(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -11(%edx) -+ jmp .LBB0_16 -+.LBB0_33: -+ vmovups -140(%ecx), %ymm0 -+ vmovups %ymm0, -140(%edx) -+ vmovups -108(%ecx), %ymm0 -+ vmovups %ymm0, -108(%edx) -+ vmovups -76(%ecx), %ymm0 -+ vmovups %ymm0, -76(%edx) -+ vmovups -44(%ecx), %ymm0 -+ vmovups %ymm0, -44(%edx) -+.LBB0_34: -+ vmovsd -12(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -12(%edx) -+ jmp .LBB0_16 -+.LBB0_35: -+ vmovups -141(%ecx), %ymm0 -+ vmovups %ymm0, -141(%edx) -+ vmovups -109(%ecx), %ymm0 -+ vmovups %ymm0, -109(%edx) -+ vmovups -77(%ecx), %ymm0 -+ vmovups %ymm0, -77(%edx) -+ vmovups -45(%ecx), %ymm0 -+ vmovups %ymm0, -45(%edx) -+.LBB0_36: -+ vmovsd -13(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -13(%edx) -+ jmp .LBB0_26 -+.LBB0_37: -+ vmovups -142(%ecx), %ymm0 -+ vmovups %ymm0, -142(%edx) -+ vmovups -110(%ecx), %ymm0 -+ vmovups %ymm0, -110(%edx) -+ vmovups -78(%ecx), %ymm0 -+ vmovups %ymm0, -78(%edx) -+ vmovups -46(%ecx), %ymm0 -+ vmovups %ymm0, -46(%edx) -+.LBB0_38: -+ vmovsd -14(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -14(%edx) -+ jmp .LBB0_26 -+.LBB0_39: -+ vmovups -143(%ecx), %ymm0 -+ vmovups %ymm0, -143(%edx) -+ vmovups -111(%ecx), %ymm0 -+ vmovups %ymm0, -111(%edx) -+ vmovups -79(%ecx), %ymm0 -+ vmovups %ymm0, -79(%edx) -+ vmovups -47(%ecx), %ymm0 -+ vmovups %ymm0, -47(%edx) -+.LBB0_40: -+ vmovsd -15(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -15(%edx) -+ jmp .LBB0_26 -+.LBB0_45: -+ vmovups -145(%ecx), %ymm0 -+ vmovups %ymm0, -145(%edx) -+ vmovups -113(%ecx), %ymm0 -+ vmovups %ymm0, -113(%edx) -+ vmovups -81(%ecx), %ymm0 -+ vmovups %ymm0, -81(%edx) -+ vmovups -49(%ecx), %ymm0 -+ vmovups %ymm0, -49(%edx) -+.LBB0_46: -+ vmovups -17(%ecx), %xmm0 -+ vmovups %xmm0, -17(%edx) -+ jmp .LBB0_6 -+.LBB0_47: -+ vmovups -146(%ecx), %ymm0 -+ vmovups %ymm0, -146(%edx) -+ vmovups -114(%ecx), %ymm0 -+ vmovups %ymm0, -114(%edx) -+ vmovups -82(%ecx), %ymm0 -+ vmovups %ymm0, -82(%edx) -+ vmovups -50(%ecx), %ymm0 -+ vmovups %ymm0, -50(%edx) -+.LBB0_48: -+ vmovups -18(%ecx), %xmm0 -+ vmovups %xmm0, -18(%edx) -+ jmp .LBB0_10 -+.LBB0_49: -+ vmovups -147(%ecx), %ymm0 -+ vmovups %ymm0, -147(%edx) -+ vmovups -115(%ecx), %ymm0 -+ vmovups %ymm0, -115(%edx) -+ vmovups -83(%ecx), %ymm0 -+ vmovups %ymm0, -83(%edx) -+ vmovups -51(%ecx), %ymm0 -+ vmovups %ymm0, -51(%edx) -+.LBB0_50: -+ vmovups -19(%ecx), %xmm0 -+ vmovups %xmm0, -19(%edx) -+ jmp .LBB0_16 -+.LBB0_51: -+ vmovups -148(%ecx), %ymm0 -+ vmovups %ymm0, -148(%edx) -+ vmovups -116(%ecx), %ymm0 -+ vmovups %ymm0, -116(%edx) -+ vmovups -84(%ecx), %ymm0 -+ vmovups %ymm0, -84(%edx) -+ vmovups -52(%ecx), %ymm0 -+ vmovups %ymm0, -52(%edx) -+.LBB0_52: -+ vmovups -20(%ecx), %xmm0 -+ vmovups %xmm0, -20(%edx) -+ jmp .LBB0_16 -+.LBB0_53: -+ vmovups -149(%ecx), %ymm0 -+ vmovups %ymm0, -149(%edx) -+ vmovups -117(%ecx), %ymm0 -+ vmovups %ymm0, -117(%edx) -+ vmovups -85(%ecx), %ymm0 -+ vmovups %ymm0, -85(%edx) -+ vmovups -53(%ecx), %ymm0 -+ vmovups %ymm0, -53(%edx) -+.LBB0_54: -+ vmovups -21(%ecx), %xmm0 -+ vmovups %xmm0, -21(%edx) -+ jmp .LBB0_26 -+.LBB0_55: -+ vmovups -150(%ecx), %ymm0 -+ vmovups %ymm0, -150(%edx) -+ vmovups -118(%ecx), %ymm0 -+ vmovups %ymm0, -118(%edx) -+ vmovups -86(%ecx), %ymm0 -+ vmovups %ymm0, -86(%edx) -+ vmovups -54(%ecx), %ymm0 -+ vmovups %ymm0, -54(%edx) -+.LBB0_56: -+ vmovups -22(%ecx), %xmm0 -+ vmovups %xmm0, -22(%edx) -+ jmp .LBB0_26 -+.LBB0_57: -+ vmovups -151(%ecx), %ymm0 -+ vmovups %ymm0, -151(%edx) -+ vmovups -119(%ecx), %ymm0 -+ vmovups %ymm0, -119(%edx) -+ vmovups -87(%ecx), %ymm0 -+ vmovups %ymm0, -87(%edx) -+ vmovups -55(%ecx), %ymm0 -+ vmovups %ymm0, -55(%edx) -+.LBB0_58: -+ vmovups -23(%ecx), %xmm0 -+ vmovups %xmm0, -23(%edx) -+ jmp .LBB0_26 -+.LBB0_59: -+ vmovups -152(%ecx), %ymm0 -+ vmovups %ymm0, -152(%edx) -+ vmovups -120(%ecx), %ymm0 -+ vmovups %ymm0, -120(%edx) -+ vmovups -88(%ecx), %ymm0 -+ vmovups %ymm0, -88(%edx) -+ vmovups -56(%ecx), %ymm0 -+ vmovups %ymm0, -56(%edx) -+.LBB0_60: -+ vmovups -24(%ecx), %xmm0 -+ vmovups %xmm0, -24(%edx) -+ jmp .LBB0_26 -+.LBB0_61: -+ vmovups -153(%ecx), %ymm0 -+ vmovups %ymm0, -153(%edx) -+ vmovups -121(%ecx), %ymm0 -+ vmovups %ymm0, -121(%edx) -+ vmovups -89(%ecx), %ymm0 -+ vmovups %ymm0, -89(%edx) -+ vmovups -57(%ecx), %ymm0 -+ vmovups %ymm0, -57(%edx) -+.LBB0_62: -+ vmovups -25(%ecx), %xmm0 -+ vmovups %xmm0, -25(%edx) -+ jmp .LBB0_44 -+.LBB0_63: -+ vmovups -154(%ecx), %ymm0 -+ vmovups %ymm0, -154(%edx) -+ vmovups -122(%ecx), %ymm0 -+ vmovups %ymm0, -122(%edx) -+ vmovups -90(%ecx), %ymm0 -+ vmovups %ymm0, -90(%edx) -+ vmovups -58(%ecx), %ymm0 -+ vmovups %ymm0, -58(%edx) -+.LBB0_64: -+ vmovups -26(%ecx), %xmm0 -+ vmovups %xmm0, -26(%edx) -+ jmp .LBB0_44 -+.LBB0_65: -+ vmovups -155(%ecx), %ymm0 -+ vmovups %ymm0, -155(%edx) -+ vmovups -123(%ecx), %ymm0 -+ vmovups %ymm0, -123(%edx) -+ vmovups -91(%ecx), %ymm0 -+ vmovups %ymm0, -91(%edx) -+ vmovups -59(%ecx), %ymm0 -+ vmovups %ymm0, -59(%edx) -+.LBB0_66: -+ vmovups -27(%ecx), %xmm0 -+ vmovups %xmm0, -27(%edx) -+ jmp .LBB0_44 -+.LBB0_67: -+ vmovups -156(%ecx), %ymm0 -+ vmovups %ymm0, -156(%edx) -+ vmovups -124(%ecx), %ymm0 -+ vmovups %ymm0, -124(%edx) -+ vmovups -92(%ecx), %ymm0 -+ vmovups %ymm0, -92(%edx) -+ vmovups -60(%ecx), %ymm0 -+ vmovups %ymm0, -60(%edx) -+.LBB0_68: -+ vmovups -28(%ecx), %xmm0 -+ vmovups %xmm0, -28(%edx) -+ jmp .LBB0_44 -+.LBB0_69: -+ vmovups -157(%ecx), %ymm0 -+ vmovups %ymm0, -157(%edx) -+ vmovups -125(%ecx), %ymm0 -+ vmovups %ymm0, -125(%edx) -+ vmovups -93(%ecx), %ymm0 -+ vmovups %ymm0, -93(%edx) -+ vmovups -61(%ecx), %ymm0 -+ vmovups %ymm0, -61(%edx) -+.LBB0_70: -+ vmovups -29(%ecx), %xmm0 -+ vmovups %xmm0, -29(%edx) -+ jmp .LBB0_44 -+.LBB0_71: -+ vmovups -158(%ecx), %ymm0 -+ vmovups %ymm0, -158(%edx) -+ vmovups -126(%ecx), %ymm0 -+ vmovups %ymm0, -126(%edx) -+ vmovups -94(%ecx), %ymm0 -+ vmovups %ymm0, -94(%edx) -+ vmovups -62(%ecx), %ymm0 -+ vmovups %ymm0, -62(%edx) -+.LBB0_72: -+ vmovups -30(%ecx), %xmm0 -+ vmovups %xmm0, -30(%edx) -+ jmp .LBB0_44 -+.LBB0_73: -+ vmovups -159(%ecx), %ymm0 -+ vmovups %ymm0, -159(%edx) -+ vmovups -127(%ecx), %ymm0 -+ vmovups %ymm0, -127(%edx) -+ vmovups -95(%ecx), %ymm0 -+ vmovups %ymm0, -95(%edx) -+ vmovups -63(%ecx), %ymm0 -+ vmovups %ymm0, -63(%edx) -+.LBB0_74: -+ vmovups -31(%ecx), %xmm0 -+ vmovups %xmm0, -31(%edx) -+ jmp .LBB0_44 -+.LBB0_75: -+ vmovups -193(%ecx), %ymm0 -+ vmovups %ymm0, -193(%edx) -+.LBB0_76: -+ vmovups -161(%ecx), %ymm0 -+ vmovups %ymm0, -161(%edx) -+.LBB0_3: -+ vmovups -129(%ecx), %ymm0 -+ vmovups %ymm0, -129(%edx) -+ vmovups -97(%ecx), %ymm0 -+ vmovups %ymm0, -97(%edx) -+.LBB0_4: -+ vmovups -65(%ecx), %ymm0 -+ vmovups %ymm0, -65(%edx) -+.LBB0_5: -+ vmovups -33(%ecx), %ymm0 -+ vmovups %ymm0, -33(%edx) -+.LBB0_6: -+ movb -1(%ecx), %cl -+ movb %cl, -1(%edx) -+ jmp .LBB0_270 -+.LBB0_77: -+ vmovups -194(%ecx), %ymm0 -+ vmovups %ymm0, -194(%edx) -+.LBB0_78: -+ vmovups -162(%ecx), %ymm0 -+ vmovups %ymm0, -162(%edx) -+.LBB0_7: -+ vmovups -130(%ecx), %ymm0 -+ vmovups %ymm0, -130(%edx) -+ vmovups -98(%ecx), %ymm0 -+ vmovups %ymm0, -98(%edx) -+.LBB0_8: -+ vmovups -66(%ecx), %ymm0 -+ vmovups %ymm0, -66(%edx) -+.LBB0_9: -+ vmovups -34(%ecx), %ymm0 -+ vmovups %ymm0, -34(%edx) -+.LBB0_10: -+ movzwl -2(%ecx), %ecx -+ movw %cx, -2(%edx) -+ jmp .LBB0_270 -+.LBB0_79: -+ vmovups -195(%ecx), %ymm0 -+ vmovups %ymm0, -195(%edx) -+.LBB0_80: -+ vmovups -163(%ecx), %ymm0 -+ vmovups %ymm0, -163(%edx) -+ vmovups -131(%ecx), %ymm0 -+ vmovups %ymm0, -131(%edx) -+ vmovups -99(%ecx), %ymm0 -+ vmovups %ymm0, -99(%edx) -+.LBB0_81: -+ vmovups -67(%ecx), %ymm0 -+ vmovups %ymm0, -67(%edx) -+.LBB0_82: -+ vmovups -35(%ecx), %ymm0 -+ vmovups %ymm0, -35(%edx) -+ jmp .LBB0_16 -+.LBB0_83: -+ vmovups -196(%ecx), %ymm0 -+ vmovups %ymm0, -196(%edx) -+.LBB0_84: -+ vmovups -164(%ecx), %ymm0 -+ vmovups %ymm0, -164(%edx) -+.LBB0_13: -+ vmovups -132(%ecx), %ymm0 -+ vmovups %ymm0, -132(%edx) -+ vmovups -100(%ecx), %ymm0 -+ vmovups %ymm0, -100(%edx) -+.LBB0_14: -+ vmovups -68(%ecx), %ymm0 -+ vmovups %ymm0, -68(%edx) -+.LBB0_15: -+ vmovups -36(%ecx), %ymm0 -+ vmovups %ymm0, -36(%edx) -+.LBB0_16: -+ movl -4(%ecx), %ecx -+ movl %ecx, -4(%edx) -+ jmp .LBB0_270 -+.LBB0_85: -+ vmovups -197(%ecx), %ymm0 -+ vmovups %ymm0, -197(%edx) -+.LBB0_86: -+ vmovups -165(%ecx), %ymm0 -+ vmovups %ymm0, -165(%edx) -+ vmovups -133(%ecx), %ymm0 -+ vmovups %ymm0, -133(%edx) -+ vmovups -101(%ecx), %ymm0 -+ vmovups %ymm0, -101(%edx) -+.LBB0_87: -+ vmovups -69(%ecx), %ymm0 -+ vmovups %ymm0, -69(%edx) -+.LBB0_88: -+ vmovups -37(%ecx), %ymm0 -+ vmovups %ymm0, -37(%edx) -+ jmp .LBB0_26 -+.LBB0_89: -+ vmovups -198(%ecx), %ymm0 -+ vmovups %ymm0, -198(%edx) -+.LBB0_90: -+ vmovups -166(%ecx), %ymm0 -+ vmovups %ymm0, -166(%edx) -+ vmovups -134(%ecx), %ymm0 -+ vmovups %ymm0, -134(%edx) -+ vmovups -102(%ecx), %ymm0 -+ vmovups %ymm0, -102(%edx) -+.LBB0_91: -+ vmovups -70(%ecx), %ymm0 -+ vmovups %ymm0, -70(%edx) -+.LBB0_92: -+ vmovups -38(%ecx), %ymm0 -+ vmovups %ymm0, -38(%edx) -+ jmp .LBB0_26 -+.LBB0_93: -+ vmovups -199(%ecx), %ymm0 -+ vmovups %ymm0, -199(%edx) -+.LBB0_94: -+ vmovups -167(%ecx), %ymm0 -+ vmovups %ymm0, -167(%edx) -+ vmovups -135(%ecx), %ymm0 -+ vmovups %ymm0, -135(%edx) -+ vmovups -103(%ecx), %ymm0 -+ vmovups %ymm0, -103(%edx) -+.LBB0_95: -+ vmovups -71(%ecx), %ymm0 -+ vmovups %ymm0, -71(%edx) -+.LBB0_96: -+ vmovups -39(%ecx), %ymm0 -+ vmovups %ymm0, -39(%edx) -+ jmp .LBB0_26 -+.LBB0_97: -+ vmovups -200(%ecx), %ymm0 -+ vmovups %ymm0, -200(%edx) -+.LBB0_98: -+ vmovups -168(%ecx), %ymm0 -+ vmovups %ymm0, -168(%edx) -+.LBB0_23: -+ vmovups -136(%ecx), %ymm0 -+ vmovups %ymm0, -136(%edx) -+ vmovups -104(%ecx), %ymm0 -+ vmovups %ymm0, -104(%edx) -+.LBB0_24: -+ vmovups -72(%ecx), %ymm0 -+ vmovups %ymm0, -72(%edx) -+.LBB0_25: -+ vmovups -40(%ecx), %ymm0 -+ vmovups %ymm0, -40(%edx) -+.LBB0_26: -+ vmovsd -8(%ecx), %xmm0 # xmm0 = mem[0],zero -+ vmovsd %xmm0, -8(%edx) -+ jmp .LBB0_270 -+.LBB0_99: -+ vmovups -201(%ecx), %ymm0 -+ vmovups %ymm0, -201(%edx) -+.LBB0_100: -+ vmovups -169(%ecx), %ymm0 -+ vmovups %ymm0, -169(%edx) -+ vmovups -137(%ecx), %ymm0 -+ vmovups %ymm0, -137(%edx) -+ vmovups -105(%ecx), %ymm0 -+ vmovups %ymm0, -105(%edx) -+.LBB0_101: -+ vmovups -73(%ecx), %ymm0 -+ vmovups %ymm0, -73(%edx) -+.LBB0_102: -+ vmovups -41(%ecx), %ymm0 -+ vmovups %ymm0, -41(%edx) -+ jmp .LBB0_44 -+.LBB0_103: -+ vmovups -202(%ecx), %ymm0 -+ vmovups %ymm0, -202(%edx) -+.LBB0_104: -+ vmovups -170(%ecx), %ymm0 -+ vmovups %ymm0, -170(%edx) -+ vmovups -138(%ecx), %ymm0 -+ vmovups %ymm0, -138(%edx) -+ vmovups -106(%ecx), %ymm0 -+ vmovups %ymm0, -106(%edx) -+.LBB0_105: -+ vmovups -74(%ecx), %ymm0 -+ vmovups %ymm0, -74(%edx) -+.LBB0_106: -+ vmovups -42(%ecx), %ymm0 -+ vmovups %ymm0, -42(%edx) -+ jmp .LBB0_44 -+.LBB0_107: -+ vmovups -203(%ecx), %ymm0 -+ vmovups %ymm0, -203(%edx) -+.LBB0_108: -+ vmovups -171(%ecx), %ymm0 -+ vmovups %ymm0, -171(%edx) -+ vmovups -139(%ecx), %ymm0 -+ vmovups %ymm0, -139(%edx) -+ vmovups -107(%ecx), %ymm0 -+ vmovups %ymm0, -107(%edx) -+.LBB0_109: -+ vmovups -75(%ecx), %ymm0 -+ vmovups %ymm0, -75(%edx) -+.LBB0_110: -+ vmovups -43(%ecx), %ymm0 -+ vmovups %ymm0, -43(%edx) -+ jmp .LBB0_44 -+.LBB0_111: -+ vmovups -204(%ecx), %ymm0 -+ vmovups %ymm0, -204(%edx) -+.LBB0_112: -+ vmovups -172(%ecx), %ymm0 -+ vmovups %ymm0, -172(%edx) -+ vmovups -140(%ecx), %ymm0 -+ vmovups %ymm0, -140(%edx) -+ vmovups -108(%ecx), %ymm0 -+ vmovups %ymm0, -108(%edx) -+.LBB0_113: -+ vmovups -76(%ecx), %ymm0 -+ vmovups %ymm0, -76(%edx) -+.LBB0_114: -+ vmovups -44(%ecx), %ymm0 -+ vmovups %ymm0, -44(%edx) -+ jmp .LBB0_44 -+.LBB0_115: -+ vmovups -205(%ecx), %ymm0 -+ vmovups %ymm0, -205(%edx) -+.LBB0_116: -+ vmovups -173(%ecx), %ymm0 -+ vmovups %ymm0, -173(%edx) -+ vmovups -141(%ecx), %ymm0 -+ vmovups %ymm0, -141(%edx) -+ vmovups -109(%ecx), %ymm0 -+ vmovups %ymm0, -109(%edx) -+.LBB0_117: -+ vmovups -77(%ecx), %ymm0 -+ vmovups %ymm0, -77(%edx) -+.LBB0_118: -+ vmovups -45(%ecx), %ymm0 -+ vmovups %ymm0, -45(%edx) -+ jmp .LBB0_44 -+.LBB0_119: -+ vmovups -206(%ecx), %ymm0 -+ vmovups %ymm0, -206(%edx) -+.LBB0_120: -+ vmovups -174(%ecx), %ymm0 -+ vmovups %ymm0, -174(%edx) -+ vmovups -142(%ecx), %ymm0 -+ vmovups %ymm0, -142(%edx) -+ vmovups -110(%ecx), %ymm0 -+ vmovups %ymm0, -110(%edx) -+.LBB0_121: -+ vmovups -78(%ecx), %ymm0 -+ vmovups %ymm0, -78(%edx) -+.LBB0_122: -+ vmovups -46(%ecx), %ymm0 -+ vmovups %ymm0, -46(%edx) -+ jmp .LBB0_44 -+.LBB0_123: -+ vmovups -207(%ecx), %ymm0 -+ vmovups %ymm0, -207(%edx) -+.LBB0_124: -+ vmovups -175(%ecx), %ymm0 -+ vmovups %ymm0, -175(%edx) -+ vmovups -143(%ecx), %ymm0 -+ vmovups %ymm0, -143(%edx) -+ vmovups -111(%ecx), %ymm0 -+ vmovups %ymm0, -111(%edx) -+.LBB0_125: -+ vmovups -79(%ecx), %ymm0 -+ vmovups %ymm0, -79(%edx) -+.LBB0_126: -+ vmovups -47(%ecx), %ymm0 -+ vmovups %ymm0, -47(%edx) -+ jmp .LBB0_44 -+.LBB0_127: -+ vmovups -208(%ecx), %ymm0 -+ vmovups %ymm0, -208(%edx) -+.LBB0_128: -+ vmovups -176(%ecx), %ymm0 -+ vmovups %ymm0, -176(%edx) -+.LBB0_41: -+ vmovups -144(%ecx), %ymm0 -+ vmovups %ymm0, -144(%edx) -+ vmovups -112(%ecx), %ymm0 -+ vmovups %ymm0, -112(%edx) -+.LBB0_42: -+ vmovups -80(%ecx), %ymm0 -+ vmovups %ymm0, -80(%edx) -+.LBB0_43: -+ vmovups -48(%ecx), %ymm0 -+ vmovups %ymm0, -48(%edx) -+.LBB0_44: -+ vmovups -16(%ecx), %xmm0 -+ vmovups %xmm0, -16(%edx) -+ jmp .LBB0_270 -+.LBB0_129: -+ vmovups -209(%ecx), %ymm0 -+ vmovups %ymm0, -209(%edx) -+.LBB0_130: -+ vmovups -177(%ecx), %ymm0 -+ vmovups %ymm0, -177(%edx) -+ vmovups -145(%ecx), %ymm0 -+ vmovups %ymm0, -145(%edx) -+ vmovups -113(%ecx), %ymm0 -+ vmovups %ymm0, -113(%edx) -+.LBB0_131: -+ vmovups -81(%ecx), %ymm0 -+ vmovups %ymm0, -81(%edx) -+.LBB0_132: -+ vmovups -49(%ecx), %ymm0 -+ vmovups %ymm0, -49(%edx) -+ jmp .LBB0_269 -+.LBB0_133: -+ vmovups -210(%ecx), %ymm0 -+ vmovups %ymm0, -210(%edx) -+.LBB0_134: -+ vmovups -178(%ecx), %ymm0 -+ vmovups %ymm0, -178(%edx) -+ vmovups -146(%ecx), %ymm0 -+ vmovups %ymm0, -146(%edx) -+ vmovups -114(%ecx), %ymm0 -+ vmovups %ymm0, -114(%edx) -+.LBB0_135: -+ vmovups -82(%ecx), %ymm0 -+ vmovups %ymm0, -82(%edx) -+.LBB0_136: -+ vmovups -50(%ecx), %ymm0 -+ vmovups %ymm0, -50(%edx) -+ jmp .LBB0_269 -+.LBB0_137: -+ vmovups -211(%ecx), %ymm0 -+ vmovups %ymm0, -211(%edx) -+.LBB0_138: -+ vmovups -179(%ecx), %ymm0 -+ vmovups %ymm0, -179(%edx) -+ vmovups -147(%ecx), %ymm0 -+ vmovups %ymm0, -147(%edx) -+ vmovups -115(%ecx), %ymm0 -+ vmovups %ymm0, -115(%edx) -+.LBB0_139: -+ vmovups -83(%ecx), %ymm0 -+ vmovups %ymm0, -83(%edx) -+.LBB0_140: -+ vmovups -51(%ecx), %ymm0 -+ vmovups %ymm0, -51(%edx) -+ jmp .LBB0_269 -+.LBB0_141: -+ vmovups -212(%ecx), %ymm0 -+ vmovups %ymm0, -212(%edx) -+.LBB0_142: -+ vmovups -180(%ecx), %ymm0 -+ vmovups %ymm0, -180(%edx) -+ vmovups -148(%ecx), %ymm0 -+ vmovups %ymm0, -148(%edx) -+ vmovups -116(%ecx), %ymm0 -+ vmovups %ymm0, -116(%edx) -+.LBB0_143: -+ vmovups -84(%ecx), %ymm0 -+ vmovups %ymm0, -84(%edx) -+.LBB0_144: -+ vmovups -52(%ecx), %ymm0 -+ vmovups %ymm0, -52(%edx) -+ jmp .LBB0_269 -+.LBB0_145: -+ vmovups -213(%ecx), %ymm0 -+ vmovups %ymm0, -213(%edx) -+.LBB0_146: -+ vmovups -181(%ecx), %ymm0 -+ vmovups %ymm0, -181(%edx) -+ vmovups -149(%ecx), %ymm0 -+ vmovups %ymm0, -149(%edx) -+ vmovups -117(%ecx), %ymm0 -+ vmovups %ymm0, -117(%edx) -+.LBB0_147: -+ vmovups -85(%ecx), %ymm0 -+ vmovups %ymm0, -85(%edx) -+.LBB0_148: -+ vmovups -53(%ecx), %ymm0 -+ vmovups %ymm0, -53(%edx) -+ jmp .LBB0_269 -+.LBB0_149: -+ vmovups -214(%ecx), %ymm0 -+ vmovups %ymm0, -214(%edx) -+.LBB0_150: -+ vmovups -182(%ecx), %ymm0 -+ vmovups %ymm0, -182(%edx) -+ vmovups -150(%ecx), %ymm0 -+ vmovups %ymm0, -150(%edx) -+ vmovups -118(%ecx), %ymm0 -+ vmovups %ymm0, -118(%edx) -+.LBB0_151: -+ vmovups -86(%ecx), %ymm0 -+ vmovups %ymm0, -86(%edx) -+.LBB0_152: -+ vmovups -54(%ecx), %ymm0 -+ vmovups %ymm0, -54(%edx) -+ jmp .LBB0_269 -+.LBB0_153: -+ vmovups -215(%ecx), %ymm0 -+ vmovups %ymm0, -215(%edx) -+.LBB0_154: -+ vmovups -183(%ecx), %ymm0 -+ vmovups %ymm0, -183(%edx) -+ vmovups -151(%ecx), %ymm0 -+ vmovups %ymm0, -151(%edx) -+ vmovups -119(%ecx), %ymm0 -+ vmovups %ymm0, -119(%edx) -+.LBB0_155: -+ vmovups -87(%ecx), %ymm0 -+ vmovups %ymm0, -87(%edx) -+.LBB0_156: -+ vmovups -55(%ecx), %ymm0 -+ vmovups %ymm0, -55(%edx) -+ jmp .LBB0_269 -+.LBB0_157: -+ vmovups -216(%ecx), %ymm0 -+ vmovups %ymm0, -216(%edx) -+.LBB0_158: -+ vmovups -184(%ecx), %ymm0 -+ vmovups %ymm0, -184(%edx) -+ vmovups -152(%ecx), %ymm0 -+ vmovups %ymm0, -152(%edx) -+ vmovups -120(%ecx), %ymm0 -+ vmovups %ymm0, -120(%edx) -+.LBB0_159: -+ vmovups -88(%ecx), %ymm0 -+ vmovups %ymm0, -88(%edx) -+.LBB0_160: -+ vmovups -56(%ecx), %ymm0 -+ vmovups %ymm0, -56(%edx) -+ jmp .LBB0_269 -+.LBB0_161: -+ vmovups -217(%ecx), %ymm0 -+ vmovups %ymm0, -217(%edx) -+.LBB0_162: -+ vmovups -185(%ecx), %ymm0 -+ vmovups %ymm0, -185(%edx) -+ vmovups -153(%ecx), %ymm0 -+ vmovups %ymm0, -153(%edx) -+ vmovups -121(%ecx), %ymm0 -+ vmovups %ymm0, -121(%edx) -+.LBB0_163: -+ vmovups -89(%ecx), %ymm0 -+ vmovups %ymm0, -89(%edx) -+.LBB0_164: -+ vmovups -57(%ecx), %ymm0 -+ vmovups %ymm0, -57(%edx) -+ jmp .LBB0_269 -+.LBB0_165: -+ vmovups -218(%ecx), %ymm0 -+ vmovups %ymm0, -218(%edx) -+.LBB0_166: -+ vmovups -186(%ecx), %ymm0 -+ vmovups %ymm0, -186(%edx) -+ vmovups -154(%ecx), %ymm0 -+ vmovups %ymm0, -154(%edx) -+ vmovups -122(%ecx), %ymm0 -+ vmovups %ymm0, -122(%edx) -+.LBB0_167: -+ vmovups -90(%ecx), %ymm0 -+ vmovups %ymm0, -90(%edx) -+.LBB0_168: -+ vmovups -58(%ecx), %ymm0 -+ vmovups %ymm0, -58(%edx) -+ jmp .LBB0_269 -+.LBB0_169: -+ vmovups -219(%ecx), %ymm0 -+ vmovups %ymm0, -219(%edx) -+.LBB0_170: -+ vmovups -187(%ecx), %ymm0 -+ vmovups %ymm0, -187(%edx) -+ vmovups -155(%ecx), %ymm0 -+ vmovups %ymm0, -155(%edx) -+ vmovups -123(%ecx), %ymm0 -+ vmovups %ymm0, -123(%edx) -+.LBB0_171: -+ vmovups -91(%ecx), %ymm0 -+ vmovups %ymm0, -91(%edx) -+.LBB0_172: -+ vmovups -59(%ecx), %ymm0 -+ vmovups %ymm0, -59(%edx) -+ jmp .LBB0_269 -+.LBB0_173: -+ vmovups -220(%ecx), %ymm0 -+ vmovups %ymm0, -220(%edx) -+.LBB0_174: -+ vmovups -188(%ecx), %ymm0 -+ vmovups %ymm0, -188(%edx) -+ vmovups -156(%ecx), %ymm0 -+ vmovups %ymm0, -156(%edx) -+ vmovups -124(%ecx), %ymm0 -+ vmovups %ymm0, -124(%edx) -+.LBB0_175: -+ vmovups -92(%ecx), %ymm0 -+ vmovups %ymm0, -92(%edx) -+.LBB0_176: -+ vmovups -60(%ecx), %ymm0 -+ vmovups %ymm0, -60(%edx) -+ jmp .LBB0_269 -+.LBB0_177: -+ vmovups -221(%ecx), %ymm0 -+ vmovups %ymm0, -221(%edx) -+.LBB0_178: -+ vmovups -189(%ecx), %ymm0 -+ vmovups %ymm0, -189(%edx) -+ vmovups -157(%ecx), %ymm0 -+ vmovups %ymm0, -157(%edx) -+ vmovups -125(%ecx), %ymm0 -+ vmovups %ymm0, -125(%edx) -+.LBB0_179: -+ vmovups -93(%ecx), %ymm0 -+ vmovups %ymm0, -93(%edx) -+.LBB0_180: -+ vmovups -61(%ecx), %ymm0 -+ vmovups %ymm0, -61(%edx) -+ jmp .LBB0_269 -+.LBB0_181: -+ vmovups -222(%ecx), %ymm0 -+ vmovups %ymm0, -222(%edx) -+.LBB0_182: -+ vmovups -190(%ecx), %ymm0 -+ vmovups %ymm0, -190(%edx) -+ vmovups -158(%ecx), %ymm0 -+ vmovups %ymm0, -158(%edx) -+ vmovups -126(%ecx), %ymm0 -+ vmovups %ymm0, -126(%edx) -+.LBB0_183: -+ vmovups -94(%ecx), %ymm0 -+ vmovups %ymm0, -94(%edx) -+.LBB0_184: -+ vmovups -62(%ecx), %ymm0 -+ vmovups %ymm0, -62(%edx) -+ jmp .LBB0_269 -+.LBB0_185: -+ vmovups -223(%ecx), %ymm0 -+ vmovups %ymm0, -223(%edx) -+.LBB0_186: -+ vmovups -191(%ecx), %ymm0 -+ vmovups %ymm0, -191(%edx) -+ vmovups -159(%ecx), %ymm0 -+ vmovups %ymm0, -159(%edx) -+ vmovups -127(%ecx), %ymm0 -+ vmovups %ymm0, -127(%edx) -+.LBB0_187: -+ vmovups -95(%ecx), %ymm0 -+ vmovups %ymm0, -95(%edx) -+.LBB0_188: -+ vmovups -63(%ecx), %ymm0 -+ vmovups %ymm0, -63(%edx) -+ jmp .LBB0_269 -+.LBB0_189: -+ vmovups -225(%ecx), %ymm0 -+ vmovups %ymm0, -225(%edx) -+ vmovups -193(%ecx), %ymm0 -+ vmovups %ymm0, -193(%edx) -+ vmovups -161(%ecx), %ymm0 -+ vmovups %ymm0, -161(%edx) -+ vmovups -129(%ecx), %ymm0 -+ vmovups %ymm0, -129(%edx) -+.LBB0_190: -+ vmovups -97(%ecx), %ymm0 -+ vmovups %ymm0, -97(%edx) -+ vmovups -65(%ecx), %ymm0 -+ vmovups %ymm0, -65(%edx) -+ jmp .LBB0_268 -+.LBB0_191: -+ vmovups -226(%ecx), %ymm0 -+ vmovups %ymm0, -226(%edx) -+ vmovups -194(%ecx), %ymm0 -+ vmovups %ymm0, -194(%edx) -+ vmovups -162(%ecx), %ymm0 -+ vmovups %ymm0, -162(%edx) -+ vmovups -130(%ecx), %ymm0 -+ vmovups %ymm0, -130(%edx) -+.LBB0_192: -+ vmovups -98(%ecx), %ymm0 -+ vmovups %ymm0, -98(%edx) -+ vmovups -66(%ecx), %ymm0 -+ vmovups %ymm0, -66(%edx) -+ jmp .LBB0_268 -+.LBB0_193: -+ vmovups -227(%ecx), %ymm0 -+ vmovups %ymm0, -227(%edx) -+ vmovups -195(%ecx), %ymm0 -+ vmovups %ymm0, -195(%edx) -+ vmovups -163(%ecx), %ymm0 -+ vmovups %ymm0, -163(%edx) -+ vmovups -131(%ecx), %ymm0 -+ vmovups %ymm0, -131(%edx) -+.LBB0_194: -+ vmovups -99(%ecx), %ymm0 -+ vmovups %ymm0, -99(%edx) -+ vmovups -67(%ecx), %ymm0 -+ vmovups %ymm0, -67(%edx) -+ jmp .LBB0_268 -+.LBB0_195: -+ vmovups -228(%ecx), %ymm0 -+ vmovups %ymm0, -228(%edx) -+ vmovups -196(%ecx), %ymm0 -+ vmovups %ymm0, -196(%edx) -+ vmovups -164(%ecx), %ymm0 -+ vmovups %ymm0, -164(%edx) -+ vmovups -132(%ecx), %ymm0 -+ vmovups %ymm0, -132(%edx) -+.LBB0_196: -+ vmovups -100(%ecx), %ymm0 -+ vmovups %ymm0, -100(%edx) -+ vmovups -68(%ecx), %ymm0 -+ vmovups %ymm0, -68(%edx) -+ jmp .LBB0_268 -+.LBB0_197: -+ vmovups -229(%ecx), %ymm0 -+ vmovups %ymm0, -229(%edx) -+ vmovups -197(%ecx), %ymm0 -+ vmovups %ymm0, -197(%edx) -+ vmovups -165(%ecx), %ymm0 -+ vmovups %ymm0, -165(%edx) -+ vmovups -133(%ecx), %ymm0 -+ vmovups %ymm0, -133(%edx) -+.LBB0_198: -+ vmovups -101(%ecx), %ymm0 -+ vmovups %ymm0, -101(%edx) -+ vmovups -69(%ecx), %ymm0 -+ vmovups %ymm0, -69(%edx) -+ jmp .LBB0_268 -+.LBB0_199: -+ vmovups -230(%ecx), %ymm0 -+ vmovups %ymm0, -230(%edx) -+ vmovups -198(%ecx), %ymm0 -+ vmovups %ymm0, -198(%edx) -+ vmovups -166(%ecx), %ymm0 -+ vmovups %ymm0, -166(%edx) -+ vmovups -134(%ecx), %ymm0 -+ vmovups %ymm0, -134(%edx) -+.LBB0_200: -+ vmovups -102(%ecx), %ymm0 -+ vmovups %ymm0, -102(%edx) -+ vmovups -70(%ecx), %ymm0 -+ vmovups %ymm0, -70(%edx) -+ jmp .LBB0_268 -+.LBB0_201: -+ vmovups -231(%ecx), %ymm0 -+ vmovups %ymm0, -231(%edx) -+ vmovups -199(%ecx), %ymm0 -+ vmovups %ymm0, -199(%edx) -+ vmovups -167(%ecx), %ymm0 -+ vmovups %ymm0, -167(%edx) -+ vmovups -135(%ecx), %ymm0 -+ vmovups %ymm0, -135(%edx) -+.LBB0_202: -+ vmovups -103(%ecx), %ymm0 -+ vmovups %ymm0, -103(%edx) -+ vmovups -71(%ecx), %ymm0 -+ vmovups %ymm0, -71(%edx) -+ jmp .LBB0_268 -+.LBB0_203: -+ vmovups -232(%ecx), %ymm0 -+ vmovups %ymm0, -232(%edx) -+ vmovups -200(%ecx), %ymm0 -+ vmovups %ymm0, -200(%edx) -+ vmovups -168(%ecx), %ymm0 -+ vmovups %ymm0, -168(%edx) -+ vmovups -136(%ecx), %ymm0 -+ vmovups %ymm0, -136(%edx) -+.LBB0_204: -+ vmovups -104(%ecx), %ymm0 -+ vmovups %ymm0, -104(%edx) -+ vmovups -72(%ecx), %ymm0 -+ vmovups %ymm0, -72(%edx) -+ jmp .LBB0_268 -+.LBB0_205: -+ vmovups -233(%ecx), %ymm0 -+ vmovups %ymm0, -233(%edx) -+ vmovups -201(%ecx), %ymm0 -+ vmovups %ymm0, -201(%edx) -+ vmovups -169(%ecx), %ymm0 -+ vmovups %ymm0, -169(%edx) -+ vmovups -137(%ecx), %ymm0 -+ vmovups %ymm0, -137(%edx) -+.LBB0_206: -+ vmovups -105(%ecx), %ymm0 -+ vmovups %ymm0, -105(%edx) -+ vmovups -73(%ecx), %ymm0 -+ vmovups %ymm0, -73(%edx) -+ jmp .LBB0_268 -+.LBB0_207: -+ vmovups -234(%ecx), %ymm0 -+ vmovups %ymm0, -234(%edx) -+ vmovups -202(%ecx), %ymm0 -+ vmovups %ymm0, -202(%edx) -+ vmovups -170(%ecx), %ymm0 -+ vmovups %ymm0, -170(%edx) -+ vmovups -138(%ecx), %ymm0 -+ vmovups %ymm0, -138(%edx) -+.LBB0_208: -+ vmovups -106(%ecx), %ymm0 -+ vmovups %ymm0, -106(%edx) -+ vmovups -74(%ecx), %ymm0 -+ vmovups %ymm0, -74(%edx) -+ jmp .LBB0_268 -+.LBB0_209: -+ vmovups -235(%ecx), %ymm0 -+ vmovups %ymm0, -235(%edx) -+ vmovups -203(%ecx), %ymm0 -+ vmovups %ymm0, -203(%edx) -+ vmovups -171(%ecx), %ymm0 -+ vmovups %ymm0, -171(%edx) -+ vmovups -139(%ecx), %ymm0 -+ vmovups %ymm0, -139(%edx) -+.LBB0_210: -+ vmovups -107(%ecx), %ymm0 -+ vmovups %ymm0, -107(%edx) -+ vmovups -75(%ecx), %ymm0 -+ vmovups %ymm0, -75(%edx) -+ jmp .LBB0_268 -+.LBB0_211: -+ vmovups -236(%ecx), %ymm0 -+ vmovups %ymm0, -236(%edx) -+ vmovups -204(%ecx), %ymm0 -+ vmovups %ymm0, -204(%edx) -+ vmovups -172(%ecx), %ymm0 -+ vmovups %ymm0, -172(%edx) -+ vmovups -140(%ecx), %ymm0 -+ vmovups %ymm0, -140(%edx) -+.LBB0_212: -+ vmovups -108(%ecx), %ymm0 -+ vmovups %ymm0, -108(%edx) -+ vmovups -76(%ecx), %ymm0 -+ vmovups %ymm0, -76(%edx) -+ jmp .LBB0_268 -+.LBB0_213: -+ vmovups -237(%ecx), %ymm0 -+ vmovups %ymm0, -237(%edx) -+ vmovups -205(%ecx), %ymm0 -+ vmovups %ymm0, -205(%edx) -+ vmovups -173(%ecx), %ymm0 -+ vmovups %ymm0, -173(%edx) -+ vmovups -141(%ecx), %ymm0 -+ vmovups %ymm0, -141(%edx) -+.LBB0_214: -+ vmovups -109(%ecx), %ymm0 -+ vmovups %ymm0, -109(%edx) -+ vmovups -77(%ecx), %ymm0 -+ vmovups %ymm0, -77(%edx) -+ jmp .LBB0_268 -+.LBB0_215: -+ vmovups -238(%ecx), %ymm0 -+ vmovups %ymm0, -238(%edx) -+ vmovups -206(%ecx), %ymm0 -+ vmovups %ymm0, -206(%edx) -+ vmovups -174(%ecx), %ymm0 -+ vmovups %ymm0, -174(%edx) -+ vmovups -142(%ecx), %ymm0 -+ vmovups %ymm0, -142(%edx) -+.LBB0_216: -+ vmovups -110(%ecx), %ymm0 -+ vmovups %ymm0, -110(%edx) -+ vmovups -78(%ecx), %ymm0 -+ vmovups %ymm0, -78(%edx) -+ jmp .LBB0_268 -+.LBB0_217: -+ vmovups -239(%ecx), %ymm0 -+ vmovups %ymm0, -239(%edx) -+ vmovups -207(%ecx), %ymm0 -+ vmovups %ymm0, -207(%edx) -+ vmovups -175(%ecx), %ymm0 -+ vmovups %ymm0, -175(%edx) -+ vmovups -143(%ecx), %ymm0 -+ vmovups %ymm0, -143(%edx) -+.LBB0_218: -+ vmovups -111(%ecx), %ymm0 -+ vmovups %ymm0, -111(%edx) -+ vmovups -79(%ecx), %ymm0 -+ vmovups %ymm0, -79(%edx) -+ jmp .LBB0_268 -+.LBB0_219: -+ vmovups -240(%ecx), %ymm0 -+ vmovups %ymm0, -240(%edx) -+ vmovups -208(%ecx), %ymm0 -+ vmovups %ymm0, -208(%edx) -+ vmovups -176(%ecx), %ymm0 -+ vmovups %ymm0, -176(%edx) -+ vmovups -144(%ecx), %ymm0 -+ vmovups %ymm0, -144(%edx) -+.LBB0_220: -+ vmovups -112(%ecx), %ymm0 -+ vmovups %ymm0, -112(%edx) -+ vmovups -80(%ecx), %ymm0 -+ vmovups %ymm0, -80(%edx) -+ jmp .LBB0_268 -+.LBB0_221: -+ vmovups -241(%ecx), %ymm0 -+ vmovups %ymm0, -241(%edx) -+ vmovups -209(%ecx), %ymm0 -+ vmovups %ymm0, -209(%edx) -+ vmovups -177(%ecx), %ymm0 -+ vmovups %ymm0, -177(%edx) -+ vmovups -145(%ecx), %ymm0 -+ vmovups %ymm0, -145(%edx) -+.LBB0_222: -+ vmovups -113(%ecx), %ymm0 -+ vmovups %ymm0, -113(%edx) -+ vmovups -81(%ecx), %ymm0 -+ vmovups %ymm0, -81(%edx) -+ jmp .LBB0_268 -+.LBB0_223: -+ vmovups -242(%ecx), %ymm0 -+ vmovups %ymm0, -242(%edx) -+ vmovups -210(%ecx), %ymm0 -+ vmovups %ymm0, -210(%edx) -+ vmovups -178(%ecx), %ymm0 -+ vmovups %ymm0, -178(%edx) -+ vmovups -146(%ecx), %ymm0 -+ vmovups %ymm0, -146(%edx) -+.LBB0_224: -+ vmovups -114(%ecx), %ymm0 -+ vmovups %ymm0, -114(%edx) -+ vmovups -82(%ecx), %ymm0 -+ vmovups %ymm0, -82(%edx) -+ jmp .LBB0_268 -+.LBB0_225: -+ vmovups -243(%ecx), %ymm0 -+ vmovups %ymm0, -243(%edx) -+ vmovups -211(%ecx), %ymm0 -+ vmovups %ymm0, -211(%edx) -+ vmovups -179(%ecx), %ymm0 -+ vmovups %ymm0, -179(%edx) -+ vmovups -147(%ecx), %ymm0 -+ vmovups %ymm0, -147(%edx) -+.LBB0_226: -+ vmovups -115(%ecx), %ymm0 -+ vmovups %ymm0, -115(%edx) -+ vmovups -83(%ecx), %ymm0 -+ vmovups %ymm0, -83(%edx) -+ jmp .LBB0_268 -+.LBB0_227: -+ vmovups -244(%ecx), %ymm0 -+ vmovups %ymm0, -244(%edx) -+ vmovups -212(%ecx), %ymm0 -+ vmovups %ymm0, -212(%edx) -+ vmovups -180(%ecx), %ymm0 -+ vmovups %ymm0, -180(%edx) -+ vmovups -148(%ecx), %ymm0 -+ vmovups %ymm0, -148(%edx) -+.LBB0_228: -+ vmovups -116(%ecx), %ymm0 -+ vmovups %ymm0, -116(%edx) -+ vmovups -84(%ecx), %ymm0 -+ vmovups %ymm0, -84(%edx) -+ jmp .LBB0_268 -+.LBB0_229: -+ vmovups -245(%ecx), %ymm0 -+ vmovups %ymm0, -245(%edx) -+ vmovups -213(%ecx), %ymm0 -+ vmovups %ymm0, -213(%edx) -+ vmovups -181(%ecx), %ymm0 -+ vmovups %ymm0, -181(%edx) -+ vmovups -149(%ecx), %ymm0 -+ vmovups %ymm0, -149(%edx) -+.LBB0_230: -+ vmovups -117(%ecx), %ymm0 -+ vmovups %ymm0, -117(%edx) -+ vmovups -85(%ecx), %ymm0 -+ vmovups %ymm0, -85(%edx) -+ jmp .LBB0_268 -+.LBB0_231: -+ vmovups -246(%ecx), %ymm0 -+ vmovups %ymm0, -246(%edx) -+ vmovups -214(%ecx), %ymm0 -+ vmovups %ymm0, -214(%edx) -+ vmovups -182(%ecx), %ymm0 -+ vmovups %ymm0, -182(%edx) -+ vmovups -150(%ecx), %ymm0 -+ vmovups %ymm0, -150(%edx) -+.LBB0_232: -+ vmovups -118(%ecx), %ymm0 -+ vmovups %ymm0, -118(%edx) -+ vmovups -86(%ecx), %ymm0 -+ vmovups %ymm0, -86(%edx) -+ jmp .LBB0_268 -+.LBB0_233: -+ vmovups -247(%ecx), %ymm0 -+ vmovups %ymm0, -247(%edx) -+ vmovups -215(%ecx), %ymm0 -+ vmovups %ymm0, -215(%edx) -+ vmovups -183(%ecx), %ymm0 -+ vmovups %ymm0, -183(%edx) -+ vmovups -151(%ecx), %ymm0 -+ vmovups %ymm0, -151(%edx) -+.LBB0_234: -+ vmovups -119(%ecx), %ymm0 -+ vmovups %ymm0, -119(%edx) -+ vmovups -87(%ecx), %ymm0 -+ vmovups %ymm0, -87(%edx) -+ jmp .LBB0_268 -+.LBB0_235: -+ vmovups -248(%ecx), %ymm0 -+ vmovups %ymm0, -248(%edx) -+ vmovups -216(%ecx), %ymm0 -+ vmovups %ymm0, -216(%edx) -+ vmovups -184(%ecx), %ymm0 -+ vmovups %ymm0, -184(%edx) -+ vmovups -152(%ecx), %ymm0 -+ vmovups %ymm0, -152(%edx) -+.LBB0_236: -+ vmovups -120(%ecx), %ymm0 -+ vmovups %ymm0, -120(%edx) -+ vmovups -88(%ecx), %ymm0 -+ vmovups %ymm0, -88(%edx) -+ jmp .LBB0_268 -+.LBB0_237: -+ vmovups -249(%ecx), %ymm0 -+ vmovups %ymm0, -249(%edx) -+ vmovups -217(%ecx), %ymm0 -+ vmovups %ymm0, -217(%edx) -+ vmovups -185(%ecx), %ymm0 -+ vmovups %ymm0, -185(%edx) -+ vmovups -153(%ecx), %ymm0 -+ vmovups %ymm0, -153(%edx) -+.LBB0_238: -+ vmovups -121(%ecx), %ymm0 -+ vmovups %ymm0, -121(%edx) -+ vmovups -89(%ecx), %ymm0 -+ vmovups %ymm0, -89(%edx) -+ jmp .LBB0_268 -+.LBB0_239: -+ vmovups -250(%ecx), %ymm0 -+ vmovups %ymm0, -250(%edx) -+ vmovups -218(%ecx), %ymm0 -+ vmovups %ymm0, -218(%edx) -+ vmovups -186(%ecx), %ymm0 -+ vmovups %ymm0, -186(%edx) -+ vmovups -154(%ecx), %ymm0 -+ vmovups %ymm0, -154(%edx) -+.LBB0_240: -+ vmovups -122(%ecx), %ymm0 -+ vmovups %ymm0, -122(%edx) -+ vmovups -90(%ecx), %ymm0 -+ vmovups %ymm0, -90(%edx) -+ jmp .LBB0_268 -+.LBB0_241: -+ vmovups -251(%ecx), %ymm0 -+ vmovups %ymm0, -251(%edx) -+ vmovups -219(%ecx), %ymm0 -+ vmovups %ymm0, -219(%edx) -+ vmovups -187(%ecx), %ymm0 -+ vmovups %ymm0, -187(%edx) -+ vmovups -155(%ecx), %ymm0 -+ vmovups %ymm0, -155(%edx) -+.LBB0_242: -+ vmovups -123(%ecx), %ymm0 -+ vmovups %ymm0, -123(%edx) -+ vmovups -91(%ecx), %ymm0 -+ vmovups %ymm0, -91(%edx) -+ jmp .LBB0_268 -+.LBB0_243: -+ vmovups -252(%ecx), %ymm0 -+ vmovups %ymm0, -252(%edx) -+ vmovups -220(%ecx), %ymm0 -+ vmovups %ymm0, -220(%edx) -+ vmovups -188(%ecx), %ymm0 -+ vmovups %ymm0, -188(%edx) -+ vmovups -156(%ecx), %ymm0 -+ vmovups %ymm0, -156(%edx) -+.LBB0_244: -+ vmovups -124(%ecx), %ymm0 -+ vmovups %ymm0, -124(%edx) -+ vmovups -92(%ecx), %ymm0 -+ vmovups %ymm0, -92(%edx) -+ jmp .LBB0_268 -+.LBB0_245: -+ vmovups -253(%ecx), %ymm0 -+ vmovups %ymm0, -253(%edx) -+ vmovups -221(%ecx), %ymm0 -+ vmovups %ymm0, -221(%edx) -+ vmovups -189(%ecx), %ymm0 -+ vmovups %ymm0, -189(%edx) -+ vmovups -157(%ecx), %ymm0 -+ vmovups %ymm0, -157(%edx) -+.LBB0_246: -+ vmovups -125(%ecx), %ymm0 -+ vmovups %ymm0, -125(%edx) -+ vmovups -93(%ecx), %ymm0 -+ vmovups %ymm0, -93(%edx) -+ jmp .LBB0_268 -+.LBB0_247: -+ vmovups -254(%ecx), %ymm0 -+ vmovups %ymm0, -254(%edx) -+ vmovups -222(%ecx), %ymm0 -+ vmovups %ymm0, -222(%edx) -+ vmovups -190(%ecx), %ymm0 -+ vmovups %ymm0, -190(%edx) -+ vmovups -158(%ecx), %ymm0 -+ vmovups %ymm0, -158(%edx) -+.LBB0_248: -+ vmovups -126(%ecx), %ymm0 -+ vmovups %ymm0, -126(%edx) -+ vmovups -94(%ecx), %ymm0 -+ vmovups %ymm0, -94(%edx) -+ jmp .LBB0_268 -+.LBB0_249: -+ vmovups -255(%ecx), %ymm0 -+ vmovups %ymm0, -255(%edx) -+ vmovups -223(%ecx), %ymm0 -+ vmovups %ymm0, -223(%edx) -+ vmovups -191(%ecx), %ymm0 -+ vmovups %ymm0, -191(%edx) -+ vmovups -159(%ecx), %ymm0 -+ vmovups %ymm0, -159(%edx) -+.LBB0_250: -+ vmovups -127(%ecx), %ymm0 -+ vmovups %ymm0, -127(%edx) -+ vmovups -95(%ecx), %ymm0 -+ vmovups %ymm0, -95(%edx) -+ jmp .LBB0_268 -+.LBB0_262: -+ vmovups -256(%ecx), %ymm0 -+ vmovups %ymm0, -256(%edx) -+.LBB0_263: -+ vmovups -224(%ecx), %ymm0 -+ vmovups %ymm0, -224(%edx) -+.LBB0_264: -+ vmovups -192(%ecx), %ymm0 -+ vmovups %ymm0, -192(%edx) -+.LBB0_265: -+ vmovups -160(%ecx), %ymm0 -+ vmovups %ymm0, -160(%edx) -+.LBB0_266: -+ vmovups -128(%ecx), %ymm0 -+ vmovups %ymm0, -128(%edx) -+.LBB0_267: -+ vmovups -96(%ecx), %ymm0 -+ vmovups %ymm0, -96(%edx) -+.LBB0_268: -+ vmovups -64(%ecx), %ymm0 -+ vmovups %ymm0, -64(%edx) -+.LBB0_269: -+ vmovups -32(%ecx), %ymm0 -+ vmovups %ymm0, -32(%edx) -+.LBB0_270: -+ vzeroupper -+ popl %esi -+ popl %edi -+ popl %ebx -+ popl %ebp -+ retl -+END(memcpy_avx2) -+ -+/*.Lfunc_end0: -+ .size memcpy_avx2, .Lfunc_end0-memcpy_avx2 -+ .section .rodata,"a",@progbits -+ .p2align 2*/ -+.LJTI0_0: -+ .long .LBB0_6@GOTOFF -+ .long .LBB0_10@GOTOFF -+ .long .LBB0_12@GOTOFF -+ .long .LBB0_16@GOTOFF -+ .long .LBB0_18@GOTOFF -+ .long .LBB0_20@GOTOFF -+ .long .LBB0_22@GOTOFF -+ .long .LBB0_26@GOTOFF -+ .long .LBB0_28@GOTOFF -+ .long .LBB0_30@GOTOFF -+ .long .LBB0_32@GOTOFF -+ .long .LBB0_34@GOTOFF -+ .long .LBB0_36@GOTOFF -+ .long .LBB0_38@GOTOFF -+ .long .LBB0_40@GOTOFF -+ .long .LBB0_44@GOTOFF -+ .long .LBB0_46@GOTOFF -+ .long .LBB0_48@GOTOFF -+ .long .LBB0_50@GOTOFF -+ .long .LBB0_52@GOTOFF -+ .long .LBB0_54@GOTOFF -+ .long .LBB0_56@GOTOFF -+ .long .LBB0_58@GOTOFF -+ .long .LBB0_60@GOTOFF -+ .long .LBB0_62@GOTOFF -+ .long .LBB0_64@GOTOFF -+ .long .LBB0_66@GOTOFF -+ .long .LBB0_68@GOTOFF -+ .long .LBB0_70@GOTOFF -+ .long .LBB0_72@GOTOFF -+ .long .LBB0_74@GOTOFF -+ .long .LBB0_269@GOTOFF -+ .long .LBB0_5@GOTOFF -+ .long .LBB0_9@GOTOFF -+ .long .LBB0_82@GOTOFF -+ .long .LBB0_15@GOTOFF -+ .long .LBB0_88@GOTOFF -+ .long .LBB0_92@GOTOFF -+ .long .LBB0_96@GOTOFF -+ .long .LBB0_25@GOTOFF -+ .long .LBB0_102@GOTOFF -+ .long .LBB0_106@GOTOFF -+ .long .LBB0_110@GOTOFF -+ .long .LBB0_114@GOTOFF -+ .long .LBB0_118@GOTOFF -+ .long .LBB0_122@GOTOFF -+ .long .LBB0_126@GOTOFF -+ .long .LBB0_43@GOTOFF -+ .long .LBB0_132@GOTOFF -+ .long .LBB0_136@GOTOFF -+ .long .LBB0_140@GOTOFF -+ .long .LBB0_144@GOTOFF -+ .long .LBB0_148@GOTOFF -+ .long .LBB0_152@GOTOFF -+ .long .LBB0_156@GOTOFF -+ .long .LBB0_160@GOTOFF -+ .long .LBB0_164@GOTOFF -+ .long .LBB0_168@GOTOFF -+ .long .LBB0_172@GOTOFF -+ .long .LBB0_176@GOTOFF -+ .long .LBB0_180@GOTOFF -+ .long .LBB0_184@GOTOFF -+ .long .LBB0_188@GOTOFF -+ .long .LBB0_268@GOTOFF -+ .long .LBB0_4@GOTOFF -+ .long .LBB0_8@GOTOFF -+ .long .LBB0_81@GOTOFF -+ .long .LBB0_14@GOTOFF -+ .long .LBB0_87@GOTOFF -+ .long .LBB0_91@GOTOFF -+ .long .LBB0_95@GOTOFF -+ .long .LBB0_24@GOTOFF -+ .long .LBB0_101@GOTOFF -+ .long .LBB0_105@GOTOFF -+ .long .LBB0_109@GOTOFF -+ .long .LBB0_113@GOTOFF -+ .long .LBB0_117@GOTOFF -+ .long .LBB0_121@GOTOFF -+ .long .LBB0_125@GOTOFF -+ .long .LBB0_42@GOTOFF -+ .long .LBB0_131@GOTOFF -+ .long .LBB0_135@GOTOFF -+ .long .LBB0_139@GOTOFF -+ .long .LBB0_143@GOTOFF -+ .long .LBB0_147@GOTOFF -+ .long .LBB0_151@GOTOFF -+ .long .LBB0_155@GOTOFF -+ .long .LBB0_159@GOTOFF -+ .long .LBB0_163@GOTOFF -+ .long .LBB0_167@GOTOFF -+ .long .LBB0_171@GOTOFF -+ .long .LBB0_175@GOTOFF -+ .long .LBB0_179@GOTOFF -+ .long .LBB0_183@GOTOFF -+ .long .LBB0_187@GOTOFF -+ .long .LBB0_267@GOTOFF -+ .long .LBB0_190@GOTOFF -+ .long .LBB0_192@GOTOFF -+ .long .LBB0_194@GOTOFF -+ .long .LBB0_196@GOTOFF -+ .long .LBB0_198@GOTOFF -+ .long .LBB0_200@GOTOFF -+ .long .LBB0_202@GOTOFF -+ .long .LBB0_204@GOTOFF -+ .long .LBB0_206@GOTOFF -+ .long .LBB0_208@GOTOFF -+ .long .LBB0_210@GOTOFF -+ .long .LBB0_212@GOTOFF -+ .long .LBB0_214@GOTOFF -+ .long .LBB0_216@GOTOFF -+ .long .LBB0_218@GOTOFF -+ .long .LBB0_220@GOTOFF -+ .long .LBB0_222@GOTOFF -+ .long .LBB0_224@GOTOFF -+ .long .LBB0_226@GOTOFF -+ .long .LBB0_228@GOTOFF -+ .long .LBB0_230@GOTOFF -+ .long .LBB0_232@GOTOFF -+ .long .LBB0_234@GOTOFF -+ .long .LBB0_236@GOTOFF -+ .long .LBB0_238@GOTOFF -+ .long .LBB0_240@GOTOFF -+ .long .LBB0_242@GOTOFF -+ .long .LBB0_244@GOTOFF -+ .long .LBB0_246@GOTOFF -+ .long .LBB0_248@GOTOFF -+ .long .LBB0_250@GOTOFF -+ .long .LBB0_266@GOTOFF -+ .long .LBB0_3@GOTOFF -+ .long .LBB0_7@GOTOFF -+ .long .LBB0_11@GOTOFF -+ .long .LBB0_13@GOTOFF -+ .long .LBB0_17@GOTOFF -+ .long .LBB0_19@GOTOFF -+ .long .LBB0_21@GOTOFF -+ .long .LBB0_23@GOTOFF -+ .long .LBB0_27@GOTOFF -+ .long .LBB0_29@GOTOFF -+ .long .LBB0_31@GOTOFF -+ .long .LBB0_33@GOTOFF -+ .long .LBB0_35@GOTOFF -+ .long .LBB0_37@GOTOFF -+ .long .LBB0_39@GOTOFF -+ .long .LBB0_41@GOTOFF -+ .long .LBB0_45@GOTOFF -+ .long .LBB0_47@GOTOFF -+ .long .LBB0_49@GOTOFF -+ .long .LBB0_51@GOTOFF -+ .long .LBB0_53@GOTOFF -+ .long .LBB0_55@GOTOFF -+ .long .LBB0_57@GOTOFF -+ .long .LBB0_59@GOTOFF -+ .long .LBB0_61@GOTOFF -+ .long .LBB0_63@GOTOFF -+ .long .LBB0_65@GOTOFF -+ .long .LBB0_67@GOTOFF -+ .long .LBB0_69@GOTOFF -+ .long .LBB0_71@GOTOFF -+ .long .LBB0_73@GOTOFF -+ .long .LBB0_265@GOTOFF -+ .long .LBB0_76@GOTOFF -+ .long .LBB0_78@GOTOFF -+ .long .LBB0_80@GOTOFF -+ .long .LBB0_84@GOTOFF -+ .long .LBB0_86@GOTOFF -+ .long .LBB0_90@GOTOFF -+ .long .LBB0_94@GOTOFF -+ .long .LBB0_98@GOTOFF -+ .long .LBB0_100@GOTOFF -+ .long .LBB0_104@GOTOFF -+ .long .LBB0_108@GOTOFF -+ .long .LBB0_112@GOTOFF -+ .long .LBB0_116@GOTOFF -+ .long .LBB0_120@GOTOFF -+ .long .LBB0_124@GOTOFF -+ .long .LBB0_128@GOTOFF -+ .long .LBB0_130@GOTOFF -+ .long .LBB0_134@GOTOFF -+ .long .LBB0_138@GOTOFF -+ .long .LBB0_142@GOTOFF -+ .long .LBB0_146@GOTOFF -+ .long .LBB0_150@GOTOFF -+ .long .LBB0_154@GOTOFF -+ .long .LBB0_158@GOTOFF -+ .long .LBB0_162@GOTOFF -+ .long .LBB0_166@GOTOFF -+ .long .LBB0_170@GOTOFF -+ .long .LBB0_174@GOTOFF -+ .long .LBB0_178@GOTOFF -+ .long .LBB0_182@GOTOFF -+ .long .LBB0_186@GOTOFF -+ .long .LBB0_264@GOTOFF -+ .long .LBB0_75@GOTOFF -+ .long .LBB0_77@GOTOFF -+ .long .LBB0_79@GOTOFF -+ .long .LBB0_83@GOTOFF -+ .long .LBB0_85@GOTOFF -+ .long .LBB0_89@GOTOFF -+ .long .LBB0_93@GOTOFF -+ .long .LBB0_97@GOTOFF -+ .long .LBB0_99@GOTOFF -+ .long .LBB0_103@GOTOFF -+ .long .LBB0_107@GOTOFF -+ .long .LBB0_111@GOTOFF -+ .long .LBB0_115@GOTOFF -+ .long .LBB0_119@GOTOFF -+ .long .LBB0_123@GOTOFF -+ .long .LBB0_127@GOTOFF -+ .long .LBB0_129@GOTOFF -+ .long .LBB0_133@GOTOFF -+ .long .LBB0_137@GOTOFF -+ .long .LBB0_141@GOTOFF -+ .long .LBB0_145@GOTOFF -+ .long .LBB0_149@GOTOFF -+ .long .LBB0_153@GOTOFF -+ .long .LBB0_157@GOTOFF -+ .long .LBB0_161@GOTOFF -+ .long .LBB0_165@GOTOFF -+ .long .LBB0_169@GOTOFF -+ .long .LBB0_173@GOTOFF -+ .long .LBB0_177@GOTOFF -+ .long .LBB0_181@GOTOFF -+ .long .LBB0_185@GOTOFF -+ .long .LBB0_263@GOTOFF -+ .long .LBB0_189@GOTOFF -+ .long .LBB0_191@GOTOFF -+ .long .LBB0_193@GOTOFF -+ .long .LBB0_195@GOTOFF -+ .long .LBB0_197@GOTOFF -+ .long .LBB0_199@GOTOFF -+ .long .LBB0_201@GOTOFF -+ .long .LBB0_203@GOTOFF -+ .long .LBB0_205@GOTOFF -+ .long .LBB0_207@GOTOFF -+ .long .LBB0_209@GOTOFF -+ .long .LBB0_211@GOTOFF -+ .long .LBB0_213@GOTOFF -+ .long .LBB0_215@GOTOFF -+ .long .LBB0_217@GOTOFF -+ .long .LBB0_219@GOTOFF -+ .long .LBB0_221@GOTOFF -+ .long .LBB0_223@GOTOFF -+ .long .LBB0_225@GOTOFF -+ .long .LBB0_227@GOTOFF -+ .long .LBB0_229@GOTOFF -+ .long .LBB0_231@GOTOFF -+ .long .LBB0_233@GOTOFF -+ .long .LBB0_235@GOTOFF -+ .long .LBB0_237@GOTOFF -+ .long .LBB0_239@GOTOFF -+ .long .LBB0_241@GOTOFF -+ .long .LBB0_243@GOTOFF -+ .long .LBB0_245@GOTOFF -+ .long .LBB0_247@GOTOFF -+ .long .LBB0_249@GOTOFF -+ .long .LBB0_262@GOTOFF -+.LJTI0_1: -+ .long .LBB0_6@GOTOFF -+ .long .LBB0_10@GOTOFF -+ .long .LBB0_12@GOTOFF -+ .long .LBB0_16@GOTOFF -+ .long .LBB0_18@GOTOFF -+ .long .LBB0_20@GOTOFF -+ .long .LBB0_22@GOTOFF -+ .long .LBB0_26@GOTOFF -+ .long .LBB0_28@GOTOFF -+ .long .LBB0_30@GOTOFF -+ .long .LBB0_32@GOTOFF -+ .long .LBB0_34@GOTOFF -+ .long .LBB0_36@GOTOFF -+ .long .LBB0_38@GOTOFF -+ .long .LBB0_40@GOTOFF -+ .long .LBB0_44@GOTOFF -+ .long .LBB0_46@GOTOFF -+ .long .LBB0_48@GOTOFF -+ .long .LBB0_50@GOTOFF -+ .long .LBB0_52@GOTOFF -+ .long .LBB0_54@GOTOFF -+ .long .LBB0_56@GOTOFF -+ .long .LBB0_58@GOTOFF -+ .long .LBB0_60@GOTOFF -+ .long .LBB0_62@GOTOFF -+ .long .LBB0_64@GOTOFF -+ .long .LBB0_66@GOTOFF -+ .long .LBB0_68@GOTOFF -+ .long .LBB0_70@GOTOFF -+ .long .LBB0_72@GOTOFF -+ .long .LBB0_74@GOTOFF -+ .long .LBB0_269@GOTOFF -+ .long .LBB0_5@GOTOFF -+ .long .LBB0_9@GOTOFF -+ .long .LBB0_82@GOTOFF -+ .long .LBB0_15@GOTOFF -+ .long .LBB0_88@GOTOFF -+ .long .LBB0_92@GOTOFF -+ .long .LBB0_96@GOTOFF -+ .long .LBB0_25@GOTOFF -+ .long .LBB0_102@GOTOFF -+ .long .LBB0_106@GOTOFF -+ .long .LBB0_110@GOTOFF -+ .long .LBB0_114@GOTOFF -+ .long .LBB0_118@GOTOFF -+ .long .LBB0_122@GOTOFF -+ .long .LBB0_126@GOTOFF -+ .long .LBB0_43@GOTOFF -+ .long .LBB0_132@GOTOFF -+ .long .LBB0_136@GOTOFF -+ .long .LBB0_140@GOTOFF -+ .long .LBB0_144@GOTOFF -+ .long .LBB0_148@GOTOFF -+ .long .LBB0_152@GOTOFF -+ .long .LBB0_156@GOTOFF -+ .long .LBB0_160@GOTOFF -+ .long .LBB0_164@GOTOFF -+ .long .LBB0_168@GOTOFF -+ .long .LBB0_172@GOTOFF -+ .long .LBB0_176@GOTOFF -+ .long .LBB0_180@GOTOFF -+ .long .LBB0_184@GOTOFF -+ .long .LBB0_188@GOTOFF -+ .long .LBB0_268@GOTOFF -+ .long .LBB0_4@GOTOFF -+ .long .LBB0_8@GOTOFF -+ .long .LBB0_81@GOTOFF -+ .long .LBB0_14@GOTOFF -+ .long .LBB0_87@GOTOFF -+ .long .LBB0_91@GOTOFF -+ .long .LBB0_95@GOTOFF -+ .long .LBB0_24@GOTOFF -+ .long .LBB0_101@GOTOFF -+ .long .LBB0_105@GOTOFF -+ .long .LBB0_109@GOTOFF -+ .long .LBB0_113@GOTOFF -+ .long .LBB0_117@GOTOFF -+ .long .LBB0_121@GOTOFF -+ .long .LBB0_125@GOTOFF -+ .long .LBB0_42@GOTOFF -+ .long .LBB0_131@GOTOFF -+ .long .LBB0_135@GOTOFF -+ .long .LBB0_139@GOTOFF -+ .long .LBB0_143@GOTOFF -+ .long .LBB0_147@GOTOFF -+ .long .LBB0_151@GOTOFF -+ .long .LBB0_155@GOTOFF -+ .long .LBB0_159@GOTOFF -+ .long .LBB0_163@GOTOFF -+ .long .LBB0_167@GOTOFF -+ .long .LBB0_171@GOTOFF -+ .long .LBB0_175@GOTOFF -+ .long .LBB0_179@GOTOFF -+ .long .LBB0_183@GOTOFF -+ .long .LBB0_187@GOTOFF -+ .long .LBB0_267@GOTOFF -+ .long .LBB0_190@GOTOFF -+ .long .LBB0_192@GOTOFF -+ .long .LBB0_194@GOTOFF -+ .long .LBB0_196@GOTOFF -+ .long .LBB0_198@GOTOFF -+ .long .LBB0_200@GOTOFF -+ .long .LBB0_202@GOTOFF -+ .long .LBB0_204@GOTOFF -+ .long .LBB0_206@GOTOFF -+ .long .LBB0_208@GOTOFF -+ .long .LBB0_210@GOTOFF -+ .long .LBB0_212@GOTOFF -+ .long .LBB0_214@GOTOFF -+ .long .LBB0_216@GOTOFF -+ .long .LBB0_218@GOTOFF -+ .long .LBB0_220@GOTOFF -+ .long .LBB0_222@GOTOFF -+ .long .LBB0_224@GOTOFF -+ .long .LBB0_226@GOTOFF -+ .long .LBB0_228@GOTOFF -+ .long .LBB0_230@GOTOFF -+ .long .LBB0_232@GOTOFF -+ .long .LBB0_234@GOTOFF -+ .long .LBB0_236@GOTOFF -+ .long .LBB0_238@GOTOFF -+ .long .LBB0_240@GOTOFF -+ .long .LBB0_242@GOTOFF -+ .long .LBB0_244@GOTOFF -+ .long .LBB0_246@GOTOFF -+ .long .LBB0_248@GOTOFF -+ .long .LBB0_250@GOTOFF -+ .long .LBB0_266@GOTOFF -+ .long .LBB0_3@GOTOFF -+ .long .LBB0_7@GOTOFF -+ .long .LBB0_11@GOTOFF -+ .long .LBB0_13@GOTOFF -+ .long .LBB0_17@GOTOFF -+ .long .LBB0_19@GOTOFF -+ .long .LBB0_21@GOTOFF -+ .long .LBB0_23@GOTOFF -+ .long .LBB0_27@GOTOFF -+ .long .LBB0_29@GOTOFF -+ .long .LBB0_31@GOTOFF -+ .long .LBB0_33@GOTOFF -+ .long .LBB0_35@GOTOFF -+ .long .LBB0_37@GOTOFF -+ .long .LBB0_39@GOTOFF -+ .long .LBB0_41@GOTOFF -+ .long .LBB0_45@GOTOFF -+ .long .LBB0_47@GOTOFF -+ .long .LBB0_49@GOTOFF -+ .long .LBB0_51@GOTOFF -+ .long .LBB0_53@GOTOFF -+ .long .LBB0_55@GOTOFF -+ .long .LBB0_57@GOTOFF -+ .long .LBB0_59@GOTOFF -+ .long .LBB0_61@GOTOFF -+ .long .LBB0_63@GOTOFF -+ .long .LBB0_65@GOTOFF -+ .long .LBB0_67@GOTOFF -+ .long .LBB0_69@GOTOFF -+ .long .LBB0_71@GOTOFF -+ .long .LBB0_73@GOTOFF -+ .long .LBB0_265@GOTOFF -+ .long .LBB0_76@GOTOFF -+ .long .LBB0_78@GOTOFF -+ .long .LBB0_80@GOTOFF -+ .long .LBB0_84@GOTOFF -+ .long .LBB0_86@GOTOFF -+ .long .LBB0_90@GOTOFF -+ .long .LBB0_94@GOTOFF -+ .long .LBB0_98@GOTOFF -+ .long .LBB0_100@GOTOFF -+ .long .LBB0_104@GOTOFF -+ .long .LBB0_108@GOTOFF -+ .long .LBB0_112@GOTOFF -+ .long .LBB0_116@GOTOFF -+ .long .LBB0_120@GOTOFF -+ .long .LBB0_124@GOTOFF -+ .long .LBB0_128@GOTOFF -+ .long .LBB0_130@GOTOFF -+ .long .LBB0_134@GOTOFF -+ .long .LBB0_138@GOTOFF -+ .long .LBB0_142@GOTOFF -+ .long .LBB0_146@GOTOFF -+ .long .LBB0_150@GOTOFF -+ .long .LBB0_154@GOTOFF -+ .long .LBB0_158@GOTOFF -+ .long .LBB0_162@GOTOFF -+ .long .LBB0_166@GOTOFF -+ .long .LBB0_170@GOTOFF -+ .long .LBB0_174@GOTOFF -+ .long .LBB0_178@GOTOFF -+ .long .LBB0_182@GOTOFF -+ .long .LBB0_186@GOTOFF -+ .long .LBB0_264@GOTOFF -+ .long .LBB0_75@GOTOFF -+ .long .LBB0_77@GOTOFF -+ .long .LBB0_79@GOTOFF -+ .long .LBB0_83@GOTOFF -+ .long .LBB0_85@GOTOFF -+ .long .LBB0_89@GOTOFF -+ .long .LBB0_93@GOTOFF -+ .long .LBB0_97@GOTOFF -+ .long .LBB0_99@GOTOFF -+ .long .LBB0_103@GOTOFF -+ .long .LBB0_107@GOTOFF -+ .long .LBB0_111@GOTOFF -+ .long .LBB0_115@GOTOFF -+ .long .LBB0_119@GOTOFF -+ .long .LBB0_123@GOTOFF -+ .long .LBB0_127@GOTOFF -+ .long .LBB0_129@GOTOFF -+ .long .LBB0_133@GOTOFF -+ .long .LBB0_137@GOTOFF -+ .long .LBB0_141@GOTOFF -+ .long .LBB0_145@GOTOFF -+ .long .LBB0_149@GOTOFF -+ .long .LBB0_153@GOTOFF -+ .long .LBB0_157@GOTOFF -+ .long .LBB0_161@GOTOFF -+ .long .LBB0_165@GOTOFF -+ .long .LBB0_169@GOTOFF -+ .long .LBB0_173@GOTOFF -+ .long .LBB0_177@GOTOFF -+ .long .LBB0_181@GOTOFF -+ .long .LBB0_185@GOTOFF -+ .long .LBB0_263@GOTOFF -+ .long .LBB0_189@GOTOFF -+ .long .LBB0_191@GOTOFF -+ .long .LBB0_193@GOTOFF -+ .long .LBB0_195@GOTOFF -+ .long .LBB0_197@GOTOFF -+ .long .LBB0_199@GOTOFF -+ .long .LBB0_201@GOTOFF -+ .long .LBB0_203@GOTOFF -+ .long .LBB0_205@GOTOFF -+ .long .LBB0_207@GOTOFF -+ .long .LBB0_209@GOTOFF -+ .long .LBB0_211@GOTOFF -+ .long .LBB0_213@GOTOFF -+ .long .LBB0_215@GOTOFF -+ .long .LBB0_217@GOTOFF -+ .long .LBB0_219@GOTOFF -+ .long .LBB0_221@GOTOFF -+ .long .LBB0_223@GOTOFF -+ .long .LBB0_225@GOTOFF -+ .long .LBB0_227@GOTOFF -+ .long .LBB0_229@GOTOFF -+ .long .LBB0_231@GOTOFF -+ .long .LBB0_233@GOTOFF -+ .long .LBB0_235@GOTOFF -+ .long .LBB0_237@GOTOFF -+ .long .LBB0_239@GOTOFF -+ .long .LBB0_241@GOTOFF -+ .long .LBB0_243@GOTOFF -+ .long .LBB0_245@GOTOFF -+ .long .LBB0_247@GOTOFF -+ .long .LBB0_249@GOTOFF -+ .long .LBB0_262@GOTOFF -+ # -- End function -diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp -index c846ded45..43aaebb54 100644 ---- a/libc/arch-x86_64/dynamic_function_dispatch.cpp -+++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp -@@ -46,4 +46,42 @@ DEFINE_IFUNC_FOR(__memset_chk) { - RETURN_FUNC(__memset_chk_func, __memset_chk_generic); - } - -+typedef int memcmp_func(const void* __lhs, const void* __rhs, size_t __n); -+DEFINE_IFUNC_FOR(memcmp) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memcmp_func, memcmp_avx2) -+ RETURN_FUNC(memcmp_func, memcmp_generic); -+} -+ -+typedef void* memmove_func(void* __dst, const void* __src, size_t __n); -+DEFINE_IFUNC_FOR(memmove) { -+ RETURN_FUNC(memmove_func, memmove_generic); -+} -+ -+typedef void* memcpy_func(void* __dst, const void* __src, size_t __n); -+DEFINE_IFUNC_FOR(memcpy) { -+ return memmove_resolver(); -+} -+ -+typedef void* memchr_func(const void* __s, int __ch, size_t __n); -+DEFINE_IFUNC_FOR(memchr) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2); -+ RETURN_FUNC(memchr_func, memchr_openbsd); -+} -+ -+typedef void* memrchr_func(const void* __s, int __ch, size_t __n); -+DEFINE_IFUNC_FOR(memrchr) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2); -+ RETURN_FUNC(memrchr_func, memrchr_openbsd); -+} -+ -+// typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n); -+// DEFINE_IFUNC_FOR(wmemset) { -+// __builtin_cpu_init(); -+// if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2); -+// RETURN_FUNC(wmemset_func, wmemset_freebsd); -+// } -+ - } // extern "C" -diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c -new file mode 100644 -index 000000000..86ee02e0b ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/memchr.c -@@ -0,0 +1,20 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#include -+#define memchr memchr_openbsd -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c -new file mode 100644 -index 000000000..c803009f5 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/memrchr.c -@@ -0,0 +1,20 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#include -+#define memrchr memrchr_openbsd -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c -new file mode 100644 -index 000000000..ac6bd7ec4 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wmemset.c -@@ -0,0 +1,20 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#include -+#define wmemset wmemset_freebsd -+ -+#include -diff --git a/libc/arch-x86_64/string/cache.h b/libc/arch-x86_64/include/cache.h -similarity index 100% -rename from libc/arch-x86_64/string/cache.h -rename to libc/arch-x86_64/include/cache.h -diff --git a/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S -new file mode 100644 -index 000000000..da667c9b3 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-memchr-kbl.S -@@ -0,0 +1,371 @@ -+#ifndef L -+# define L(label) .L##label -+#endif -+ -+#ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+#endif -+ -+#ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+#endif -+ -+#ifndef cfi_rel_offset -+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -+#endif -+ -+#ifndef cfi_restore -+# define cfi_restore(reg) .cfi_restore reg -+#endif -+ -+#ifndef cfi_adjust_cfa_offset -+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -+#endif -+ -+#ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+#endif -+ -+#ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+#endif -+ -+#define CFI_PUSH(REG) \ -+ cfi_adjust_cfa_offset (4); \ -+ cfi_rel_offset (REG, 0) -+ -+#define CFI_POP(REG) \ -+ cfi_adjust_cfa_offset (-4); \ -+ cfi_restore (REG) -+ -+#define PUSH(REG) push REG; -+#define POP(REG) pop REG; -+ -+#define ENTRANCE PUSH (%rbx); -+#define RETURN_END POP (%rbx); ret -+#define RETURN RETURN_END; -+ -+# ifndef MEMCHR -+# define MEMCHR memchr_avx2 -+# endif -+ -+# ifdef USE_AS_WMEMCHR -+# define VPCMPEQ vpcmpeqd -+# else -+# define VPCMPEQ vpcmpeqb -+# endif -+ -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+# define VEC_SIZE 32 -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (MEMCHR) -+# ifndef USE_AS_RAWMEMCHR -+ /* Check for zero length. */ -+ testq %rdx, %rdx -+ jz L(null) -+# endif -+ movl %edi, %ecx -+ /* Broadcast CHAR to YMM0. */ -+ vmovd %esi, %xmm0 -+# ifdef USE_AS_WMEMCHR -+ shl $2, %rdx -+ vpbroadcastd %xmm0, %ymm0 -+# else -+ vpbroadcastb %xmm0, %ymm0 -+# endif -+ /* Check if we may cross page boundary with one vector load. */ -+ andl $(2 * VEC_SIZE - 1), %ecx -+ cmpl $VEC_SIZE, %ecx -+ ja L(cros_page_boundary) -+ -+ /* Check the first VEC_SIZE bytes. */ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+# ifndef USE_AS_RAWMEMCHR -+ jnz L(first_vec_x0_check) -+ /* Adjust length and check the end of data. */ -+ subq $VEC_SIZE, %rdx -+ jbe L(zero) -+# else -+ jnz L(first_vec_x0) -+# endif -+ -+ /* Align data for aligned loads in the loop. */ -+ addq $VEC_SIZE, %rdi -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ -+# ifndef USE_AS_RAWMEMCHR -+ /* Adjust length. */ -+ addq %rcx, %rdx -+ -+ subq $(VEC_SIZE * 4), %rdx -+ jbe L(last_4x_vec_or_less) -+# endif -+ jmp L(more_4x_vec) -+ -+ .p2align 4 -+L(cros_page_boundary): -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ /* Remove the leading bytes. */ -+ sarl %cl, %eax -+ testl %eax, %eax -+ jz L(aligned_more) -+ tzcntl %eax, %eax -+# ifndef USE_AS_RAWMEMCHR -+ /* Check the end of data. */ -+ cmpq %rax, %rdx -+ jbe L(zero) -+# endif -+ addq %rdi, %rax -+ addq %rcx, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(aligned_more): -+# ifndef USE_AS_RAWMEMCHR -+ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" -+ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition -+ overflow. */ -+ negq %rcx -+ addq $VEC_SIZE, %rcx -+ -+ /* Check the end of data. */ -+ subq %rcx, %rdx -+ jbe L(zero) -+# endif -+ -+ addq $VEC_SIZE, %rdi -+ -+# ifndef USE_AS_RAWMEMCHR -+ subq $(VEC_SIZE * 4), %rdx -+ jbe L(last_4x_vec_or_less) -+# endif -+ -+L(more_4x_vec): -+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time -+ since data is only aligned to VEC_SIZE. */ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x2) -+ -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x3) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ -+# ifndef USE_AS_RAWMEMCHR -+ subq $(VEC_SIZE * 4), %rdx -+ jbe L(last_4x_vec_or_less) -+# endif -+ -+ /* Align data to 4 * VEC_SIZE. */ -+ movq %rdi, %rcx -+ andl $(4 * VEC_SIZE - 1), %ecx -+ andq $-(4 * VEC_SIZE), %rdi -+ -+# ifndef USE_AS_RAWMEMCHR -+ /* Adjust length. */ -+ addq %rcx, %rdx -+# endif -+ -+ .p2align 4 -+L(loop_4x_vec): -+ /* Compare 4 * VEC at a time forward. */ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 -+ -+ vpor %ymm1, %ymm2, %ymm5 -+ vpor %ymm3, %ymm4, %ymm6 -+ vpor %ymm5, %ymm6, %ymm5 -+ -+ vpmovmskb %ymm5, %eax -+ testl %eax, %eax -+ jnz L(4x_vec_end) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ -+# ifdef USE_AS_RAWMEMCHR -+ jmp L(loop_4x_vec) -+# else -+ subq $(VEC_SIZE * 4), %rdx -+ ja L(loop_4x_vec) -+ -+L(last_4x_vec_or_less): -+ /* Less than 4 * VEC and aligned to VEC_SIZE. */ -+ addl $(VEC_SIZE * 2), %edx -+ jle L(last_2x_vec) -+ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+ jnz L(first_vec_x2_check) -+ subl $VEC_SIZE, %edx -+ jle L(zero) -+ -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+ jnz L(first_vec_x3_check) -+ xorl %eax, %eax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_2x_vec): -+ addl $(VEC_SIZE * 2), %edx -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+ jnz L(first_vec_x0_check) -+ subl $VEC_SIZE, %edx -+ jle L(zero) -+ -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1_check) -+ xorl %eax, %eax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x0_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rdx -+ jbe L(zero) -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x1_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rdx -+ jbe L(zero) -+ addq $VEC_SIZE, %rax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x2_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rdx -+ jbe L(zero) -+ addq $(VEC_SIZE * 2), %rax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x3_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rdx -+ jbe L(zero) -+ addq $(VEC_SIZE * 3), %rax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(zero): -+ VZEROUPPER -+L(null): -+ xorl %eax, %eax -+ ret -+# endif -+ -+ .p2align 4 -+L(first_vec_x0): -+ tzcntl %eax, %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x1): -+ tzcntl %eax, %eax -+ addq $VEC_SIZE, %rax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x2): -+ tzcntl %eax, %eax -+ addq $(VEC_SIZE * 2), %rax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(4x_vec_end): -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ vpmovmskb %ymm2, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ vpmovmskb %ymm3, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x2) -+ vpmovmskb %ymm4, %eax -+ testl %eax, %eax -+L(first_vec_x3): -+ tzcntl %eax, %eax -+ addq $(VEC_SIZE * 3), %rax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+END (MEMCHR) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S -new file mode 100644 -index 000000000..e9778ca5a ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-memcmp-kbl.S -@@ -0,0 +1,428 @@ -+/* Copyright (C) 2017-2019 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+/* memcmp/wmemcmp is implemented as: -+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap -+ to avoid branches. -+ 2. Use overlapping compare to avoid branch. -+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 -+ bytes for wmemcmp. -+ 4. If size is 8 * VEC_SIZE or less, unroll the loop. -+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory -+ area. -+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. -+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. -+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ -+ -+ -+#ifndef MEMCMP -+# define MEMCMP memcmp_avx2 -+#endif -+ -+#ifndef L -+# define L(label) .L##label -+#endif -+ -+#ifndef ALIGN -+# define ALIGN(n) .p2align n -+#endif -+ -+#ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+#endif -+ -+#ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+#endif -+ -+#ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+#endif -+ -+#ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+#endif -+ -+#ifndef ALIGN -+# define ALIGN(n) .p2align n -+#endif -+ -+# ifdef USE_AS_WMEMCMP -+# define VPCMPEQ vpcmpeqd -+# else -+# define VPCMPEQ vpcmpeqb -+# endif -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+# define VEC_SIZE 32 -+# define VEC_MASK ((1 << VEC_SIZE) - 1) -+ .section .text.avx,"ax",@progbits -+ENTRY (MEMCMP) -+# ifdef USE_AS_WMEMCMP -+ shl $2, %RDX_LP -+# elif defined __ILP32__ -+ /* Clear the upper 32 bits. */ -+ movl %edx, %edx -+# endif -+ cmp $VEC_SIZE, %rdx -+ jb L(less_vec) -+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ cmpq $(VEC_SIZE * 2), %rdx -+ jbe L(last_vec) -+ VPCMPEQ %ymm0, %ymm0, %ymm0 -+ /* More than 2 * VEC. */ -+ cmpq $(VEC_SIZE * 8), %rdx -+ ja L(more_8x_vec) -+ cmpq $(VEC_SIZE * 4), %rdx -+ jb L(last_4x_vec) -+ /* From 4 * VEC to 8 * VEC, inclusively. */ -+ vmovdqu (%rsi), %ymm1 -+ VPCMPEQ (%rdi), %ymm1, %ymm1 -+ vmovdqu VEC_SIZE(%rsi), %ymm2 -+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 -+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 -+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 -+ vpand %ymm1, %ymm2, %ymm5 -+ vpand %ymm3, %ymm4, %ymm6 -+ vpand %ymm5, %ymm6, %ymm5 -+ vptest %ymm0, %ymm5 -+ jnc L(4x_vec_end) -+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi -+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi -+ vmovdqu (%rsi), %ymm1 -+ VPCMPEQ (%rdi), %ymm1, %ymm1 -+ vmovdqu VEC_SIZE(%rsi), %ymm2 -+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 -+ vpand %ymm2, %ymm1, %ymm5 -+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 -+ vpand %ymm3, %ymm5, %ymm5 -+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 -+ vpand %ymm4, %ymm5, %ymm5 -+ vptest %ymm0, %ymm5 -+ jnc L(4x_vec_end) -+ xorl %eax, %eax -+ VZEROUPPER -+ ret -+ .p2align 4 -+L(last_2x_vec): -+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+L(last_vec): -+ /* Use overlapping loads to avoid branches. */ -+ leaq -VEC_SIZE(%rdi, %rdx), %rdi -+ leaq -VEC_SIZE(%rsi, %rdx), %rsi -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ VZEROUPPER -+ ret -+ .p2align 4 -+L(first_vec): -+ /* A byte or int32 is different within 16 or 32 bytes. */ -+ tzcntl %eax, %ecx -+# ifdef USE_AS_WMEMCMP -+ xorl %eax, %eax -+ movl (%rdi, %rcx), %edx -+ cmpl (%rsi, %rcx), %edx -+L(wmemcmp_return): -+ setl %al -+ negl %eax -+ orl $1, %eax -+# else -+ movzbl (%rdi, %rcx), %eax -+ movzbl (%rsi, %rcx), %edx -+ sub %edx, %eax -+# endif -+ VZEROUPPER -+ ret -+# ifdef USE_AS_WMEMCMP -+ .p2align 4 -+L(4): -+ xorl %eax, %eax -+ movl (%rdi), %edx -+ cmpl (%rsi), %edx -+ jne L(wmemcmp_return) -+ ret -+# else -+ -+L(between_4_7): -+ /* Load as big endian with overlapping movbe to avoid branches. */ -+ movbe (%rdi), %eax -+ movbe (%rsi), %ecx -+ shlq $32, %rax -+ shlq $32, %rcx -+ movbe -4(%rdi, %rdx), %edi -+ movbe -4(%rsi, %rdx), %esi -+ orq %rdi, %rax -+ orq %rsi, %rcx -+ subq %rcx, %rax -+ je L(exit) -+ sbbl %eax, %eax -+ orl $1, %eax -+ ret -+ .p2align 4 -+/*L(8): -+ giving two failures -+ movl (%rdi), %eax -+ subl (%rsi), %eax -+ je L(between_4_7) -+ retq */ -+ -+L(exit): -+ ret -+ .p2align 4 -+L(between_2_3): -+ /* Load as big endian to avoid branches. */ -+ movzwl (%rdi), %eax -+ movzwl (%rsi), %ecx -+ shll $8, %eax -+ shll $8, %ecx -+ bswap %eax -+ bswap %ecx -+ movb -1(%rdi, %rdx), %al -+ movb -1(%rsi, %rdx), %cl -+ /* Subtraction is okay because the upper 8 bits are zero. */ -+ subl %ecx, %eax -+ ret -+ .p2align 4 -+L(1): -+ movzbl (%rdi), %eax -+ movzbl (%rsi), %ecx -+ sub %ecx, %eax -+ ret -+# endif -+ .p2align 4 -+L(zero): -+ xorl %eax, %eax -+ ret -+ .p2align 4 -+L(less_vec): -+# ifdef USE_AS_WMEMCMP -+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ -+ cmpb $4, %dl -+ je L(4) -+ jb L(zero) -+# else -+/* cmpb $8, %dl -+ jne L(tmp) -+ movl (%rdi), %eax -+ subl (%rsi), %eax -+ jne L(exit) -+L(temp): -+ movl %edx, %edx -+ //jmp L(tmp) -+L(tmp):*/ -+ -+ cmpb $1, %dl -+ je L(1) -+ jb L(zero) -+ -+ cmpb $4, %dl -+ jb L(between_2_3) -+ cmpb $8, %dl -+ //je L(8) -+ jb L(between_4_7) -+# endif -+ cmpb $16, %dl -+ jae L(between_16_31) -+ /* It is between 8 and 15 bytes. */ -+ vmovq (%rdi), %xmm1 -+ vmovq (%rsi), %xmm2 -+ VPCMPEQ %xmm1, %xmm2, %xmm2 -+ vpmovmskb %xmm2, %eax -+ subl $0xffff, %eax -+ jnz L(first_vec) -+ /* Use overlapping loads to avoid branches. */ -+ leaq -8(%rdi, %rdx), %rdi -+ leaq -8(%rsi, %rdx), %rsi -+ vmovq (%rdi), %xmm1 -+ vmovq (%rsi), %xmm2 -+ VPCMPEQ %xmm1, %xmm2, %xmm2 -+ vpmovmskb %xmm2, %eax -+ subl $0xffff, %eax -+ jnz L(first_vec) -+ ret -+ .p2align 4 -+L(between_16_31): -+ /* From 16 to 31 bytes. No branch when size == 16. */ -+ vmovdqu (%rsi), %xmm2 -+ VPCMPEQ (%rdi), %xmm2, %xmm2 -+ vpmovmskb %xmm2, %eax -+ subl $0xffff, %eax -+ jnz L(first_vec) -+ /* Use overlapping loads to avoid branches. */ -+ leaq -16(%rdi, %rdx), %rdi -+ leaq -16(%rsi, %rdx), %rsi -+ vmovdqu (%rsi), %xmm2 -+ VPCMPEQ (%rdi), %xmm2, %xmm2 -+ vpmovmskb %xmm2, %eax -+ subl $0xffff, %eax -+ jnz L(first_vec) -+ ret -+ .p2align 4 -+L(more_8x_vec): -+ /* More than 8 * VEC. Check the first VEC. */ -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ /* Align the first memory area for aligned loads in the loop. -+ Compute how much the first memory area is misaligned. */ -+ movq %rdi, %rcx -+ andl $(VEC_SIZE - 1), %ecx -+ /* Get the negative of offset for alignment. */ -+ subq $VEC_SIZE, %rcx -+ /* Adjust the second memory area. */ -+ subq %rcx, %rsi -+ /* Adjust the first memory area which should be aligned now. */ -+ subq %rcx, %rdi -+ /* Adjust length. */ -+ addq %rcx, %rdx -+L(loop_4x_vec): -+ /* Compare 4 * VEC at a time forward. */ -+ vmovdqu (%rsi), %ymm1 -+ VPCMPEQ (%rdi), %ymm1, %ymm1 -+ vmovdqu VEC_SIZE(%rsi), %ymm2 -+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 -+ vpand %ymm2, %ymm1, %ymm5 -+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 -+ vpand %ymm3, %ymm5, %ymm5 -+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 -+ vpand %ymm4, %ymm5, %ymm5 -+ vptest %ymm0, %ymm5 -+ jnc L(4x_vec_end) -+ addq $(VEC_SIZE * 4), %rdi -+ addq $(VEC_SIZE * 4), %rsi -+ subq $(VEC_SIZE * 4), %rdx -+ cmpq $(VEC_SIZE * 4), %rdx -+ jae L(loop_4x_vec) -+ /* Less than 4 * VEC. */ -+ cmpq $VEC_SIZE, %rdx -+ jbe L(last_vec) -+ cmpq $(VEC_SIZE * 2), %rdx -+ jbe L(last_2x_vec) -+L(last_4x_vec): -+ /* From 2 * VEC to 4 * VEC. */ -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ addq $VEC_SIZE, %rdi -+ addq $VEC_SIZE, %rsi -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ /* Use overlapping loads to avoid branches. */ -+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi -+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ addq $VEC_SIZE, %rdi -+ addq $VEC_SIZE, %rsi -+ vmovdqu (%rsi), %ymm2 -+ VPCMPEQ (%rdi), %ymm2, %ymm2 -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ VZEROUPPER -+ ret -+ .p2align 4 -+L(4x_vec_end): -+ vpmovmskb %ymm1, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec) -+ vpmovmskb %ymm2, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec_x1) -+ vpmovmskb %ymm3, %eax -+ subl $VEC_MASK, %eax -+ jnz L(first_vec_x2) -+ vpmovmskb %ymm4, %eax -+ subl $VEC_MASK, %eax -+ tzcntl %eax, %ecx -+# ifdef USE_AS_WMEMCMP -+ xorl %eax, %eax -+ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx -+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx -+ jmp L(wmemcmp_return) -+# else -+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax -+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx -+ sub %edx, %eax -+# endif -+ VZEROUPPER -+ ret -+ .p2align 4 -+L(first_vec_x1): -+ tzcntl %eax, %ecx -+# ifdef USE_AS_WMEMCMP -+ xorl %eax, %eax -+ movl VEC_SIZE(%rdi, %rcx), %edx -+ cmpl VEC_SIZE(%rsi, %rcx), %edx -+ jmp L(wmemcmp_return) -+# else -+ movzbl VEC_SIZE(%rdi, %rcx), %eax -+ movzbl VEC_SIZE(%rsi, %rcx), %edx -+ sub %edx, %eax -+# endif -+ VZEROUPPER -+ ret -+ .p2align 4 -+L(first_vec_x2): -+ tzcntl %eax, %ecx -+# ifdef USE_AS_WMEMCMP -+ xorl %eax, %eax -+ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx -+ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx -+ jmp L(wmemcmp_return) -+# else -+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax -+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx -+ sub %edx, %eax -+# endif -+ VZEROUPPER -+ ret -+END (MEMCMP) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S -new file mode 100644 -index 000000000..a958fb56d ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-memrchr-kbl.S -@@ -0,0 +1,408 @@ -+/* memrchr optimized with AVX2. -+ Copyright (C) 2017-2019 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#ifndef L -+# define L(label) .L##label -+#endif -+ -+#ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+#endif -+ -+#ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+#endif -+ -+#ifndef cfi_rel_offset -+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -+#endif -+ -+#ifndef cfi_restore -+# define cfi_restore(reg) .cfi_restore reg -+#endif -+ -+#ifndef cfi_adjust_cfa_offset -+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -+#endif -+ -+#ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+#endif -+ -+#ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+#endif -+ -+#define CFI_PUSH(REG) \ -+ cfi_adjust_cfa_offset (4); \ -+ cfi_rel_offset (REG, 0) -+ -+#define CFI_POP(REG) \ -+ cfi_adjust_cfa_offset (-4); \ -+ cfi_restore (REG) -+ -+#define PUSH(REG) pushl REG; CFI_PUSH (REG) -+#define POP(REG) popl REG; CFI_POP (REG) -+ -+# ifndef MEMRCHR -+# define MEMRCHR memrchr_avx2 -+# endif -+ -+#ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+# define VEC_SIZE 32 -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (MEMRCHR) -+ /* Broadcast CHAR to YMM0. */ -+ vmovd %esi, %xmm0 -+ vpbroadcastb %xmm0, %ymm0 -+ -+ sub $VEC_SIZE, %rdx -+ jbe L(last_vec_or_less) -+ -+ add %rdx, %rdi -+ -+ /* Check the last VEC_SIZE bytes. */ -+ vpcmpeqb (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x0) -+ -+ subq $(VEC_SIZE * 4), %rdi -+ movl %edi, %ecx -+ andl $(VEC_SIZE - 1), %ecx -+ jz L(aligned_more) -+ -+ /* Align data for aligned loads in the loop. */ -+ addq $VEC_SIZE, %rdi -+ addq $VEC_SIZE, %rdx -+ andq $-VEC_SIZE, %rdi -+ subq %rcx, %rdx -+ -+ .p2align 4 -+L(aligned_more): -+ subq $(VEC_SIZE * 4), %rdx -+ jbe L(last_4x_vec_or_less) -+ -+ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time -+ since data is only aligned to VEC_SIZE. */ -+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x3) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 -+ vpmovmskb %ymm2, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x2) -+ -+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 -+ vpmovmskb %ymm3, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x1) -+ -+ vpcmpeqb (%rdi), %ymm0, %ymm4 -+ vpmovmskb %ymm4, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x0) -+ -+ /* Align data to 4 * VEC_SIZE for loop with fewer branches. -+ There are some overlaps with above if data isn't aligned -+ to 4 * VEC_SIZE. */ -+ movl %edi, %ecx -+ andl $(VEC_SIZE * 4 - 1), %ecx -+ jz L(loop_4x_vec) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ addq $(VEC_SIZE * 4), %rdx -+ andq $-(VEC_SIZE * 4), %rdi -+ subq %rcx, %rdx -+ -+ .p2align 4 -+L(loop_4x_vec): -+ /* Compare 4 * VEC at a time forward. */ -+ subq $(VEC_SIZE * 4), %rdi -+ subq $(VEC_SIZE * 4), %rdx -+ jbe L(last_4x_vec_or_less) -+ -+ vmovdqa (%rdi), %ymm1 -+ vmovdqa VEC_SIZE(%rdi), %ymm2 -+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 -+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 -+ -+ vpcmpeqb %ymm1, %ymm0, %ymm1 -+ vpcmpeqb %ymm2, %ymm0, %ymm2 -+ vpcmpeqb %ymm3, %ymm0, %ymm3 -+ vpcmpeqb %ymm4, %ymm0, %ymm4 -+ -+ vpor %ymm1, %ymm2, %ymm5 -+ vpor %ymm3, %ymm4, %ymm6 -+ vpor %ymm5, %ymm6, %ymm5 -+ -+ vpmovmskb %ymm5, %eax -+ testl %eax, %eax -+ jz L(loop_4x_vec) -+ -+ /* There is a match. */ -+ vpmovmskb %ymm4, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x3) -+ -+ vpmovmskb %ymm3, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x2) -+ -+ vpmovmskb %ymm2, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x1) -+ -+ vpmovmskb %ymm1, %eax -+ bsrl %eax, %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_4x_vec_or_less): -+ addl $(VEC_SIZE * 4), %edx -+ cmpl $(VEC_SIZE * 2), %edx -+ jbe L(last_2x_vec) -+ -+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x3) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 -+ vpmovmskb %ymm2, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x2) -+ -+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 -+ vpmovmskb %ymm3, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x1_check) -+ cmpl $(VEC_SIZE * 3), %edx -+ jbe L(zero) -+ -+ vpcmpeqb (%rdi), %ymm0, %ymm4 -+ vpmovmskb %ymm4, %eax -+ testl %eax, %eax -+ jz L(zero) -+ bsrl %eax, %eax -+ subq $(VEC_SIZE * 4), %rdx -+ addq %rax, %rdx -+ jl L(zero) -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_2x_vec): -+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(last_vec_x3_check) -+ cmpl $VEC_SIZE, %edx -+ jbe L(zero) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jz L(zero) -+ bsrl %eax, %eax -+ subq $(VEC_SIZE * 2), %rdx -+ addq %rax, %rdx -+ jl L(zero) -+ addl $(VEC_SIZE * 2), %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_x0): -+ bsrl %eax, %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_x1): -+ bsrl %eax, %eax -+ addl $VEC_SIZE, %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_x2): -+ bsrl %eax, %eax -+ addl $(VEC_SIZE * 2), %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_x3): -+ bsrl %eax, %eax -+ addl $(VEC_SIZE * 3), %eax -+ addq %rdi, %rax -+ ret -+ -+ .p2align 4 -+L(last_vec_x1_check): -+ bsrl %eax, %eax -+ subq $(VEC_SIZE * 3), %rdx -+ addq %rax, %rdx -+ jl L(zero) -+ addl $VEC_SIZE, %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_x3_check): -+ bsrl %eax, %eax -+ subq $VEC_SIZE, %rdx -+ addq %rax, %rdx -+ jl L(zero) -+ addl $(VEC_SIZE * 3), %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(zero): -+ VZEROUPPER -+L(null): -+ xorl %eax, %eax -+ ret -+ -+ .p2align 4 -+L(last_vec_or_less_aligned): -+ movl %edx, %ecx -+ -+ vpcmpeqb (%rdi), %ymm0, %ymm1 -+ -+ movl $1, %edx -+ /* Support rdx << 32. */ -+ salq %cl, %rdx -+ subq $1, %rdx -+ -+ vpmovmskb %ymm1, %eax -+ -+ /* Remove the trailing bytes. */ -+ andl %edx, %eax -+ testl %eax, %eax -+ jz L(zero) -+ -+ bsrl %eax, %eax -+ addq %rdi, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_or_less): -+ addl $VEC_SIZE, %edx -+ -+ /* Check for zero length. */ -+ testl %edx, %edx -+ jz L(null) -+ -+ movl %edi, %ecx -+ andl $(VEC_SIZE - 1), %ecx -+ jz L(last_vec_or_less_aligned) -+ -+ movl %ecx, %esi -+ movl %ecx, %r8d -+ addl %edx, %esi -+ andq $-VEC_SIZE, %rdi -+ -+ subl $VEC_SIZE, %esi -+ ja L(last_vec_2x_aligned) -+ -+ /* Check the last VEC. */ -+ vpcmpeqb (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ -+ /* Remove the leading and trailing bytes. */ -+ sarl %cl, %eax -+ movl %edx, %ecx -+ -+ movl $1, %edx -+ sall %cl, %edx -+ subl $1, %edx -+ -+ andl %edx, %eax -+ testl %eax, %eax -+ jz L(zero) -+ -+ bsrl %eax, %eax -+ addq %rdi, %rax -+ addq %r8, %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_vec_2x_aligned): -+ movl %esi, %ecx -+ -+ /* Check the last VEC. */ -+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 -+ -+ movl $1, %edx -+ sall %cl, %edx -+ subl $1, %edx -+ -+ vpmovmskb %ymm1, %eax -+ -+ /* Remove the trailing bytes. */ -+ andl %edx, %eax -+ -+ testl %eax, %eax -+ jnz L(last_vec_x1) -+ -+ /* Check the second last VEC. */ -+ vpcmpeqb (%rdi), %ymm0, %ymm1 -+ -+ movl %r8d, %ecx -+ -+ vpmovmskb %ymm1, %eax -+ -+ /* Remove the leading bytes. Must use unsigned right shift for -+ bsrl below. */ -+ shrl %cl, %eax -+ testl %eax, %eax -+ jz L(zero) -+ -+ bsrl %eax, %eax -+ addq %rdi, %rax -+ addq %r8, %rax -+ VZEROUPPER -+ ret -+END (MEMRCHR) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S -new file mode 100644 -index 000000000..7c485cf70 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wmemset-kbl.S -@@ -0,0 +1,140 @@ -+/* -+Copyright (C) 2019 The Android Open Source Project -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions -+are met: -+ * Redistributions of source code must retain the above copyright -+ notice, this list of conditions and the following disclaimer. -+ * Redistributions in binary form must reproduce the above copyright -+ notice, this list of conditions and the following disclaimer in -+ the documentation and/or other materials provided with the -+ distribution. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS -+FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE -+COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, -+BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS -+OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED -+AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT -+OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+SUCH DAMAGE. -+*/ -+ -+#include -+ -+#ifndef WMEMSET -+ #define WMEMSET wmemset_avx2 -+#endif -+ -+ .section .text.avx2,"ax",@progbits -+ -+ENTRY (WMEMSET) -+# BB#0: -+ testq %rdx, %rdx -+ je .LBB0_14 -+# BB#1: -+ cmpq $32, %rdx -+ jae .LBB0_3 -+# BB#2: -+ xorl %r8d, %r8d -+ movq %rdi, %rax -+ jmp .LBB0_12 -+.LBB0_3: -+ movq %rdx, %r8 -+ andq $-32, %r8 -+ vmovd %esi, %xmm0 -+ vpbroadcastd %xmm0, %ymm0 -+ leaq -32(%r8), %rcx -+ movq %rcx, %rax -+ shrq $5, %rax -+ leal 1(%rax), %r9d -+ andl $7, %r9d -+ cmpq $224, %rcx -+ jae .LBB0_5 -+# BB#4: -+ xorl %eax, %eax -+ testq %r9, %r9 -+ jne .LBB0_8 -+ jmp .LBB0_10 -+.LBB0_5: -+ leaq 992(%rdi), %rcx -+ leaq -1(%r9), %r10 -+ subq %rax, %r10 -+ xorl %eax, %eax -+ .p2align 4, 0x90 -+.LBB0_6: # =>This Inner Loop Header: Depth=1 -+ vmovdqu %ymm0, -992(%rcx,%rax,4) -+ vmovdqu %ymm0, -960(%rcx,%rax,4) -+ vmovdqu %ymm0, -928(%rcx,%rax,4) -+ vmovdqu %ymm0, -896(%rcx,%rax,4) -+ vmovdqu %ymm0, -864(%rcx,%rax,4) -+ vmovdqu %ymm0, -832(%rcx,%rax,4) -+ vmovdqu %ymm0, -800(%rcx,%rax,4) -+ vmovdqu %ymm0, -768(%rcx,%rax,4) -+ vmovdqu %ymm0, -736(%rcx,%rax,4) -+ vmovdqu %ymm0, -704(%rcx,%rax,4) -+ vmovdqu %ymm0, -672(%rcx,%rax,4) -+ vmovdqu %ymm0, -640(%rcx,%rax,4) -+ vmovdqu %ymm0, -608(%rcx,%rax,4) -+ vmovdqu %ymm0, -576(%rcx,%rax,4) -+ vmovdqu %ymm0, -544(%rcx,%rax,4) -+ vmovdqu %ymm0, -512(%rcx,%rax,4) -+ vmovdqu %ymm0, -480(%rcx,%rax,4) -+ vmovdqu %ymm0, -448(%rcx,%rax,4) -+ vmovdqu %ymm0, -416(%rcx,%rax,4) -+ vmovdqu %ymm0, -384(%rcx,%rax,4) -+ vmovdqu %ymm0, -352(%rcx,%rax,4) -+ vmovdqu %ymm0, -320(%rcx,%rax,4) -+ vmovdqu %ymm0, -288(%rcx,%rax,4) -+ vmovdqu %ymm0, -256(%rcx,%rax,4) -+ vmovdqu %ymm0, -224(%rcx,%rax,4) -+ vmovdqu %ymm0, -192(%rcx,%rax,4) -+ vmovdqu %ymm0, -160(%rcx,%rax,4) -+ vmovdqu %ymm0, -128(%rcx,%rax,4) -+ vmovdqu %ymm0, -96(%rcx,%rax,4) -+ vmovdqu %ymm0, -64(%rcx,%rax,4) -+ vmovdqu %ymm0, -32(%rcx,%rax,4) -+ vmovdqu %ymm0, (%rcx,%rax,4) -+ addq $256, %rax # imm = 0x100 -+ addq $8, %r10 -+ jne .LBB0_6 -+# BB#7: -+ testq %r9, %r9 -+ je .LBB0_10 -+.LBB0_8: -+ leaq (%rdi,%rax,4), %rax -+ addq $96, %rax -+ negq %r9 -+ .p2align 4, 0x90 -+.LBB0_9: # =>This Inner Loop Header: Depth=1 -+ vmovdqu %ymm0, -96(%rax) -+ vmovdqu %ymm0, -64(%rax) -+ vmovdqu %ymm0, -32(%rax) -+ vmovdqu %ymm0, (%rax) -+ subq $-128, %rax -+ addq $1, %r9 -+ jne .LBB0_9 -+.LBB0_10: -+ cmpq %rdx, %r8 -+ je .LBB0_14 -+# BB#11: -+ leaq (%rdi,%r8,4), %rax -+.LBB0_12: -+ subq %r8, %rdx -+ .p2align 4, 0x90 -+.LBB0_13: # =>This Inner Loop Header: Depth=1 -+ movl %esi, (%rax) -+ addq $4, %rax -+ addq $-1, %rdx -+ jne .LBB0_13 -+.LBB0_14: -+ movq %rdi, %rax -+ vzeroupper -+ retq -+END(WMEMSET) -diff --git a/libc/arch-x86_64/string/sse2-memmove-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S -similarity index 99% -rename from libc/arch-x86_64/string/sse2-memmove-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S -index 739502888..7024f4950 100644 ---- a/libc/arch-x86_64/string/sse2-memmove-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S -@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include "cache.h" - - #ifndef MEMMOVE --# define MEMMOVE memmove -+# define MEMMOVE memmove_generic - #endif - - #ifndef L -@@ -515,4 +515,4 @@ L(mm_large_page_loop_backward): - - END (MEMMOVE) - --ALIAS_SYMBOL(memcpy, MEMMOVE) -+//ALIAS_SYMBOL(memcpy, MEMMOVE) -diff --git a/libc/arch-x86_64/string/sse2-memset-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-memset-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-memset-slm.S -diff --git a/libc/arch-x86_64/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-stpcpy-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S -diff --git a/libc/arch-x86_64/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-stpncpy-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S -diff --git a/libc/arch-x86_64/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-strcat-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S -diff --git a/libc/arch-x86_64/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-strcpy-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S -diff --git a/libc/arch-x86_64/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-strlen-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S -diff --git a/libc/arch-x86_64/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-strncat-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S -diff --git a/libc/arch-x86_64/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/sse2-strncpy-slm.S -rename to libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S -diff --git a/libc/arch-x86_64/string/sse4-memcmp-slm.S b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S -similarity index 99% -rename from libc/arch-x86_64/string/sse4-memcmp-slm.S -rename to libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S -index 8a8b180a2..6cfcd767f 100644 ---- a/libc/arch-x86_64/string/sse4-memcmp-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S -@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include "cache.h" - - #ifndef MEMCMP --# define MEMCMP memcmp -+# define MEMCMP memcmp_generic - #endif - - #ifndef L -diff --git a/libc/arch-x86_64/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/ssse3-strcmp-slm.S -rename to libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S -diff --git a/libc/arch-x86_64/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S -similarity index 100% -rename from libc/arch-x86_64/string/ssse3-strncmp-slm.S -rename to libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S -diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S -index 93ff5f2fc..979ce4f18 100644 ---- a/libc/arch-x86_64/static_function_dispatch.S -+++ b/libc/arch-x86_64/static_function_dispatch.S -@@ -35,3 +35,9 @@ END(name) - - FUNCTION_DELEGATE(memset, memset_generic) - FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic) -+FUNCTION_DELEGATE(memcmp, memcmp_generic) -+FUNCTION_DELEGATE(memcpy, memmove_generic) -+FUNCTION_DELEGATE(memmove, memmove_generic) -+FUNCTION_DELEGATE(memchr, memchr_openbsd) -+FUNCTION_DELEGATE(memrchr, memrchr_openbsd) -+//FUNCTION_DELEGATE(wmemset, wmemset_freebsd) --- -2.25.1 - diff --git a/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch b/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch deleted file mode 100644 index 0432f627fd..0000000000 --- a/aosp_diff/preliminary/bionic/0004-Optimize-bionic-string-functions-with-avx-implementa.patch +++ /dev/null @@ -1,4169 +0,0 @@ -From b6a7f45aa68426f4e32a4bf51e71ec5453f25f8d Mon Sep 17 00:00:00 2001 -From: Ravi Kumar Soni -Date: Mon, 28 Oct 2024 15:08:14 +0530 -Subject: [PATCH 4/5] Optimize bionic string functions with avx implementation - -Following are the string functions that has been -optimized with avx2 implementation from glibc 2.32 version. - - strcmp, strncmp - - strlen, strnlen - - strchr, strrchr - - strcpy, strncpy - - stpcpy, stpncpy - - strcat, strncat - - wcscmp, wcsncmp - - wcslen, wcsnlen - - wcschr, wcsrchr - -Test done: Build and boot is fine, Run the benchmarks suite. - -Change-Id: I7f08a7507d25447ce886e9fde0482527c3f7a178 -Signed-off-by: ahs -Signed-off-by: Ravi Kumar Soni ---- - libc/Android.bp | 45 +- - .../arch-x86_64/dynamic_function_dispatch.cpp | 133 ++- - libc/arch-x86_64/generic/string/memchr.c | 2 +- - libc/arch-x86_64/generic/string/memrchr.c | 2 +- - libc/arch-x86_64/generic/string/strchr.cpp | 19 + - libc/arch-x86_64/generic/string/strnlen.cpp | 19 + - libc/arch-x86_64/generic/string/strrchr.cpp | 19 + - libc/arch-x86_64/generic/string/wcschr.c | 19 + - libc/arch-x86_64/generic/string/wcscmp.c | 19 + - libc/arch-x86_64/generic/string/wcslen.c | 19 + - libc/arch-x86_64/generic/string/wcsncmp.c | 19 + - libc/arch-x86_64/generic/string/wcsnlen.c | 19 + - libc/arch-x86_64/generic/string/wcsrchr.c | 19 + - libc/arch-x86_64/generic/string/wmemset.c | 2 +- - .../{ => kabylake}/string/avx2-memset-kbl.S | 0 - .../kabylake/string/avx2-stpcpy-kbl.S | 3 + - .../kabylake/string/avx2-stpncpy-kbl.S | 5 + - .../kabylake/string/avx2-strcat-kbl.S | 299 +++++ - .../kabylake/string/avx2-strchr-kbl.S | 277 +++++ - .../kabylake/string/avx2-strcmp-kbl.S | 885 ++++++++++++++ - .../kabylake/string/avx2-strcpy-kbl.S | 1046 +++++++++++++++++ - .../kabylake/string/avx2-strlen-kbl.S | 418 +++++++ - .../kabylake/string/avx2-strncat-kbl.S | 3 + - .../kabylake/string/avx2-strncmp-kbl.S | 4 + - .../kabylake/string/avx2-strncpy-kbl.S | 4 + - .../kabylake/string/avx2-strnlen-kbl.S | 4 + - .../kabylake/string/avx2-strrchr-kbl.S | 258 ++++ - .../kabylake/string/avx2-wcschr-kbl.S | 3 + - .../kabylake/string/avx2-wcscmp-kbl.S | 4 + - .../kabylake/string/avx2-wcslen-kbl.S | 4 + - .../kabylake/string/avx2-wcsncmp-kbl.S | 6 + - .../kabylake/string/avx2-wcsnlen-kbl.S | 6 + - .../kabylake/string/avx2-wcsrchr-kbl.S | 3 + - libc/arch-x86_64/kabylake/string/avx_regs.h | 26 + - .../{include => kabylake/string}/cache.h | 0 - libc/arch-x86_64/silvermont/string/cache.h | 36 + - .../silvermont/string/sse2-stpcpy-slm.S | 2 +- - .../silvermont/string/sse2-stpncpy-slm.S | 2 +- - .../silvermont/string/sse2-strcat-slm.S | 2 +- - .../silvermont/string/sse2-strcpy-slm.S | 2 +- - .../silvermont/string/sse2-strlen-slm.S | 2 +- - .../silvermont/string/sse2-strncat-slm.S | 2 +- - .../silvermont/string/sse2-strncpy-slm.S | 2 +- - .../silvermont/string/ssse3-strcmp-slm.S | 2 +- - .../silvermont/string/ssse3-strncmp-slm.S | 2 +- - libc/arch-x86_64/static_function_dispatch.S | 25 +- - 46 files changed, 3669 insertions(+), 23 deletions(-) - create mode 100644 libc/arch-x86_64/generic/string/strchr.cpp - create mode 100644 libc/arch-x86_64/generic/string/strnlen.cpp - create mode 100644 libc/arch-x86_64/generic/string/strrchr.cpp - create mode 100644 libc/arch-x86_64/generic/string/wcschr.c - create mode 100644 libc/arch-x86_64/generic/string/wcscmp.c - create mode 100644 libc/arch-x86_64/generic/string/wcslen.c - create mode 100644 libc/arch-x86_64/generic/string/wcsncmp.c - create mode 100644 libc/arch-x86_64/generic/string/wcsnlen.c - create mode 100644 libc/arch-x86_64/generic/string/wcsrchr.c - rename libc/arch-x86_64/{ => kabylake}/string/avx2-memset-kbl.S (100%) - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S - create mode 100644 libc/arch-x86_64/kabylake/string/avx_regs.h - rename libc/arch-x86_64/{include => kabylake/string}/cache.h (100%) - create mode 100644 libc/arch-x86_64/silvermont/string/cache.h - -diff --git a/libc/Android.bp b/libc/Android.bp -index 530ce9111..92483e833 100644 ---- a/libc/Android.bp -+++ b/libc/Android.bp -@@ -377,6 +377,17 @@ cc_library_static { - "upstream-freebsd/lib/libc/string/wmemcmp.c", - ], - }, -+ x86_64: { -+ exclude_srcs: [ -+ "upstream-freebsd/lib/libc/string/wcscmp.c", -+ "upstream-freebsd/lib/libc/string/wcsncmp.c", -+ "upstream-freebsd/lib/libc/string/wcslen.c", -+ "upstream-freebsd/lib/libc/string/wcsnlen.c", -+ "upstream-freebsd/lib/libc/string/wcschr.c", -+ "upstream-freebsd/lib/libc/string/wcsrchr.c", -+ -+ ], -+ }, - }, - - cflags: [ -@@ -1185,7 +1196,6 @@ cc_library_static { - ], - }, - x86_64: { -- include_dirs: ["bionic/libc/arch-x86_64/include"], - srcs: [ - "arch-x86_64/bionic/__bionic_clone.S", - "arch-x86_64/bionic/_exit_with_stack_teardown.S", -@@ -1194,7 +1204,7 @@ cc_library_static { - "arch-x86_64/bionic/syscall.S", - "arch-x86_64/bionic/vfork.S", - -- "arch-x86_64/string/avx2-memset-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-memset-kbl.S", - "arch-x86_64/silvermont/string/sse2-memmove-slm.S", - "arch-x86_64/silvermont/string/sse2-memset-slm.S", - "arch-x86_64/silvermont/string/sse2-stpcpy-slm.S", -@@ -1211,17 +1221,42 @@ cc_library_static { - //"arch-x86_64/generic/string/wmemset.c" - "arch-x86_64/generic/string/memchr.c", - "arch-x86_64/generic/string/memrchr.c", -+ "arch-x86_64/generic/string/strchr.cpp", -+ "arch-x86_64/generic/string/strrchr.cpp", -+ "arch-x86_64/generic/string/strnlen.cpp", -+ "arch-x86_64/generic/string/wcscmp.c", -+ "arch-x86_64/generic/string/wcsncmp.c", -+ "arch-x86_64/generic/string/wcslen.c", -+ "arch-x86_64/generic/string/wcsnlen.c", -+ "arch-x86_64/generic/string/wcschr.c", -+ "arch-x86_64/generic/string/wcsrchr.c", - - //"arch-x86_64/kabylake/string/avx2-wmemset-kbl.S" - "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S", - "arch-x86_64/kabylake/string/avx2-memchr-kbl.S", - "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strlen-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strnlen-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strchr-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strrchr-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strcpy-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strncpy-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strcat-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-strncat-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-wcslen-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-wcschr-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S", - -- "bionic/strchr.cpp", - "bionic/strchrnul.cpp", -- "bionic/strnlen.cpp", -- "bionic/strrchr.cpp", - ], -+ - }, - }, - -diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp -index 43aaebb54..182eb4200 100644 ---- a/libc/arch-x86_64/dynamic_function_dispatch.cpp -+++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp -@@ -67,21 +67,148 @@ typedef void* memchr_func(const void* __s, int __ch, size_t __n); - DEFINE_IFUNC_FOR(memchr) { - __builtin_cpu_init(); - if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memchr_func, memchr_avx2); -- RETURN_FUNC(memchr_func, memchr_openbsd); -+ RETURN_FUNC(memchr_func, memchr_generic); - } - - typedef void* memrchr_func(const void* __s, int __ch, size_t __n); - DEFINE_IFUNC_FOR(memrchr) { - __builtin_cpu_init(); - if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memrchr_func, memrchr_avx2); -- RETURN_FUNC(memrchr_func, memrchr_openbsd); -+ RETURN_FUNC(memrchr_func, memrchr_generic); - } - - // typedef int wmemset_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n); - // DEFINE_IFUNC_FOR(wmemset) { - // __builtin_cpu_init(); - // if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wmemset_func, wmemset_avx2); --// RETURN_FUNC(wmemset_func, wmemset_freebsd); -+// RETURN_FUNC(wmemset_func, wmemset_generic); - // } - -+typedef int strcmp_func(const char* __lhs, const char* __rhs); -+DEFINE_IFUNC_FOR(strcmp) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcmp_func, strcmp_avx2); -+ RETURN_FUNC(strcmp_func, strcmp_generic); -+} -+ -+typedef int strncmp_func(const char* __lhs, const char* __rhs, size_t __n); -+DEFINE_IFUNC_FOR(strncmp) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncmp_func, strncmp_avx2); -+ RETURN_FUNC(strncmp_func, strncmp_generic); -+} -+ -+typedef char* strcpy_func(char* __dst, const char* __src); -+DEFINE_IFUNC_FOR(strcpy) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcpy_func, strcpy_avx2); -+ RETURN_FUNC(strcpy_func, strcpy_generic); -+} -+ -+typedef char* strncpy_func(char* __dst, const char* __src, size_t __n); -+DEFINE_IFUNC_FOR(strncpy) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncpy_func, strncpy_avx2); -+ RETURN_FUNC(strncpy_func, strncpy_generic); -+} -+ -+typedef char* stpcpy_func(char* __dst, const char* __src); -+DEFINE_IFUNC_FOR(stpcpy) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpcpy_func, stpcpy_avx2); -+ RETURN_FUNC(stpcpy_func, stpcpy_generic); -+} -+ -+typedef char* stpncpy_func(char* __dst, const char* __src, size_t __n); -+DEFINE_IFUNC_FOR(stpncpy) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(stpncpy_func, stpncpy_avx2); -+ RETURN_FUNC(stpncpy_func, stpncpy_generic); -+} -+ -+typedef size_t strlen_func(const char* __s); -+DEFINE_IFUNC_FOR(strlen) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strlen_func, strlen_avx2); -+ RETURN_FUNC(strlen_func, strlen_generic); -+} -+ -+ -+typedef size_t strnlen_func(const char* __s, size_t __n); -+DEFINE_IFUNC_FOR(strnlen) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strnlen_func, strnlen_avx2); -+ RETURN_FUNC(strnlen_func, strnlen_generic); -+} -+ -+typedef char* strchr_func(const char* __s, int __ch); -+DEFINE_IFUNC_FOR(strchr) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strchr_func, strchr_avx2); -+ RETURN_FUNC(strchr_func, strchr_generic); -+} -+ -+typedef char* strrchr_func(const char* __s, int __ch); -+DEFINE_IFUNC_FOR(strrchr) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strrchr_func, strrchr_avx2); -+ RETURN_FUNC(strrchr_func, strrchr_generic); -+} -+ -+typedef char* strcat_func(char* __dst, const char* __src); -+DEFINE_IFUNC_FOR(strcat) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strcat_func, strcat_avx2); -+ RETURN_FUNC(strcat_func, strcat_generic); -+} -+ -+typedef char* strncat_func(char* __dst, const char* __src, size_t __n); -+DEFINE_IFUNC_FOR(strncat) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(strncat_func, strncat_avx2); -+ RETURN_FUNC(strncat_func, strncat_generic); -+} -+ -+typedef int wcscmp_func(const wchar_t* __lhs, const wchar_t* __rhs); -+DEFINE_IFUNC_FOR(wcscmp) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcscmp_func, wcscmp_avx2); -+ RETURN_FUNC(wcscmp_func, wcscmp_generic); -+} -+ -+typedef int wcsncmp_func(const wchar_t* __lhs, const wchar_t* __rhs, size_t __n); -+DEFINE_IFUNC_FOR(wcsncmp) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsncmp_func, wcsncmp_avx2); -+ RETURN_FUNC(wcsncmp_func, wcsncmp_generic); -+} -+ -+typedef size_t wcslen_func(const wchar_t* __s); -+DEFINE_IFUNC_FOR(wcslen) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcslen_func, wcslen_avx2); -+ RETURN_FUNC(wcslen_func, wcslen_generic); -+} -+ -+typedef size_t wcsnlen_func(const wchar_t* __s, size_t __n); -+DEFINE_IFUNC_FOR(wcsnlen) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsnlen_func, wcsnlen_avx2); -+ RETURN_FUNC(wcsnlen_func, wcsnlen_generic); -+} -+ -+typedef wchar_t* wcschr_func(const wchar_t* __s, wchar_t __wc); -+DEFINE_IFUNC_FOR(wcschr) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcschr_func, wcschr_avx2); -+ RETURN_FUNC(wcschr_func, wcschr_generic); -+} -+ -+typedef wchar_t* wcsrchr_func(const wchar_t* __s, wchar_t __wc); -+DEFINE_IFUNC_FOR(wcsrchr) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(wcsrchr_func, wcsrchr_avx2); -+ RETURN_FUNC(wcsrchr_func, wcsrchr_generic); -+} -+ - } // extern "C" -diff --git a/libc/arch-x86_64/generic/string/memchr.c b/libc/arch-x86_64/generic/string/memchr.c -index 86ee02e0b..e6fc3eb84 100644 ---- a/libc/arch-x86_64/generic/string/memchr.c -+++ b/libc/arch-x86_64/generic/string/memchr.c -@@ -15,6 +15,6 @@ - */ - - #include --#define memchr memchr_openbsd -+#define memchr memchr_generic - - #include -diff --git a/libc/arch-x86_64/generic/string/memrchr.c b/libc/arch-x86_64/generic/string/memrchr.c -index c803009f5..ee085e384 100644 ---- a/libc/arch-x86_64/generic/string/memrchr.c -+++ b/libc/arch-x86_64/generic/string/memrchr.c -@@ -15,6 +15,6 @@ - */ - - #include --#define memrchr memrchr_openbsd -+#define memrchr memrchr_generic - - #include -diff --git a/libc/arch-x86_64/generic/string/strchr.cpp b/libc/arch-x86_64/generic/string/strchr.cpp -new file mode 100644 -index 000000000..8a3d6d619 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/strchr.cpp -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define strchr strchr_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/strnlen.cpp b/libc/arch-x86_64/generic/string/strnlen.cpp -new file mode 100644 -index 000000000..f60348656 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/strnlen.cpp -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define strnlen strnlen_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/strrchr.cpp b/libc/arch-x86_64/generic/string/strrchr.cpp -new file mode 100644 -index 000000000..9f0f33fd2 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/strrchr.cpp -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define strrchr strrchr_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wcschr.c b/libc/arch-x86_64/generic/string/wcschr.c -new file mode 100644 -index 000000000..d45e45d20 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wcschr.c -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define wcschr wcschr_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wcscmp.c b/libc/arch-x86_64/generic/string/wcscmp.c -new file mode 100644 -index 000000000..e55bab549 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wcscmp.c -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define wcscmp wcscmp_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wcslen.c b/libc/arch-x86_64/generic/string/wcslen.c -new file mode 100644 -index 000000000..5b873fc30 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wcslen.c -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define wcslen wcslen_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wcsncmp.c b/libc/arch-x86_64/generic/string/wcsncmp.c -new file mode 100644 -index 000000000..40b2ca2f3 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wcsncmp.c -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define wcsncmp wcsncmp_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wcsnlen.c b/libc/arch-x86_64/generic/string/wcsnlen.c -new file mode 100644 -index 000000000..91051cea7 ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wcsnlen.c -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define wcsnlen wcsnlen_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wcsrchr.c b/libc/arch-x86_64/generic/string/wcsrchr.c -new file mode 100644 -index 000000000..73e8c25bc ---- /dev/null -+++ b/libc/arch-x86_64/generic/string/wcsrchr.c -@@ -0,0 +1,19 @@ -+/* -+ * Copyright (C) 2019 The Android Open Source Project -+ * -+ * Licensed under the Apache License, Version 2.0 (the "License"); -+ * you may not use this file except in compliance with the License. -+ * You may obtain a copy of the License at -+ * -+ * http://www.apache.org/licenses/LICENSE-2.0 -+ * -+ * Unless required by applicable law or agreed to in writing, software -+ * distributed under the License is distributed on an "AS IS" BASIS, -+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -+ * See the License for the specific language governing permissions and -+ * limitations under the License. -+*/ -+ -+#define wcsrchr wcsrchr_generic -+ -+#include -diff --git a/libc/arch-x86_64/generic/string/wmemset.c b/libc/arch-x86_64/generic/string/wmemset.c -index ac6bd7ec4..9675fe91f 100644 ---- a/libc/arch-x86_64/generic/string/wmemset.c -+++ b/libc/arch-x86_64/generic/string/wmemset.c -@@ -15,6 +15,6 @@ - */ - - #include --#define wmemset wmemset_freebsd -+#define wmemset wmemset_generic - - #include -diff --git a/libc/arch-x86_64/string/avx2-memset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S -similarity index 100% -rename from libc/arch-x86_64/string/avx2-memset-kbl.S -rename to libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S -diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S -new file mode 100644 -index 000000000..63f9ba25b ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-stpcpy-kbl.S -@@ -0,0 +1,3 @@ -+#define USE_AS_STPCPY -+#define STRCPY stpcpy_avx2 -+#include "avx2-strcpy-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S -new file mode 100644 -index 000000000..c1bbdb29e ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-stpncpy-kbl.S -@@ -0,0 +1,5 @@ -+#define USE_AS_STPCPY -+#define USE_AS_STRNCPY -+#define STRCPY stpncpy_avx2 -+#include "avx_regs.h" -+#include "avx2-strcpy-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S -new file mode 100644 -index 000000000..d1e9b4b38 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strcat-kbl.S -@@ -0,0 +1,299 @@ -+/* strcat with AVX2 -+ Copyright (C) 2011-2020 Free Software Foundation, Inc. -+ Contributed by Intel Corporation. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+ -+ -+# ifndef STRCAT -+# define STRCAT strcat_avx2 -+# endif -+ -+# ifndef L -+# define L(label) .L##label -+# endif -+ -+# ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+# endif -+ -+# ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+# endif -+ -+# ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+# endif -+ -+# ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+# endif -+ -+# define USE_AS_STRCAT -+ -+/* Number of bytes in a vector register */ -+# define VEC_SIZE 32 -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (STRCAT) -+ mov %rdi, %r9 -+# ifdef USE_AS_STRNCAT -+ mov %rdx, %r8 -+# endif -+ -+ xor %eax, %eax -+ mov %edi, %ecx -+ and $((VEC_SIZE * 4) - 1), %ecx -+ vpxor %xmm6, %xmm6, %xmm6 -+ cmp $(VEC_SIZE * 3), %ecx -+ ja L(fourth_vector_boundary) -+ vpcmpeqb (%rdi), %ymm6, %ymm0 -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_first_vector) -+ mov %rdi, %rax -+ and $-VEC_SIZE, %rax -+ jmp L(align_vec_size_start) -+L(fourth_vector_boundary): -+ mov %rdi, %rax -+ and $-VEC_SIZE, %rax -+ vpcmpeqb (%rax), %ymm6, %ymm0 -+ mov $-1, %r10d -+ sub %rax, %rcx -+ shl %cl, %r10d -+ vpmovmskb %ymm0, %edx -+ and %r10d, %edx -+ jnz L(exit) -+ -+L(align_vec_size_start): -+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0 -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_second_vector) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 -+ vpmovmskb %ymm1, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_third_vector) -+ -+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 -+ vpmovmskb %ymm2, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fourth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 -+ vpmovmskb %ymm3, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fifth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 -+ add $(VEC_SIZE * 4), %rax -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_second_vector) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 -+ vpmovmskb %ymm1, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_third_vector) -+ -+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 -+ vpmovmskb %ymm2, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fourth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 -+ vpmovmskb %ymm3, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fifth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 -+ add $(VEC_SIZE * 4), %rax -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_second_vector) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 -+ vpmovmskb %ymm1, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_third_vector) -+ -+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 -+ vpmovmskb %ymm2, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fourth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 -+ vpmovmskb %ymm3, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fifth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 -+ add $(VEC_SIZE * 4), %rax -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_second_vector) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 -+ vpmovmskb %ymm1, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_third_vector) -+ -+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 -+ vpmovmskb %ymm2, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fourth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 -+ vpmovmskb %ymm3, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fifth_vector) -+ -+ test $((VEC_SIZE * 4) - 1), %rax -+ jz L(align_four_vec_loop) -+ -+ vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0 -+ add $(VEC_SIZE * 5), %rax -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit) -+ -+ test $((VEC_SIZE * 4) - 1), %rax -+ jz L(align_four_vec_loop) -+ -+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1 -+ add $VEC_SIZE, %rax -+ vpmovmskb %ymm1, %edx -+ test %edx, %edx -+ jnz L(exit) -+ -+ test $((VEC_SIZE * 4) - 1), %rax -+ jz L(align_four_vec_loop) -+ -+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2 -+ add $VEC_SIZE, %rax -+ vpmovmskb %ymm2, %edx -+ test %edx, %edx -+ jnz L(exit) -+ -+ test $((VEC_SIZE * 4) - 1), %rax -+ jz L(align_four_vec_loop) -+ -+ vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3 -+ add $VEC_SIZE, %rax -+ vpmovmskb %ymm3, %edx -+ test %edx, %edx -+ jnz L(exit) -+ -+ add $VEC_SIZE, %rax -+ -+ .p2align 4 -+L(align_four_vec_loop): -+ vmovaps (%rax), %ymm4 -+ vpminub VEC_SIZE(%rax), %ymm4, %ymm4 -+ vmovaps (VEC_SIZE * 2)(%rax), %ymm5 -+ vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5 -+ add $(VEC_SIZE * 4), %rax -+ vpminub %ymm4, %ymm5, %ymm5 -+ vpcmpeqb %ymm5, %ymm6, %ymm5 -+ vpmovmskb %ymm5, %edx -+ test %edx, %edx -+ jz L(align_four_vec_loop) -+ -+ vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0 -+ sub $(VEC_SIZE * 5), %rax -+ vpmovmskb %ymm0, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_second_vector) -+ -+ vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1 -+ vpmovmskb %ymm1, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_third_vector) -+ -+ vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2 -+ vpmovmskb %ymm2, %edx -+ test %edx, %edx -+ jnz L(exit_null_on_fourth_vector) -+ -+ vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3 -+ vpmovmskb %ymm3, %edx -+ sub %rdi, %rax -+ bsf %rdx, %rdx -+ add %rdx, %rax -+ add $(VEC_SIZE * 4), %rax -+ jmp L(StartStrcpyPart) -+ -+ .p2align 4 -+L(exit): -+ sub %rdi, %rax -+L(exit_null_on_first_vector): -+ bsf %rdx, %rdx -+ add %rdx, %rax -+ jmp L(StartStrcpyPart) -+ -+ .p2align 4 -+L(exit_null_on_second_vector): -+ sub %rdi, %rax -+ bsf %rdx, %rdx -+ add %rdx, %rax -+ add $VEC_SIZE, %rax -+ jmp L(StartStrcpyPart) -+ -+ .p2align 4 -+L(exit_null_on_third_vector): -+ sub %rdi, %rax -+ bsf %rdx, %rdx -+ add %rdx, %rax -+ add $(VEC_SIZE * 2), %rax -+ jmp L(StartStrcpyPart) -+ -+ .p2align 4 -+L(exit_null_on_fourth_vector): -+ sub %rdi, %rax -+ bsf %rdx, %rdx -+ add %rdx, %rax -+ add $(VEC_SIZE * 3), %rax -+ jmp L(StartStrcpyPart) -+ -+ .p2align 4 -+L(exit_null_on_fifth_vector): -+ sub %rdi, %rax -+ bsf %rdx, %rdx -+ add %rdx, %rax -+ add $(VEC_SIZE * 4), %rax -+ -+ .p2align 4 -+L(StartStrcpyPart): -+ lea (%r9, %rax), %rdi -+ mov %rsi, %rcx -+ mov %r9, %rax /* save result */ -+ -+# ifdef USE_AS_STRNCAT -+ test %r8, %r8 -+ jz L(ExitZero) -+# define USE_AS_STRNCPY -+# endif -+ -+# include "avx2-strcpy-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S -new file mode 100644 -index 000000000..7d8a44c81 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strchr-kbl.S -@@ -0,0 +1,277 @@ -+/* strchr/strchrnul optimized with AVX2. -+ Copyright (C) 2017-2020 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+ -+# ifndef STRCHR -+# define STRCHR strchr_avx2 -+# endif -+ -+# ifndef L -+# define L(label) .L##label -+# endif -+ -+# ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+# endif -+ -+# ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+# endif -+ -+# ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+# endif -+ -+# ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+# endif -+ -+# ifdef USE_AS_WCSCHR -+# define VPBROADCAST vpbroadcastd -+# define VPCMPEQ vpcmpeqd -+# define CHAR_REG esi -+# else -+# define VPBROADCAST vpbroadcastb -+# define VPCMPEQ vpcmpeqb -+# define CHAR_REG sil -+# endif -+ -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+# define VEC_SIZE 32 -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (STRCHR) -+ movl %edi, %ecx -+ /* Broadcast CHAR to YMM0. */ -+ vmovd %esi, %xmm0 -+ vpxor %xmm9, %xmm9, %xmm9 -+ VPBROADCAST %xmm0, %ymm0 -+ /* Check if we may cross page boundary with one vector load. */ -+ andl $(2 * VEC_SIZE - 1), %ecx -+ cmpl $VEC_SIZE, %ecx -+ ja L(cros_page_boundary) -+ -+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the -+ null byte. */ -+ vmovdqu (%rdi), %ymm8 -+ VPCMPEQ %ymm8, %ymm0, %ymm1 -+ VPCMPEQ %ymm8, %ymm9, %ymm2 -+ vpor %ymm1, %ymm2, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ -+ /* Align data for aligned loads in the loop. */ -+ addq $VEC_SIZE, %rdi -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ -+ jmp L(more_4x_vec) -+ -+ .p2align 4 -+L(cros_page_boundary): -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ vmovdqu (%rdi), %ymm8 -+ VPCMPEQ %ymm8, %ymm0, %ymm1 -+ VPCMPEQ %ymm8, %ymm9, %ymm2 -+ vpor %ymm1, %ymm2, %ymm1 -+ vpmovmskb %ymm1, %eax -+ /* Remove the leading bytes. */ -+ sarl %cl, %eax -+ testl %eax, %eax -+ jz L(aligned_more) -+ /* Found CHAR or the null byte. */ -+ tzcntl %eax, %eax -+ addq %rcx, %rax -+# ifdef USE_AS_STRCHRNUL -+ addq %rdi, %rax -+# else -+ xorl %edx, %edx -+ leaq (%rdi, %rax), %rax -+ cmp (%rax), %CHAR_REG -+ cmovne %rdx, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(aligned_more): -+ addq $VEC_SIZE, %rdi -+ -+L(more_4x_vec): -+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time -+ since data is only aligned to VEC_SIZE. */ -+ vmovdqa (%rdi), %ymm8 -+ VPCMPEQ %ymm8, %ymm0, %ymm1 -+ VPCMPEQ %ymm8, %ymm9, %ymm2 -+ vpor %ymm1, %ymm2, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ -+ vmovdqa VEC_SIZE(%rdi), %ymm8 -+ VPCMPEQ %ymm8, %ymm0, %ymm1 -+ VPCMPEQ %ymm8, %ymm9, %ymm2 -+ vpor %ymm1, %ymm2, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ -+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 -+ VPCMPEQ %ymm8, %ymm0, %ymm1 -+ VPCMPEQ %ymm8, %ymm9, %ymm2 -+ vpor %ymm1, %ymm2, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x2) -+ -+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 -+ VPCMPEQ %ymm8, %ymm0, %ymm1 -+ VPCMPEQ %ymm8, %ymm9, %ymm2 -+ vpor %ymm1, %ymm2, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x3) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ -+ /* Align data to 4 * VEC_SIZE. */ -+ movq %rdi, %rcx -+ andl $(4 * VEC_SIZE - 1), %ecx -+ andq $-(4 * VEC_SIZE), %rdi -+ -+ .p2align 4 -+L(loop_4x_vec): -+ /* Compare 4 * VEC at a time forward. */ -+ vmovdqa (%rdi), %ymm5 -+ vmovdqa VEC_SIZE(%rdi), %ymm6 -+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 -+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 -+ -+ VPCMPEQ %ymm5, %ymm0, %ymm1 -+ VPCMPEQ %ymm6, %ymm0, %ymm2 -+ VPCMPEQ %ymm7, %ymm0, %ymm3 -+ VPCMPEQ %ymm8, %ymm0, %ymm4 -+ -+ VPCMPEQ %ymm5, %ymm9, %ymm5 -+ VPCMPEQ %ymm6, %ymm9, %ymm6 -+ VPCMPEQ %ymm7, %ymm9, %ymm7 -+ VPCMPEQ %ymm8, %ymm9, %ymm8 -+ -+ vpor %ymm1, %ymm5, %ymm1 -+ vpor %ymm2, %ymm6, %ymm2 -+ vpor %ymm3, %ymm7, %ymm3 -+ vpor %ymm4, %ymm8, %ymm4 -+ -+ vpor %ymm1, %ymm2, %ymm5 -+ vpor %ymm3, %ymm4, %ymm6 -+ -+ vpor %ymm5, %ymm6, %ymm5 -+ -+ vpmovmskb %ymm5, %eax -+ testl %eax, %eax -+ jnz L(4x_vec_end) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ -+ jmp L(loop_4x_vec) -+ -+ .p2align 4 -+L(first_vec_x0): -+ /* Found CHAR or the null byte. */ -+ tzcntl %eax, %eax -+# ifdef USE_AS_STRCHRNUL -+ addq %rdi, %rax -+# else -+ xorl %edx, %edx -+ leaq (%rdi, %rax), %rax -+ cmp (%rax), %CHAR_REG -+ cmovne %rdx, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x1): -+ tzcntl %eax, %eax -+# ifdef USE_AS_STRCHRNUL -+ addq $VEC_SIZE, %rax -+ addq %rdi, %rax -+# else -+ xorl %edx, %edx -+ leaq VEC_SIZE(%rdi, %rax), %rax -+ cmp (%rax), %CHAR_REG -+ cmovne %rdx, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x2): -+ tzcntl %eax, %eax -+# ifdef USE_AS_STRCHRNUL -+ addq $(VEC_SIZE * 2), %rax -+ addq %rdi, %rax -+# else -+ xorl %edx, %edx -+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax -+ cmp (%rax), %CHAR_REG -+ cmovne %rdx, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(4x_vec_end): -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ vpmovmskb %ymm2, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ vpmovmskb %ymm3, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x2) -+ vpmovmskb %ymm4, %eax -+ testl %eax, %eax -+L(first_vec_x3): -+ tzcntl %eax, %eax -+# ifdef USE_AS_STRCHRNUL -+ addq $(VEC_SIZE * 3), %rax -+ addq %rdi, %rax -+# else -+ xorl %edx, %edx -+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax -+ cmp (%rax), %CHAR_REG -+ cmovne %rdx, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+END (STRCHR) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S -new file mode 100644 -index 000000000..b241812d8 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strcmp-kbl.S -@@ -0,0 +1,885 @@ -+/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2. -+ Copyright (C) 2018-2020 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+ -+# ifndef STRCMP -+# define STRCMP strcmp_avx2 -+# endif -+ -+# ifndef L -+# define L(label) .L##label -+# endif -+ -+# ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+# endif -+ -+# ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+# endif -+ -+# ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+# endif -+ -+# ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+# endif -+ -+# define PAGE_SIZE 4096 -+ -+/* VEC_SIZE = Number of bytes in a ymm register */ -+# define VEC_SIZE 32 -+ -+/* Shift for dividing by (VEC_SIZE * 4). */ -+# define DIVIDE_BY_VEC_4_SHIFT 7 -+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) -+# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) -+# endif -+ -+# ifdef USE_AS_WCSCMP -+/* Compare packed dwords. */ -+# define VPCMPEQ vpcmpeqd -+/* Compare packed dwords and store minimum. */ -+# define VPMINU vpminud -+/* 1 dword char == 4 bytes. */ -+# define SIZE_OF_CHAR 4 -+# else -+/* Compare packed bytes. */ -+# define VPCMPEQ vpcmpeqb -+/* Compare packed bytes and store minimum. */ -+# define VPMINU vpminub -+/* 1 byte char == 1 byte. */ -+# define SIZE_OF_CHAR 1 -+# endif -+ -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+/* Warning! -+ wcscmp/wcsncmp have to use SIGNED comparison for elements. -+ strcmp/strncmp have to use UNSIGNED comparison for elements. -+*/ -+ -+/* The main idea of the string comparison (byte or dword) using AVX2 -+ consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on -+ either packed bytes or dwords depending on USE_AS_WCSCMP. In order -+ to check the null char, algorithm keeps the matched bytes/dwords, -+ requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general, -+ the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and -+ one VPMINU instructions, together with movdqu and testl instructions. -+ Main loop (away from from page boundary) compares 4 vectors are a time, -+ effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop. -+ -+ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic -+ is the same as strcmp, except that an a maximum offset is tracked. If -+ the maximum offset is reached before a difference is found, zero is -+ returned. */ -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (STRCMP) -+# ifdef USE_AS_STRNCMP -+ /* Check for simple cases (0 or 1) in offset. */ -+ cmp $1, %RDX_LP -+ je L(char0) -+ jb L(zero) -+# ifdef USE_AS_WCSCMP -+ /* Convert units: from wide to byte char. */ -+ shl $2, %RDX_LP -+# endif -+ /* Register %r11 tracks the maximum offset. */ -+ mov %RDX_LP, %R11_LP -+# endif -+ movl %edi, %eax -+ xorl %edx, %edx -+ /* Make %xmm7 (%ymm7) all zeros in this function. */ -+ vpxor %xmm7, %xmm7, %xmm7 -+ orl %esi, %eax -+ andl $(PAGE_SIZE - 1), %eax -+ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax -+ jg L(cross_page) -+ /* Start comparing 4 vectors. */ -+ vmovdqu (%rdi), %ymm1 -+ VPCMPEQ (%rsi), %ymm1, %ymm0 -+ VPMINU %ymm1, %ymm0, %ymm0 -+ VPCMPEQ %ymm7, %ymm0, %ymm0 -+ vpmovmskb %ymm0, %ecx -+ testl %ecx, %ecx -+ je L(next_3_vectors) -+ tzcntl %ecx, %edx -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the mismatched index (%rdx) is after the maximum -+ offset (%r11). */ -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (%rdi, %rdx), %ecx -+ cmpl (%rsi, %rdx), %ecx -+ je L(return) -+L(wcscmp_return): -+ setl %al -+ negl %eax -+ orl $1, %eax -+L(return): -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(return_vec_size): -+ tzcntl %ecx, %edx -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after -+ the maximum offset (%r11). */ -+ addq $VEC_SIZE, %rdx -+ cmpq %r11, %rdx -+ jae L(zero) -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (%rdi, %rdx), %ecx -+ cmpl (%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl VEC_SIZE(%rdi, %rdx), %ecx -+ cmpl VEC_SIZE(%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl VEC_SIZE(%rdi, %rdx), %eax -+ movzbl VEC_SIZE(%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(return_2_vec_size): -+ tzcntl %ecx, %edx -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is -+ after the maximum offset (%r11). */ -+ addq $(VEC_SIZE * 2), %rdx -+ cmpq %r11, %rdx -+ jae L(zero) -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (%rdi, %rdx), %ecx -+ cmpl (%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx -+ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax -+ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(return_3_vec_size): -+ tzcntl %ecx, %edx -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is -+ after the maximum offset (%r11). */ -+ addq $(VEC_SIZE * 3), %rdx -+ cmpq %r11, %rdx -+ jae L(zero) -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (%rdi, %rdx), %ecx -+ cmpl (%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx -+ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax -+ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(next_3_vectors): -+ vmovdqu VEC_SIZE(%rdi), %ymm6 -+ VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 -+ VPMINU %ymm6, %ymm3, %ymm3 -+ VPCMPEQ %ymm7, %ymm3, %ymm3 -+ vpmovmskb %ymm3, %ecx -+ testl %ecx, %ecx -+ jne L(return_vec_size) -+ vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 -+ vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 -+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 -+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 -+ VPMINU %ymm5, %ymm2, %ymm2 -+ VPCMPEQ %ymm4, %ymm0, %ymm0 -+ VPCMPEQ %ymm7, %ymm2, %ymm2 -+ vpmovmskb %ymm2, %ecx -+ testl %ecx, %ecx -+ jne L(return_2_vec_size) -+ VPMINU %ymm4, %ymm0, %ymm0 -+ VPCMPEQ %ymm7, %ymm0, %ymm0 -+ vpmovmskb %ymm0, %ecx -+ testl %ecx, %ecx -+ jne L(return_3_vec_size) -+L(main_loop_header): -+ leaq (VEC_SIZE * 4)(%rdi), %rdx -+ movl $PAGE_SIZE, %ecx -+ /* Align load via RAX. */ -+ andq $-(VEC_SIZE * 4), %rdx -+ subq %rdi, %rdx -+ leaq (%rdi, %rdx), %rax -+# ifdef USE_AS_STRNCMP -+ /* Starting from this point, the maximum offset, or simply the -+ 'offset', DECREASES by the same amount when base pointers are -+ moved forward. Return 0 when: -+ 1) On match: offset <= the matched vector index. -+ 2) On mistmach, offset is before the mistmatched index. -+ */ -+ subq %rdx, %r11 -+ jbe L(zero) -+# endif -+ addq %rsi, %rdx -+ movq %rdx, %rsi -+ andl $(PAGE_SIZE - 1), %esi -+ /* Number of bytes before page crossing. */ -+ subq %rsi, %rcx -+ /* Number of VEC_SIZE * 4 blocks before page crossing. */ -+ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx -+ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ -+ movl %ecx, %esi -+ jmp L(loop_start) -+ -+ .p2align 4 -+L(loop): -+# ifdef USE_AS_STRNCMP -+ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease -+ the maximum offset (%r11) by the same amount. */ -+ subq $(VEC_SIZE * 4), %r11 -+ jbe L(zero) -+# endif -+ addq $(VEC_SIZE * 4), %rax -+ addq $(VEC_SIZE * 4), %rdx -+L(loop_start): -+ testl %esi, %esi -+ leal -1(%esi), %esi -+ je L(loop_cross_page) -+L(back_to_loop): -+ /* Main loop, comparing 4 vectors are a time. */ -+ vmovdqa (%rax), %ymm0 -+ vmovdqa VEC_SIZE(%rax), %ymm3 -+ VPCMPEQ (%rdx), %ymm0, %ymm4 -+ VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 -+ VPMINU %ymm0, %ymm4, %ymm4 -+ VPMINU %ymm3, %ymm1, %ymm1 -+ vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 -+ VPMINU %ymm1, %ymm4, %ymm0 -+ vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 -+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 -+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 -+ VPMINU %ymm2, %ymm5, %ymm5 -+ VPMINU %ymm3, %ymm6, %ymm6 -+ VPMINU %ymm5, %ymm0, %ymm0 -+ VPMINU %ymm6, %ymm0, %ymm0 -+ VPCMPEQ %ymm7, %ymm0, %ymm0 -+ -+ /* Test each mask (32 bits) individually because for VEC_SIZE -+ == 32 is not possible to OR the four masks and keep all bits -+ in a 64-bit integer register, differing from SSE2 strcmp -+ where ORing is possible. */ -+ vpmovmskb %ymm0, %ecx -+ testl %ecx, %ecx -+ je L(loop) -+ VPCMPEQ %ymm7, %ymm4, %ymm0 -+ vpmovmskb %ymm0, %edi -+ testl %edi, %edi -+ je L(test_vec) -+ tzcntl %edi, %ecx -+# ifdef USE_AS_STRNCMP -+ cmpq %rcx, %r11 -+ jbe L(zero) -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rcx), %edi -+ cmpl (%rdx, %rcx), %edi -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rcx), %eax -+ movzbl (%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rcx), %edi -+ cmpl (%rdx, %rcx), %edi -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rcx), %eax -+ movzbl (%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(test_vec): -+# ifdef USE_AS_STRNCMP -+ /* The first vector matched. Return 0 if the maximum offset -+ (%r11) <= VEC_SIZE. */ -+ cmpq $VEC_SIZE, %r11 -+ jbe L(zero) -+# endif -+ VPCMPEQ %ymm7, %ymm1, %ymm1 -+ vpmovmskb %ymm1, %ecx -+ testl %ecx, %ecx -+ je L(test_2_vec) -+ tzcntl %ecx, %edi -+# ifdef USE_AS_STRNCMP -+ addq $VEC_SIZE, %rdi -+ cmpq %rdi, %r11 -+ jbe L(zero) -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rdi), %ecx -+ cmpl (%rdx, %rdi), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rdi), %eax -+ movzbl (%rdx, %rdi), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl VEC_SIZE(%rsi, %rdi), %ecx -+ cmpl VEC_SIZE(%rdx, %rdi), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl VEC_SIZE(%rax, %rdi), %eax -+ movzbl VEC_SIZE(%rdx, %rdi), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(test_2_vec): -+# ifdef USE_AS_STRNCMP -+ /* The first 2 vectors matched. Return 0 if the maximum offset -+ (%r11) <= 2 * VEC_SIZE. */ -+ cmpq $(VEC_SIZE * 2), %r11 -+ jbe L(zero) -+# endif -+ VPCMPEQ %ymm7, %ymm5, %ymm5 -+ vpmovmskb %ymm5, %ecx -+ testl %ecx, %ecx -+ je L(test_3_vec) -+ tzcntl %ecx, %edi -+# ifdef USE_AS_STRNCMP -+ addq $(VEC_SIZE * 2), %rdi -+ cmpq %rdi, %r11 -+ jbe L(zero) -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rdi), %ecx -+ cmpl (%rdx, %rdi), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rdi), %eax -+ movzbl (%rdx, %rdi), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx -+ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax -+ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(test_3_vec): -+# ifdef USE_AS_STRNCMP -+ /* The first 3 vectors matched. Return 0 if the maximum offset -+ (%r11) <= 3 * VEC_SIZE. */ -+ cmpq $(VEC_SIZE * 3), %r11 -+ jbe L(zero) -+# endif -+ VPCMPEQ %ymm7, %ymm6, %ymm6 -+ vpmovmskb %ymm6, %esi -+ tzcntl %esi, %ecx -+# ifdef USE_AS_STRNCMP -+ addq $(VEC_SIZE * 3), %rcx -+ cmpq %rcx, %r11 -+ jbe L(zero) -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rcx), %esi -+ cmpl (%rdx, %rcx), %esi -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rcx), %eax -+ movzbl (%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi -+ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi -+ jne L(wcscmp_return) -+# else -+ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax -+ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(loop_cross_page): -+ xorl %r10d, %r10d -+ movq %rdx, %rcx -+ /* Align load via RDX. We load the extra ECX bytes which should -+ be ignored. */ -+ andl $((VEC_SIZE * 4) - 1), %ecx -+ /* R10 is -RCX. */ -+ subq %rcx, %r10 -+ -+ /* This works only if VEC_SIZE * 2 == 64. */ -+# if (VEC_SIZE * 2) != 64 -+# error (VEC_SIZE * 2) != 64 -+# endif -+ -+ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ -+ cmpl $(VEC_SIZE * 2), %ecx -+ jge L(loop_cross_page_2_vec) -+ -+ vmovdqu (%rax, %r10), %ymm2 -+ vmovdqu VEC_SIZE(%rax, %r10), %ymm3 -+ VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 -+ VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 -+ VPMINU %ymm2, %ymm0, %ymm0 -+ VPMINU %ymm3, %ymm1, %ymm1 -+ VPCMPEQ %ymm7, %ymm0, %ymm0 -+ VPCMPEQ %ymm7, %ymm1, %ymm1 -+ -+ vpmovmskb %ymm0, %edi -+ vpmovmskb %ymm1, %esi -+ -+ salq $32, %rsi -+ xorq %rsi, %rdi -+ -+ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ -+ shrq %cl, %rdi -+ -+ testq %rdi, %rdi -+ je L(loop_cross_page_2_vec) -+ tzcntq %rdi, %rcx -+# ifdef USE_AS_STRNCMP -+ cmpq %rcx, %r11 -+ jbe L(zero) -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rcx), %edi -+ cmpl (%rdx, %rcx), %edi -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rcx), %eax -+ movzbl (%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rcx), %edi -+ cmpl (%rdx, %rcx), %edi -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rcx), %eax -+ movzbl (%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(loop_cross_page_2_vec): -+ /* The first VEC_SIZE * 2 bytes match or are ignored. */ -+ vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 -+ vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 -+ VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 -+ VPMINU %ymm2, %ymm5, %ymm5 -+ VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 -+ VPCMPEQ %ymm7, %ymm5, %ymm5 -+ VPMINU %ymm3, %ymm6, %ymm6 -+ VPCMPEQ %ymm7, %ymm6, %ymm6 -+ -+ vpmovmskb %ymm5, %edi -+ vpmovmskb %ymm6, %esi -+ -+ salq $32, %rsi -+ xorq %rsi, %rdi -+ -+ xorl %r8d, %r8d -+ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ -+ subl $(VEC_SIZE * 2), %ecx -+ jle 1f -+ /* Skip ECX bytes. */ -+ shrq %cl, %rdi -+ /* R8 has number of bytes skipped. */ -+ movl %ecx, %r8d -+1: -+ /* Before jumping back to the loop, set ESI to the number of -+ VEC_SIZE * 4 blocks before page crossing. */ -+ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi -+ -+ testq %rdi, %rdi -+# ifdef USE_AS_STRNCMP -+ /* At this point, if %rdi value is 0, it already tested -+ VEC_SIZE*4+%r10 byte starting from %rax. This label -+ checks whether strncmp maximum offset reached or not. */ -+ je L(string_nbyte_offset_check) -+# else -+ je L(back_to_loop) -+# endif -+ tzcntq %rdi, %rcx -+ addq %r10, %rcx -+ /* Adjust for number of bytes skipped. */ -+ addq %r8, %rcx -+# ifdef USE_AS_STRNCMP -+ addq $(VEC_SIZE * 2), %rcx -+ subq %rcx, %r11 -+ jbe L(zero) -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (%rsi, %rcx), %edi -+ cmpl (%rdx, %rcx), %edi -+ jne L(wcscmp_return) -+# else -+ movzbl (%rax, %rcx), %eax -+ movzbl (%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# else -+# ifdef USE_AS_WCSCMP -+ movq %rax, %rsi -+ xorl %eax, %eax -+ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi -+ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi -+ jne L(wcscmp_return) -+# else -+ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax -+ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx -+ subl %edx, %eax -+# endif -+# endif -+ VZEROUPPER -+ ret -+ -+# ifdef USE_AS_STRNCMP -+L(string_nbyte_offset_check): -+ leaq (VEC_SIZE * 4)(%r10), %r10 -+ cmpq %r10, %r11 -+ jbe L(zero) -+ jmp L(back_to_loop) -+# endif -+ -+ .p2align 4 -+L(cross_page_loop): -+ /* Check one byte/dword at a time. */ -+# ifdef USE_AS_WCSCMP -+ cmpl %ecx, %eax -+# else -+ subl %ecx, %eax -+# endif -+ jne L(different) -+ addl $SIZE_OF_CHAR, %edx -+ cmpl $(VEC_SIZE * 4), %edx -+ je L(main_loop_header) -+# ifdef USE_AS_STRNCMP -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+# ifdef USE_AS_WCSCMP -+ movl (%rdi, %rdx), %eax -+ movl (%rsi, %rdx), %ecx -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %ecx -+# endif -+ /* Check null char. */ -+ testl %eax, %eax -+ jne L(cross_page_loop) -+ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED -+ comparisons. */ -+ subl %ecx, %eax -+# ifndef USE_AS_WCSCMP -+L(different): -+# endif -+ VZEROUPPER -+ ret -+ -+# ifdef USE_AS_WCSCMP -+ .p2align 4 -+L(different): -+ /* Use movl to avoid modifying EFLAGS. */ -+ movl $0, %eax -+ setl %al -+ negl %eax -+ orl $1, %eax -+ VZEROUPPER -+ ret -+# endif -+ -+# ifdef USE_AS_STRNCMP -+ .p2align 4 -+L(zero): -+ xorl %eax, %eax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(char0): -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (%rdi), %ecx -+ cmpl (%rsi), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rsi), %ecx -+ movzbl (%rdi), %eax -+ subl %ecx, %eax -+# endif -+ VZEROUPPER -+ ret -+# endif -+ -+ .p2align 4 -+L(last_vector): -+ addq %rdx, %rdi -+ addq %rdx, %rsi -+# ifdef USE_AS_STRNCMP -+ subq %rdx, %r11 -+# endif -+ tzcntl %ecx, %edx -+# ifdef USE_AS_STRNCMP -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+# ifdef USE_AS_WCSCMP -+ xorl %eax, %eax -+ movl (%rdi, %rdx), %ecx -+ cmpl (%rsi, %rdx), %ecx -+ jne L(wcscmp_return) -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %edx -+ subl %edx, %eax -+# endif -+ VZEROUPPER -+ ret -+ -+ /* Comparing on page boundary region requires special treatment: -+ It must done one vector at the time, starting with the wider -+ ymm vector if possible, if not, with xmm. If fetching 16 bytes -+ (xmm) still passes the boundary, byte comparison must be done. -+ */ -+ .p2align 4 -+L(cross_page): -+ /* Try one ymm vector at a time. */ -+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax -+ jg L(cross_page_1_vector) -+L(loop_1_vector): -+ vmovdqu (%rdi, %rdx), %ymm1 -+ VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 -+ VPMINU %ymm1, %ymm0, %ymm0 -+ VPCMPEQ %ymm7, %ymm0, %ymm0 -+ vpmovmskb %ymm0, %ecx -+ testl %ecx, %ecx -+ jne L(last_vector) -+ -+ addl $VEC_SIZE, %edx -+ -+ addl $VEC_SIZE, %eax -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the current offset (%rdx) >= the maximum offset -+ (%r11). */ -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax -+ jle L(loop_1_vector) -+L(cross_page_1_vector): -+ /* Less than 32 bytes to check, try one xmm vector. */ -+ cmpl $(PAGE_SIZE - 16), %eax -+ jg L(cross_page_1_xmm) -+ vmovdqu (%rdi, %rdx), %xmm1 -+ VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 -+ VPMINU %xmm1, %xmm0, %xmm0 -+ VPCMPEQ %xmm7, %xmm0, %xmm0 -+ vpmovmskb %xmm0, %ecx -+ testl %ecx, %ecx -+ jne L(last_vector) -+ -+ addl $16, %edx -+# ifndef USE_AS_WCSCMP -+ addl $16, %eax -+# endif -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the current offset (%rdx) >= the maximum offset -+ (%r11). */ -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+ -+L(cross_page_1_xmm): -+# ifndef USE_AS_WCSCMP -+ /* Less than 16 bytes to check, try 8 byte vector. NB: No need -+ for wcscmp nor wcsncmp since wide char is 4 bytes. */ -+ cmpl $(PAGE_SIZE - 8), %eax -+ jg L(cross_page_8bytes) -+ vmovq (%rdi, %rdx), %xmm1 -+ vmovq (%rsi, %rdx), %xmm0 -+ VPCMPEQ %xmm0, %xmm1, %xmm0 -+ VPMINU %xmm1, %xmm0, %xmm0 -+ VPCMPEQ %xmm7, %xmm0, %xmm0 -+ vpmovmskb %xmm0, %ecx -+ /* Only last 8 bits are valid. */ -+ andl $0xff, %ecx -+ testl %ecx, %ecx -+ jne L(last_vector) -+ -+ addl $8, %edx -+ addl $8, %eax -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the current offset (%rdx) >= the maximum offset -+ (%r11). */ -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+ -+L(cross_page_8bytes): -+ /* Less than 8 bytes to check, try 4 byte vector. */ -+ cmpl $(PAGE_SIZE - 4), %eax -+ jg L(cross_page_4bytes) -+ vmovd (%rdi, %rdx), %xmm1 -+ vmovd (%rsi, %rdx), %xmm0 -+ VPCMPEQ %xmm0, %xmm1, %xmm0 -+ VPMINU %xmm1, %xmm0, %xmm0 -+ VPCMPEQ %xmm7, %xmm0, %xmm0 -+ vpmovmskb %xmm0, %ecx -+ /* Only last 4 bits are valid. */ -+ andl $0xf, %ecx -+ testl %ecx, %ecx -+ jne L(last_vector) -+ -+ addl $4, %edx -+# ifdef USE_AS_STRNCMP -+ /* Return 0 if the current offset (%rdx) >= the maximum offset -+ (%r11). */ -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+ -+L(cross_page_4bytes): -+# endif -+ /* Less than 4 bytes to check, try one byte/dword at a time. */ -+# ifdef USE_AS_STRNCMP -+ cmpq %r11, %rdx -+ jae L(zero) -+# endif -+# ifdef USE_AS_WCSCMP -+ movl (%rdi, %rdx), %eax -+ movl (%rsi, %rdx), %ecx -+# else -+ movzbl (%rdi, %rdx), %eax -+ movzbl (%rsi, %rdx), %ecx -+# endif -+ testl %eax, %eax -+ jne L(cross_page_loop) -+ subl %ecx, %eax -+ VZEROUPPER -+ ret -+END (STRCMP) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S -new file mode 100644 -index 000000000..809a9ac00 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strcpy-kbl.S -@@ -0,0 +1,1046 @@ -+/* strcpy with AVX2 -+ Copyright (C) 2011-2020 Free Software Foundation, Inc. -+ Contributed by Intel Corporation. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+ -+# ifndef USE_AS_STRCAT -+ -+# ifndef STRCPY -+# define STRCPY strcpy_avx2 -+# endif -+ -+# endif -+ -+# ifndef L -+# define L(label) .L##label -+# endif -+ -+# ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+# endif -+ -+# ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+# endif -+ -+# ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+# endif -+ -+# ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+# endif -+ -+/* Number of bytes in a vector register */ -+# ifndef VEC_SIZE -+# define VEC_SIZE 32 -+# endif -+ -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+/* zero register */ -+#define xmmZ xmm0 -+#define ymmZ ymm0 -+ -+/* mask register */ -+#define ymmM ymm1 -+ -+# ifndef USE_AS_STRCAT -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (STRCPY) -+# ifdef USE_AS_STRNCPY -+ mov %RDX_LP, %R8_LP -+ test %R8_LP, %R8_LP -+ jz L(ExitZero) -+# endif -+ mov %rsi, %rcx -+# ifndef USE_AS_STPCPY -+ mov %rdi, %rax /* save result */ -+# endif -+ -+# endif -+ -+ vpxor %xmmZ, %xmmZ, %xmmZ -+ -+ and $((VEC_SIZE * 4) - 1), %ecx -+ cmp $(VEC_SIZE * 2), %ecx -+ jbe L(SourceStringAlignmentLessTwoVecSize) -+ -+ and $-VEC_SIZE, %rsi -+ and $(VEC_SIZE - 1), %ecx -+ -+ vpcmpeqb (%rsi), %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ shr %cl, %rdx -+ -+# ifdef USE_AS_STRNCPY -+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT -+ mov $VEC_SIZE, %r10 -+ sub %rcx, %r10 -+ cmp %r10, %r8 -+# else -+ mov $(VEC_SIZE + 1), %r10 -+ sub %rcx, %r10 -+ cmp %r10, %r8 -+# endif -+ jbe L(CopyVecSizeTailCase2OrCase3) -+# endif -+ test %edx, %edx -+ jnz L(CopyVecSizeTail) -+ -+ vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2 -+ vpmovmskb %ymm2, %edx -+ -+# ifdef USE_AS_STRNCPY -+ add $VEC_SIZE, %r10 -+ cmp %r10, %r8 -+ jbe L(CopyTwoVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+ jnz L(CopyTwoVecSize) -+ -+ vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */ -+ vmovdqu %ymm2, (%rdi) -+ -+/* If source address alignment != destination address alignment */ -+ .p2align 4 -+L(UnalignVecSizeBoth): -+ sub %rcx, %rdi -+# ifdef USE_AS_STRNCPY -+ add %rcx, %r8 -+ sbb %rcx, %rcx -+ or %rcx, %r8 -+# endif -+ mov $VEC_SIZE, %rcx -+ vmovdqa (%rsi, %rcx), %ymm2 -+ vmovdqu %ymm2, (%rdi, %rcx) -+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 -+ vpcmpeqb %ymm2, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $VEC_SIZE, %rcx -+# ifdef USE_AS_STRNCPY -+ sub $(VEC_SIZE * 3), %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec2) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vmovdqu %ymm2, (%rdi, %rcx) -+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 -+ vpcmpeqb %ymm3, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $VEC_SIZE, %rcx -+# ifdef USE_AS_STRNCPY -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec3) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vmovdqu %ymm3, (%rdi, %rcx) -+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4 -+ vpcmpeqb %ymm4, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $VEC_SIZE, %rcx -+# ifdef USE_AS_STRNCPY -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec4) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vmovdqu %ymm4, (%rdi, %rcx) -+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 -+ vpcmpeqb %ymm2, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $VEC_SIZE, %rcx -+# ifdef USE_AS_STRNCPY -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec2) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vmovdqu %ymm2, (%rdi, %rcx) -+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2 -+ vpcmpeqb %ymm2, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $VEC_SIZE, %rcx -+# ifdef USE_AS_STRNCPY -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec2) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3 -+ vmovdqu %ymm2, (%rdi, %rcx) -+ vpcmpeqb %ymm3, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $VEC_SIZE, %rcx -+# ifdef USE_AS_STRNCPY -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+# endif -+ test %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec3) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vmovdqu %ymm3, (%rdi, %rcx) -+ mov %rsi, %rdx -+ lea VEC_SIZE(%rsi, %rcx), %rsi -+ and $-(VEC_SIZE * 4), %rsi -+ sub %rsi, %rdx -+ sub %rdx, %rdi -+# ifdef USE_AS_STRNCPY -+ lea (VEC_SIZE * 8)(%r8, %rdx), %r8 -+# endif -+L(UnalignedFourVecSizeLoop): -+ vmovdqa (%rsi), %ymm4 -+ vmovdqa VEC_SIZE(%rsi), %ymm5 -+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 -+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 -+ vpminub %ymm5, %ymm4, %ymm2 -+ vpminub %ymm7, %ymm6, %ymm3 -+ vpminub %ymm2, %ymm3, %ymm3 -+ vpcmpeqb %ymmM, %ymm3, %ymm3 -+ vpmovmskb %ymm3, %edx -+# ifdef USE_AS_STRNCPY -+ sub $(VEC_SIZE * 4), %r8 -+ jbe L(UnalignedLeaveCase2OrCase3) -+# endif -+ test %edx, %edx -+ jnz L(UnalignedFourVecSizeLeave) -+ -+L(UnalignedFourVecSizeLoop_start): -+ add $(VEC_SIZE * 4), %rdi -+ add $(VEC_SIZE * 4), %rsi -+ vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi) -+ vmovdqa (%rsi), %ymm4 -+ vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi) -+ vmovdqa VEC_SIZE(%rsi), %ymm5 -+ vpminub %ymm5, %ymm4, %ymm2 -+ vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi) -+ vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6 -+ vmovdqu %ymm7, -VEC_SIZE(%rdi) -+ vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7 -+ vpminub %ymm7, %ymm6, %ymm3 -+ vpminub %ymm2, %ymm3, %ymm3 -+ vpcmpeqb %ymmM, %ymm3, %ymm3 -+ vpmovmskb %ymm3, %edx -+# ifdef USE_AS_STRNCPY -+ sub $(VEC_SIZE * 4), %r8 -+ jbe L(UnalignedLeaveCase2OrCase3) -+# endif -+ test %edx, %edx -+ jz L(UnalignedFourVecSizeLoop_start) -+ -+L(UnalignedFourVecSizeLeave): -+ vpcmpeqb %ymm4, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ test %edx, %edx -+ jnz L(CopyVecSizeUnaligned_0) -+ -+ vpcmpeqb %ymm5, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %ecx -+ test %ecx, %ecx -+ jnz L(CopyVecSizeUnaligned_16) -+ -+ vpcmpeqb %ymm6, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ test %edx, %edx -+ jnz L(CopyVecSizeUnaligned_32) -+ -+ vpcmpeqb %ymm7, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %ecx -+ bsf %ecx, %edx -+ vmovdqu %ymm4, (%rdi) -+ vmovdqu %ymm5, VEC_SIZE(%rdi) -+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+# ifdef USE_AS_STPCPY -+ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax -+# endif -+ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) -+ add $(VEC_SIZE - 1), %r8 -+ sub %rdx, %r8 -+ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi -+ jmp L(StrncpyFillTailWithZero) -+# else -+ add $(VEC_SIZE * 3), %rsi -+ add $(VEC_SIZE * 3), %rdi -+ jmp L(CopyVecSizeExit) -+# endif -+ -+/* If source address alignment == destination address alignment */ -+ -+L(SourceStringAlignmentLessTwoVecSize): -+ vmovdqu (%rsi), %ymm3 -+ vmovdqu VEC_SIZE(%rsi), %ymm2 -+ vpcmpeqb %ymm3, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ -+# ifdef USE_AS_STRNCPY -+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT -+ cmp $VEC_SIZE, %r8 -+# else -+ cmp $(VEC_SIZE + 1), %r8 -+# endif -+ jbe L(CopyVecSizeTail1Case2OrCase3) -+# endif -+ test %edx, %edx -+ jnz L(CopyVecSizeTail1) -+ -+ vmovdqu %ymm3, (%rdi) -+ vpcmpeqb %ymm2, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ -+# ifdef USE_AS_STRNCPY -+# if defined USE_AS_STPCPY || defined USE_AS_STRCAT -+ cmp $(VEC_SIZE * 2), %r8 -+# else -+ cmp $((VEC_SIZE * 2) + 1), %r8 -+# endif -+ jbe L(CopyTwoVecSize1Case2OrCase3) -+# endif -+ test %edx, %edx -+ jnz L(CopyTwoVecSize1) -+ -+ and $-VEC_SIZE, %rsi -+ and $(VEC_SIZE - 1), %ecx -+ jmp L(UnalignVecSizeBoth) -+ -+/*------End of main part with loops---------------------*/ -+ -+/* Case1 */ -+ -+# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) -+ .p2align 4 -+L(CopyVecSize): -+ add %rcx, %rdi -+# endif -+L(CopyVecSizeTail): -+ add %rcx, %rsi -+L(CopyVecSizeTail1): -+ bsf %edx, %edx -+L(CopyVecSizeExit): -+ cmp $32, %edx -+ jae L(Exit32_63) -+ cmp $16, %edx -+ jae L(Exit16_31) -+ cmp $8, %edx -+ jae L(Exit8_15) -+ cmp $4, %edx -+ jae L(Exit4_7) -+ cmp $3, %edx -+ je L(Exit3) -+ cmp $1, %edx -+ ja L(Exit2) -+ je L(Exit1) -+ movb $0, (%rdi) -+# ifdef USE_AS_STPCPY -+ lea (%rdi), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub $1, %r8 -+ lea 1(%rdi), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(CopyTwoVecSize1): -+ add $VEC_SIZE, %rsi -+ add $VEC_SIZE, %rdi -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub $VEC_SIZE, %r8 -+# endif -+ jmp L(CopyVecSizeTail1) -+ -+ .p2align 4 -+L(CopyTwoVecSize): -+ bsf %edx, %edx -+ add %rcx, %rsi -+ add $VEC_SIZE, %edx -+ sub %ecx, %edx -+ jmp L(CopyVecSizeExit) -+ -+ .p2align 4 -+L(CopyVecSizeUnaligned_0): -+ bsf %edx, %edx -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %rdx), %rax -+# endif -+ vmovdqu %ymm4, (%rdi) -+ add $((VEC_SIZE * 4) - 1), %r8 -+ sub %rdx, %r8 -+ lea 1(%rdi, %rdx), %rdi -+ jmp L(StrncpyFillTailWithZero) -+# else -+ jmp L(CopyVecSizeExit) -+# endif -+ -+ .p2align 4 -+L(CopyVecSizeUnaligned_16): -+ bsf %ecx, %edx -+ vmovdqu %ymm4, (%rdi) -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+# ifdef USE_AS_STPCPY -+ lea VEC_SIZE(%rdi, %rdx), %rax -+# endif -+ vmovdqu %ymm5, VEC_SIZE(%rdi) -+ add $((VEC_SIZE * 3) - 1), %r8 -+ sub %rdx, %r8 -+ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi -+ jmp L(StrncpyFillTailWithZero) -+# else -+ add $VEC_SIZE, %rsi -+ add $VEC_SIZE, %rdi -+ jmp L(CopyVecSizeExit) -+# endif -+ -+ .p2align 4 -+L(CopyVecSizeUnaligned_32): -+ bsf %edx, %edx -+ vmovdqu %ymm4, (%rdi) -+ vmovdqu %ymm5, VEC_SIZE(%rdi) -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+# ifdef USE_AS_STPCPY -+ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax -+# endif -+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) -+ add $((VEC_SIZE * 2) - 1), %r8 -+ sub %rdx, %r8 -+ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi -+ jmp L(StrncpyFillTailWithZero) -+# else -+ add $(VEC_SIZE * 2), %rsi -+ add $(VEC_SIZE * 2), %rdi -+ jmp L(CopyVecSizeExit) -+# endif -+ -+# ifdef USE_AS_STRNCPY -+# ifndef USE_AS_STRCAT -+ .p2align 4 -+L(CopyVecSizeUnalignedVec6): -+ vmovdqu %ymm6, (%rdi, %rcx) -+ jmp L(CopyVecSizeVecExit) -+ -+ .p2align 4 -+L(CopyVecSizeUnalignedVec5): -+ vmovdqu %ymm5, (%rdi, %rcx) -+ jmp L(CopyVecSizeVecExit) -+ -+ .p2align 4 -+L(CopyVecSizeUnalignedVec4): -+ vmovdqu %ymm4, (%rdi, %rcx) -+ jmp L(CopyVecSizeVecExit) -+ -+ .p2align 4 -+L(CopyVecSizeUnalignedVec3): -+ vmovdqu %ymm3, (%rdi, %rcx) -+ jmp L(CopyVecSizeVecExit) -+# endif -+ -+/* Case2 */ -+ -+ .p2align 4 -+L(CopyVecSizeCase2): -+ add $VEC_SIZE, %r8 -+ add %rcx, %rdi -+ add %rcx, %rsi -+ bsf %edx, %edx -+ cmp %r8d, %edx -+ jb L(CopyVecSizeExit) -+ jmp L(StrncpyExit) -+ -+ .p2align 4 -+L(CopyTwoVecSizeCase2): -+ add %rcx, %rsi -+ bsf %edx, %edx -+ add $VEC_SIZE, %edx -+ sub %ecx, %edx -+ cmp %r8d, %edx -+ jb L(CopyVecSizeExit) -+ jmp L(StrncpyExit) -+ -+L(CopyVecSizeTailCase2): -+ add %rcx, %rsi -+ bsf %edx, %edx -+ cmp %r8d, %edx -+ jb L(CopyVecSizeExit) -+ jmp L(StrncpyExit) -+ -+L(CopyVecSizeTail1Case2): -+ bsf %edx, %edx -+ cmp %r8d, %edx -+ jb L(CopyVecSizeExit) -+ jmp L(StrncpyExit) -+ -+/* Case2 or Case3, Case3 */ -+ -+ .p2align 4 -+L(CopyVecSizeCase2OrCase3): -+ test %rdx, %rdx -+ jnz L(CopyVecSizeCase2) -+L(CopyVecSizeCase3): -+ add $VEC_SIZE, %r8 -+ add %rcx, %rdi -+ add %rcx, %rsi -+ jmp L(StrncpyExit) -+ -+ .p2align 4 -+L(CopyTwoVecSizeCase2OrCase3): -+ test %rdx, %rdx -+ jnz L(CopyTwoVecSizeCase2) -+ add %rcx, %rsi -+ jmp L(StrncpyExit) -+ -+ .p2align 4 -+L(CopyVecSizeTailCase2OrCase3): -+ test %rdx, %rdx -+ jnz L(CopyVecSizeTailCase2) -+ add %rcx, %rsi -+ jmp L(StrncpyExit) -+ -+ .p2align 4 -+L(CopyTwoVecSize1Case2OrCase3): -+ add $VEC_SIZE, %rdi -+ add $VEC_SIZE, %rsi -+ sub $VEC_SIZE, %r8 -+L(CopyVecSizeTail1Case2OrCase3): -+ test %rdx, %rdx -+ jnz L(CopyVecSizeTail1Case2) -+ jmp L(StrncpyExit) -+# endif -+ -+/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ -+ -+ .p2align 4 -+L(Exit1): -+ movzwl (%rsi), %edx -+ mov %dx, (%rdi) -+# ifdef USE_AS_STPCPY -+ lea 1(%rdi), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub $2, %r8 -+ lea 2(%rdi), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Exit2): -+ movzwl (%rsi), %ecx -+ mov %cx, (%rdi) -+ movb $0, 2(%rdi) -+# ifdef USE_AS_STPCPY -+ lea 2(%rdi), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub $3, %r8 -+ lea 3(%rdi), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Exit3): -+ mov (%rsi), %edx -+ mov %edx, (%rdi) -+# ifdef USE_AS_STPCPY -+ lea 3(%rdi), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub $4, %r8 -+ lea 4(%rdi), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Exit4_7): -+ mov (%rsi), %ecx -+ mov %ecx, (%rdi) -+ mov -3(%rsi, %rdx), %ecx -+ mov %ecx, -3(%rdi, %rdx) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %rdx), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub %rdx, %r8 -+ sub $1, %r8 -+ lea 1(%rdi, %rdx), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Exit8_15): -+ mov (%rsi), %rcx -+ mov -7(%rsi, %rdx), %r9 -+ mov %rcx, (%rdi) -+ mov %r9, -7(%rdi, %rdx) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %rdx), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub %rdx, %r8 -+ sub $1, %r8 -+ lea 1(%rdi, %rdx), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Exit16_31): -+ vmovdqu (%rsi), %xmm2 -+ vmovdqu -15(%rsi, %rdx), %xmm3 -+ vmovdqu %xmm2, (%rdi) -+ vmovdqu %xmm3, -15(%rdi, %rdx) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %rdx), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub %rdx, %r8 -+ sub $1, %r8 -+ lea 1(%rdi, %rdx), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Exit32_63): -+ vmovdqu (%rsi), %ymm2 -+ vmovdqu -31(%rsi, %rdx), %ymm3 -+ vmovdqu %ymm2, (%rdi) -+ vmovdqu %ymm3, -31(%rdi, %rdx) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %rdx), %rax -+# endif -+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT -+ sub %rdx, %r8 -+ sub $1, %r8 -+ lea 1(%rdi, %rdx), %rdi -+ jnz L(StrncpyFillTailWithZero) -+# endif -+ VZEROUPPER -+ ret -+ -+# ifdef USE_AS_STRNCPY -+ -+ .p2align 4 -+L(StrncpyExit1): -+ movzbl (%rsi), %edx -+ mov %dl, (%rdi) -+# ifdef USE_AS_STPCPY -+ lea 1(%rdi), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, 1(%rdi) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit2): -+ movzwl (%rsi), %edx -+ mov %dx, (%rdi) -+# ifdef USE_AS_STPCPY -+ lea 2(%rdi), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, 2(%rdi) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit3_4): -+ movzwl (%rsi), %ecx -+ movzwl -2(%rsi, %r8), %edx -+ mov %cx, (%rdi) -+ mov %dx, -2(%rdi, %r8) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %r8), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (%rdi, %r8) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit5_8): -+ mov (%rsi), %ecx -+ mov -4(%rsi, %r8), %edx -+ mov %ecx, (%rdi) -+ mov %edx, -4(%rdi, %r8) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %r8), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (%rdi, %r8) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit9_16): -+ mov (%rsi), %rcx -+ mov -8(%rsi, %r8), %rdx -+ mov %rcx, (%rdi) -+ mov %rdx, -8(%rdi, %r8) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %r8), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (%rdi, %r8) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit17_32): -+ vmovdqu (%rsi), %xmm2 -+ vmovdqu -16(%rsi, %r8), %xmm3 -+ vmovdqu %xmm2, (%rdi) -+ vmovdqu %xmm3, -16(%rdi, %r8) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %r8), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (%rdi, %r8) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit33_64): -+ /* 0/32, 31/16 */ -+ vmovdqu (%rsi), %ymm2 -+ vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3 -+ vmovdqu %ymm2, (%rdi) -+ vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8) -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %r8), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (%rdi, %r8) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(StrncpyExit65): -+ /* 0/32, 32/32, 64/1 */ -+ vmovdqu (%rsi), %ymm2 -+ vmovdqu 32(%rsi), %ymm3 -+ mov 64(%rsi), %cl -+ vmovdqu %ymm2, (%rdi) -+ vmovdqu %ymm3, 32(%rdi) -+ mov %cl, 64(%rdi) -+# ifdef USE_AS_STPCPY -+ lea 65(%rdi), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, 65(%rdi) -+# endif -+ VZEROUPPER -+ ret -+ -+# ifndef USE_AS_STRCAT -+ -+ .p2align 4 -+L(Fill1): -+ mov %dl, (%rdi) -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Fill2): -+ mov %dx, (%rdi) -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Fill3_4): -+ mov %dx, (%rdi) -+ mov %dx, -2(%rdi, %r8) -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Fill5_8): -+ mov %edx, (%rdi) -+ mov %edx, -4(%rdi, %r8) -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Fill9_16): -+ mov %rdx, (%rdi) -+ mov %rdx, -8(%rdi, %r8) -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(Fill17_32): -+ vmovdqu %xmmZ, (%rdi) -+ vmovdqu %xmmZ, -16(%rdi, %r8) -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(CopyVecSizeUnalignedVec2): -+ vmovdqu %ymm2, (%rdi, %rcx) -+ -+ .p2align 4 -+L(CopyVecSizeVecExit): -+ bsf %edx, %edx -+ add $(VEC_SIZE - 1), %r8 -+ add %rcx, %rdi -+# ifdef USE_AS_STPCPY -+ lea (%rdi, %rdx), %rax -+# endif -+ sub %rdx, %r8 -+ lea 1(%rdi, %rdx), %rdi -+ -+ .p2align 4 -+L(StrncpyFillTailWithZero): -+ xor %edx, %edx -+ sub $VEC_SIZE, %r8 -+ jbe L(StrncpyFillExit) -+ -+ vmovdqu %ymmZ, (%rdi) -+ add $VEC_SIZE, %rdi -+ -+ mov %rdi, %rsi -+ and $(VEC_SIZE - 1), %esi -+ sub %rsi, %rdi -+ add %rsi, %r8 -+ sub $(VEC_SIZE * 4), %r8 -+ jb L(StrncpyFillLessFourVecSize) -+ -+L(StrncpyFillLoopVmovdqa): -+ vmovdqa %ymmZ, (%rdi) -+ vmovdqa %ymmZ, VEC_SIZE(%rdi) -+ vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi) -+ vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi) -+ add $(VEC_SIZE * 4), %rdi -+ sub $(VEC_SIZE * 4), %r8 -+ jae L(StrncpyFillLoopVmovdqa) -+ -+L(StrncpyFillLessFourVecSize): -+ add $(VEC_SIZE * 2), %r8 -+ jl L(StrncpyFillLessTwoVecSize) -+ vmovdqa %ymmZ, (%rdi) -+ vmovdqa %ymmZ, VEC_SIZE(%rdi) -+ add $(VEC_SIZE * 2), %rdi -+ sub $VEC_SIZE, %r8 -+ jl L(StrncpyFillExit) -+ vmovdqa %ymmZ, (%rdi) -+ add $VEC_SIZE, %rdi -+ jmp L(Fill) -+ -+ .p2align 4 -+L(StrncpyFillLessTwoVecSize): -+ add $VEC_SIZE, %r8 -+ jl L(StrncpyFillExit) -+ vmovdqa %ymmZ, (%rdi) -+ add $VEC_SIZE, %rdi -+ jmp L(Fill) -+ -+ .p2align 4 -+L(StrncpyFillExit): -+ add $VEC_SIZE, %r8 -+L(Fill): -+ cmp $17, %r8d -+ jae L(Fill17_32) -+ cmp $9, %r8d -+ jae L(Fill9_16) -+ cmp $5, %r8d -+ jae L(Fill5_8) -+ cmp $3, %r8d -+ jae L(Fill3_4) -+ cmp $1, %r8d -+ ja L(Fill2) -+ je L(Fill1) -+ VZEROUPPER -+ ret -+ -+/* end of ifndef USE_AS_STRCAT */ -+# endif -+ -+ .p2align 4 -+L(UnalignedLeaveCase2OrCase3): -+ test %rdx, %rdx -+ jnz L(UnalignedFourVecSizeLeaveCase2) -+L(UnalignedFourVecSizeLeaveCase3): -+ lea (VEC_SIZE * 4)(%r8), %rcx -+ and $-VEC_SIZE, %rcx -+ add $(VEC_SIZE * 3), %r8 -+ jl L(CopyVecSizeCase3) -+ vmovdqu %ymm4, (%rdi) -+ sub $VEC_SIZE, %r8 -+ jb L(CopyVecSizeCase3) -+ vmovdqu %ymm5, VEC_SIZE(%rdi) -+ sub $VEC_SIZE, %r8 -+ jb L(CopyVecSizeCase3) -+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) -+ sub $VEC_SIZE, %r8 -+ jb L(CopyVecSizeCase3) -+ vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi) -+# ifdef USE_AS_STPCPY -+ lea (VEC_SIZE * 4)(%rdi), %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (VEC_SIZE * 4)(%rdi) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(UnalignedFourVecSizeLeaveCase2): -+ xor %ecx, %ecx -+ vpcmpeqb %ymm4, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ add $(VEC_SIZE * 3), %r8 -+ jle L(CopyVecSizeCase2OrCase3) -+ test %edx, %edx -+# ifndef USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec4) -+# else -+ jnz L(CopyVecSize) -+# endif -+ vpcmpeqb %ymm5, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ vmovdqu %ymm4, (%rdi) -+ add $VEC_SIZE, %rcx -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+ test %edx, %edx -+# ifndef USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec5) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vpcmpeqb %ymm6, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ vmovdqu %ymm5, VEC_SIZE(%rdi) -+ add $VEC_SIZE, %rcx -+ sub $VEC_SIZE, %r8 -+ jbe L(CopyVecSizeCase2OrCase3) -+ test %edx, %edx -+# ifndef USE_AS_STRCAT -+ jnz L(CopyVecSizeUnalignedVec6) -+# else -+ jnz L(CopyVecSize) -+# endif -+ -+ vpcmpeqb %ymm7, %ymmZ, %ymmM -+ vpmovmskb %ymmM, %edx -+ vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi) -+ lea VEC_SIZE(%rdi, %rcx), %rdi -+ lea VEC_SIZE(%rsi, %rcx), %rsi -+ bsf %edx, %edx -+ cmp %r8d, %edx -+ jb L(CopyVecSizeExit) -+L(StrncpyExit): -+ cmp $65, %r8d -+ je L(StrncpyExit65) -+ cmp $33, %r8d -+ jae L(StrncpyExit33_64) -+ cmp $17, %r8d -+ jae L(StrncpyExit17_32) -+ cmp $9, %r8d -+ jae L(StrncpyExit9_16) -+ cmp $5, %r8d -+ jae L(StrncpyExit5_8) -+ cmp $3, %r8d -+ jae L(StrncpyExit3_4) -+ cmp $1, %r8d -+ ja L(StrncpyExit2) -+ je L(StrncpyExit1) -+# ifdef USE_AS_STPCPY -+ mov %rdi, %rax -+# endif -+# ifdef USE_AS_STRCAT -+ movb $0, (%rdi) -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(ExitZero): -+# ifndef USE_AS_STRCAT -+ mov %rdi, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+# endif -+ -+# ifndef USE_AS_STRCAT -+END (STRCPY) -+# else -+END (STRCAT) -+# endif -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S -new file mode 100644 -index 000000000..912d771b4 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strlen-kbl.S -@@ -0,0 +1,418 @@ -+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. -+ Copyright (C) 2017-2020 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+# ifndef STRLEN -+# define STRLEN strlen_avx2 -+# endif -+ -+# ifndef L -+# define L(label) .L##label -+# endif -+ -+# ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+# endif -+ -+# ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+# endif -+ -+# ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+# endif -+ -+# ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+# endif -+ -+# ifdef USE_AS_WCSLEN -+# define VPCMPEQ vpcmpeqd -+# define VPMINU vpminud -+# else -+# define VPCMPEQ vpcmpeqb -+# define VPMINU vpminub -+# endif -+ -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+# define VEC_SIZE 32 -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (STRLEN) -+# ifdef USE_AS_STRNLEN -+ /* Check for zero length. */ -+ test %RSI_LP, %RSI_LP -+ jz L(zero) -+# ifdef USE_AS_WCSLEN -+ shl $2, %RSI_LP -+# elif defined __ILP32__ -+ /* Clear the upper 32 bits. */ -+ movl %esi, %esi -+# endif -+ mov %RSI_LP, %R8_LP -+# endif -+ movl %edi, %ecx -+ movq %rdi, %rdx -+ vpxor %xmm0, %xmm0, %xmm0 -+ -+ /* Check if we may cross page boundary with one vector load. */ -+ andl $(2 * VEC_SIZE - 1), %ecx -+ cmpl $VEC_SIZE, %ecx -+ ja L(cros_page_boundary) -+ -+ /* Check the first VEC_SIZE bytes. */ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+# ifdef USE_AS_STRNLEN -+ jnz L(first_vec_x0_check) -+ /* Adjust length and check the end of data. */ -+ subq $VEC_SIZE, %rsi -+ jbe L(max) -+# else -+ jnz L(first_vec_x0) -+# endif -+ -+ /* Align data for aligned loads in the loop. */ -+ addq $VEC_SIZE, %rdi -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ -+# ifdef USE_AS_STRNLEN -+ /* Adjust length. */ -+ addq %rcx, %rsi -+ -+ subq $(VEC_SIZE * 4), %rsi -+ jbe L(last_4x_vec_or_less) -+# endif -+ jmp L(more_4x_vec) -+ -+ .p2align 4 -+L(cros_page_boundary): -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ /* Remove the leading bytes. */ -+ sarl %cl, %eax -+ testl %eax, %eax -+ jz L(aligned_more) -+ tzcntl %eax, %eax -+# ifdef USE_AS_STRNLEN -+ /* Check the end of data. */ -+ cmpq %rax, %rsi -+ jbe L(max) -+# endif -+ addq %rdi, %rax -+ addq %rcx, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(aligned_more): -+# ifdef USE_AS_STRNLEN -+ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" -+ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" -+ to void possible addition overflow. */ -+ negq %rcx -+ addq $VEC_SIZE, %rcx -+ -+ /* Check the end of data. */ -+ subq %rcx, %rsi -+ jbe L(max) -+# endif -+ -+ addq $VEC_SIZE, %rdi -+ -+# ifdef USE_AS_STRNLEN -+ subq $(VEC_SIZE * 4), %rsi -+ jbe L(last_4x_vec_or_less) -+# endif -+ -+L(more_4x_vec): -+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time -+ since data is only aligned to VEC_SIZE. */ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x2) -+ -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x3) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ -+# ifdef USE_AS_STRNLEN -+ subq $(VEC_SIZE * 4), %rsi -+ jbe L(last_4x_vec_or_less) -+# endif -+ -+ /* Align data to 4 * VEC_SIZE. */ -+ movq %rdi, %rcx -+ andl $(4 * VEC_SIZE - 1), %ecx -+ andq $-(4 * VEC_SIZE), %rdi -+ -+# ifdef USE_AS_STRNLEN -+ /* Adjust length. */ -+ addq %rcx, %rsi -+# endif -+ -+ .p2align 4 -+L(loop_4x_vec): -+ /* Compare 4 * VEC at a time forward. */ -+ vmovdqa (%rdi), %ymm1 -+ vmovdqa VEC_SIZE(%rdi), %ymm2 -+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 -+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 -+ VPMINU %ymm1, %ymm2, %ymm5 -+ VPMINU %ymm3, %ymm4, %ymm6 -+ VPMINU %ymm5, %ymm6, %ymm5 -+ -+ VPCMPEQ %ymm5, %ymm0, %ymm5 -+ vpmovmskb %ymm5, %eax -+ testl %eax, %eax -+ jnz L(4x_vec_end) -+ -+ addq $(VEC_SIZE * 4), %rdi -+ -+# ifndef USE_AS_STRNLEN -+ jmp L(loop_4x_vec) -+# else -+ subq $(VEC_SIZE * 4), %rsi -+ ja L(loop_4x_vec) -+ -+L(last_4x_vec_or_less): -+ /* Less than 4 * VEC and aligned to VEC_SIZE. */ -+ addl $(VEC_SIZE * 2), %esi -+ jle L(last_2x_vec) -+ -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ -+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+ jnz L(first_vec_x2_check) -+ subl $VEC_SIZE, %esi -+ jle L(max) -+ -+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+ jnz L(first_vec_x3_check) -+ movq %r8, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(last_2x_vec): -+ addl $(VEC_SIZE * 2), %esi -+ VPCMPEQ (%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ -+ jnz L(first_vec_x0_check) -+ subl $VEC_SIZE, %esi -+ jle L(max) -+ -+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1_check) -+ movq %r8, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x0_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rsi -+ jbe L(max) -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x1_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rsi -+ jbe L(max) -+ addq $VEC_SIZE, %rax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x2_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rsi -+ jbe L(max) -+ addq $(VEC_SIZE * 2), %rax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x3_check): -+ tzcntl %eax, %eax -+ /* Check the end of data. */ -+ cmpq %rax, %rsi -+ jbe L(max) -+ addq $(VEC_SIZE * 3), %rax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(max): -+ movq %r8, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(zero): -+ xorl %eax, %eax -+ ret -+# endif -+ -+ .p2align 4 -+L(first_vec_x0): -+ tzcntl %eax, %eax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x1): -+ tzcntl %eax, %eax -+ addq $VEC_SIZE, %rax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(first_vec_x2): -+ tzcntl %eax, %eax -+ addq $(VEC_SIZE * 2), %rax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(4x_vec_end): -+ VPCMPEQ %ymm1, %ymm0, %ymm1 -+ vpmovmskb %ymm1, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x0) -+ VPCMPEQ %ymm2, %ymm0, %ymm2 -+ vpmovmskb %ymm2, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x1) -+ VPCMPEQ %ymm3, %ymm0, %ymm3 -+ vpmovmskb %ymm3, %eax -+ testl %eax, %eax -+ jnz L(first_vec_x2) -+ VPCMPEQ %ymm4, %ymm0, %ymm4 -+ vpmovmskb %ymm4, %eax -+L(first_vec_x3): -+ tzcntl %eax, %eax -+ addq $(VEC_SIZE * 3), %rax -+ addq %rdi, %rax -+ subq %rdx, %rax -+# ifdef USE_AS_WCSLEN -+ shrq $2, %rax -+# endif -+ VZEROUPPER -+ ret -+ -+END (STRLEN) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S -new file mode 100644 -index 000000000..71e1a46c2 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strncat-kbl.S -@@ -0,0 +1,3 @@ -+#define USE_AS_STRNCAT -+#define STRCAT strncat_avx2 -+#include "avx2-strcat-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S -new file mode 100644 -index 000000000..b21a19134 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strncmp-kbl.S -@@ -0,0 +1,4 @@ -+#define STRCMP strncmp_avx2 -+#define USE_AS_STRNCMP 1 -+#include "avx_regs.h" -+#include "avx2-strcmp-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S -new file mode 100644 -index 000000000..7ad840667 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strncpy-kbl.S -@@ -0,0 +1,4 @@ -+#define USE_AS_STRNCPY -+#define STRCPY strncpy_avx2 -+#include "avx_regs.h" -+#include "avx2-strcpy-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S -new file mode 100644 -index 000000000..22cc5c527 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strnlen-kbl.S -@@ -0,0 +1,4 @@ -+#define STRLEN strnlen_avx2 -+#define USE_AS_STRNLEN 1 -+#include "avx_regs.h" -+#include "avx2-strlen-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S -new file mode 100644 -index 000000000..b3a65fbc6 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-strrchr-kbl.S -@@ -0,0 +1,258 @@ -+/* strrchr/wcsrchr optimized with AVX2. -+ Copyright (C) 2017-2020 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+ -+# ifndef STRRCHR -+# define STRRCHR strrchr_avx2 -+# endif -+ -+# ifndef L -+# define L(label) .L##label -+# endif -+ -+# ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+# endif -+ -+# ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+# endif -+ -+# ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+# endif -+ -+# ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+# endif -+ -+# ifdef USE_AS_WCSRCHR -+# define VPBROADCAST vpbroadcastd -+# define VPCMPEQ vpcmpeqd -+# else -+# define VPBROADCAST vpbroadcastb -+# define VPCMPEQ vpcmpeqb -+# endif -+ -+# ifndef VZEROUPPER -+# define VZEROUPPER vzeroupper -+# endif -+ -+# define VEC_SIZE 32 -+ -+ .section .text.avx,"ax",@progbits -+ENTRY (STRRCHR) -+ movd %esi, %xmm4 -+ movl %edi, %ecx -+ /* Broadcast CHAR to YMM4. */ -+ VPBROADCAST %xmm4, %ymm4 -+ vpxor %xmm0, %xmm0, %xmm0 -+ -+ /* Check if we may cross page boundary with one vector load. */ -+ andl $(2 * VEC_SIZE - 1), %ecx -+ cmpl $VEC_SIZE, %ecx -+ ja L(cros_page_boundary) -+ -+ vmovdqu (%rdi), %ymm1 -+ VPCMPEQ %ymm1, %ymm0, %ymm2 -+ VPCMPEQ %ymm1, %ymm4, %ymm3 -+ vpmovmskb %ymm2, %ecx -+ vpmovmskb %ymm3, %eax -+ addq $VEC_SIZE, %rdi -+ -+ testl %eax, %eax -+ jnz L(first_vec) -+ -+ testl %ecx, %ecx -+ jnz L(return_null) -+ -+ andq $-VEC_SIZE, %rdi -+ xorl %edx, %edx -+ jmp L(aligned_loop) -+ -+ .p2align 4 -+L(first_vec): -+ /* Check if there is a nul CHAR. */ -+ testl %ecx, %ecx -+ jnz L(char_and_nul_in_first_vec) -+ -+ /* Remember the match and keep searching. */ -+ movl %eax, %edx -+ movq %rdi, %rsi -+ andq $-VEC_SIZE, %rdi -+ jmp L(aligned_loop) -+ -+ .p2align 4 -+L(cros_page_boundary): -+ andl $(VEC_SIZE - 1), %ecx -+ andq $-VEC_SIZE, %rdi -+ vmovdqa (%rdi), %ymm1 -+ VPCMPEQ %ymm1, %ymm0, %ymm2 -+ VPCMPEQ %ymm1, %ymm4, %ymm3 -+ vpmovmskb %ymm2, %edx -+ vpmovmskb %ymm3, %eax -+ shrl %cl, %edx -+ shrl %cl, %eax -+ addq $VEC_SIZE, %rdi -+ -+ /* Check if there is a CHAR. */ -+ testl %eax, %eax -+ jnz L(found_char) -+ -+ testl %edx, %edx -+ jnz L(return_null) -+ -+ jmp L(aligned_loop) -+ -+ .p2align 4 -+L(found_char): -+ testl %edx, %edx -+ jnz L(char_and_nul) -+ -+ /* Remember the match and keep searching. */ -+ movl %eax, %edx -+ leaq (%rdi, %rcx), %rsi -+ -+ .p2align 4 -+L(aligned_loop): -+ vmovdqa (%rdi), %ymm1 -+ VPCMPEQ %ymm1, %ymm0, %ymm2 -+ addq $VEC_SIZE, %rdi -+ VPCMPEQ %ymm1, %ymm4, %ymm3 -+ vpmovmskb %ymm2, %ecx -+ vpmovmskb %ymm3, %eax -+ orl %eax, %ecx -+ jnz L(char_nor_null) -+ -+ vmovdqa (%rdi), %ymm1 -+ VPCMPEQ %ymm1, %ymm0, %ymm2 -+ add $VEC_SIZE, %rdi -+ VPCMPEQ %ymm1, %ymm4, %ymm3 -+ vpmovmskb %ymm2, %ecx -+ vpmovmskb %ymm3, %eax -+ orl %eax, %ecx -+ jnz L(char_nor_null) -+ -+ vmovdqa (%rdi), %ymm1 -+ VPCMPEQ %ymm1, %ymm0, %ymm2 -+ addq $VEC_SIZE, %rdi -+ VPCMPEQ %ymm1, %ymm4, %ymm3 -+ vpmovmskb %ymm2, %ecx -+ vpmovmskb %ymm3, %eax -+ orl %eax, %ecx -+ jnz L(char_nor_null) -+ -+ vmovdqa (%rdi), %ymm1 -+ VPCMPEQ %ymm1, %ymm0, %ymm2 -+ addq $VEC_SIZE, %rdi -+ VPCMPEQ %ymm1, %ymm4, %ymm3 -+ vpmovmskb %ymm2, %ecx -+ vpmovmskb %ymm3, %eax -+ orl %eax, %ecx -+ jz L(aligned_loop) -+ -+ .p2align 4 -+L(char_nor_null): -+ /* Find a CHAR or a nul CHAR in a loop. */ -+ testl %eax, %eax -+ jnz L(match) -+L(return_value): -+ testl %edx, %edx -+ jz L(return_null) -+ movl %edx, %eax -+ movq %rsi, %rdi -+ -+# ifdef USE_AS_WCSRCHR -+ /* Keep the first bit for each matching CHAR for bsr. */ -+ andl $0x11111111, %eax -+# endif -+ bsrl %eax, %eax -+ leaq -VEC_SIZE(%rdi, %rax), %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(match): -+ /* Find a CHAR. Check if there is a nul CHAR. */ -+ vpmovmskb %ymm2, %ecx -+ testl %ecx, %ecx -+ jnz L(find_nul) -+ -+ /* Remember the match and keep searching. */ -+ movl %eax, %edx -+ movq %rdi, %rsi -+ jmp L(aligned_loop) -+ -+ .p2align 4 -+L(find_nul): -+# ifdef USE_AS_WCSRCHR -+ /* Keep the first bit for each matching CHAR for bsr. */ -+ andl $0x11111111, %ecx -+ andl $0x11111111, %eax -+# endif -+ /* Mask out any matching bits after the nul CHAR. */ -+ movl %ecx, %r8d -+ subl $1, %r8d -+ xorl %ecx, %r8d -+ andl %r8d, %eax -+ testl %eax, %eax -+ /* If there is no CHAR here, return the remembered one. */ -+ jz L(return_value) -+ bsrl %eax, %eax -+ leaq -VEC_SIZE(%rdi, %rax), %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(char_and_nul): -+ /* Find both a CHAR and a nul CHAR. */ -+ addq %rcx, %rdi -+ movl %edx, %ecx -+L(char_and_nul_in_first_vec): -+# ifdef USE_AS_WCSRCHR -+ /* Keep the first bit for each matching CHAR for bsr. */ -+ andl $0x11111111, %ecx -+ andl $0x11111111, %eax -+# endif -+ /* Mask out any matching bits after the nul CHAR. */ -+ movl %ecx, %r8d -+ subl $1, %r8d -+ xorl %ecx, %r8d -+ andl %r8d, %eax -+ testl %eax, %eax -+ /* Return null pointer if the nul CHAR comes first. */ -+ jz L(return_null) -+ bsrl %eax, %eax -+ leaq -VEC_SIZE(%rdi, %rax), %rax -+ VZEROUPPER -+ ret -+ -+ .p2align 4 -+L(return_null): -+ xorl %eax, %eax -+ VZEROUPPER -+ ret -+ -+END (STRRCHR) -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S -new file mode 100644 -index 000000000..b03124767 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wcschr-kbl.S -@@ -0,0 +1,3 @@ -+#define STRCHR wcschr_avx2 -+#define USE_AS_WCSCHR 1 -+#include "avx2-strchr-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S -new file mode 100644 -index 000000000..bcbcd4ce7 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wcscmp-kbl.S -@@ -0,0 +1,4 @@ -+#define STRCMP wcscmp_avx2 -+#define USE_AS_WCSCMP 1 -+ -+#include "avx2-strcmp-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S -new file mode 100644 -index 000000000..f1b973572 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wcslen-kbl.S -@@ -0,0 +1,4 @@ -+#define STRLEN wcslen_avx2 -+#define USE_AS_WCSLEN 1 -+ -+#include "avx2-strlen-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S -new file mode 100644 -index 000000000..7603169c1 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wcsncmp-kbl.S -@@ -0,0 +1,6 @@ -+#define STRCMP wcsncmp_avx2 -+#define USE_AS_STRNCMP 1 -+#define USE_AS_WCSCMP 1 -+ -+#include "avx_regs.h" -+#include "avx2-strcmp-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S -new file mode 100644 -index 000000000..2095cd8e0 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wcsnlen-kbl.S -@@ -0,0 +1,6 @@ -+#define STRLEN wcsnlen_avx2 -+#define USE_AS_WCSLEN 1 -+#define USE_AS_STRNLEN 1 -+ -+#include "avx_regs.h" -+#include "avx2-strlen-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S -new file mode 100644 -index 000000000..fbec1286c ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-wcsrchr-kbl.S -@@ -0,0 +1,3 @@ -+#define STRRCHR wcsrchr_avx2 -+#define USE_AS_WCSRCHR 1 -+#include "avx2-strrchr-kbl.S" -diff --git a/libc/arch-x86_64/kabylake/string/avx_regs.h b/libc/arch-x86_64/kabylake/string/avx_regs.h -new file mode 100644 -index 000000000..223d97e3e ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx_regs.h -@@ -0,0 +1,26 @@ -+/* Long and pointer size in bytes. */ -+#define LP_SIZE 8 -+ -+/* Instruction to operate on long and pointer. */ -+#define LP_OP(insn) insn##q -+ -+/* Assembler address directive. */ -+#define ASM_ADDR .quad -+ -+/* Registers to hold long and pointer. */ -+#define RAX_LP rax -+#define RBP_LP rbp -+#define RBX_LP rbx -+#define RCX_LP rcx -+#define RDI_LP rdi -+#define RDX_LP rdx -+#define RSI_LP rsi -+#define RSP_LP rsp -+#define R8_LP r8 -+#define R9_LP r9 -+#define R10_LP r10 -+#define R11_LP r11 -+#define R12_LP r12 -+#define R13_LP r13 -+#define R14_LP r14 -+#define R15_LP r15 -diff --git a/libc/arch-x86_64/include/cache.h b/libc/arch-x86_64/kabylake/string/cache.h -similarity index 100% -rename from libc/arch-x86_64/include/cache.h -rename to libc/arch-x86_64/kabylake/string/cache.h -diff --git a/libc/arch-x86_64/silvermont/string/cache.h b/libc/arch-x86_64/silvermont/string/cache.h -new file mode 100644 -index 000000000..3606d2a1a ---- /dev/null -+++ b/libc/arch-x86_64/silvermont/string/cache.h -@@ -0,0 +1,36 @@ -+/* -+Copyright (c) 2014, Intel Corporation -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ -+ * Redistributions of source code must retain the above copyright notice, -+ * this list of conditions and the following disclaimer. -+ -+ * Redistributions in binary form must reproduce the above copyright notice, -+ * this list of conditions and the following disclaimer in the documentation -+ * and/or other materials provided with the distribution. -+ -+ * Neither the name of Intel Corporation nor the names of its contributors -+ * may be used to endorse or promote products derived from this software -+ * without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+/* Values are optimized for Silvermont */ -+#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */ -+#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */ -+ -+#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) -+#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) -diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S -index 0ad2d44cf..ce15cdf1c 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-stpcpy-slm.S -@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #define USE_AS_STPCPY --#define STRCPY stpcpy -+#define STRCPY stpcpy_generic - #include "sse2-strcpy-slm.S" -diff --git a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S -index 30666850b..02b4df02d 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-stpncpy-slm.S -@@ -30,5 +30,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - #define USE_AS_STRNCPY - #define USE_AS_STPCPY --#define STRCPY stpncpy -+#define STRCPY stpncpy_generic - #include "sse2-strcpy-slm.S" -diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S -index dd8207ff5..007adfe95 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-strcat-slm.S -@@ -29,7 +29,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #ifndef STRCAT --# define STRCAT strcat -+# define STRCAT strcat_generic - #endif - - #ifndef L -diff --git a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S -index 3e146bfbc..ade9eac4f 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-strcpy-slm.S -@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #ifndef USE_AS_STRCAT - - # ifndef STRCPY --# define STRCPY strcpy -+# define STRCPY strcpy_generic - # endif - - # ifndef L -diff --git a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S -index 3772fe770..df24f9de2 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-strlen-slm.S -@@ -31,7 +31,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #ifndef USE_AS_STRCAT - - #ifndef STRLEN --# define STRLEN strlen -+# define STRLEN strlen_generic - #endif - - #ifndef L -diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S -index 6b4a43084..c5394f9d5 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-strncat-slm.S -@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #define USE_AS_STRNCAT --#define STRCAT strncat -+#define STRCAT strncat_generic - #include "sse2-strcat-slm.S" -diff --git a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S -index 594e78f74..2e8d68d12 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-strncpy-slm.S -@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #define USE_AS_STRNCPY --#define STRCPY strncpy -+#define STRCPY strncpy_generic - #include "sse2-strcpy-slm.S" -diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S -index e8acd5ba4..fa2542f00 100644 ---- a/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S -+++ b/libc/arch-x86_64/silvermont/string/ssse3-strcmp-slm.S -@@ -43,7 +43,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #else - #define UPDATE_STRNCMP_COUNTER - #ifndef STRCMP --#define STRCMP strcmp -+#define STRCMP strcmp_generic - #endif - #endif - -diff --git a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S -index 0e4077517..5d20a483f 100644 ---- a/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S -+++ b/libc/arch-x86_64/silvermont/string/ssse3-strncmp-slm.S -@@ -29,5 +29,5 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #define USE_AS_STRNCMP --#define STRCMP strncmp -+#define STRCMP strncmp_generic - #include "ssse3-strcmp-slm.S" -diff --git a/libc/arch-x86_64/static_function_dispatch.S b/libc/arch-x86_64/static_function_dispatch.S -index 979ce4f18..5c0f1f2ba 100644 ---- a/libc/arch-x86_64/static_function_dispatch.S -+++ b/libc/arch-x86_64/static_function_dispatch.S -@@ -38,6 +38,25 @@ FUNCTION_DELEGATE(__memset_chk, __memset_chk_generic) - FUNCTION_DELEGATE(memcmp, memcmp_generic) - FUNCTION_DELEGATE(memcpy, memmove_generic) - FUNCTION_DELEGATE(memmove, memmove_generic) --FUNCTION_DELEGATE(memchr, memchr_openbsd) --FUNCTION_DELEGATE(memrchr, memrchr_openbsd) --//FUNCTION_DELEGATE(wmemset, wmemset_freebsd) -+FUNCTION_DELEGATE(memchr, memchr_generic) -+FUNCTION_DELEGATE(memrchr, memrchr_generic) -+//FUNCTION_DELEGATE(wmemset, wmemset_generic) -+FUNCTION_DELEGATE(strcmp, strcmp_generic) -+FUNCTION_DELEGATE(strncmp, strncmp_generic) -+FUNCTION_DELEGATE(strcpy, strcpy_generic) -+FUNCTION_DELEGATE(strncpy, strncpy_generic) -+FUNCTION_DELEGATE(stpcpy, stpcpy_generic) -+FUNCTION_DELEGATE(stpncpy, stpncpy_generic) -+FUNCTION_DELEGATE(strlen, strlen_generic) -+FUNCTION_DELEGATE(strnlen, strnlen_generic) -+FUNCTION_DELEGATE(strchr, strchr_generic) -+FUNCTION_DELEGATE(strrchr, strrchr_generic) -+FUNCTION_DELEGATE(strcat, strcat_generic) -+FUNCTION_DELEGATE(strncat, strncat_generic) -+FUNCTION_DELEGATE(wcscmp, wcscmp_generic) -+FUNCTION_DELEGATE(wcsncmp, wcsncmp_generic) -+FUNCTION_DELEGATE(wcslen, wcslen_generic) -+FUNCTION_DELEGATE(wcsnlen, wcsnlen_generic) -+FUNCTION_DELEGATE(wcschr, wcschr_generic) -+FUNCTION_DELEGATE(wcsrchr, wcsrchr_generic) -+ --- -2.25.1 - diff --git a/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch b/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch deleted file mode 100644 index 6f47b3414b..0000000000 --- a/aosp_diff/preliminary/bionic/0005-avx2-implementation-for-memmove-api.patch +++ /dev/null @@ -1,645 +0,0 @@ -From 05ace70e6407263d0bef91800005942a079058d6 Mon Sep 17 00:00:00 2001 -From: "Reddy, Alavala Srinivasa" -Date: Wed, 1 Nov 2023 18:43:18 +0530 -Subject: [PATCH 5/5] avx2 implementation for memmove api - -This patch includes handwritten avx2 assembly -implementation for memmove 64-bit. - -Test done: Build and boot is fine, Run the benchmarks suite. - -Signed-off-by: ahs ---- - libc/Android.bp | 1 + - .../arch-x86_64/dynamic_function_dispatch.cpp | 2 + - .../kabylake/string/avx2-memmove-kbl.S | 593 ++++++++++++++++++ - 3 files changed, 596 insertions(+) - create mode 100644 libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S - -diff --git a/libc/Android.bp b/libc/Android.bp -index 92483e833..5deb88b48 100644 ---- a/libc/Android.bp -+++ b/libc/Android.bp -@@ -1235,6 +1235,7 @@ cc_library_static { - "arch-x86_64/kabylake/string/avx2-memcmp-kbl.S", - "arch-x86_64/kabylake/string/avx2-memchr-kbl.S", - "arch-x86_64/kabylake/string/avx2-memrchr-kbl.S", -+ "arch-x86_64/kabylake/string/avx2-memmove-kbl.S", - "arch-x86_64/kabylake/string/avx2-strcmp-kbl.S", - "arch-x86_64/kabylake/string/avx2-strncmp-kbl.S", - "arch-x86_64/kabylake/string/avx2-strlen-kbl.S", -diff --git a/libc/arch-x86_64/dynamic_function_dispatch.cpp b/libc/arch-x86_64/dynamic_function_dispatch.cpp -index 182eb4200..5bcf63e4c 100644 ---- a/libc/arch-x86_64/dynamic_function_dispatch.cpp -+++ b/libc/arch-x86_64/dynamic_function_dispatch.cpp -@@ -55,6 +55,8 @@ DEFINE_IFUNC_FOR(memcmp) { - - typedef void* memmove_func(void* __dst, const void* __src, size_t __n); - DEFINE_IFUNC_FOR(memmove) { -+ __builtin_cpu_init(); -+ if (__builtin_cpu_supports("avx2")) RETURN_FUNC(memmove_func, memmove_avx2); - RETURN_FUNC(memmove_func, memmove_generic); - } - -diff --git a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S -new file mode 100644 -index 000000000..02e9ec1d2 ---- /dev/null -+++ b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S -@@ -0,0 +1,593 @@ -+/* -+Copyright (c) 2014, Intel Corporation -+All rights reserved. -+ -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are met: -+ -+ * Redistributions of source code must retain the above copyright notice, -+ * this list of conditions and the following disclaimer. -+ -+ * Redistributions in binary form must reproduce the above copyright notice, -+ * this list of conditions and the following disclaimer in the documentation -+ * and/or other materials provided with the distribution. -+ -+ * Neither the name of Intel Corporation nor the names of its contributors -+ * may be used to endorse or promote products derived from this software -+ * without specific prior written permission. -+ -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*/ -+ -+#include "cache.h" -+ -+#ifndef MEMMOVE -+# define MEMMOVE memmove_avx2 -+#endif -+ -+#ifndef L -+# define L(label) .L##label -+#endif -+ -+#ifndef cfi_startproc -+# define cfi_startproc .cfi_startproc -+#endif -+ -+#ifndef cfi_endproc -+# define cfi_endproc .cfi_endproc -+#endif -+ -+#ifndef cfi_rel_offset -+# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off -+#endif -+ -+#ifndef cfi_restore -+# define cfi_restore(reg) .cfi_restore reg -+#endif -+ -+#ifndef cfi_adjust_cfa_offset -+# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off -+#endif -+ -+#ifndef ENTRY -+# define ENTRY(name) \ -+ .type name, @function; \ -+ .globl name; \ -+ .p2align 4; \ -+name: \ -+ cfi_startproc -+#endif -+ -+#ifndef ALIAS_SYMBOL -+# define ALIAS_SYMBOL(alias, original) \ -+ .globl alias; \ -+ .equ alias, original -+#endif -+ -+#ifndef END -+# define END(name) \ -+ cfi_endproc; \ -+ .size name, .-name -+#endif -+ -+#define CFI_PUSH(REG) \ -+ cfi_adjust_cfa_offset (4); \ -+ cfi_rel_offset (REG, 0) -+ -+#define CFI_POP(REG) \ -+ cfi_adjust_cfa_offset (-4); \ -+ cfi_restore (REG) -+ -+#define PUSH(REG) push REG; -+#define POP(REG) pop REG; -+ -+#define ENTRANCE PUSH (%rbx); -+#define RETURN_END POP (%rbx); ret -+#define RETURN RETURN_END; -+ -+ .section .text.avx2,"ax",@progbits -+ENTRY (MEMMOVE) -+ ENTRANCE -+ mov %rdi, %rax -+ -+/* Check whether we should copy backward or forward. */ -+ cmp %rsi, %rdi -+ je L(mm_return) -+ jg L(mm_len_0_or_more_backward) -+ -+/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] -+ separately. */ -+ cmp $16, %rdx -+ jbe L(mm_len_0_16_bytes_forward) -+ -+ cmp $32, %rdx -+ ja L(mm_len_32_or_more_forward) -+ -+/* Copy [0..32] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu -16(%rsi, %rdx), %xmm1 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, -16(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_32_or_more_forward): -+ cmp $64, %rdx -+ ja L(mm_len_64_or_more_forward) -+ -+/* Copy [0..64] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu 16(%rsi), %xmm1 -+ movdqu -16(%rsi, %rdx), %xmm2 -+ movdqu -32(%rsi, %rdx), %xmm3 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, 16(%rdi) -+ movdqu %xmm2, -16(%rdi, %rdx) -+ movdqu %xmm3, -32(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_64_or_more_forward): -+ cmp $128, %rdx -+ ja L(mm_len_128_or_more_forward) -+ -+/* Copy [0..128] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu 16(%rsi), %xmm1 -+ movdqu 32(%rsi), %xmm2 -+ movdqu 48(%rsi), %xmm3 -+ movdqu -64(%rsi, %rdx), %xmm4 -+ movdqu -48(%rsi, %rdx), %xmm5 -+ movdqu -32(%rsi, %rdx), %xmm6 -+ movdqu -16(%rsi, %rdx), %xmm7 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, 16(%rdi) -+ movdqu %xmm2, 32(%rdi) -+ movdqu %xmm3, 48(%rdi) -+ movdqu %xmm4, -64(%rdi, %rdx) -+ movdqu %xmm5, -48(%rdi, %rdx) -+ movdqu %xmm6, -32(%rdi, %rdx) -+ movdqu %xmm7, -16(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_128_or_more_forward): -+ cmp $256, %rdx -+ ja L(mm_len_256_or_more_forward) -+ -+/* Copy [0..256] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu 16(%rsi), %xmm1 -+ movdqu 32(%rsi), %xmm2 -+ movdqu 48(%rsi), %xmm3 -+ movdqu 64(%rsi), %xmm4 -+ movdqu 80(%rsi), %xmm5 -+ movdqu 96(%rsi), %xmm6 -+ movdqu 112(%rsi), %xmm7 -+ movdqu -128(%rsi, %rdx), %xmm8 -+ movdqu -112(%rsi, %rdx), %xmm9 -+ movdqu -96(%rsi, %rdx), %xmm10 -+ movdqu -80(%rsi, %rdx), %xmm11 -+ movdqu -64(%rsi, %rdx), %xmm12 -+ movdqu -48(%rsi, %rdx), %xmm13 -+ movdqu -32(%rsi, %rdx), %xmm14 -+ movdqu -16(%rsi, %rdx), %xmm15 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, 16(%rdi) -+ movdqu %xmm2, 32(%rdi) -+ movdqu %xmm3, 48(%rdi) -+ movdqu %xmm4, 64(%rdi) -+ movdqu %xmm5, 80(%rdi) -+ movdqu %xmm6, 96(%rdi) -+ movdqu %xmm7, 112(%rdi) -+ movdqu %xmm8, -128(%rdi, %rdx) -+ movdqu %xmm9, -112(%rdi, %rdx) -+ movdqu %xmm10, -96(%rdi, %rdx) -+ movdqu %xmm11, -80(%rdi, %rdx) -+ movdqu %xmm12, -64(%rdi, %rdx) -+ movdqu %xmm13, -48(%rdi, %rdx) -+ movdqu %xmm14, -32(%rdi, %rdx) -+ movdqu %xmm15, -16(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_256_or_more_forward): -+/* Aligning the address of destination. */ -+/* save first unaligned 128 bytes */ -+ vmovdqu (%rsi), %ymm0 -+ vmovdqu 32(%rsi), %ymm1 -+ vmovdqu 64(%rsi), %ymm2 -+ vmovdqu 96(%rsi), %ymm3 -+ -+ lea 128(%rdi), %r8 -+ and $-128, %r8 /* r8 now aligned to next 128 byte boundary */ -+ sub %rdi, %rsi /* rsi = src - dst = diff */ -+ -+ vmovdqu (%r8, %rsi), %ymm4 -+ vmovdqu 32(%r8, %rsi), %ymm5 -+ vmovdqu 64(%r8, %rsi), %ymm6 -+ vmovdqu 96(%r8, %rsi), %ymm7 -+ -+ vmovdqu %ymm0, (%rdi) -+ vmovdqu %ymm1, 32(%rdi) -+ vmovdqu %ymm2, 64(%rdi) -+ vmovdqu %ymm3, 96(%rdi) -+ vmovdqa %ymm4, (%r8) -+ vmovaps %ymm5, 32(%r8) -+ vmovaps %ymm6, 64(%r8) -+ vmovaps %ymm7, 96(%r8) -+ add $128, %r8 -+ -+ lea (%rdi, %rdx), %rbx -+ and $-128, %rbx -+ cmp %r8, %rbx -+ jbe L(mm_copy_remaining_forward) -+ -+ cmp $SHARED_CACHE_SIZE_HALF, %rdx -+ jae L(mm_large_page_loop_forward) -+ -+ .p2align 4 -+L(mm_main_loop_forward): -+ prefetcht0 128(%r8, %rsi) -+ vmovdqu (%r8, %rsi), %ymm0 -+ vmovdqu 32(%r8, %rsi), %ymm1 -+ vmovdqa %ymm0, (%r8) -+ vmovaps %ymm1, 32(%r8) -+ lea 64(%r8), %r8 -+ cmp %r8, %rbx -+ ja L(mm_main_loop_forward) -+ -+L(mm_copy_remaining_forward): -+ add %rdi, %rdx -+ sub %r8, %rdx -+/* We copied all up till %rdi position in the dst. -+ In %rdx now is how many bytes are left to copy. -+ Now we need to advance %r8. */ -+ lea (%r8, %rsi), %r9 -+ -+L(mm_remaining_0_128_bytes_forward): -+ cmp $64, %rdx -+ ja L(mm_remaining_65_128_bytes_forward) -+ cmp $32, %rdx -+ ja L(mm_remaining_33_64_bytes_forward) -+ vzeroupper -+ cmp $16, %rdx -+ ja L(mm_remaining_17_32_bytes_forward) -+ test %rdx, %rdx -+ .p2align 4,,2 -+ je L(mm_return) -+ -+ cmpb $8, %dl -+ ja L(mm_remaining_9_16_bytes_forward) -+ cmpb $4, %dl -+ .p2align 4,,5 -+ ja L(mm_remaining_5_8_bytes_forward) -+ cmpb $2, %dl -+ .p2align 4,,1 -+ ja L(mm_remaining_3_4_bytes_forward) -+ movzbl -1(%r9,%rdx), %esi -+ movzbl (%r9), %ebx -+ movb %sil, -1(%r8,%rdx) -+ movb %bl, (%r8) -+ jmp L(mm_return) -+ -+L(mm_remaining_65_128_bytes_forward): -+ vmovdqu (%r9), %ymm0 -+ vmovdqu 32(%r9), %ymm1 -+ vmovdqu -64(%r9, %rdx), %ymm2 -+ vmovdqu -32(%r9, %rdx), %ymm3 -+ vmovdqu %ymm0, (%r8) -+ vmovdqu %ymm1, 32(%r8) -+ vmovdqu %ymm2, -64(%r8, %rdx) -+ vmovdqu %ymm3, -32(%r8, %rdx) -+ jmp L(mm_return) -+ -+L(mm_remaining_33_64_bytes_forward): -+ vmovdqu (%r9), %ymm0 -+ vmovdqu -32(%r9, %rdx), %ymm1 -+ vmovdqu %ymm0, (%r8) -+ vmovdqu %ymm1, -32(%r8, %rdx) -+ jmp L(mm_return) -+ -+L(mm_remaining_17_32_bytes_forward): -+ movdqu (%r9), %xmm0 -+ movdqu -16(%r9, %rdx), %xmm1 -+ movdqu %xmm0, (%r8) -+ movdqu %xmm1, -16(%r8, %rdx) -+ jmp L(mm_return) -+ -+L(mm_remaining_5_8_bytes_forward): -+ movl (%r9), %esi -+ movl -4(%r9,%rdx), %ebx -+ movl %esi, (%r8) -+ movl %ebx, -4(%r8,%rdx) -+ jmp L(mm_return) -+ -+L(mm_remaining_9_16_bytes_forward): -+ mov (%r9), %rsi -+ mov -8(%r9, %rdx), %rbx -+ mov %rsi, (%r8) -+ mov %rbx, -8(%r8, %rdx) -+ jmp L(mm_return) -+ -+L(mm_remaining_3_4_bytes_forward): -+ movzwl -2(%r9,%rdx), %esi -+ movzwl (%r9), %ebx -+ movw %si, -2(%r8,%rdx) -+ movw %bx, (%r8) -+ jmp L(mm_return) -+ -+L(mm_len_0_16_bytes_forward): -+ testb $24, %dl -+ jne L(mm_len_9_16_bytes_forward) -+ testb $4, %dl -+ .p2align 4,,5 -+ jne L(mm_len_5_8_bytes_forward) -+ test %rdx, %rdx -+ .p2align 4,,2 -+ je L(mm_return) -+ testb $2, %dl -+ .p2align 4,,1 -+ jne L(mm_len_2_4_bytes_forward) -+ movzbl -1(%rsi,%rdx), %ebx -+ movzbl (%rsi), %esi -+ movb %bl, -1(%rdi,%rdx) -+ movb %sil, (%rdi) -+ jmp L(mm_return) -+ -+L(mm_len_2_4_bytes_forward): -+ movzwl -2(%rsi,%rdx), %ebx -+ movzwl (%rsi), %esi -+ movw %bx, -2(%rdi,%rdx) -+ movw %si, (%rdi) -+ jmp L(mm_return) -+ -+L(mm_len_5_8_bytes_forward): -+ movl (%rsi), %ebx -+ movl -4(%rsi,%rdx), %esi -+ movl %ebx, (%rdi) -+ movl %esi, -4(%rdi,%rdx) -+ jmp L(mm_return) -+ -+L(mm_len_9_16_bytes_forward): -+ mov (%rsi), %rbx -+ mov -8(%rsi, %rdx), %rsi -+ mov %rbx, (%rdi) -+ mov %rsi, -8(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_recalc_len): -+/* Compute in %rdx how many bytes are left to copy after -+ the main loop stops. */ -+ vzeroupper -+ mov %rbx, %rdx -+ sub %rdi, %rdx -+/* The code for copying backwards. */ -+L(mm_len_0_or_more_backward): -+ -+/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] -+ separately. */ -+ cmp $16, %rdx -+ jbe L(mm_len_0_16_bytes_backward) -+ -+ cmp $32, %rdx -+ ja L(mm_len_32_or_more_backward) -+ -+/* Copy [0..32] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu -16(%rsi, %rdx), %xmm1 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, -16(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_32_or_more_backward): -+ cmp $64, %rdx -+ ja L(mm_len_64_or_more_backward) -+ -+/* Copy [0..64] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu 16(%rsi), %xmm1 -+ movdqu -16(%rsi, %rdx), %xmm2 -+ movdqu -32(%rsi, %rdx), %xmm3 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, 16(%rdi) -+ movdqu %xmm2, -16(%rdi, %rdx) -+ movdqu %xmm3, -32(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_64_or_more_backward): -+ cmp $128, %rdx -+ ja L(mm_len_128_or_more_backward) -+ -+/* Copy [0..128] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu 16(%rsi), %xmm1 -+ movdqu 32(%rsi), %xmm2 -+ movdqu 48(%rsi), %xmm3 -+ movdqu -64(%rsi, %rdx), %xmm4 -+ movdqu -48(%rsi, %rdx), %xmm5 -+ movdqu -32(%rsi, %rdx), %xmm6 -+ movdqu -16(%rsi, %rdx), %xmm7 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, 16(%rdi) -+ movdqu %xmm2, 32(%rdi) -+ movdqu %xmm3, 48(%rdi) -+ movdqu %xmm4, -64(%rdi, %rdx) -+ movdqu %xmm5, -48(%rdi, %rdx) -+ movdqu %xmm6, -32(%rdi, %rdx) -+ movdqu %xmm7, -16(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_128_or_more_backward): -+ cmp $256, %rdx -+ ja L(mm_len_256_or_more_backward) -+ -+/* Copy [0..256] and return. */ -+ movdqu (%rsi), %xmm0 -+ movdqu 16(%rsi), %xmm1 -+ movdqu 32(%rsi), %xmm2 -+ movdqu 48(%rsi), %xmm3 -+ movdqu 64(%rsi), %xmm4 -+ movdqu 80(%rsi), %xmm5 -+ movdqu 96(%rsi), %xmm6 -+ movdqu 112(%rsi), %xmm7 -+ movdqu -128(%rsi, %rdx), %xmm8 -+ movdqu -112(%rsi, %rdx), %xmm9 -+ movdqu -96(%rsi, %rdx), %xmm10 -+ movdqu -80(%rsi, %rdx), %xmm11 -+ movdqu -64(%rsi, %rdx), %xmm12 -+ movdqu -48(%rsi, %rdx), %xmm13 -+ movdqu -32(%rsi, %rdx), %xmm14 -+ movdqu -16(%rsi, %rdx), %xmm15 -+ movdqu %xmm0, (%rdi) -+ movdqu %xmm1, 16(%rdi) -+ movdqu %xmm2, 32(%rdi) -+ movdqu %xmm3, 48(%rdi) -+ movdqu %xmm4, 64(%rdi) -+ movdqu %xmm5, 80(%rdi) -+ movdqu %xmm6, 96(%rdi) -+ movdqu %xmm7, 112(%rdi) -+ movdqu %xmm8, -128(%rdi, %rdx) -+ movdqu %xmm9, -112(%rdi, %rdx) -+ movdqu %xmm10, -96(%rdi, %rdx) -+ movdqu %xmm11, -80(%rdi, %rdx) -+ movdqu %xmm12, -64(%rdi, %rdx) -+ movdqu %xmm13, -48(%rdi, %rdx) -+ movdqu %xmm14, -32(%rdi, %rdx) -+ movdqu %xmm15, -16(%rdi, %rdx) -+ jmp L(mm_return) -+ -+L(mm_len_256_or_more_backward): -+/* Aligning the address of destination. We need to save -+ 128 bytes from the source in order not to overwrite them. */ -+ vmovdqu -32(%rsi, %rdx), %ymm0 -+ vmovdqu -64(%rsi, %rdx), %ymm1 -+ vmovdqu -96(%rsi, %rdx), %ymm2 -+ vmovdqu -128(%rsi, %rdx), %ymm3 -+ -+ lea (%rdi, %rdx), %r9 -+ and $-128, %r9 /* r9 = aligned dst */ -+ -+ mov %rsi, %r8 -+ sub %rdi, %r8 /* r8 = src - dst, diff */ -+ -+ vmovdqu -32(%r9, %r8), %ymm4 -+ vmovdqu -64(%r9, %r8), %ymm5 -+ vmovdqu -96(%r9, %r8), %ymm6 -+ vmovdqu -128(%r9, %r8), %ymm7 -+ -+ vmovdqu %ymm0, -32(%rdi, %rdx) -+ vmovdqu %ymm1, -64(%rdi, %rdx) -+ vmovdqu %ymm2, -96(%rdi, %rdx) -+ vmovdqu %ymm3, -128(%rdi, %rdx) -+ vmovdqa %ymm4, -32(%r9) -+ vmovdqa %ymm5, -64(%r9) -+ vmovdqa %ymm6, -96(%r9) -+ vmovdqa %ymm7, -128(%r9) -+ lea -128(%r9), %r9 -+ -+ lea 128(%rdi), %rbx -+ and $-128, %rbx -+ -+ cmp %r9, %rbx -+ jae L(mm_recalc_len) -+ -+ cmp $SHARED_CACHE_SIZE_HALF, %rdx -+ jae L(mm_large_page_loop_backward) -+ -+ .p2align 4 -+L(mm_main_loop_backward): -+ prefetcht0 -128(%r9, %r8) -+ -+ vmovdqu -64(%r9, %r8), %ymm0 -+ vmovdqu -32(%r9, %r8), %ymm1 -+ vmovdqa %ymm0, -64(%r9) -+ vmovaps %ymm1, -32(%r9) -+ lea -64(%r9), %r9 -+ cmp %r9, %rbx -+ jb L(mm_main_loop_backward) -+ jmp L(mm_recalc_len) -+ -+/* Copy [0..16] and return. */ -+L(mm_len_0_16_bytes_backward): -+ testb $24, %dl -+ jnz L(mm_len_9_16_bytes_backward) -+ testb $4, %dl -+ .p2align 4,,5 -+ jnz L(mm_len_5_8_bytes_backward) -+ test %rdx, %rdx -+ .p2align 4,,2 -+ je L(mm_return) -+ testb $2, %dl -+ .p2align 4,,1 -+ jne L(mm_len_3_4_bytes_backward) -+ movzbl -1(%rsi,%rdx), %ebx -+ movzbl (%rsi), %ecx -+ movb %bl, -1(%rdi,%rdx) -+ movb %cl, (%rdi) -+ jmp L(mm_return) -+ -+L(mm_len_3_4_bytes_backward): -+ movzwl -2(%rsi,%rdx), %ebx -+ movzwl (%rsi), %ecx -+ movw %bx, -2(%rdi,%rdx) -+ movw %cx, (%rdi) -+ jmp L(mm_return) -+ -+L(mm_len_9_16_bytes_backward): -+ movl -4(%rsi,%rdx), %ebx -+ movl -8(%rsi,%rdx), %ecx -+ movl %ebx, -4(%rdi,%rdx) -+ movl %ecx, -8(%rdi,%rdx) -+ sub $8, %rdx -+ jmp L(mm_len_0_16_bytes_backward) -+ -+L(mm_len_5_8_bytes_backward): -+ movl (%rsi), %ebx -+ movl -4(%rsi,%rdx), %ecx -+ movl %ebx, (%rdi) -+ movl %ecx, -4(%rdi,%rdx) -+ -+L(mm_return): -+ vzeroupper -+ RETURN -+ -+/* Big length copy forward part. */ -+ -+ .p2align 4 -+L(mm_large_page_loop_forward): -+ vmovdqu (%r8, %rsi), %ymm0 -+ vmovdqu 32(%r8, %rsi), %ymm1 -+ vmovdqu 64(%r8, %rsi), %ymm2 -+ vmovdqu 96(%r8, %rsi), %ymm3 -+ vmovntdq %ymm0, (%r8) -+ vmovntdq %ymm1, 32(%r8) -+ vmovntdq %ymm2, 64(%r8) -+ vmovntdq %ymm3, 96(%r8) -+ lea 128(%r8), %r8 -+ cmp %r8, %rbx -+ ja L(mm_large_page_loop_forward) -+ sfence -+ jmp L(mm_copy_remaining_forward) -+ -+/* Big length copy backward part. */ -+ .p2align 4 -+L(mm_large_page_loop_backward): -+ vmovdqu -64(%r9, %r8), %ymm0 -+ vmovdqu -32(%r9, %r8), %ymm1 -+ vmovntdq %ymm0, -64(%r9) -+ vmovntdq %ymm1, -32(%r9) -+ lea -64(%r9), %r9 -+ cmp %r9, %rbx -+ jb L(mm_large_page_loop_backward) -+ sfence -+ jmp L(mm_recalc_len) -+ -+END (MEMMOVE) -+ -+//ALIAS_SYMBOL(memcpy, MEMMOVE) --- -2.25.1 - diff --git a/aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch b/aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch deleted file mode 100644 index 204371f263..0000000000 --- a/aosp_diff/preliminary/bionic/0006-Obtain-x86-cache-info-from-CPU.patch +++ /dev/null @@ -1,594 +0,0 @@ -From e4ddc78e40f68994a1822c2e126e517c8f4060c5 Mon Sep 17 00:00:00 2001 -From: Elliott Hughes -Date: Fri, 19 Jul 2024 12:00:17 +0000 -Subject: [PATCH] Obtain x86 cache info from CPU - -The cache info today is hardcoded in cache.h -May not be optimal across various uarchs/SKUs -Leverage bionic sysconf to get the underlying cache. - -Improvements seen on RPL, for various sizes -memmove_non_overlapping -1.25M - 31% -1.5M - 30% -1.75M - 28% - -memcpy -1.25M - 31% -1.5M - 31% -1.75M - 30% - -The bionic benchmarks (which only go up to 128KiB) show no change, as -you'd expect. - -Test: bionic/tests/run-on-host.sh 64 && bionic/tests/run-on-host.sh 32 -Bug: 202102347 -Change-Id: I4bbad51794758873744149d0f58b86bb92ee307f -Signed-off-by: Vinay Prasad Kompella -Signed-off-by: Soni, Ravi Kumar ---- - libc/arch-x86/string/cache.h | 41 ------------------- - libc/arch-x86/string/sse2-memmove-slm.S | 19 +++++++-- - libc/arch-x86/string/sse2-memset-atom.S | 13 ++++-- - libc/arch-x86/string/sse2-memset-slm.S | 11 +++-- - libc/arch-x86/string/ssse3-memcpy-atom.S | 1 - - .../kabylake/string/avx2-memmove-kbl.S | 26 +++++++++--- - .../kabylake/string/avx2-memset-kbl.S | 8 +--- - libc/arch-x86_64/kabylake/string/cache.h | 36 ---------------- - libc/arch-x86_64/silvermont/string/cache.h | 36 ---------------- - .../silvermont/string/sse2-memmove-slm.S | 26 +++++++++--- - .../silvermont/string/sse2-memset-slm.S | 8 +--- - .../silvermont/string/sse4-memcmp-slm.S | 13 +----- - libc/bionic/libc_init_common.cpp | 26 ++++++++++++ - 13 files changed, 108 insertions(+), 156 deletions(-) - delete mode 100644 libc/arch-x86/string/cache.h - delete mode 100644 libc/arch-x86_64/kabylake/string/cache.h - delete mode 100644 libc/arch-x86_64/silvermont/string/cache.h - -diff --git a/libc/arch-x86/string/cache.h b/libc/arch-x86/string/cache.h -deleted file mode 100644 -index 33719a0cb..000000000 ---- a/libc/arch-x86/string/cache.h -+++ /dev/null -@@ -1,41 +0,0 @@ --/* --Copyright (c) 2010, Intel Corporation --All rights reserved. -- --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are met: -- -- * Redistributions of source code must retain the above copyright notice, -- * this list of conditions and the following disclaimer. -- -- * Redistributions in binary form must reproduce the above copyright notice, -- * this list of conditions and the following disclaimer in the documentation -- * and/or other materials provided with the distribution. -- -- * Neither the name of Intel Corporation nor the names of its contributors -- * may be used to endorse or promote products derived from this software -- * without specific prior written permission. -- --THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND --ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED --WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR --ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES --(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; --LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON --ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT --(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS --SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --*/ -- --#ifdef FOR_ATOM --#define SHARED_CACHE_SIZE (512 * 1024) /* Atom L2 Cache */ --#endif --#ifdef FOR_SILVERMONT --#define SHARED_CACHE_SIZE (1024 * 1024) /* Silvermont L2 Cache */ --#endif -- --#define DATA_CACHE_SIZE (24 * 1024) /* Atom and Silvermont L1 Data Cache */ -- --#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) --#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) -diff --git a/libc/arch-x86/string/sse2-memmove-slm.S b/libc/arch-x86/string/sse2-memmove-slm.S -index 79b5d1b7e..7f4237486 100644 ---- a/libc/arch-x86/string/sse2-memmove-slm.S -+++ b/libc/arch-x86/string/sse2-memmove-slm.S -@@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #define FOR_SILVERMONT --#include "cache.h" - - #ifndef MEMMOVE - # define MEMMOVE memmove_generic -@@ -94,6 +93,8 @@ name: \ - #define RETURN_END POP (%ebx); ret - #define RETURN RETURN_END; CFI_PUSH (%ebx) - -+#define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x -+ - .section .text.sse2,"ax",@progbits - ENTRY (MEMMOVE) - ENTRANCE -@@ -193,7 +194,13 @@ L(mm_len_128_or_more_forward): - cmp %edi, %ebx - jbe L(mm_copy_remaining_forward) - -- cmp $SHARED_CACHE_SIZE_HALF, %ecx -+ PUSH(%ebx) -+ SETUP_PIC_REG(bx) -+ add $_GLOBAL_OFFSET_TABLE_, %ebx -+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -+ /* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */ -+ POP(%ebx) -+ - jae L(mm_large_page_loop_forward) - - .p2align 4 -@@ -424,7 +431,13 @@ L(mm_len_128_or_more_backward): - cmp %edi, %ebx - jae L(mm_main_loop_backward_end) - -- cmp $SHARED_CACHE_SIZE_HALF, %ecx -+ PUSH(%ebx) -+ SETUP_PIC_REG(bx) -+ add $_GLOBAL_OFFSET_TABLE_, %ebx -+ cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -+ /* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */ -+ POP(%ebx) -+ - jae L(mm_large_page_loop_backward) - - .p2align 4 -diff --git a/libc/arch-x86/string/sse2-memset-atom.S b/libc/arch-x86/string/sse2-memset-atom.S -index 320afec11..e43ead0d1 100644 ---- a/libc/arch-x86/string/sse2-memset-atom.S -+++ b/libc/arch-x86/string/sse2-memset-atom.S -@@ -31,7 +31,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include - - #define FOR_ATOM --#include "cache.h" - - #ifndef L - # define L(label) .L##label -@@ -64,6 +63,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #define RETURN RETURN_END; CFI_PUSH(%ebx) - #define JMPTBL(I, B) I - B - -+#define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x -+ - /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. */ - # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ -@@ -256,14 +257,20 @@ L(aligned_16_less128bytes): - ALIGN(4) - L(128bytesormore): - PUSH(%ebx) -- mov $SHARED_CACHE_SIZE, %ebx -+ SETUP_PIC_REG(bx) -+ add $_GLOBAL_OFFSET_TABLE_, %ebx -+ mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx - cmp %ebx, %ecx - jae L(128bytesormore_nt_start) - - - POP(%ebx) - # define RESTORE_EBX_STATE CFI_PUSH(%ebx) -- cmp $DATA_CACHE_SIZE, %ecx -+ PUSH(%ebx) -+ SETUP_PIC_REG(bx) -+ add $_GLOBAL_OFFSET_TABLE_, %ebx -+ cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx -+ POP(%ebx) - - jae L(128bytes_L2_normal) - subl $128, %ecx -diff --git a/libc/arch-x86/string/sse2-memset-slm.S b/libc/arch-x86/string/sse2-memset-slm.S -index 5cff141ad..e4c8fa108 100644 ---- a/libc/arch-x86/string/sse2-memset-slm.S -+++ b/libc/arch-x86/string/sse2-memset-slm.S -@@ -31,7 +31,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - #include - - #define FOR_SILVERMONT --#include "cache.h" - - #ifndef L - # define L(label) .L##label -@@ -64,6 +63,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - # define RETURN RETURN_END; CFI_PUSH(%ebx) - # define JMPTBL(I, B) I - B - -+#define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x -+ - /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. */ - # define BRANCH_TO_JMPTBL_ENTRY(TABLE) \ -@@ -177,14 +178,18 @@ L(aligned_16_less128bytes): - ALIGN(4) - L(128bytesormore): - PUSH(%ebx) -- mov $SHARED_CACHE_SIZE, %ebx -+ SETUP_PIC_REG(bx) -+ add $_GLOBAL_OFFSET_TABLE_, %ebx -+ mov __x86_shared_cache_size@GOTOFF(%ebx), %ebx - cmp %ebx, %ecx - jae L(128bytesormore_nt_start) - - POP(%ebx) - - PUSH(%ebx) -- mov $DATA_CACHE_SIZE, %ebx -+ SETUP_PIC_REG(bx) -+ add $_GLOBAL_OFFSET_TABLE_, %ebx -+ mov __x86_data_cache_size@GOTOFF(%ebx), %ebx - - cmp %ebx, %ecx - jae L(128bytes_L2_normal) -diff --git a/libc/arch-x86/string/ssse3-memcpy-atom.S b/libc/arch-x86/string/ssse3-memcpy-atom.S -index fe3082ee7..83e198504 100644 ---- a/libc/arch-x86/string/ssse3-memcpy-atom.S -+++ b/libc/arch-x86/string/ssse3-memcpy-atom.S -@@ -29,7 +29,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - #define FOR_ATOM --#include "cache.h" - - #ifndef MEMCPY - # define MEMCPY memcpy_atom -diff --git a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S -index 02e9ec1d2..77d628eb0 100644 ---- a/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S -+++ b/libc/arch-x86_64/kabylake/string/avx2-memmove-kbl.S -@@ -28,7 +28,7 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - --#include "cache.h" -+ - - #ifndef MEMMOVE - # define MEMMOVE memmove_avx2 -@@ -228,8 +228,9 @@ L(mm_len_256_or_more_forward): - cmp %r8, %rbx - jbe L(mm_copy_remaining_forward) - -- cmp $SHARED_CACHE_SIZE_HALF, %rdx -- jae L(mm_large_page_loop_forward) -+ cmp __x86_shared_cache_size_half(%rip), %rdx -+ -+ ja L(mm_overlapping_check_forward) - - .p2align 4 - L(mm_main_loop_forward): -@@ -497,8 +498,10 @@ L(mm_len_256_or_more_backward): - cmp %r9, %rbx - jae L(mm_recalc_len) - -- cmp $SHARED_CACHE_SIZE_HALF, %rdx -- jae L(mm_large_page_loop_backward) -+ cmp __x86_shared_cache_size_half(%rip), %rdx -+ -+ ja L(mm_overlapping_check_backward) -+ - - .p2align 4 - L(mm_main_loop_backward): -@@ -560,6 +563,12 @@ L(mm_return): - /* Big length copy forward part. */ - - .p2align 4 -+L(mm_overlapping_check_forward): -+ mov %rsi, %r9 -+ add %rdx, %r9 -+ cmp __x86_shared_cache_size(%rip), %r9 -+ jbe L(mm_main_loop_forward) -+ - L(mm_large_page_loop_forward): - vmovdqu (%r8, %rsi), %ymm0 - vmovdqu 32(%r8, %rsi), %ymm1 -@@ -577,6 +586,13 @@ L(mm_large_page_loop_forward): - - /* Big length copy backward part. */ - .p2align 4 -+L(mm_overlapping_check_backward): -+ mov %rdi, %r11 -+ sub %rsi, %r11 /* r11 = dst - src, diff */ -+ add %rdx, %r11 -+ cmp __x86_shared_cache_size(%rip), %r11 -+ jbe L(mm_main_loop_backward) -+ - L(mm_large_page_loop_backward): - vmovdqu -64(%r9, %r8), %ymm0 - vmovdqu -32(%r9, %r8), %ymm1 -diff --git a/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S -index ca62a9f8c..35d682a5d 100644 ---- a/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S -+++ b/libc/arch-x86_64/kabylake/string/avx2-memset-kbl.S -@@ -30,7 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - #include - --#include "cache.h" - - #ifndef L - # define L(label) .L##label -@@ -117,11 +116,8 @@ L(16bytesormore): - cmpq %rcx, %rdx - je L(done) - --#ifdef SHARED_CACHE_SIZE -- cmp $SHARED_CACHE_SIZE, %r8 --#else -- cmp __x86_64_shared_cache_size(%rip), %r8 --#endif -+ cmp __x86_shared_cache_size(%rip), %r8 -+ - ja L(non_temporal_loop) - - ALIGN (4) -diff --git a/libc/arch-x86_64/kabylake/string/cache.h b/libc/arch-x86_64/kabylake/string/cache.h -deleted file mode 100644 -index 4131509fb..000000000 ---- a/libc/arch-x86_64/kabylake/string/cache.h -+++ /dev/null -@@ -1,36 +0,0 @@ --/* --Copyright (c) 2014, Intel Corporation --All rights reserved. -- --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are met: -- -- * Redistributions of source code must retain the above copyright notice, -- * this list of conditions and the following disclaimer. -- -- * Redistributions in binary form must reproduce the above copyright notice, -- * this list of conditions and the following disclaimer in the documentation -- * and/or other materials provided with the distribution. -- -- * Neither the name of Intel Corporation nor the names of its contributors -- * may be used to endorse or promote products derived from this software -- * without specific prior written permission. -- --THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND --ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED --WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR --ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES --(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; --LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON --ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT --(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS --SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --*/ -- --/* Values are optimized for Core Architecture */ --#define SHARED_CACHE_SIZE (4096*1024) /* Core Architecture L2 Cache */ --#define DATA_CACHE_SIZE (24*1024) /* Core Architecture L1 Data Cache */ -- --#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) --#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) -diff --git a/libc/arch-x86_64/silvermont/string/cache.h b/libc/arch-x86_64/silvermont/string/cache.h -deleted file mode 100644 -index 3606d2a1a..000000000 ---- a/libc/arch-x86_64/silvermont/string/cache.h -+++ /dev/null -@@ -1,36 +0,0 @@ --/* --Copyright (c) 2014, Intel Corporation --All rights reserved. -- --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are met: -- -- * Redistributions of source code must retain the above copyright notice, -- * this list of conditions and the following disclaimer. -- -- * Redistributions in binary form must reproduce the above copyright notice, -- * this list of conditions and the following disclaimer in the documentation -- * and/or other materials provided with the distribution. -- -- * Neither the name of Intel Corporation nor the names of its contributors -- * may be used to endorse or promote products derived from this software -- * without specific prior written permission. -- --THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND --ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED --WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE --DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR --ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES --(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; --LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON --ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT --(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS --SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --*/ -- --/* Values are optimized for Silvermont */ --#define SHARED_CACHE_SIZE (1024*1024) /* Silvermont L2 Cache */ --#define DATA_CACHE_SIZE (24*1024) /* Silvermont L1 Data Cache */ -- --#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) --#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) -diff --git a/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S -index 7024f4950..0530a6f59 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-memmove-slm.S -@@ -28,7 +28,6 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - --#include "cache.h" - - #ifndef MEMMOVE - # define MEMMOVE memmove_generic -@@ -189,8 +188,9 @@ L(mm_len_128_or_more_forward): - cmp %r8, %rbx - jbe L(mm_copy_remaining_forward) - -- cmp $SHARED_CACHE_SIZE_HALF, %rdx -- jae L(mm_large_page_loop_forward) -+ cmp __x86_shared_cache_size_half(%rip), %rdx -+ -+ ja L(mm_overlapping_check_forward) - - .p2align 4 - L(mm_main_loop_forward): -@@ -414,8 +414,9 @@ L(mm_len_128_or_more_backward): - cmp %r9, %rbx - jae L(mm_recalc_len) - -- cmp $SHARED_CACHE_SIZE_HALF, %rdx -- jae L(mm_large_page_loop_backward) -+ cmp __x86_shared_cache_size_half(%rip), %rdx -+ -+ ja L(mm_overlapping_check_backward) - - .p2align 4 - L(mm_main_loop_backward): -@@ -481,6 +482,13 @@ L(mm_return): - /* Big length copy forward part. */ - - .p2align 4 -+ -+L(mm_overlapping_check_forward): -+ mov %rsi, %r9 -+ add %rdx, %r9 -+ cmp __x86_shared_cache_size(%rip), %r9 -+ jbe L(mm_main_loop_forward) -+ - L(mm_large_page_loop_forward): - movdqu (%r8, %rsi), %xmm0 - movdqu 16(%r8, %rsi), %xmm1 -@@ -498,6 +506,14 @@ L(mm_large_page_loop_forward): - - /* Big length copy backward part. */ - .p2align 4 -+ -+L(mm_overlapping_check_backward): -+ mov %rdi, %r11 -+ sub %rsi, %r11 /* r11 = dst - src, diff */ -+ add %rdx, %r11 -+ cmp __x86_shared_cache_size(%rip), %r11 -+ jbe L(mm_main_loop_backward) -+ - L(mm_large_page_loop_backward): - movdqu -64(%r9, %r8), %xmm0 - movdqu -48(%r9, %r8), %xmm1 -diff --git a/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S -index cceadd297..84ab327c9 100644 ---- a/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse2-memset-slm.S -@@ -30,7 +30,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - #include - --#include "cache.h" - - #ifndef L - # define L(label) .L##label -@@ -116,11 +115,8 @@ L(128bytesmore): - cmpq %rcx, %rdx - je L(return) - --#ifdef SHARED_CACHE_SIZE -- cmp $SHARED_CACHE_SIZE, %r8 --#else -- cmp __x86_64_shared_cache_size(%rip), %r8 --#endif -+ cmp __x86_shared_cache_size(%rip), %r8 -+ - ja L(128bytesmore_nt) - - ALIGN (4) -diff --git a/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S -index 6cfcd767f..c5980d431 100644 ---- a/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S -+++ b/libc/arch-x86_64/silvermont/string/sse4-memcmp-slm.S -@@ -28,7 +28,6 @@ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - --#include "cache.h" - - #ifndef MEMCMP - # define MEMCMP memcmp_generic -@@ -353,11 +352,7 @@ L(less32bytesin256): - - ALIGN (4) - L(512bytesormore): --#ifdef DATA_CACHE_SIZE_HALF -- mov $DATA_CACHE_SIZE_HALF, %r8 --#else -- mov __x86_64_data_cache_size_half(%rip), %r8 --#endif -+ mov __x86_data_cache_size_half(%rip), %r8 - mov %r8, %r9 - shr $1, %r8 - add %r9, %r8 -@@ -669,11 +664,7 @@ L(less32bytesin256in2alinged): - - ALIGN (4) - L(512bytesormorein2aligned): --#ifdef DATA_CACHE_SIZE_HALF -- mov $DATA_CACHE_SIZE_HALF, %r8 --#else -- mov __x86_64_data_cache_size_half(%rip), %r8 --#endif -+ mov __x86_data_cache_size_half(%rip), %r8 - mov %r8, %r9 - shr $1, %r8 - add %r9, %r8 -diff --git a/libc/bionic/libc_init_common.cpp b/libc/bionic/libc_init_common.cpp -index c82c52e9d..6e4213a90 100644 ---- a/libc/bionic/libc_init_common.cpp -+++ b/libc/bionic/libc_init_common.cpp -@@ -63,6 +63,28 @@ __LIBC_HIDDEN__ constinit _Atomic(bool) __libc_memtag_stack; - __BIONIC_WEAK_VARIABLE_FOR_NATIVE_BRIDGE - const char* __progname; - -+#if defined(__i386__) || defined(__x86_64__) -+// Default sizes based on the old hard-coded values for Atom/Silvermont (x86) and Core 2 (x86-64)... -+size_t __x86_data_cache_size = 24 * 1024; -+size_t __x86_data_cache_size_half = __x86_data_cache_size / 2; -+size_t __x86_shared_cache_size = sizeof(long) == 8 ? 4096 * 1024 : 1024 * 1024; -+size_t __x86_shared_cache_size_half = __x86_shared_cache_size / 2; -+// ...overwritten at runtime based on the cpu's reported cache sizes. -+static void __libc_init_x86_cache_info() { -+ // Handle the case where during early boot /sys fs may not yet be ready, -+ // resulting in sysconf() returning 0, leading to crashes. -+ // In that case (basically just init), we keep the defaults. -+ if (sysconf(_SC_LEVEL1_DCACHE_SIZE) != 0) { -+ __x86_data_cache_size = sysconf(_SC_LEVEL1_DCACHE_SIZE); -+ __x86_data_cache_size_half = __x86_data_cache_size / 2; -+ } -+ if (sysconf(_SC_LEVEL2_CACHE_SIZE) != 0) { -+ __x86_shared_cache_size = sysconf(_SC_LEVEL2_CACHE_SIZE); -+ __x86_shared_cache_size_half = __x86_shared_cache_size / 2; -+ } -+} -+#endif -+ - void __libc_init_globals() { - // Initialize libc globals that are needed in both the linker and in libc. - // In dynamic binaries, this is run at least twice for different copies of the -@@ -172,6 +194,10 @@ void __libc_init_common() { - __system_properties_init(); // Requires 'environ'. - __libc_init_fdsan(); // Requires system properties (for debug.fdsan). - __libc_init_fdtrack(); -+ -+#if defined(__i386__) || defined(__x86_64__) -+ __libc_init_x86_cache_info(); -+#endif - } - - void __libc_init_fork_handler() { --- -2.34.1 - diff --git a/vendorsetup.sh b/vendorsetup.sh index 51f9c94771..6c328b8203 100755 --- a/vendorsetup.sh +++ b/vendorsetup.sh @@ -35,12 +35,8 @@ function lunch } # Get the exact value of a build variable. -function _get_build_var_cached() +function get_build_var() { - # Set the TARGET_RELEASE variable to the release_config for - # which we want to build CELADON. It should be one among - # $(TOP)/build/release/release_configs/* - TARGET_RELEASE=ap3a if [ "$1" = "COMMON_LUNCH_CHOICES" ] then valid_targets=`mixinup -t` @@ -50,8 +46,7 @@ function _get_build_var_cached() array=(${t/-/ }) target=${array[0]} if [[ "${valid_targets}" =~ "$target" ]]; then - tgt=$target-$TARGET_RELEASE-${array[1]} - LUNCH_MENU_CHOICES+=($tgt) + LUNCH_MENU_CHOICES+=($t) fi done echo ${LUNCH_MENU_CHOICES[@]}