Skip to content

Commit

Permalink
arm64: lib: Optimize memcmp
Browse files Browse the repository at this point in the history
Rewrite memcmp to improve performance. On small and medium inputs
performance is typically 25% better. Large inputs use a SIMD loop
processing 64 bytes per iteration, which is 50% faster than the
previous version.

Change-Id: I5fb12c6368e1a690695c8b1190ef8ab608516de5
  • Loading branch information
Wilco1 authored and gotenksIN committed Jul 14, 2024
1 parent e983bdb commit cb693da
Showing 1 changed file with 142 additions and 89 deletions.
231 changes: 142 additions & 89 deletions arch/arm64/lib/memcmp.S
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (c) 2013-2021, Arm Limited.
*
* Adapted from the original at:
* https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/memcmp.S
* https://github.com/ARM-software/optimized-routines/blob/7a9fd1603e/string/aarch64/memcmp.S
*/

#include <linux/linkage.h>
Expand All @@ -16,90 +16,70 @@

#define L(label) .L ## label

/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
#define result w0

/* Internal variables. */
#define data1 x3
#define data1w w3
#define data1h x4
#define data2 x5
#define data2w w5
#define data2h x6
#define tmp1 x7
#define tmp2 x8
#define src1 x0
#define src2 x1
#define limit x2
#define result w0

#define data1 x3
#define data1w w3
#define data2 x4
#define data2w w4
#define data3 x5
#define data3w w5
#define data4 x6
#define data4w w6
#define tmp x6
#define src1end x7
#define src2end x8

SYM_FUNC_START_WEAK_PI(memcmp)
subs limit, limit, 8
b.lo L(less8)

ldr data1, [src1], 8
ldr data2, [src2], 8
cmp data1, data2
b.ne L(return)

subs limit, limit, 8
b.gt L(more16)

ldr data1, [src1, limit]
ldr data2, [src2, limit]
b L(return)

L(more16):
ldr data1, [src1], 8
ldr data2, [src2], 8
cmp data1, data2
bne L(return)

/* Jump directly to comparing the last 16 bytes for 32 byte (or less)
strings. */
subs limit, limit, 16
cmp limit, 16
b.lo L(less16)
ldp data1, data3, [src1]
ldp data2, data4, [src2]
ccmp data1, data2, 0, ne
ccmp data3, data4, 0, eq
b.ne L(return2)

add src1end, src1, limit
add src2end, src2, limit
cmp limit, 32
b.ls L(last_bytes)
cmp limit, 160
b.hs L(loop_align)
sub limit, limit, 32

/* We overlap loads between 0-32 bytes at either side of SRC1 when we
try to align, so limit it only to strings larger than 128 bytes. */
cmp limit, 96
b.ls L(loop16)

/* Align src1 and adjust src2 with bytes not yet done. */
and tmp1, src1, 15
add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1

/* Loop performing 16 bytes per iteration using aligned src1.
Limit is pre-decremented by 16 and must be larger than zero.
Exit if <= 16 bytes left to do or if the data is not equal. */
.p2align 4
L(loop16):
ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16
subs limit, limit, 16
ccmp data1, data2, 0, hi
ccmp data1h, data2h, 0, eq
b.eq L(loop16)

L(loop32):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
bne L(return)
mov data1, data1h
mov data2, data2h
ccmp data3, data4, 0, eq
b.ne L(return2)
cmp limit, 16
b.ls L(last_bytes)

ldp data1, data3, [src1, 32]
ldp data2, data4, [src2, 32]
cmp data1, data2
bne L(return)
ccmp data3, data4, 0, eq
b.ne L(return2)
add src1, src1, 32
add src2, src2, 32
L(last64):
subs limit, limit, 32
b.hi L(loop32)

/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
add src1, src1, limit
add src2, src2, limit
ldp data1, data1h, [src1]
ldp data2, data2h, [src2]
cmp data1, data2
bne L(return)
mov data1, data1h
mov data2, data2h
ldp data1, data3, [src1end, -16]
ldp data2, data4, [src2end, -16]
L(return2):
cmp data1, data2
csel data1, data1, data3, ne
csel data2, data2, data4, ne

/* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
Expand All @@ -108,31 +88,104 @@ L(return):
rev data2, data2
#endif
cmp data1, data2
L(ret_eq):
cset result, ne
cneg result, result, lo
ret

.p2align 4
/* Compare up to 8 bytes. Limit is [-8..-1]. */
L(less16):
add src1end, src1, limit
add src2end, src2, limit
tbz limit, 3, L(less8)
ldr data1, [src1]
ldr data2, [src2]
ldr data3, [src1end, -8]
ldr data4, [src2end, -8]
b L(return2)

.p2align 4
L(less8):
adds limit, limit, 4
b.lo L(less4)
ldr data1w, [src1], 4
ldr data2w, [src2], 4
tbz limit, 2, L(less4)
ldr data1w, [src1]
ldr data2w, [src2]
ldr data3w, [src1end, -4]
ldr data4w, [src2end, -4]
b L(return2)

L(less4):
tbz limit, 1, L(less2)
ldrh data1w, [src1]
ldrh data2w, [src2]
cmp data1w, data2w
b.ne L(return)
sub limit, limit, 4
L(less4):
adds limit, limit, 4
beq L(ret_eq)
L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq L(byte_loop)
L(less2):
mov result, 0
tbz limit, 0, L(return_zero)
ldrb data1w, [src1end, -1]
ldrb data2w, [src2end, -1]
sub result, data1w, data2w
L(return_zero):
ret

L(loop_align):
ldp data1, data3, [src1, 16]
ldp data2, data4, [src2, 16]
cmp data1, data2
ccmp data3, data4, 0, eq
b.ne L(return2)

/* Align src2 and adjust src1, src2 and limit. */
and tmp, src2, 15
sub tmp, tmp, 16
sub src2, src2, tmp
add limit, limit, tmp
sub src1, src1, tmp
sub limit, limit, 64 + 16

.p2align 4
L(loop64):
ldr q0, [src1, 16]
ldr q1, [src2, 16]
subs limit, limit, 64
ldr q2, [src1, 32]
ldr q3, [src2, 32]
eor v0.16b, v0.16b, v1.16b
eor v1.16b, v2.16b, v3.16b
ldr q2, [src1, 48]
ldr q3, [src2, 48]
umaxp v0.16b, v0.16b, v1.16b
ldr q4, [src1, 64]!
ldr q5, [src2, 64]!
eor v1.16b, v2.16b, v3.16b
eor v2.16b, v4.16b, v5.16b
umaxp v1.16b, v1.16b, v2.16b
umaxp v0.16b, v0.16b, v1.16b
umaxp v0.16b, v0.16b, v0.16b
fmov tmp, d0
ccmp tmp, 0, 0, hi
b.eq L(loop64)

/* If equal, process last 1-64 bytes using scalar loop. */
add limit, limit, 64 + 16
cbz tmp, L(last64)

/* Determine the 8-byte aligned offset of the first difference. */
#ifdef __AARCH64EB__
rev16 tmp, tmp
#endif
rev tmp, tmp
clz tmp, tmp
bic tmp, tmp, 7
sub tmp, tmp, 48
ldr data1, [src1, tmp]
ldr data2, [src2, tmp]
#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
mov result, 1
cmp data1, data2
cneg result, result, lo
ret

SYM_FUNC_END_PI(memcmp)
Expand Down

0 comments on commit cb693da

Please sign in to comment.