From 2889ba7dfeed811e9dda1c38e998d18ae0085bcf Mon Sep 17 00:00:00 2001 From: Alan Kelly Date: Wed, 18 Dec 2024 01:01:10 -0800 Subject: [PATCH] Better Arm64 input register loading and incrementation. Believe it or not, these small changes make kernels about 3% faster. PiperOrigin-RevId: 707440293 --- gemm_compiler/aarch64_template.py | 14 +-- gemm_compiler/avx512f_template.py | 4 +- gemm_compiler/base_architecture.py | 4 +- gemm_compiler/generate.py | 4 +- gemm_compiler/neondot_template.py | 20 ---- gemm_compiler/neonfma_template.py | 30 ++--- gemm_compiler/x64_template.py | 14 +-- ...10x16-minmax-asm-amd64-avx512f-broadcast.S | 102 ++++++++-------- ...10x32-minmax-asm-amd64-avx512f-broadcast.S | 102 ++++++++-------- ...11x16-minmax-asm-amd64-avx512f-broadcast.S | 112 +++++++++--------- ...11x32-minmax-asm-amd64-avx512f-broadcast.S | 112 +++++++++--------- ...emm-1x16-minmax-asm-aarch64-neonfma-ld32.S | 30 ++--- ...-1x16-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-1x32-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-1x64-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...gemm-1x8-minmax-asm-aarch64-neonfma-ld32.S | 23 ++-- ...emm-2x16-minmax-asm-aarch64-neonfma-ld32.S | 47 +++----- ...-2x16-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-2x32-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-2x64-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...gemm-2x8-minmax-asm-aarch64-neonfma-ld32.S | 35 +++--- ...emm-3x16-minmax-asm-aarch64-neonfma-ld32.S | 64 ++++------ ...-3x16-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-3x32-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-3x64-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...gemm-3x8-minmax-asm-aarch64-neonfma-ld32.S | 47 +++----- ...emm-4x16-minmax-asm-aarch64-neonfma-ld32.S | 81 ++++++------- ...-4x16-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-4x32-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-4x64-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...gemm-4x8-minmax-asm-aarch64-neonfma-ld32.S | 59 ++++----- ...emm-5x16-minmax-asm-aarch64-neonfma-ld32.S | 98 +++++++-------- ...-5x16-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-5x32-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...-5x64-minmax-asm-amd64-avx512f-broadcast.S | 2 +- ...gemm-5x8-minmax-asm-aarch64-neonfma-ld32.S | 71 +++++------ ...-6x16-minmax-asm-amd64-avx512f-broadcast.S | 62 +++++----- ...-6x32-minmax-asm-amd64-avx512f-broadcast.S | 62 +++++----- ...-7x16-minmax-asm-amd64-avx512f-broadcast.S | 72 +++++------ ...-7x32-minmax-asm-amd64-avx512f-broadcast.S | 72 +++++------ ...-8x16-minmax-asm-amd64-avx512f-broadcast.S | 82 ++++++------- ...-8x32-minmax-asm-amd64-avx512f-broadcast.S | 82 ++++++------- ...-9x16-minmax-asm-amd64-avx512f-broadcast.S | 92 +++++++------- ...-9x32-minmax-asm-amd64-avx512f-broadcast.S | 92 +++++++------- ...w-gemm-10x16-minmax-asm-amd64-avx512vnni.S | 102 ++++++++-------- ...w-gemm-10x32-minmax-asm-amd64-avx512vnni.S | 102 ++++++++-------- ...w-gemm-11x16-minmax-asm-amd64-avx512vnni.S | 112 +++++++++--------- ...w-gemm-11x32-minmax-asm-amd64-avx512vnni.S | 112 +++++++++--------- ...emm-1x16-minmax-asm-aarch64-neondot-ld32.S | 30 ++--- ...8w-gemm-1x16-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-1x32-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-1x64-minmax-asm-amd64-avx512vnni.S | 2 +- ...gemm-1x8-minmax-asm-aarch64-neondot-ld32.S | 23 ++-- ...emm-2x16-minmax-asm-aarch64-neondot-ld32.S | 47 +++----- ...8w-gemm-2x16-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-2x32-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-2x64-minmax-asm-amd64-avx512vnni.S | 2 +- ...gemm-2x8-minmax-asm-aarch64-neondot-ld32.S | 35 +++--- ...emm-3x16-minmax-asm-aarch64-neondot-ld32.S | 64 ++++------ ...8w-gemm-3x16-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-3x32-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-3x64-minmax-asm-amd64-avx512vnni.S | 2 +- ...gemm-3x8-minmax-asm-aarch64-neondot-ld32.S | 47 +++----- ...emm-4x16-minmax-asm-aarch64-neondot-ld32.S | 81 ++++++------- ...8w-gemm-4x16-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-4x32-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-4x64-minmax-asm-amd64-avx512vnni.S | 2 +- ...gemm-4x8-minmax-asm-aarch64-neondot-ld32.S | 59 ++++----- ...8w-gemm-5x16-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-5x32-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-5x64-minmax-asm-amd64-avx512vnni.S | 2 +- ...8w-gemm-6x16-minmax-asm-amd64-avx512vnni.S | 62 +++++----- ...8w-gemm-6x32-minmax-asm-amd64-avx512vnni.S | 62 +++++----- ...8w-gemm-7x16-minmax-asm-amd64-avx512vnni.S | 72 +++++------ ...8w-gemm-7x32-minmax-asm-amd64-avx512vnni.S | 72 +++++------ ...8w-gemm-8x16-minmax-asm-amd64-avx512vnni.S | 82 ++++++------- ...8w-gemm-8x32-minmax-asm-amd64-avx512vnni.S | 82 ++++++------- ...8w-gemm-9x16-minmax-asm-amd64-avx512vnni.S | 92 +++++++------- ...8w-gemm-9x32-minmax-asm-amd64-avx512vnni.S | 92 +++++++------- 79 files changed, 1485 insertions(+), 1694 deletions(-) diff --git a/gemm_compiler/aarch64_template.py b/gemm_compiler/aarch64_template.py index fd7361fcfb8c..67cac7e5fcda 100644 --- a/gemm_compiler/aarch64_template.py +++ b/gemm_compiler/aarch64_template.py @@ -165,7 +165,7 @@ def inner_loop(self, M, N): w_step=self.register_bytes() * N_COUNT, ) for l in self.weights_asm()['loop']: - if N_COUNT % 2 == 0: + if N_COUNT % 2 != 0: asm_string += l.format( W_ptr=self.w_ptr_register(), W=self.w_registers()[nr], @@ -276,17 +276,15 @@ def clamp_inputs_and_outputs( def increment_ptr(self, ptr, step): return f'add {ptr}, {ptr}, {step}\n' - def zero_gp_register(self, reg): - return f'eor {reg}, {reg}, {reg}\n' + def initialize_k_register(self, reg): + kc_register = self.kc_register() + return f'mov {reg}, {kc_register}\n' def cmp_k_and_jump_if_less(self, label): kc_register = self.kc_register() k_register = self.k_register() - return """add {k_register}, {k_register}, 4 - cmp {kc_register}, {k_register} - bne {label}\n""".format( - label=label, k_register=k_register, kc_register=kc_register - ) + return f"""subs {k_register}, {k_register}, 4 + bne {label}\n""" def epilogue(self, M, N, isa): restore_stack = """ diff --git a/gemm_compiler/avx512f_template.py b/gemm_compiler/avx512f_template.py index af33712eb31c..f8f941b32ec5 100644 --- a/gemm_compiler/avx512f_template.py +++ b/gemm_compiler/avx512f_template.py @@ -181,7 +181,7 @@ def store( if pop_c: asm_string += '\n' + '# Pop output pointers from the stack.\n' c_reg_offset = 0 - POP_C = 'mov {C_REG}, [rsp + {offset}]\n' + POP_C = 'mov {C_REG}, [rsp - {offset}]\n' for mr in range(0, M): sp_offset = 128 + (mr) * 16 + 8 asm_string += POP_C.format(C_REG=cm_registers[mr], offset=sp_offset) @@ -208,7 +208,7 @@ def store( ) if pop_c: asm_string += '\n' + '# Write output pointers to the stack.\n' - POP_C = 'mov [rsp + {offset}], {C_REG}\n' + POP_C = 'mov [rsp - {offset}], {C_REG}\n' for mr in range(0, M): sp_offset = 128 + (mr) * 16 + 8 asm_string += POP_C.format(C_REG=cm_registers[mr], offset=sp_offset) diff --git a/gemm_compiler/base_architecture.py b/gemm_compiler/base_architecture.py index 3caccac4c8a8..9133ef58eb54 100644 --- a/gemm_compiler/base_architecture.py +++ b/gemm_compiler/base_architecture.py @@ -143,8 +143,8 @@ def increment_ptr(self, ptr, step): raise NotImplementedError @abstractmethod - def zero_gp_register(self, reg): - """Zero the given general purpose register.""" + def initialize_k_register(self, reg): + """Initialized the given general purpose register for inner loop control.""" raise NotImplementedError @abstractmethod diff --git a/gemm_compiler/generate.py b/gemm_compiler/generate.py index 097121c0034f..d5a99898f994 100644 --- a/gemm_compiler/generate.py +++ b/gemm_compiler/generate.py @@ -36,8 +36,8 @@ def generate_gemm_microkernel( # the outer loop label asm_string += '\nouter_loop:\n' - asm_string += '# Zero k counter.\n' - asm_string += isa.zero_gp_register(k_register) + asm_string += '# Initialize k counter.\n' + asm_string += isa.initialize_k_register(k_register) # Read a registers from the stack if required asm_string += isa.read_a_registers(M=M) diff --git a/gemm_compiler/neondot_template.py b/gemm_compiler/neondot_template.py index e9b2bf3f37a9..70400d9740a7 100644 --- a/gemm_compiler/neondot_template.py +++ b/gemm_compiler/neondot_template.py @@ -74,26 +74,6 @@ def quantization_params(self): def quantization_params_register(self): return 'x24' - def input_asm(self): - in_asm = { - 'loop': [ - 'ldr d{AM}, [{AM_ptr}, {a_offset}]\n', - ] - } - return in_asm - - def weights_asm(self): - w_asm = { - 'loop': [ - 'ldr q{W}, [{W_ptr}, {offset}]\n', - ], - 'loop_2': [ - 'ldp q{W}, q{W_1}, [{W_ptr}, {offset}]\n', - ], - 'after': 'add {W}, {W}, {w_step}\n', - } - return w_asm - def compute_asm(self): c_asm = { 'loop': ['sdot v{ACC}.4s, v{W}.16b, v{A}.4b[0]\n'], diff --git a/gemm_compiler/neonfma_template.py b/gemm_compiler/neonfma_template.py index 2a18e459a9fb..55e28e830fdf 100644 --- a/gemm_compiler/neonfma_template.py +++ b/gemm_compiler/neonfma_template.py @@ -60,7 +60,7 @@ def w_registers(self): def input_asm(self): in_asm = { 'loop': [ - 'ldr d{AM}, [{AM_ptr}, {a_offset}]\n', + 'ldr s{AM}, [{AM_ptr}], 4\n', ] } return in_asm @@ -68,12 +68,11 @@ def input_asm(self): def weights_asm(self): w_asm = { 'loop': [ - 'ldr q{W}, [{W_ptr}, {offset}]\n', + 'ldr q{W}, [{W_ptr}], 16\n', ], 'loop_2': [ - 'ldp q{W}, q{W_1}, [{W_ptr}, {offset}]\n', + 'ldp q{W}, q{W_1}, [{W_ptr}], 32\n', ], - 'after': 'add {W}, {W}, {w_step}\n', } return w_asm @@ -139,22 +138,21 @@ def store( cmp {nc}, {n_step} b.lo tail_{N_2}\n""".format(n_step=N, N_2=N // 2, nc=nc_reg) for mr in range(0, M): - asm_string += 'stp q{ACC}, q{ACC_1}, [{c_reg}]\n'.format( + asm_string += 'stp q{ACC}, q{ACC_1}, [{c_reg}], 32\n'.format( ACC=accumulators[mr], ACC_1=accumulators[M + mr], c_reg=cm_registers[mr], ) for nr in range(2, N_COUNT, 2): - asm_string += 'stp q{ACC}, q{ACC_1}, [{c_reg}, {offset}]\n'.format( + asm_string += 'stp q{ACC}, q{ACC_1}, [{c_reg}], 32\n'.format( ACC=accumulators[M * 2 + mr], ACC_1=accumulators[M * 3 + mr], c_reg=cm_registers[mr], - offset=self.register_bytes() * nr, ) for mr in range(0, M): - asm_string += 'add {cm}, {cm}, {cn_stride}\n'.format( - cn_stride=N_COUNT * 16, cm=cm_registers[mr] - ) + AM_PTR = self.am_registers()[mr] + kc_register = self.kc_register() + asm_string += f'sub {AM_PTR}, {AM_PTR}, {kc_register}\n' CHECK = """ sub {nc}, {nc}, {n_step} b.ne outer_loop @@ -167,7 +165,7 @@ def store( \ntail_8: tbz {nc_lo}, 3, tail_4\n""".format(nc_lo=nc_lo) for mr in range(0, M): - asm_string += 'stp q{ACC}, q{ACC_1}, [{c_reg}]\n'.format( + asm_string += 'stp q{ACC}, q{ACC_1}, [{c_reg}], 32\n'.format( ACC=accumulators[mr], ACC_1=accumulators[mr + M], c_reg=cm_registers[mr], @@ -179,30 +177,24 @@ def store( asm_string += 'mov v{ACC0}.16b, v{ACC1}.16b\n'.format( ACC0=accumulators[mr + M], ACC1=accumulators[mr + 3 * M] ) - for mr in range(0, M): - asm_string += 'add {cm}, {cm}, 32\n'.format(cm=cm_registers[mr]) asm_string += """ \ntail_4: tbz {nc_lo}, 2, tail_2\n""".format(nc_lo=nc_lo) for mr in range(0, M): - asm_string += 'str q{ACC}, [{c_reg}]\n'.format( + asm_string += 'str q{ACC}, [{c_reg}], 16\n'.format( ACC=accumulators[mr], c_reg=cm_registers[mr] ) for mr in range(0, M): asm_string += 'mov v{ACC0}.16b, v{ACC1}.16b\n'.format( ACC0=accumulators[mr], ACC1=accumulators[mr + M] ) - for mr in range(0, M): - asm_string += 'add {cm}, {cm}, 16\n'.format(cm=cm_registers[mr]) asm_string += """ \ntail_2: tbz {nc_lo}, 1, tail_1\n""".format(nc_lo=nc_lo) for mr in range(0, M): - asm_string += 'str d{ACC}, [{c_reg}]\n'.format( + asm_string += 'str d{ACC}, [{c_reg}], 8\n'.format( ACC=accumulators[mr], c_reg=cm_registers[mr] ) - for mr in range(0, M): - asm_string += 'add {c_reg}, {c_reg}, 8\n'.format(c_reg=cm_registers[mr]) for mr in range(0, M): asm_string += 'dup d{ACC}, v{ACC}.d[1]\n'.format(ACC=accumulators[mr]) asm_string += """ diff --git a/gemm_compiler/x64_template.py b/gemm_compiler/x64_template.py index f5f43eb471b9..637cd565beb6 100644 --- a/gemm_compiler/x64_template.py +++ b/gemm_compiler/x64_template.py @@ -196,8 +196,8 @@ def input_output_register_setup(self, M): cmovle {aM}, {aM_1} cmovle {cM}, {cM_1}\n""" INPUT_OUTPUT_REGISTER_PUSH = """ - mov [rsp + {a_rsp_offset}], {aM} - mov [rsp + {c_rsp_offset}], {cM}\n""" + mov [rsp - {a_rsp_offset}], {aM} + mov [rsp - {c_rsp_offset}], {cM}\n""" ret = '' if self.stack_size(M) != 0: ret += """sub rsp, {stack_size}\n""".format( @@ -208,11 +208,11 @@ def input_output_register_setup(self, M): ret += ( '# Write rsi (a pointer) to the stack as we need the register.\n' ) - ret += 'mov [rsp + 128], rsi\n' + ret += 'mov [rsp - 128], rsi\n' ret += ( '# Write r10 (c pointer) to the stack as we need the register.\n' ) - ret += 'mov [rsp + 136], r10\n' + ret += 'mov [rsp - 136], r10\n' for mr in range(1, M): # cycle size of 2 if required if M > self.max_M_before_spilling(): @@ -262,7 +262,7 @@ def read_a_registers(self, M): if M <= self.max_M_before_spilling(): return '' ret = '# Read a pointers from stack into GP registers.\n' - POP_A = 'mov {aM}, [rsp + {a_rsp_offset}]\n' + POP_A = 'mov {aM}, [rsp - {a_rsp_offset}]\n' for mr in range(0, M): a_rsp_offset = 128 + mr * 16 ret += POP_A.format(aM=registers[mr], a_rsp_offset=a_rsp_offset) @@ -272,7 +272,7 @@ def read_a_registers(self, M): def increment_ptr(self, ptr, step): return f'add {ptr}, {step}\n' - def zero_gp_register(self, reg): + def initialize_k_register(self, reg): return f'mov {reg}, 0\n' def cmp_k_and_jump_if_less(self, label): @@ -287,7 +287,7 @@ def cmp_k_and_jump_if_less(self, label): def load_from_stack(self, reg, offset): """Load 8 bytes from the given offset from the stack pointer to reg.""" - return f'mov {reg}, [rsp + {offset}]\n' + return f'mov {reg}, [rsp - {offset}]\n' def epilogue(self, M, N, isa): restore_stack = '\nreturn:\n' diff --git a/src/f32-gemm/gen/f32-gemm-10x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-10x16-minmax-asm-amd64-avx512f-broadcast.S index f7cc3e8dd2ed..b2da35107d20 100644 --- a/src/f32-gemm/gen/f32-gemm-10x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-10x16-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,8 +112,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -124,8 +124,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -136,23 +136,23 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -217,16 +217,16 @@ inner_loop: vmaxps zmm20, zmm0, zmm20 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] # Check whether full or partial store. cmp rcx, 16 @@ -254,16 +254,16 @@ inner_loop: add r8, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 sub rcx, 16 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-10x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-10x32-minmax-asm-amd64-avx512f-broadcast.S index 2035e9e324c8..efd156a9ab43 100644 --- a/src/f32-gemm/gen/f32-gemm-10x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-10x32-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,8 +112,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -124,8 +124,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -136,23 +136,23 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_10x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -258,16 +258,16 @@ inner_loop: vmaxps zmm30, zmm0, zmm30 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] # Check whether full or partial store. cmp rcx, 32 @@ -305,16 +305,16 @@ inner_loop: add r8, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 sub rcx, 32 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-11x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-11x16-minmax-asm-amd64-avx512f-broadcast.S index a4b89bdcbf24..c8c4c288e71e 100644 --- a/src/f32-gemm/gen/f32-gemm-11x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-11x16-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,8 +112,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -124,8 +124,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -136,8 +136,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 # Clamp a & c pointers if mr <= 10 mov rsi, rax @@ -148,24 +148,24 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 288], rsi - mov [rsp + 296], r10 + mov [rsp - 288], rsi + mov [rsp - 296], r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] - mov rdi, [rsp + 288] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] + mov rdi, [rsp - 288] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -235,17 +235,17 @@ inner_loop: vmaxps zmm21, zmm0, zmm21 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] - mov rdi, [rsp + 296] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] + mov rdi, [rsp - 296] # Check whether full or partial store. cmp rcx, 16 @@ -275,17 +275,17 @@ inner_loop: add rdi, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 - mov [rsp + 296], rdi + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 + mov [rsp - 296], rdi sub rcx, 16 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-11x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-11x32-minmax-asm-amd64-avx512f-broadcast.S index 0051ffe86988..8e090a0b1822 100644 --- a/src/f32-gemm/gen/f32-gemm-11x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-11x32-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,8 +112,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -124,8 +124,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -136,8 +136,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 # Clamp a & c pointers if mr <= 10 mov rsi, rax @@ -148,24 +148,24 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_11x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 288], rsi - mov [rsp + 296], r10 + mov [rsp - 288], rsi + mov [rsp - 296], r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] - mov rdi, [rsp + 288] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] + mov rdi, [rsp - 288] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -280,17 +280,17 @@ inner_loop: vmaxps zmm13, zmm0, zmm13 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] - mov rdi, [rsp + 296] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] + mov rdi, [rsp - 296] # Check whether full or partial store. cmp rcx, 32 @@ -331,17 +331,17 @@ inner_loop: add rdi, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 - mov [rsp + 296], rdi + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 + mov [rsp - 296], rdi sub rcx, 32 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-aarch64-neonfma-ld32.S index 2df9b3ff08ca..c6827b2b6698 100644 --- a/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-aarch64-neonfma-ld32.S @@ -20,25 +20,22 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16__asm_aarch64_neonfma_lane ld2r {v0.4s, v1.4s}, [x13] outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q12, [x5, 0] ldp q13, q14, [x5, 32] add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldp q7, q8, [x5, 0] - ldp q9, q10, [x5, 32] - ldr q9, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldp q7, q8, [x5], 32 + ldp q9, q10, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v8.4s, v2.s[0] fmla v13.4s, v9.4s, v2.s[0] fmla v14.4s, v10.4s, v2.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -53,9 +50,9 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q11, q12, [x6] - stp q13, q14, [x6, 32] - add x6, x6, 64 + stp q11, q12, [x6], 32 + stp q13, q14, [x6], 32 + sub x3, x3, x2 sub x1, x1, 16 b.ne outer_loop @@ -63,23 +60,20 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q11, q12, [x6] + stp q11, q12, [x6], 32 mov v11.16b, v13.16b mov v12.16b, v14.16b - add x6, x6, 32 tail_4: tbz x1, 2, tail_2 - str q11, [x6] + str q11, [x6], 16 mov v11.16b, v12.16b - add x6, x6, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - add x6, x6, 8 + str d11, [x6], 8 dup d11, v11.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-amd64-avx512f-broadcast.S index 1dfd045b8228..de9feca9f79a 100644 --- a/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-1x16-minmax-asm-amd64-avx512f-broadcast.S @@ -28,7 +28,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x16__asm_amd64_avx512f_broadcast mov r11, [rsp + 64] outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-1x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-1x32-minmax-asm-amd64-avx512f-broadcast.S index 85b0958e4d38..8bf1ff26b55e 100644 --- a/src/f32-gemm/gen/f32-gemm-1x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-1x32-minmax-asm-amd64-avx512f-broadcast.S @@ -28,7 +28,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x32__asm_amd64_avx512f_broadcast mov r11, [rsp + 64] outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-1x64-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-1x64-minmax-asm-amd64-avx512f-broadcast.S index 9b3b2acd90a4..f0e2e17c3d89 100644 --- a/src/f32-gemm/gen/f32-gemm-1x64-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-1x64-minmax-asm-amd64-avx512f-broadcast.S @@ -28,7 +28,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x64__asm_amd64_avx512f_broadcast mov r11, [rsp + 64] outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld32.S index cda2f5c9d03c..d6aec57123e2 100644 --- a/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-1x8-minmax-asm-aarch64-neonfma-ld32.S @@ -20,21 +20,18 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_1x8__asm_aarch64_neonfma_lane ld2r {v0.4s, v1.4s}, [x13] outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q12, [x5, 0] add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldp q7, q8, [x5, 0] - ldr q7, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldp q7, q8, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v8.4s, v2.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -45,8 +42,8 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q11, q12, [x6] - add x6, x6, 32 + stp q11, q12, [x6], 32 + sub x3, x3, x2 sub x1, x1, 8 b.ne outer_loop @@ -54,15 +51,13 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q11, [x6] + str q11, [x6], 16 mov v11.16b, v12.16b - add x6, x6, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - add x6, x6, 8 + str d11, [x6], 8 dup d11, v11.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-aarch64-neonfma-ld32.S index bc692d94999c..dd3cb73756ed 100644 --- a/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-aarch64-neonfma-ld32.S @@ -27,8 +27,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16__asm_aarch64_neonfma_lane csel x13, x6, x13, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q13, [x5, 0] ldp q15, q17, [x5, 32] @@ -39,12 +39,10 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldp q7, q8, [x5, 0] - ldp q9, q10, [x5, 32] - ldr q9, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldp q7, q8, [x5], 32 + ldp q9, q10, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v8.4s, v2.s[0] @@ -53,8 +51,7 @@ inner_loop: fmla v16.4s, v9.4s, v3.s[0] fmla v17.4s, v10.4s, v2.s[0] fmla v18.4s, v10.4s, v3.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -77,12 +74,12 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q11, q13, [x6] - stp q15, q17, [x6, 32] - stp q12, q14, [x13] - stp q16, q18, [x13, 32] - add x6, x6, 64 - add x13, x13, 64 + stp q11, q13, [x6], 32 + stp q15, q17, [x6], 32 + stp q12, q14, [x13], 32 + stp q16, q18, [x13], 32 + sub x3, x3, x2 + sub x9, x9, x2 sub x1, x1, 16 b.ne outer_loop @@ -90,32 +87,26 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q11, q13, [x6] - stp q12, q14, [x13] + stp q11, q13, [x6], 32 + stp q12, q14, [x13], 32 mov v11.16b, v15.16b mov v13.16b, v17.16b mov v12.16b, v16.16b mov v14.16b, v18.16b - add x6, x6, 32 - add x13, x13, 32 tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] + str q11, [x6], 16 + str q12, [x13], 16 mov v11.16b, v13.16b mov v12.16b, v14.16b - add x6, x6, 16 - add x13, x13, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - add x6, x6, 8 - add x13, x13, 8 + str d11, [x6], 8 + str d12, [x13], 8 dup d11, v11.d[1] dup d12, v12.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-amd64-avx512f-broadcast.S index f429e9631c6f..36236780321b 100644 --- a/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-2x16-minmax-asm-amd64-avx512f-broadcast.S @@ -37,7 +37,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x16__asm_amd64_avx512f_broadcast cmovle r13, r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-2x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-2x32-minmax-asm-amd64-avx512f-broadcast.S index 212db9528b5f..f80ba7448b11 100644 --- a/src/f32-gemm/gen/f32-gemm-2x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-2x32-minmax-asm-amd64-avx512f-broadcast.S @@ -37,7 +37,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x32__asm_amd64_avx512f_broadcast cmovle r13, r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-2x64-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-2x64-minmax-asm-amd64-avx512f-broadcast.S index d79b807966c0..105cf33322eb 100644 --- a/src/f32-gemm/gen/f32-gemm-2x64-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-2x64-minmax-asm-amd64-avx512f-broadcast.S @@ -37,7 +37,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x64__asm_amd64_avx512f_broadcast cmovle r13, r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-2x8-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-2x8-minmax-asm-aarch64-neonfma-ld32.S index 729194114329..bba3de14cf77 100644 --- a/src/f32-gemm/gen/f32-gemm-2x8-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-2x8-minmax-asm-aarch64-neonfma-ld32.S @@ -27,8 +27,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_2x8__asm_aarch64_neonfma_lane csel x13, x6, x13, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q13, [x5, 0] mov v12.16b, v11.16b @@ -36,17 +36,14 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldp q7, q8, [x5, 0] - ldr q7, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldp q7, q8, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v8.4s, v2.s[0] fmla v14.4s, v8.4s, v3.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -61,10 +58,10 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q11, q13, [x6] - stp q12, q14, [x13] - add x6, x6, 32 - add x13, x13, 32 + stp q11, q13, [x6], 32 + stp q12, q14, [x13], 32 + sub x3, x3, x2 + sub x9, x9, x2 sub x1, x1, 8 b.ne outer_loop @@ -72,20 +69,16 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] + str q11, [x6], 16 + str q12, [x13], 16 mov v11.16b, v13.16b mov v12.16b, v14.16b - add x6, x6, 16 - add x13, x13, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - add x6, x6, 8 - add x13, x13, 8 + str d11, [x6], 8 + str d12, [x13], 8 dup d11, v11.d[1] dup d12, v12.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-aarch64-neonfma-ld32.S index 8b470591443e..9b2af6912f4f 100644 --- a/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-aarch64-neonfma-ld32.S @@ -31,8 +31,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16__asm_aarch64_neonfma_lane csel x14, x13, x14, LS outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q14, [x5, 0] ldp q17, q20, [x5, 32] @@ -47,13 +47,11 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldp q7, q8, [x5, 0] - ldp q9, q10, [x5, 32] - ldr q9, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldp q7, q8, [x5], 32 + ldp q9, q10, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v7.4s, v4.s[0] @@ -66,8 +64,7 @@ inner_loop: fmla v20.4s, v10.4s, v2.s[0] fmla v21.4s, v10.4s, v3.s[0] fmla v22.4s, v10.4s, v4.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -98,15 +95,15 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q11, q14, [x6] - stp q17, q20, [x6, 32] - stp q12, q15, [x13] - stp q18, q21, [x13, 32] - stp q13, q16, [x14] - stp q19, q22, [x14, 32] - add x6, x6, 64 - add x13, x13, 64 - add x14, x14, 64 + stp q11, q14, [x6], 32 + stp q17, q20, [x6], 32 + stp q12, q15, [x13], 32 + stp q18, q21, [x13], 32 + stp q13, q16, [x14], 32 + stp q19, q22, [x14], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 sub x1, x1, 16 b.ne outer_loop @@ -114,41 +111,32 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q11, q14, [x6] - stp q12, q15, [x13] - stp q13, q16, [x14] + stp q11, q14, [x6], 32 + stp q12, q15, [x13], 32 + stp q13, q16, [x14], 32 mov v11.16b, v17.16b mov v14.16b, v20.16b mov v12.16b, v18.16b mov v15.16b, v21.16b mov v13.16b, v19.16b mov v16.16b, v22.16b - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] - str q13, [x14] + str q11, [x6], 16 + str q12, [x13], 16 + str q13, [x14], 16 mov v11.16b, v14.16b mov v12.16b, v15.16b mov v13.16b, v16.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - str d13, [x14] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 + str d11, [x6], 8 + str d12, [x13], 8 + str d13, [x14], 8 dup d11, v11.d[1] dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-amd64-avx512f-broadcast.S index dc65d2c61afe..53ca4d7700ee 100644 --- a/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-3x16-minmax-asm-amd64-avx512f-broadcast.S @@ -46,7 +46,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x16__asm_amd64_avx512f_broadcast cmovle rbx, r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-3x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-3x32-minmax-asm-amd64-avx512f-broadcast.S index 92db444e85ac..5510c79e6df7 100644 --- a/src/f32-gemm/gen/f32-gemm-3x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-3x32-minmax-asm-amd64-avx512f-broadcast.S @@ -46,7 +46,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x32__asm_amd64_avx512f_broadcast cmovle rbx, r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-3x64-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-3x64-minmax-asm-amd64-avx512f-broadcast.S index ca88fdb48066..bd1641c2879a 100644 --- a/src/f32-gemm/gen/f32-gemm-3x64-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-3x64-minmax-asm-amd64-avx512f-broadcast.S @@ -46,7 +46,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x64__asm_amd64_avx512f_broadcast cmovle rbx, r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-3x8-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-3x8-minmax-asm-aarch64-neonfma-ld32.S index fede5c5452f9..26bebc208e7b 100644 --- a/src/f32-gemm/gen/f32-gemm-3x8-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-3x8-minmax-asm-aarch64-neonfma-ld32.S @@ -31,8 +31,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_3x8__asm_aarch64_neonfma_lane csel x14, x13, x14, LS outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q14, [x5, 0] mov v12.16b, v11.16b @@ -42,20 +42,17 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldp q7, q8, [x5, 0] - ldr q7, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldp q7, q8, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v7.4s, v4.s[0] fmla v14.4s, v8.4s, v2.s[0] fmla v15.4s, v8.4s, v3.s[0] fmla v16.4s, v8.4s, v4.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -74,12 +71,12 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q11, q14, [x6] - stp q12, q15, [x13] - stp q13, q16, [x14] - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 + stp q11, q14, [x6], 32 + stp q12, q15, [x13], 32 + stp q13, q16, [x14], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 sub x1, x1, 8 b.ne outer_loop @@ -87,25 +84,19 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] - str q13, [x14] + str q11, [x6], 16 + str q12, [x13], 16 + str q13, [x14], 16 mov v11.16b, v14.16b mov v12.16b, v15.16b mov v13.16b, v16.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - str d13, [x14] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 + str d11, [x6], 8 + str d12, [x13], 8 + str d13, [x14], 8 dup d11, v11.d[1] dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-aarch64-neonfma-ld32.S index d2a0eb0586a7..fb40896a6e4a 100644 --- a/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-aarch64-neonfma-ld32.S @@ -37,8 +37,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16__asm_aarch64_neonfma_lane csel x15, x14, x15, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q15, [x5, 0] ldp q19, q23, [x5, 32] @@ -57,14 +57,12 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldr d5, [x11, x20] - ldp q7, q8, [x5, 0] - ldp q9, q10, [x5, 32] - ldr q9, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldr s5, [x11], 4 + ldp q7, q8, [x5], 32 + ldp q9, q10, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v7.4s, v4.s[0] @@ -81,8 +79,7 @@ inner_loop: fmla v24.4s, v10.4s, v3.s[0] fmla v25.4s, v10.4s, v4.s[0] fmla v26.4s, v10.4s, v5.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -121,18 +118,18 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q11, q15, [x6] - stp q19, q23, [x6, 32] - stp q12, q16, [x13] - stp q20, q24, [x13, 32] - stp q13, q17, [x14] - stp q21, q25, [x14, 32] - stp q14, q18, [x15] - stp q22, q26, [x15, 32] - add x6, x6, 64 - add x13, x13, 64 - add x14, x14, 64 - add x15, x15, 64 + stp q11, q15, [x6], 32 + stp q19, q23, [x6], 32 + stp q12, q16, [x13], 32 + stp q20, q24, [x13], 32 + stp q13, q17, [x14], 32 + stp q21, q25, [x14], 32 + stp q14, q18, [x15], 32 + stp q22, q26, [x15], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 + sub x11, x11, x2 sub x1, x1, 16 b.ne outer_loop @@ -140,10 +137,10 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q11, q15, [x6] - stp q12, q16, [x13] - stp q13, q17, [x14] - stp q14, q18, [x15] + stp q11, q15, [x6], 32 + stp q12, q16, [x13], 32 + stp q13, q17, [x14], 32 + stp q14, q18, [x15], 32 mov v11.16b, v19.16b mov v15.16b, v23.16b mov v12.16b, v20.16b @@ -152,38 +149,26 @@ tail_8: mov v17.16b, v25.16b mov v14.16b, v22.16b mov v18.16b, v26.16b - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 - add x15, x15, 32 tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] - str q13, [x14] - str q14, [x15] + str q11, [x6], 16 + str q12, [x13], 16 + str q13, [x14], 16 + str q14, [x15], 16 mov v11.16b, v15.16b mov v12.16b, v16.16b mov v13.16b, v17.16b mov v14.16b, v18.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 - add x15, x15, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - str d13, [x14] - str d14, [x15] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 - add x15, x15, 8 + str d11, [x6], 8 + str d12, [x13], 8 + str d13, [x14], 8 + str d14, [x15], 8 dup d11, v11.d[1] dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-amd64-avx512f-broadcast.S index 71d442f0da79..d0938f5ae743 100644 --- a/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-4x16-minmax-asm-amd64-avx512f-broadcast.S @@ -55,7 +55,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x16__asm_amd64_avx512f_broadcast cmovle rbp, rbx outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-4x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-4x32-minmax-asm-amd64-avx512f-broadcast.S index 7f117407c4ad..75764d9495ad 100644 --- a/src/f32-gemm/gen/f32-gemm-4x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-4x32-minmax-asm-amd64-avx512f-broadcast.S @@ -55,7 +55,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x32__asm_amd64_avx512f_broadcast cmovle rbp, rbx outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-4x64-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-4x64-minmax-asm-amd64-avx512f-broadcast.S index e862b8a36de9..4492f3fabef6 100644 --- a/src/f32-gemm/gen/f32-gemm-4x64-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-4x64-minmax-asm-amd64-avx512f-broadcast.S @@ -55,7 +55,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x64__asm_amd64_avx512f_broadcast cmovle rbp, rbx outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld32.S index 061b5026d24c..70f9d83b2f50 100644 --- a/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-4x8-minmax-asm-aarch64-neonfma-ld32.S @@ -37,8 +37,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_4x8__asm_aarch64_neonfma_lane csel x15, x14, x15, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q15, [x5, 0] mov v12.16b, v11.16b @@ -50,13 +50,11 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldr d5, [x11, x20] - ldp q7, q8, [x5, 0] - ldr q7, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldr s5, [x11], 4 + ldp q7, q8, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v7.4s, v4.s[0] @@ -65,8 +63,7 @@ inner_loop: fmla v16.4s, v8.4s, v3.s[0] fmla v17.4s, v8.4s, v4.s[0] fmla v18.4s, v8.4s, v5.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -89,14 +86,14 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q11, q15, [x6] - stp q12, q16, [x13] - stp q13, q17, [x14] - stp q14, q18, [x15] - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 - add x15, x15, 32 + stp q11, q15, [x6], 32 + stp q12, q16, [x13], 32 + stp q13, q17, [x14], 32 + stp q14, q18, [x15], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 + sub x11, x11, x2 sub x1, x1, 8 b.ne outer_loop @@ -104,30 +101,22 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] - str q13, [x14] - str q14, [x15] + str q11, [x6], 16 + str q12, [x13], 16 + str q13, [x14], 16 + str q14, [x15], 16 mov v11.16b, v15.16b mov v12.16b, v16.16b mov v13.16b, v17.16b mov v14.16b, v18.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 - add x15, x15, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - str d13, [x14] - str d14, [x15] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 - add x15, x15, 8 + str d11, [x6], 8 + str d12, [x13], 8 + str d13, [x14], 8 + str d14, [x15], 8 dup d11, v11.d[1] dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-aarch64-neonfma-ld32.S index 5b8d65dd9d48..ffd980a64486 100644 --- a/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-aarch64-neonfma-ld32.S @@ -41,8 +41,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16__asm_aarch64_neonfma_lane csel x19, x15, x19, LS outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q16, [x5, 0] ldp q21, q26, [x5, 32] @@ -65,15 +65,13 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldr d5, [x11, x20] - ldr d6, [x12, x20] - ldp q7, q8, [x5, 0] - ldp q9, q10, [x5, 32] - ldr q9, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldr s5, [x11], 4 + ldr s6, [x12], 4 + ldp q7, q8, [x5], 32 + ldp q9, q10, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v7.4s, v4.s[0] @@ -94,8 +92,7 @@ inner_loop: fmla v28.4s, v10.4s, v4.s[0] fmla v29.4s, v10.4s, v5.s[0] fmla v30.4s, v10.4s, v6.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -142,21 +139,21 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q11, q16, [x6] - stp q21, q26, [x6, 32] - stp q12, q17, [x13] - stp q22, q27, [x13, 32] - stp q13, q18, [x14] - stp q23, q28, [x14, 32] - stp q14, q19, [x15] - stp q24, q29, [x15, 32] - stp q15, q20, [x19] - stp q25, q30, [x19, 32] - add x6, x6, 64 - add x13, x13, 64 - add x14, x14, 64 - add x15, x15, 64 - add x19, x19, 64 + stp q11, q16, [x6], 32 + stp q21, q26, [x6], 32 + stp q12, q17, [x13], 32 + stp q22, q27, [x13], 32 + stp q13, q18, [x14], 32 + stp q23, q28, [x14], 32 + stp q14, q19, [x15], 32 + stp q24, q29, [x15], 32 + stp q15, q20, [x19], 32 + stp q25, q30, [x19], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 + sub x11, x11, x2 + sub x12, x12, x2 sub x1, x1, 16 b.ne outer_loop @@ -164,11 +161,11 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q11, q16, [x6] - stp q12, q17, [x13] - stp q13, q18, [x14] - stp q14, q19, [x15] - stp q15, q20, [x19] + stp q11, q16, [x6], 32 + stp q12, q17, [x13], 32 + stp q13, q18, [x14], 32 + stp q14, q19, [x15], 32 + stp q15, q20, [x19], 32 mov v11.16b, v21.16b mov v16.16b, v26.16b mov v12.16b, v22.16b @@ -179,44 +176,29 @@ tail_8: mov v19.16b, v29.16b mov v15.16b, v25.16b mov v20.16b, v30.16b - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 - add x15, x15, 32 - add x19, x19, 32 tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] - str q13, [x14] - str q14, [x15] - str q15, [x19] + str q11, [x6], 16 + str q12, [x13], 16 + str q13, [x14], 16 + str q14, [x15], 16 + str q15, [x19], 16 mov v11.16b, v16.16b mov v12.16b, v17.16b mov v13.16b, v18.16b mov v14.16b, v19.16b mov v15.16b, v20.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 - add x15, x15, 16 - add x19, x19, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - str d13, [x14] - str d14, [x15] - str d15, [x19] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 - add x15, x15, 8 - add x19, x19, 8 + str d11, [x6], 8 + str d12, [x13], 8 + str d13, [x14], 8 + str d14, [x15], 8 + str d15, [x19], 8 dup d11, v11.d[1] dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-amd64-avx512f-broadcast.S index b6743cce4672..ea0997bc8250 100644 --- a/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-5x16-minmax-asm-amd64-avx512f-broadcast.S @@ -64,7 +64,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x16__asm_amd64_avx512f_broadcast cmovle r8, rbp outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-5x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-5x32-minmax-asm-amd64-avx512f-broadcast.S index c0273cc2f80d..7d76f7794f58 100644 --- a/src/f32-gemm/gen/f32-gemm-5x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-5x32-minmax-asm-amd64-avx512f-broadcast.S @@ -64,7 +64,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x32__asm_amd64_avx512f_broadcast cmovle r8, rbp outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-5x64-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-5x64-minmax-asm-amd64-avx512f-broadcast.S index 8a2a511a2474..8adfd1c1ca58 100644 --- a/src/f32-gemm/gen/f32-gemm-5x64-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-5x64-minmax-asm-amd64-avx512f-broadcast.S @@ -64,7 +64,7 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x64__asm_amd64_avx512f_broadcast cmovle r8, rbp outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] diff --git a/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-ld32.S b/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-ld32.S index a98c88464451..1205fdbff755 100644 --- a/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-ld32.S +++ b/src/f32-gemm/gen/f32-gemm-5x8-minmax-asm-aarch64-neonfma-ld32.S @@ -41,8 +41,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_5x8__asm_aarch64_neonfma_lane csel x19, x15, x19, LS outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with the biases. ldp q11, q16, [x5, 0] mov v12.16b, v11.16b @@ -56,14 +56,12 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldr d5, [x11, x20] - ldr d6, [x12, x20] - ldp q7, q8, [x5, 0] - ldr q7, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldr s5, [x11], 4 + ldr s6, [x12], 4 + ldp q7, q8, [x5], 32 fmla v11.4s, v7.4s, v2.s[0] fmla v12.4s, v7.4s, v3.s[0] fmla v13.4s, v7.4s, v4.s[0] @@ -74,8 +72,7 @@ inner_loop: fmla v18.4s, v8.4s, v4.s[0] fmla v19.4s, v8.4s, v5.s[0] fmla v20.4s, v8.4s, v6.s[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Min/max clamping.. fmin v11.4s, v1.4s, v11.4s @@ -102,16 +99,16 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q11, q16, [x6] - stp q12, q17, [x13] - stp q13, q18, [x14] - stp q14, q19, [x15] - stp q15, q20, [x19] - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 - add x15, x15, 32 - add x19, x19, 32 + stp q11, q16, [x6], 32 + stp q12, q17, [x13], 32 + stp q13, q18, [x14], 32 + stp q14, q19, [x15], 32 + stp q15, q20, [x19], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 + sub x11, x11, x2 + sub x12, x12, x2 sub x1, x1, 8 b.ne outer_loop @@ -119,35 +116,25 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q11, [x6] - str q12, [x13] - str q13, [x14] - str q14, [x15] - str q15, [x19] + str q11, [x6], 16 + str q12, [x13], 16 + str q13, [x14], 16 + str q14, [x15], 16 + str q15, [x19], 16 mov v11.16b, v16.16b mov v12.16b, v17.16b mov v13.16b, v18.16b mov v14.16b, v19.16b mov v15.16b, v20.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 - add x15, x15, 16 - add x19, x19, 16 tail_2: tbz x1, 1, tail_1 - str d11, [x6] - str d12, [x13] - str d13, [x14] - str d14, [x15] - str d15, [x19] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 - add x15, x15, 8 - add x19, x19, 8 + str d11, [x6], 8 + str d12, [x13], 8 + str d13, [x14], 8 + str d14, [x15], 8 + str d15, [x19], 8 dup d11, v11.d[1] dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/f32-gemm/gen/f32-gemm-6x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-6x16-minmax-asm-amd64-avx512f-broadcast.S index a5345db1f288..d44a42ccb3ad 100644 --- a/src/f32-gemm/gen/f32-gemm-6x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-6x16-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,19 +88,19 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -145,12 +145,12 @@ inner_loop: vmaxps zmm16, zmm0, zmm16 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] # Check whether full or partial store. cmp rcx, 16 @@ -170,12 +170,12 @@ inner_loop: add r10, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 sub rcx, 16 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-6x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-6x32-minmax-asm-amd64-avx512f-broadcast.S index c3bb1d298a5b..9a8a091a14c9 100644 --- a/src/f32-gemm/gen/f32-gemm-6x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-6x32-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x32__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,19 +88,19 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_6x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -170,12 +170,12 @@ inner_loop: vmaxps zmm22, zmm0, zmm22 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] # Check whether full or partial store. cmp rcx, 32 @@ -201,12 +201,12 @@ inner_loop: add r10, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 sub rcx, 32 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-7x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-7x16-minmax-asm-amd64-avx512f-broadcast.S index e6a8f904d5cc..361c5d1ab3a9 100644 --- a/src/f32-gemm/gen/f32-gemm-7x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-7x16-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,20 +100,20 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -163,13 +163,13 @@ inner_loop: vmaxps zmm17, zmm0, zmm17 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] # Check whether full or partial store. cmp rcx, 16 @@ -191,13 +191,13 @@ inner_loop: add r13, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 sub rcx, 16 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-7x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-7x32-minmax-asm-amd64-avx512f-broadcast.S index 6229ffb4a2d6..9354a90d8af1 100644 --- a/src/f32-gemm/gen/f32-gemm-7x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-7x32-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,20 +100,20 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_7x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -192,13 +192,13 @@ inner_loop: vmaxps zmm24, zmm0, zmm24 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] # Check whether full or partial store. cmp rcx, 32 @@ -227,13 +227,13 @@ inner_loop: add r13, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 sub rcx, 32 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-8x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-8x16-minmax-asm-amd64-avx512f-broadcast.S index a5a0b9d84759..7233c24350f2 100644 --- a/src/f32-gemm/gen/f32-gemm-8x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-8x16-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,21 +112,21 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -181,14 +181,14 @@ inner_loop: vmaxps zmm18, zmm0, zmm18 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] # Check whether full or partial store. cmp rcx, 16 @@ -212,14 +212,14 @@ inner_loop: add rbx, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx sub rcx, 16 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-8x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-8x32-minmax-asm-amd64-avx512f-broadcast.S index a9beafe042ff..e4633f2fcfe1 100644 --- a/src/f32-gemm/gen/f32-gemm-8x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-8x32-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,21 +112,21 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_8x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -214,14 +214,14 @@ inner_loop: vmaxps zmm26, zmm0, zmm26 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] # Check whether full or partial store. cmp rcx, 32 @@ -253,14 +253,14 @@ inner_loop: add rbx, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx sub rcx, 32 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-9x16-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-9x16-minmax-asm-amd64-avx512f-broadcast.S index 8de336f988c2..c239863a9c7f 100644 --- a/src/f32-gemm/gen/f32-gemm-9x16-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-9x16-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,8 +112,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -124,22 +124,22 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x16__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -199,15 +199,15 @@ inner_loop: vmaxps zmm19, zmm0, zmm19 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] # Check whether full or partial store. cmp rcx, 16 @@ -233,15 +233,15 @@ inner_loop: add rbp, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp sub rcx, 16 jne outer_loop diff --git a/src/f32-gemm/gen/f32-gemm-9x32-minmax-asm-amd64-avx512f-broadcast.S b/src/f32-gemm/gen/f32-gemm-9x32-minmax-asm-amd64-avx512f-broadcast.S index 7670f513dda0..c65ab67655fd 100644 --- a/src/f32-gemm/gen/f32-gemm-9x32-minmax-asm-amd64-avx512f-broadcast.S +++ b/src/f32-gemm/gen/f32-gemm-9x32-minmax-asm-amd64-avx512f-broadcast.S @@ -27,9 +27,9 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast # Load cm_stride. mov r11, [rsp + 64] # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -40,8 +40,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -52,8 +52,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -64,8 +64,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -76,8 +76,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -88,8 +88,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -100,8 +100,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -112,8 +112,8 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -124,22 +124,22 @@ BEGIN_FUNCTION xnn_f32_gemm_minmax_ukernel_9x32__asm_amd64_avx512f_broadcast cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] # Initialize accumulators with the biases. vmovaps zmm7, [r9 + 0] @@ -236,15 +236,15 @@ inner_loop: vmaxps zmm28, zmm0, zmm28 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] # Check whether full or partial store. cmp rcx, 32 @@ -279,15 +279,15 @@ inner_loop: add rbp, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp sub rcx, 32 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x16-minmax-asm-amd64-avx512vnni.S index 908ce5a876e7..21ca0f84253b 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x16-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn and rdx, -4 sub rsp, 1104 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -128,8 +128,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -140,8 +140,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 # Load quantization params pointer from stack mov r11, [rsp + 1192] @@ -177,19 +177,19 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x16c4__asm_amd64_avx512vnn vmovups zmmword ptr [rsp + 1040], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -293,16 +293,16 @@ inner_loop: vmaxps zmm21, zmm0, zmm21 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] # Check whether full or partial store. cmp rcx, 16 @@ -330,16 +330,16 @@ inner_loop: add r8, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 sub rcx, 16 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x32-minmax-asm-amd64-avx512vnni.S index 0f577bfa8718..a6ae6619c078 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-10x32-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn and rdx, -4 sub rsp, 1104 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -128,8 +128,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -140,8 +140,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 # Load quantization params pointer from stack mov r11, [rsp + 1192] @@ -177,19 +177,19 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_10x32c4__asm_amd64_avx512vnn vmovups zmmword ptr [rsp + 1040], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -367,16 +367,16 @@ inner_loop: vmaxps zmm4, zmm0, zmm4 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] # Check whether full or partial store. cmp rcx, 32 @@ -414,16 +414,16 @@ inner_loop: add r8, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 sub rcx, 32 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x16-minmax-asm-amd64-avx512vnni.S index 95bfd56c2748..4d66ebfc7d28 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x16-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn and rdx, -4 sub rsp, 1168 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -128,8 +128,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -140,8 +140,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 # Clamp a & c pointers if mr <= 10 mov rsi, rax @@ -152,8 +152,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 288], rsi - mov [rsp + 296], r10 + mov [rsp - 288], rsi + mov [rsp - 296], r10 # Load quantization params pointer from stack mov r11, [rsp + 1256] @@ -192,20 +192,20 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x16c4__asm_amd64_avx512vnn vmovups zmmword ptr [rsp + 1104], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] - mov rdi, [rsp + 288] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] + mov rdi, [rsp - 288] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -317,17 +317,17 @@ inner_loop: vmaxps zmm22, zmm0, zmm22 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] - mov rdi, [rsp + 296] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] + mov rdi, [rsp - 296] # Check whether full or partial store. cmp rcx, 16 @@ -357,17 +357,17 @@ inner_loop: add rdi, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 - mov [rsp + 296], rdi + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 + mov [rsp - 296], rdi sub rcx, 16 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x32-minmax-asm-amd64-avx512vnni.S index 21abd629b22d..a8bf41586bc7 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-11x32-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn and rdx, -4 sub rsp, 1168 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -128,8 +128,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Clamp a & c pointers if mr <= 9 mov rax, rsi @@ -140,8 +140,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rax, rsi cmovle r13, r10 - mov [rsp + 272], rax - mov [rsp + 280], r13 + mov [rsp - 272], rax + mov [rsp - 280], r13 # Clamp a & c pointers if mr <= 10 mov rsi, rax @@ -152,8 +152,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn cmovle rsi, rax cmovle r10, r13 - mov [rsp + 288], rsi - mov [rsp + 296], r10 + mov [rsp - 288], rsi + mov [rsp - 296], r10 # Load quantization params pointer from stack mov r11, [rsp + 1256] @@ -192,20 +192,20 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_11x32c4__asm_amd64_avx512vnn vmovups zmmword ptr [rsp + 1104], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] - mov r8, [rsp + 272] - mov rdi, [rsp + 288] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] + mov r8, [rsp - 272] + mov rdi, [rsp - 288] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -398,17 +398,17 @@ inner_loop: vmaxps zmm9, zmm0, zmm9 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] - mov r8, [rsp + 280] - mov rdi, [rsp + 296] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] + mov r8, [rsp - 280] + mov rdi, [rsp - 296] # Check whether full or partial store. cmp rcx, 32 @@ -449,17 +449,17 @@ inner_loop: add rdi, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp - mov [rsp + 280], r8 - mov [rsp + 296], rdi + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp + mov [rsp - 280], r8 + mov [rsp - 296], rdi sub rcx, 32 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-aarch64-neondot-ld32.S index d387eb8baba2..85a7c8a52be1 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-aarch64-neondot-ld32.S @@ -25,8 +25,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__asm_aarch64_neondot_ outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldr q10, [x24] ldp q2, q3, [x5, 0] @@ -38,17 +38,14 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldp q6, q7, [x5, 0] - ldp q8, q9, [x5, 32] - ldr q8, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldp q6, q7, [x5], 32 + ldp q8, q9, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v7.16b, v2.4b[0] sdot v14.4s, v8.16b, v2.4b[0] sdot v15.4s, v9.16b, v2.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -92,9 +89,9 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q12, q13, [x6] - stp q14, q15, [x6, 32] - add x6, x6, 64 + stp q12, q13, [x6], 32 + stp q14, q15, [x6], 32 + sub x3, x3, x2 sub x1, x1, 16 b.ne outer_loop @@ -102,23 +99,20 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q12, q13, [x6] + stp q12, q13, [x6], 32 mov v12.16b, v14.16b mov v13.16b, v15.16b - add x6, x6, 32 tail_4: tbz x1, 2, tail_2 - str q12, [x6] + str q12, [x6], 16 mov v12.16b, v13.16b - add x6, x6, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - add x6, x6, 8 + str d12, [x6], 8 dup d12, v12.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-amd64-avx512vnni.S index 6d8dc3d5e2de..ac491a5dde15 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x16-minmax-asm-amd64-avx512vnni.S @@ -38,7 +38,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 464], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x32-minmax-asm-amd64-avx512vnni.S index 40f098232ed0..0bef4bbf8834 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x32-minmax-asm-amd64-avx512vnni.S @@ -38,7 +38,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 464], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x64-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x64-minmax-asm-amd64-avx512vnni.S index 864746630bd4..93737fb82075 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x64-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x64-minmax-asm-amd64-avx512vnni.S @@ -38,7 +38,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x64c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 464], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-asm-aarch64-neondot-ld32.S index 6b5abae3f7cb..f74e5b726255 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-1x8-minmax-asm-aarch64-neondot-ld32.S @@ -25,8 +25,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c4__asm_aarch64_neondot_l outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldr q10, [x24] ldp q2, q3, [x5, 0] @@ -35,14 +35,11 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldp q6, q7, [x5, 0] - ldr q6, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldp q6, q7, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v7.16b, v2.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -72,8 +69,8 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q12, q13, [x6] - add x6, x6, 32 + stp q12, q13, [x6], 32 + sub x3, x3, x2 sub x1, x1, 8 b.ne outer_loop @@ -81,15 +78,13 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q12, [x6] + str q12, [x6], 16 mov v12.16b, v13.16b - add x6, x6, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - add x6, x6, 8 + str d12, [x6], 8 dup d12, v12.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-aarch64-neondot-ld32.S index fbc5c40efad1..466642b8f274 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-aarch64-neondot-ld32.S @@ -32,8 +32,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16c4__asm_aarch64_neondot_ csel x13, x6, x13, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldp q10, q11, [x24] ldp q2, q3, [x5, 0] @@ -49,12 +49,10 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldp q6, q7, [x5, 0] - ldp q8, q9, [x5, 32] - ldr q8, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldp q6, q7, [x5], 32 + ldp q8, q9, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v6.16b, v3.4b[0] sdot v14.4s, v7.16b, v2.4b[0] @@ -63,8 +61,7 @@ inner_loop: sdot v17.4s, v8.16b, v3.4b[0] sdot v18.4s, v9.16b, v2.4b[0] sdot v19.4s, v9.16b, v3.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -132,12 +129,12 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q12, q14, [x6] - stp q16, q18, [x6, 32] - stp q13, q15, [x13] - stp q17, q19, [x13, 32] - add x6, x6, 64 - add x13, x13, 64 + stp q12, q14, [x6], 32 + stp q16, q18, [x6], 32 + stp q13, q15, [x13], 32 + stp q17, q19, [x13], 32 + sub x3, x3, x2 + sub x9, x9, x2 sub x1, x1, 16 b.ne outer_loop @@ -145,32 +142,26 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q12, q14, [x6] - stp q13, q15, [x13] + stp q12, q14, [x6], 32 + stp q13, q15, [x13], 32 mov v12.16b, v16.16b mov v14.16b, v18.16b mov v13.16b, v17.16b mov v15.16b, v19.16b - add x6, x6, 32 - add x13, x13, 32 tail_4: tbz x1, 2, tail_2 - str q12, [x6] - str q13, [x13] + str q12, [x6], 16 + str q13, [x13], 16 mov v12.16b, v14.16b mov v13.16b, v15.16b - add x6, x6, 16 - add x13, x13, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - str d13, [x13] - add x6, x6, 8 - add x13, x13, 8 + str d12, [x6], 8 + str d13, [x13], 8 dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-amd64-avx512vnni.S index 34b63a18d390..a1d36fcaa02a 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x16-minmax-asm-amd64-avx512vnni.S @@ -50,7 +50,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 528], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x32-minmax-asm-amd64-avx512vnni.S index 955f2135499f..e925e817701d 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x32-minmax-asm-amd64-avx512vnni.S @@ -50,7 +50,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 528], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x64-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x64-minmax-asm-amd64-avx512vnni.S index 9dc74973e555..c7f7e099b888 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x64-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x64-minmax-asm-amd64-avx512vnni.S @@ -50,7 +50,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x64c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 528], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-asm-aarch64-neondot-ld32.S index 9992a9e82733..43b2d05e26e4 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-2x8-minmax-asm-aarch64-neondot-ld32.S @@ -32,8 +32,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_2x8c4__asm_aarch64_neondot_l csel x13, x6, x13, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldp q10, q11, [x24] ldp q2, q3, [x5, 0] @@ -44,17 +44,14 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldp q6, q7, [x5, 0] - ldr q6, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldp q6, q7, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v6.16b, v3.4b[0] sdot v14.4s, v7.16b, v2.4b[0] sdot v15.4s, v7.16b, v3.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -96,10 +93,10 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q12, q14, [x6] - stp q13, q15, [x13] - add x6, x6, 32 - add x13, x13, 32 + stp q12, q14, [x6], 32 + stp q13, q15, [x13], 32 + sub x3, x3, x2 + sub x9, x9, x2 sub x1, x1, 8 b.ne outer_loop @@ -107,20 +104,16 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q12, [x6] - str q13, [x13] + str q12, [x6], 16 + str q13, [x13], 16 mov v12.16b, v14.16b mov v13.16b, v15.16b - add x6, x6, 16 - add x13, x13, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - str d13, [x13] - add x6, x6, 8 - add x13, x13, 8 + str d12, [x6], 8 + str d13, [x13], 8 dup d12, v12.d[1] dup d13, v13.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-aarch64-neondot-ld32.S index 0489c52ca7f5..e0df35013602 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-aarch64-neondot-ld32.S @@ -36,8 +36,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16c4__asm_aarch64_neondot_ csel x14, x13, x14, LS outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldp q10, q11, [x24] ldr q10, [x24] @@ -58,13 +58,11 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldp q6, q7, [x5, 0] - ldp q8, q9, [x5, 32] - ldr q8, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldp q6, q7, [x5], 32 + ldp q8, q9, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v6.16b, v3.4b[0] sdot v14.4s, v6.16b, v4.4b[0] @@ -77,8 +75,7 @@ inner_loop: sdot v21.4s, v9.16b, v2.4b[0] sdot v22.4s, v9.16b, v3.4b[0] sdot v23.4s, v9.16b, v4.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -170,15 +167,15 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q12, q15, [x6] - stp q18, q21, [x6, 32] - stp q13, q16, [x13] - stp q19, q22, [x13, 32] - stp q14, q17, [x14] - stp q20, q23, [x14, 32] - add x6, x6, 64 - add x13, x13, 64 - add x14, x14, 64 + stp q12, q15, [x6], 32 + stp q18, q21, [x6], 32 + stp q13, q16, [x13], 32 + stp q19, q22, [x13], 32 + stp q14, q17, [x14], 32 + stp q20, q23, [x14], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 sub x1, x1, 16 b.ne outer_loop @@ -186,41 +183,32 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q12, q15, [x6] - stp q13, q16, [x13] - stp q14, q17, [x14] + stp q12, q15, [x6], 32 + stp q13, q16, [x13], 32 + stp q14, q17, [x14], 32 mov v12.16b, v18.16b mov v15.16b, v21.16b mov v13.16b, v19.16b mov v16.16b, v22.16b mov v14.16b, v20.16b mov v17.16b, v23.16b - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 tail_4: tbz x1, 2, tail_2 - str q12, [x6] - str q13, [x13] - str q14, [x14] + str q12, [x6], 16 + str q13, [x13], 16 + str q14, [x14], 16 mov v12.16b, v15.16b mov v13.16b, v16.16b mov v14.16b, v17.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - str d13, [x13] - str d14, [x14] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 + str d12, [x6], 8 + str d13, [x13], 8 + str d14, [x14], 8 dup d12, v12.d[1] dup d13, v13.d[1] dup d14, v14.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-amd64-avx512vnni.S index 7d65da61eb9f..ff0bedd1102f 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x16-minmax-asm-amd64-avx512vnni.S @@ -62,7 +62,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 592], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x32-minmax-asm-amd64-avx512vnni.S index 836ea2aaa287..1231fb19cd4e 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x32-minmax-asm-amd64-avx512vnni.S @@ -62,7 +62,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 592], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x64-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x64-minmax-asm-amd64-avx512vnni.S index 0382f92d2209..6518bd7b0c92 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x64-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x64-minmax-asm-amd64-avx512vnni.S @@ -62,7 +62,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x64c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 592], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8-minmax-asm-aarch64-neondot-ld32.S index 1a6057975222..dc270f1d7d74 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-3x8-minmax-asm-aarch64-neondot-ld32.S @@ -36,8 +36,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c4__asm_aarch64_neondot_l csel x14, x13, x14, LS outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldp q10, q11, [x24] ldr q10, [x24] @@ -51,20 +51,17 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldp q6, q7, [x5, 0] - ldr q6, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldp q6, q7, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v6.16b, v3.4b[0] sdot v14.4s, v6.16b, v4.4b[0] sdot v15.4s, v7.16b, v2.4b[0] sdot v16.4s, v7.16b, v3.4b[0] sdot v17.4s, v7.16b, v4.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -118,12 +115,12 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q12, q15, [x6] - stp q13, q16, [x13] - stp q14, q17, [x14] - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 + stp q12, q15, [x6], 32 + stp q13, q16, [x13], 32 + stp q14, q17, [x14], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 sub x1, x1, 8 b.ne outer_loop @@ -131,25 +128,19 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q12, [x6] - str q13, [x13] - str q14, [x14] + str q12, [x6], 16 + str q13, [x13], 16 + str q14, [x14], 16 mov v12.16b, v15.16b mov v13.16b, v16.16b mov v14.16b, v17.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - str d13, [x13] - str d14, [x14] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 + str d12, [x6], 8 + str d13, [x13], 8 + str d14, [x14], 8 dup d12, v12.d[1] dup d13, v13.d[1] dup d14, v14.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-aarch64-neondot-ld32.S index 724cb906f710..6150aafa1556 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-aarch64-neondot-ld32.S @@ -42,8 +42,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_aarch64_neondot_ csel x15, x14, x15, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldp q10, q11, [x24] ldp q2, q3, [x5, 0] @@ -67,14 +67,12 @@ outer_loop: add x5, x5, 64 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldr d5, [x11, x20] - ldp q6, q7, [x5, 0] - ldp q8, q9, [x5, 32] - ldr q8, [x5, 32] - add x5, x5, 64 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldr s5, [x11], 4 + ldp q6, q7, [x5], 32 + ldp q8, q9, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v6.16b, v3.4b[0] sdot v14.4s, v6.16b, v4.4b[0] @@ -91,8 +89,7 @@ inner_loop: sdot v25.4s, v9.16b, v3.4b[0] sdot v26.4s, v9.16b, v4.4b[0] sdot v27.4s, v9.16b, v5.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -208,18 +205,18 @@ inner_loop: # Check whether full or partial store. cmp x1, 16 b.lo tail_8 - stp q12, q16, [x6] - stp q20, q24, [x6, 32] - stp q13, q17, [x13] - stp q21, q25, [x13, 32] - stp q14, q18, [x14] - stp q22, q26, [x14, 32] - stp q15, q19, [x15] - stp q23, q27, [x15, 32] - add x6, x6, 64 - add x13, x13, 64 - add x14, x14, 64 - add x15, x15, 64 + stp q12, q16, [x6], 32 + stp q20, q24, [x6], 32 + stp q13, q17, [x13], 32 + stp q21, q25, [x13], 32 + stp q14, q18, [x14], 32 + stp q22, q26, [x14], 32 + stp q15, q19, [x15], 32 + stp q23, q27, [x15], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 + sub x11, x11, x2 sub x1, x1, 16 b.ne outer_loop @@ -227,10 +224,10 @@ inner_loop: tail_8: tbz x1, 3, tail_4 - stp q12, q16, [x6] - stp q13, q17, [x13] - stp q14, q18, [x14] - stp q15, q19, [x15] + stp q12, q16, [x6], 32 + stp q13, q17, [x13], 32 + stp q14, q18, [x14], 32 + stp q15, q19, [x15], 32 mov v12.16b, v20.16b mov v16.16b, v24.16b mov v13.16b, v21.16b @@ -239,38 +236,26 @@ tail_8: mov v18.16b, v26.16b mov v15.16b, v23.16b mov v19.16b, v27.16b - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 - add x15, x15, 32 tail_4: tbz x1, 2, tail_2 - str q12, [x6] - str q13, [x13] - str q14, [x14] - str q15, [x15] + str q12, [x6], 16 + str q13, [x13], 16 + str q14, [x14], 16 + str q15, [x15], 16 mov v12.16b, v16.16b mov v13.16b, v17.16b mov v14.16b, v18.16b mov v15.16b, v19.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 - add x15, x15, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - str d13, [x13] - str d14, [x14] - str d15, [x15] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 - add x15, x15, 8 + str d12, [x6], 8 + str d13, [x13], 8 + str d14, [x14], 8 + str d15, [x15], 8 dup d12, v12.d[1] dup d13, v13.d[1] dup d14, v14.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-amd64-avx512vnni.S index 02e801e11dcc..088e175bf915 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x16-minmax-asm-amd64-avx512vnni.S @@ -74,7 +74,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 656], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x32-minmax-asm-amd64-avx512vnni.S index 3e7674a87fea..eed325b907c7 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x32-minmax-asm-amd64-avx512vnni.S @@ -74,7 +74,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 656], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x64-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x64-minmax-asm-amd64-avx512vnni.S index 09d697c5aebb..709c05c9e3d9 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x64-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x64-minmax-asm-amd64-avx512vnni.S @@ -74,7 +74,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x64c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 656], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8-minmax-asm-aarch64-neondot-ld32.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8-minmax-asm-aarch64-neondot-ld32.S index 449fdb0cc152..4150079a1668 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8-minmax-asm-aarch64-neondot-ld32.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-4x8-minmax-asm-aarch64-neondot-ld32.S @@ -42,8 +42,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c4__asm_aarch64_neondot_l csel x15, x14, x15, LO outer_loop: - # Zero k counter. - eor x20, x20, x20 + # Initialize k counter. + mov x20, x2 # Initialize accumulators with k_sum * input zero point. ldp q10, q11, [x24] ldp q2, q3, [x5, 0] @@ -58,13 +58,11 @@ outer_loop: add x5, x5, 32 inner_loop: - ldr d2, [x3, x20] - ldr d3, [x9, x20] - ldr d4, [x10, x20] - ldr d5, [x11, x20] - ldp q6, q7, [x5, 0] - ldr q6, [x5, 0] - add x5, x5, 32 + ldr s2, [x3], 4 + ldr s3, [x9], 4 + ldr s4, [x10], 4 + ldr s5, [x11], 4 + ldp q6, q7, [x5], 32 sdot v12.4s, v6.16b, v2.4b[0] sdot v13.4s, v6.16b, v3.4b[0] sdot v14.4s, v6.16b, v4.4b[0] @@ -73,8 +71,7 @@ inner_loop: sdot v17.4s, v7.16b, v3.4b[0] sdot v18.4s, v7.16b, v4.4b[0] sdot v19.4s, v7.16b, v5.4b[0] - add x20, x20, 4 - cmp x2, x20 + subs x20, x20, 4 bne inner_loop # Convert from int32 to float. @@ -140,14 +137,14 @@ inner_loop: # Check whether full or partial store. cmp x1, 8 b.lo tail_4 - stp q12, q16, [x6] - stp q13, q17, [x13] - stp q14, q18, [x14] - stp q15, q19, [x15] - add x6, x6, 32 - add x13, x13, 32 - add x14, x14, 32 - add x15, x15, 32 + stp q12, q16, [x6], 32 + stp q13, q17, [x13], 32 + stp q14, q18, [x14], 32 + stp q15, q19, [x15], 32 + sub x3, x3, x2 + sub x9, x9, x2 + sub x10, x10, x2 + sub x11, x11, x2 sub x1, x1, 8 b.ne outer_loop @@ -155,30 +152,22 @@ inner_loop: tail_4: tbz x1, 2, tail_2 - str q12, [x6] - str q13, [x13] - str q14, [x14] - str q15, [x15] + str q12, [x6], 16 + str q13, [x13], 16 + str q14, [x14], 16 + str q15, [x15], 16 mov v12.16b, v16.16b mov v13.16b, v17.16b mov v14.16b, v18.16b mov v15.16b, v19.16b - add x6, x6, 16 - add x13, x13, 16 - add x14, x14, 16 - add x15, x15, 16 tail_2: tbz x1, 1, tail_1 - str d12, [x6] - str d13, [x13] - str d14, [x14] - str d15, [x15] - add x6, x6, 8 - add x13, x13, 8 - add x14, x14, 8 - add x15, x15, 8 + str d12, [x6], 8 + str d13, [x13], 8 + str d14, [x14], 8 + str d15, [x15], 8 dup d12, v12.d[1] dup d13, v13.d[1] dup d14, v14.d[1] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16-minmax-asm-amd64-avx512vnni.S index 7e6dfb52b2bd..87080cb6b9c0 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x16-minmax-asm-amd64-avx512vnni.S @@ -86,7 +86,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 720], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x32-minmax-asm-amd64-avx512vnni.S index 505d355c8ab5..1fe64f5ee563 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x32-minmax-asm-amd64-avx512vnni.S @@ -86,7 +86,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 720], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x64-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x64-minmax-asm-amd64-avx512vnni.S index 68e56d0a1bae..0744f7740160 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x64-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-5x64-minmax-asm-amd64-avx512vnni.S @@ -86,7 +86,7 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_5x64c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 720], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16-minmax-asm-amd64-avx512vnni.S index 3fd6919e6d4e..161931d21684 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x16-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 848 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Load quantization params pointer from stack mov r11, [rsp + 936] @@ -117,15 +117,15 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 784], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -197,12 +197,12 @@ inner_loop: vmaxps zmm17, zmm0, zmm17 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] # Check whether full or partial store. cmp rcx, 16 @@ -222,12 +222,12 @@ inner_loop: add r10, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 sub rcx, 16 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x32-minmax-asm-amd64-avx512vnni.S index 176fb27f7bcf..392f5e81e14c 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-6x32-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 848 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Load quantization params pointer from stack mov r11, [rsp + 936] @@ -117,15 +117,15 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_6x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 784], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -243,12 +243,12 @@ inner_loop: vmaxps zmm23, zmm0, zmm23 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] # Check whether full or partial store. cmp rcx, 32 @@ -274,12 +274,12 @@ inner_loop: add r10, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 sub rcx, 32 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16-minmax-asm-amd64-avx512vnni.S index 7611b47eb4f7..522dd0d1ab46 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x16-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 912 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Load quantization params pointer from stack mov r11, [rsp + 1000] @@ -132,16 +132,16 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 848], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -221,13 +221,13 @@ inner_loop: vmaxps zmm18, zmm0, zmm18 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] # Check whether full or partial store. cmp rcx, 16 @@ -249,13 +249,13 @@ inner_loop: add r13, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 sub rcx, 16 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x32-minmax-asm-amd64-avx512vnni.S index c878899f599d..dc51096698d7 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-7x32-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 912 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Load quantization params pointer from stack mov r11, [rsp + 1000] @@ -132,16 +132,16 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 848], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -274,13 +274,13 @@ inner_loop: vmaxps zmm25, zmm0, zmm25 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] # Check whether full or partial store. cmp rcx, 32 @@ -309,13 +309,13 @@ inner_loop: add r13, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 sub rcx, 32 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16-minmax-asm-amd64-avx512vnni.S index 036cea89f4a5..0f27cae0e2ef 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x16-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 976 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Load quantization params pointer from stack mov r11, [rsp + 1064] @@ -147,17 +147,17 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 912], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -245,14 +245,14 @@ inner_loop: vmaxps zmm19, zmm0, zmm19 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] # Check whether full or partial store. cmp rcx, 16 @@ -276,14 +276,14 @@ inner_loop: add rbx, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx sub rcx, 16 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x32-minmax-asm-amd64-avx512vnni.S index ed763c61c829..80eea889ec2c 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-8x32-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 976 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Load quantization params pointer from stack mov r11, [rsp + 1064] @@ -147,17 +147,17 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_8x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 912], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -305,14 +305,14 @@ inner_loop: vmaxps zmm27, zmm0, zmm27 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] # Check whether full or partial store. cmp rcx, 32 @@ -344,14 +344,14 @@ inner_loop: add rbx, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx sub rcx, 32 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x16-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x16-minmax-asm-amd64-avx512vnni.S index 89142cc0ed71..023f5837e7ef 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x16-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x16-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 1040 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -128,8 +128,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Load quantization params pointer from stack mov r11, [rsp + 1128] @@ -162,18 +162,18 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x16c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 976], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -269,15 +269,15 @@ inner_loop: vmaxps zmm20, zmm0, zmm20 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] # Check whether full or partial store. cmp rcx, 16 @@ -303,15 +303,15 @@ inner_loop: add rbp, 64 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp sub rcx, 16 jne outer_loop diff --git a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x32-minmax-asm-amd64-avx512vnni.S b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x32-minmax-asm-amd64-avx512vnni.S index dbae58d69b35..217f56b415e5 100644 --- a/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x32-minmax-asm-amd64-avx512vnni.S +++ b/src/qd8-f32-qc8w-gemm/gen/qd8-f32-qc8w-gemm-9x32-minmax-asm-amd64-avx512vnni.S @@ -31,9 +31,9 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni and rdx, -4 sub rsp, 1040 # Write rsi (a pointer) to the stack as we need the register. - mov [rsp + 128], rsi + mov [rsp - 128], rsi # Write r10 (c pointer) to the stack as we need the register. - mov [rsp + 136], r10 + mov [rsp - 136], r10 # Clamp a & c pointers if mr <= 1 mov rax, rsi @@ -44,8 +44,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 144], rax - mov [rsp + 152], r13 + mov [rsp - 144], rax + mov [rsp - 152], r13 # Clamp a & c pointers if mr <= 2 mov rsi, rax @@ -56,8 +56,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 160], rsi - mov [rsp + 168], r10 + mov [rsp - 160], rsi + mov [rsp - 168], r10 # Clamp a & c pointers if mr <= 3 mov rax, rsi @@ -68,8 +68,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 176], rax - mov [rsp + 184], r13 + mov [rsp - 176], rax + mov [rsp - 184], r13 # Clamp a & c pointers if mr <= 4 mov rsi, rax @@ -80,8 +80,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 192], rsi - mov [rsp + 200], r10 + mov [rsp - 192], rsi + mov [rsp - 200], r10 # Clamp a & c pointers if mr <= 5 mov rax, rsi @@ -92,8 +92,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 208], rax - mov [rsp + 216], r13 + mov [rsp - 208], rax + mov [rsp - 216], r13 # Clamp a & c pointers if mr <= 6 mov rsi, rax @@ -104,8 +104,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 224], rsi - mov [rsp + 232], r10 + mov [rsp - 224], rsi + mov [rsp - 232], r10 # Clamp a & c pointers if mr <= 7 mov rax, rsi @@ -116,8 +116,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rax, rsi cmovle r13, r10 - mov [rsp + 240], rax - mov [rsp + 248], r13 + mov [rsp - 240], rax + mov [rsp - 248], r13 # Clamp a & c pointers if mr <= 8 mov rsi, rax @@ -128,8 +128,8 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni cmovle rsi, rax cmovle r10, r13 - mov [rsp + 256], rsi - mov [rsp + 264], r10 + mov [rsp - 256], rsi + mov [rsp - 264], r10 # Load quantization params pointer from stack mov r11, [rsp + 1128] @@ -162,18 +162,18 @@ BEGIN_FUNCTION xnn_qd8_f32_qc8w_gemm_minmax_ukernel_9x32c4__asm_amd64_avx512vnni vmovups zmmword ptr [rsp + 976], zmm6 outer_loop: - # Zero k counter. + # Initialize k counter. mov r11, 0 # Read a pointers from stack into GP registers. - mov rsi, [rsp + 128] - mov rax, [rsp + 144] - mov r15, [rsp + 160] - mov r14, [rsp + 176] - mov r12, [rsp + 192] - mov r10, [rsp + 208] - mov r13, [rsp + 224] - mov rbx, [rsp + 240] - mov rbp, [rsp + 256] + mov rsi, [rsp - 128] + mov rax, [rsp - 144] + mov r15, [rsp - 160] + mov r14, [rsp - 176] + mov r12, [rsp - 192] + mov r10, [rsp - 208] + mov r13, [rsp - 224] + mov rbx, [rsp - 240] + mov rbp, [rsp - 256] # Initialize accumulators with k_sum * input zero point. vmovaps zmm6, [r9 + 0] @@ -336,15 +336,15 @@ inner_loop: vmaxps zmm29, zmm0, zmm29 # Pop output pointers from the stack. - mov rsi, [rsp + 136] - mov rax, [rsp + 152] - mov r15, [rsp + 168] - mov r14, [rsp + 184] - mov r12, [rsp + 200] - mov r10, [rsp + 216] - mov r13, [rsp + 232] - mov rbx, [rsp + 248] - mov rbp, [rsp + 264] + mov rsi, [rsp - 136] + mov rax, [rsp - 152] + mov r15, [rsp - 168] + mov r14, [rsp - 184] + mov r12, [rsp - 200] + mov r10, [rsp - 216] + mov r13, [rsp - 232] + mov rbx, [rsp - 248] + mov rbp, [rsp - 264] # Check whether full or partial store. cmp rcx, 32 @@ -379,15 +379,15 @@ inner_loop: add rbp, 128 # Write output pointers to the stack. - mov [rsp + 136], rsi - mov [rsp + 152], rax - mov [rsp + 168], r15 - mov [rsp + 184], r14 - mov [rsp + 200], r12 - mov [rsp + 216], r10 - mov [rsp + 232], r13 - mov [rsp + 248], rbx - mov [rsp + 264], rbp + mov [rsp - 136], rsi + mov [rsp - 152], rax + mov [rsp - 168], r15 + mov [rsp - 184], r14 + mov [rsp - 200], r12 + mov [rsp - 216], r10 + mov [rsp - 232], r13 + mov [rsp - 248], rbx + mov [rsp - 264], rbp sub rcx, 32 jne outer_loop