From cf0920b16f51009e9a198b3874de0a50447890a3 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Wed, 26 Jun 2024 07:43:26 +0100 Subject: [PATCH 1/7] sct: crypto_hash/sha256/amd64/ref --- src/crypto_hash/sha256/amd64/ref/hash.jazz | 3 +++ src/crypto_hash/sha256/amd64/ref/sha256.jinc | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/crypto_hash/sha256/amd64/ref/hash.jazz b/src/crypto_hash/sha256/amd64/ref/hash.jazz index bed68245..2a04a350 100644 --- a/src/crypto_hash/sha256/amd64/ref/hash.jazz +++ b/src/crypto_hash/sha256/amd64/ref/hash.jazz @@ -3,6 +3,9 @@ require "sha256.jinc" export fn jade_hash_sha256_amd64_ref(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha256_ref(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha256/amd64/ref/sha256.jinc b/src/crypto_hash/sha256/amd64/ref/sha256.jinc index fa7497e4..c5a83de4 100644 --- a/src/crypto_hash/sha256/amd64/ref/sha256.jinc +++ b/src/crypto_hash/sha256/amd64/ref/sha256.jinc @@ -192,7 +192,7 @@ fn _blocks_0_ref(reg ptr u32[8] _H, reg u64 in inlen) -> reg ptr u32[8], reg u64 stack ptr u32[8] Hp; reg ptr u32[8] H; reg u64 tr; - stack u64 in_s; + #mmx reg u64 in_s; Kp = SHA256_K; Hp = _H; @@ -275,9 +275,9 @@ fn _blocks_1_ref(reg ptr u32[8] _H, reg ptr u32[32] sblocks, reg u64 nblocks) -> reg ptr u32[64] Kp; stack ptr u32[8] Hp; reg ptr u32[8] H; - stack ptr u32[32] s_sblocks; + #mmx reg ptr u32[32] s_sblocks; reg u64 i oblocks tr; - stack u64 s_i; + #mmx reg u64 s_i; Kp = SHA256_K; Hp = _H; @@ -395,7 +395,7 @@ inline fn __lastblocks_ref(reg u64 in inlen bits) -> stack u32[32], reg u64 inline fn __sha256_ref(reg u64 out in inlen) { reg u64 bits nblocks; - stack u64 s_out s_bits; + #mmx reg u64 s_out s_bits; stack u32[8] H; reg ptr u32[8] Hp; stack u32[32] sblocks; From 36658f181bbfc51d08c2d30ca73af96b743eda62 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Wed, 26 Jun 2024 07:45:17 +0100 Subject: [PATCH 2/7] sct: crypto_hash/sha512/amd64/ref --- src/crypto_hash/sha512/amd64/ref/hash.jazz | 3 +++ src/crypto_hash/sha512/amd64/ref/sha512.jinc | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/crypto_hash/sha512/amd64/ref/hash.jazz b/src/crypto_hash/sha512/amd64/ref/hash.jazz index 76212246..9990d2dd 100644 --- a/src/crypto_hash/sha512/amd64/ref/hash.jazz +++ b/src/crypto_hash/sha512/amd64/ref/hash.jazz @@ -3,6 +3,9 @@ require "sha512.jinc" export fn jade_hash_sha512_amd64_ref(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha512_ref(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha512/amd64/ref/sha512.jinc b/src/crypto_hash/sha512/amd64/ref/sha512.jinc index 184af39b..15c49814 100644 --- a/src/crypto_hash/sha512/amd64/ref/sha512.jinc +++ b/src/crypto_hash/sha512/amd64/ref/sha512.jinc @@ -192,7 +192,7 @@ fn _blocks_0_ref(reg ptr u64[8] _H, reg u64 in inlen) -> reg ptr u64[8], reg u64 stack ptr u64[8] Hp; reg ptr u64[8] H; reg u64 tr; - stack u64 in_s; + #mmx reg u64 in_s; Kp = SHA512_K; Hp = _H; @@ -275,9 +275,9 @@ fn _blocks_1_ref(reg ptr u64[8] _H, reg ptr u64[32] sblocks, reg u64 nblocks) -> reg ptr u64[80] Kp; stack ptr u64[8] Hp; reg ptr u64[8] H; - stack ptr u64[32] s_sblocks; + #mmx reg ptr u64[32] s_sblocks; reg u64 i oblocks tr; - stack u64 s_i; + #mmx reg u64 s_i; Kp = SHA512_K; Hp = _H; @@ -395,7 +395,7 @@ inline fn __lastblocks_ref(reg u64 in inlen bits) -> stack u64[32], reg u64 inline fn __sha512_ref(reg u64 out in inlen) { reg u64 bits nblocks; - stack u64 s_out s_bits; + #mmx reg u64 s_out s_bits; stack u64[8] H; reg ptr u64[8] Hp; stack u64[32] sblocks; From 9268d1ca24059bf0d41fb69401aa728d7bb80165 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Wed, 26 Jun 2024 08:15:47 +0100 Subject: [PATCH 3/7] sct: crypto_hash sha3-* ref1 --- .../keccak/keccak1600/amd64/ref1/keccak1600.jinc | 12 ++++++------ src/crypto_hash/sha3-224/amd64/ref1/hash.jazz | 3 +++ src/crypto_hash/sha3-256/amd64/ref1/hash.jazz | 3 +++ src/crypto_hash/sha3-384/amd64/ref1/hash.jazz | 3 +++ src/crypto_hash/sha3-512/amd64/ref1/hash.jazz | 3 +++ 5 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc index c6dcf710..6d18b83e 100644 --- a/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc +++ b/src/common/keccak/keccak1600/amd64/ref1/keccak1600.jinc @@ -87,7 +87,7 @@ inline fn __absorb_ref1( reg u64 rate // rate already in bytes -- it is returned bc of spills ) -> reg ptr u64[25], reg u64 { - stack u64 s_in s_inlen s_rate; + #mmx reg u64 s_in s_inlen s_rate; reg u8 trail_byte; // intermediate blocks @@ -173,13 +173,13 @@ inline fn __xtr_bytes_ref1( inline fn __squeeze_ref1( reg mut ptr u64[25] state, - stack u64 s_out, - reg u64 outlen, - reg u64 rate + #mmx reg u64 s_out, + reg u64 outlen, + reg u64 rate ) { reg u64 out; - stack u64 s_outlen s_rate; + #mmx reg u64 s_outlen s_rate; // intermediate blocks while ( outlen > rate ) @@ -212,7 +212,7 @@ inline fn __keccak1600_ref1(reg u64 out outlen in inlen, reg u8 trail_byte, reg { stack u64[25] _state; reg ptr u64[25] state; - stack u64 s_out s_outlen; + #mmx reg u64 s_out s_outlen; stack u8 s_trail_byte; s_out = out; diff --git a/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz index df9387c2..6411abba 100644 --- a/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz +++ b/src/crypto_hash/sha3-224/amd64/ref1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-224.jinc" export fn jade_hash_sha3_224_amd64_ref1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_224_ref1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz index e8a10bf8..0538261b 100644 --- a/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz +++ b/src/crypto_hash/sha3-256/amd64/ref1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-256.jinc" export fn jade_hash_sha3_256_amd64_ref1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_256_ref1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz index 166e9a76..53a0e0cd 100644 --- a/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz +++ b/src/crypto_hash/sha3-384/amd64/ref1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-384.jinc" export fn jade_hash_sha3_384_amd64_ref1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_384_ref1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz b/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz index 453a96a8..26126522 100644 --- a/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz +++ b/src/crypto_hash/sha3-512/amd64/ref1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-512.jinc" export fn jade_hash_sha3_512_amd64_ref1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_512_ref1(hash, input, input_length); ?{}, r = #set0(); return r; From b639b734557358cc5e5e81388c9b608a6c7ad926 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Wed, 26 Jun 2024 08:20:38 +0100 Subject: [PATCH 4/7] sct: crypto_hash sha3-* avx2 --- .../keccak1600/amd64/avx2/keccak1600.jinc | 119 +++++++++++++----- .../keccak1600/amd64/avx2/keccakf1600.jinc | 8 +- src/crypto_hash/sha3-224/amd64/avx2/hash.jazz | 7 +- .../sha3-224/amd64/avx2/sha3-224.jinc | 4 +- src/crypto_hash/sha3-256/amd64/avx2/hash.jazz | 7 +- .../sha3-256/amd64/avx2/sha3-256.jinc | 4 +- src/crypto_hash/sha3-384/amd64/avx2/hash.jazz | 7 +- .../sha3-384/amd64/avx2/sha3-384.jinc | 4 +- src/crypto_hash/sha3-512/amd64/avx2/hash.jazz | 7 +- .../sha3-512/amd64/avx2/sha3-512.jinc | 4 +- 10 files changed, 122 insertions(+), 49 deletions(-) diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc index 4403e5bb..3cbacb8a 100644 --- a/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc +++ b/src/common/keccak/keccak1600/amd64/avx2/keccak1600.jinc @@ -33,23 +33,32 @@ inline fn __add_full_block_avx2( stack u64[28] s_state, reg ptr u64[25] a_jagged_p, reg u64 in inlen, - reg u64 rate -) -> reg u256[7], stack u64[28], reg u64, reg u64 + reg u64 rate, + #msf reg u64 ms +) -> reg u256[7], stack u64[28], reg u64, reg u64, #msf reg u64 { inline int i; reg u64 j l t rate8; + reg bool loop_condition; rate8 = rate; rate8 >>= 3; j = 0; - while ( j < rate8 ) + while { loop_condition = ( j < rate8 ); } ( loop_condition ) { + ms = #update_msf(loop_condition, ms); + t = [in + 8*j]; + l = a_jagged_p[(int) j]; + l = #protect(l, ms); + s_state[(int) l] = t; j += 1; + } + ms = #update_msf(!loop_condition, ms); //TODO: check & change to #VPBROADCAST_4u64 t = s_state[0]; @@ -63,7 +72,7 @@ inline fn __add_full_block_avx2( in += rate; inlen -= rate; - return state, s_state, in, inlen; + return state, s_state, in, inlen, ms; } @@ -74,42 +83,56 @@ inline fn __add_final_block_avx2( reg ptr u64[25] a_jagged_p, reg u64 in inlen, reg u8 trail_byte, - reg u64 rate -) -> reg u256[7] + reg u64 rate, + #msf reg u64 ms +) -> reg u256[7], #msf reg u64 { inline int i; reg u64 j l t inlen8; reg u8 c; + reg bool loop_condition; s_state = __init_s_state_avx2(); inlen8 = inlen; inlen8 >>= 3; j = 0; - while ( j < inlen8 ) + + while { loop_condition = (j < inlen8); } ( loop_condition ) { + ms = #update_msf(loop_condition, ms); + t = [in + 8*j]; l = a_jagged_p[(int) j]; + l = #protect(l, ms); + s_state[(int) l] = t; j += 1; } + ms = #update_msf(!loop_condition, ms); + l = a_jagged_p[(int) j]; + l = #protect(l, ms); + l <<= 3; j <<= 3; - while ( j < inlen ) + while { loop_condition = ( j < inlen ); } ( loop_condition ) { + ms = #update_msf(loop_condition, ms); c = (u8)[in + j]; s_state[u8 (int) l] = c; j += 1; l += 1; } + ms = #update_msf(!loop_condition, ms); s_state[u8 (int) l] = trail_byte; // j = (rate-1) >> 3; j = rate; j -= 1; j >>= 3; l = a_jagged_p[(int) j]; + l = #protect(l, ms); l <<= 3; // l += ((rate-1) & 0x7) j = rate; j -= 1; j &= 0x7; @@ -125,7 +148,7 @@ inline fn __add_final_block_avx2( for i = 0 to 7 { state[i] ^= s_state[u256 i]; } - return state; + return state, ms; } @@ -134,12 +157,14 @@ inline fn __xtr_full_block_avx2( reg u256[7] state, reg ptr u64[25] a_jagged_p, reg u64 out, - reg u64 len -) -> reg u64 + reg u64 len, + #msf reg u64 ms +) -> reg u64, #msf reg u64 { inline int i; stack u64[28] s_state; reg u64 j l t len8; + reg bool loop_condition; for i = 0 to 7 { s_state[u256 i] = state[i]; } @@ -147,17 +172,22 @@ inline fn __xtr_full_block_avx2( len8 = len; len8 >>= 3; j = 0; - while ( j < len8 ) + while { loop_condition = ( j < len8 ); } ( loop_condition ) { + ms = #update_msf(loop_condition, ms); + l = a_jagged_p[(int) j]; + l = #protect(l, ms); + t = s_state[(int) l]; [out + 8*j] = t; j += 1; } + ms = #update_msf(!loop_condition, ms); out += len; - return out; + return out, ms; } @@ -166,13 +196,15 @@ inline fn __xtr_bytes_avx2( reg u256[7] state, reg ptr u64[25] a_jagged_p, reg u64 out, - reg u64 len + reg u64 len, + #msf reg u64 ms ) -> reg u64 { inline int i; stack u64[28] s_state; reg u64 j l t len8; reg u8 c; + reg bool loop_condition; for i = 0 to 7 { s_state[u256 i] = state[i]; } @@ -180,13 +212,22 @@ inline fn __xtr_bytes_avx2( len8 = len; len8 >>= 3; j = 0; - while ( j < len8 ) - { l = a_jagged_p[(int) j]; + while { loop_condition = ( j < len8 ); } ( loop_condition ) + { + ms = #update_msf(loop_condition, ms); + + l = a_jagged_p[(int) j]; + l = #protect(l, ms); + t = s_state[(int) l]; [out + 8*j] = t; j += 1; } + ms = #update_msf(!loop_condition, ms); + l = a_jagged_p[(int)j]; + l = #protect(l, ms); + j <<= 3; l <<= 3; @@ -208,65 +249,75 @@ inline fn __absorb_avx2( reg u256[7] state, reg u64 in inlen, reg u8 trail_byte, - reg u64 rate -) -> reg u256[7] + reg u64 rate, + #msf reg u64 ms +) -> reg u256[7], #msf reg u64 { stack u64[28] s_state; reg ptr u64[25] a_jagged_p; + reg bool loop_condition; a_jagged_p = KECCAK_A_JAGGED; s_state = __init_s_state_avx2(); // intermediate blocks - while ( inlen >= rate ) + while { loop_condition = (inlen >= rate); } (loop_condition) { - state, s_state, in, inlen = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate); - state = __keccakf1600_avx2(state); + ms = #update_msf(loop_condition, ms); + + state, s_state, in, inlen, ms = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate, ms); + + state, ms = __keccakf1600_avx2(state, ms); } + ms = #update_msf(!loop_condition, ms); // final block - state = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate); + state, ms = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate, ms); - return state; + return state, ms; } -inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate) +inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate, #msf reg u64 ms) { reg ptr u64[25] a_jagged_p; + reg bool loop_condition; a_jagged_p = KECCAK_A_JAGGED; // intermediate blocks - while ( outlen > rate ) + while { loop_condition = (outlen > rate); } ( loop_condition ) { - state = __keccakf1600_avx2(state); - out = __xtr_full_block_avx2(state, a_jagged_p, out, rate); + ms = #update_msf(loop_condition, ms); + + state, ms = __keccakf1600_avx2(state, ms); + out, ms = __xtr_full_block_avx2(state, a_jagged_p, out, rate, ms); outlen -= rate; } + ms = #update_msf(!loop_condition, ms); - state = __keccakf1600_avx2(state); - out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen); + state, ms = __keccakf1600_avx2(state, ms); + out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen, ms); } -inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) +inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate, #msf reg u64 ms) { reg u256[7] state; state = __keccak_init_avx2(); // absorb - state = __absorb_avx2(state, in, inlen, trail_byte, rate); + state, ms = __absorb_avx2(state, in, inlen, trail_byte, rate, ms); // squeeze - __squeeze_avx2(state, out, outlen, rate); + __squeeze_avx2(state, out, outlen, rate, ms); } -fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) +fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate, #msf reg u64 ms) { - __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc index 6ca9dda6..907981ee 100644 --- a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc +++ b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc @@ -59,7 +59,7 @@ u64[25] KECCAK_A_JAGGED = }; -inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7] +inline fn __keccakf1600_avx2(reg u256[7] state, #msf reg u64 ms) -> reg u256[7], #msf reg u64 { reg u256[9] t; reg u256 c00 c14 d00 d14; @@ -194,9 +194,11 @@ inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7] iotas_o += 32; _,_,_,zf,r = #DEC_64(r); - }(!zf) + }(!zf) { ms = #update_msf(!zf, ms); } - return state; + ms = #update_msf(zf, ms); + + return state, ms; } diff --git a/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz index 77ae780a..97d4822a 100644 --- a/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz +++ b/src/crypto_hash/sha3-224/amd64/avx2/hash.jazz @@ -3,7 +3,12 @@ require "sha3-224.jinc" export fn jade_hash_sha3_224_amd64_avx2(reg u64 hash input input_length) -> reg u64 { reg u64 r; - __sha3_224_avx2(hash, input, input_length); + #msf reg u64 ms; + + ms = #init_msf(); + + __sha3_224_avx2(hash, input, input_length, ms); + ?{}, r = #set0(); return r; } diff --git a/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc b/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc index 10f0d31b..42e20d46 100644 --- a/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc +++ b/src/crypto_hash/sha3-224/amd64/avx2/sha3-224.jinc @@ -1,6 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -inline fn __sha3_224_avx2(reg u64 out in inlen) +inline fn __sha3_224_avx2(reg u64 out in inlen, #msf reg u64 ms) { reg u64 outlen rate; reg u8 trail_byte; @@ -9,7 +9,7 @@ inline fn __sha3_224_avx2(reg u64 out in inlen) trail_byte = 0x6; rate = (1152/8); - _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } diff --git a/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz index 462c1c0b..88f6b8ff 100644 --- a/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz +++ b/src/crypto_hash/sha3-256/amd64/avx2/hash.jazz @@ -3,7 +3,12 @@ require "sha3-256.jinc" export fn jade_hash_sha3_256_amd64_avx2(reg u64 hash input input_length) -> reg u64 { reg u64 r; - __sha3_256_avx2(hash, input, input_length); + #msf reg u64 ms; + + ms = #init_msf(); + + __sha3_256_avx2(hash, input, input_length, ms); + ?{}, r = #set0(); return r; } diff --git a/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc b/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc index ee575bb5..6a808935 100644 --- a/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc +++ b/src/crypto_hash/sha3-256/amd64/avx2/sha3-256.jinc @@ -1,6 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -inline fn __sha3_256_avx2(reg u64 out in inlen) +inline fn __sha3_256_avx2(reg u64 out in inlen, #msf reg u64 ms) { reg u64 outlen rate; reg u8 trail_byte; @@ -9,7 +9,7 @@ inline fn __sha3_256_avx2(reg u64 out in inlen) trail_byte = 0x6; rate = (1088/8); - _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } diff --git a/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz index 0be82db3..75e61f6c 100644 --- a/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz +++ b/src/crypto_hash/sha3-384/amd64/avx2/hash.jazz @@ -3,7 +3,12 @@ require "sha3-384.jinc" export fn jade_hash_sha3_384_amd64_avx2(reg u64 hash input input_length) -> reg u64 { reg u64 r; - __sha3_384_avx2(hash, input, input_length); + #msf reg u64 ms; + + ms = #init_msf(); + + __sha3_384_avx2(hash, input, input_length, ms); + ?{}, r = #set0(); return r; } diff --git a/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc b/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc index db29845f..4737c251 100644 --- a/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc +++ b/src/crypto_hash/sha3-384/amd64/avx2/sha3-384.jinc @@ -1,6 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -inline fn __sha3_384_avx2(reg u64 out in inlen) +inline fn __sha3_384_avx2(reg u64 out in inlen, #msf reg u64 ms) { reg u64 outlen rate; reg u8 trail_byte; @@ -9,7 +9,7 @@ inline fn __sha3_384_avx2(reg u64 out in inlen) trail_byte = 0x6; rate = (832/8); - _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } diff --git a/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz b/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz index 49335d0d..50070315 100644 --- a/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz +++ b/src/crypto_hash/sha3-512/amd64/avx2/hash.jazz @@ -3,7 +3,12 @@ require "sha3-512.jinc" export fn jade_hash_sha3_512_amd64_avx2(reg u64 hash input input_length) -> reg u64 { reg u64 r; - __sha3_512_avx2(hash, input, input_length); + #msf reg u64 ms; + + ms = #init_msf(); + + __sha3_512_avx2(hash, input, input_length, ms); + ?{}, r = #set0(); return r; } diff --git a/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc b/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc index 17ce4c24..0a9da967 100644 --- a/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc +++ b/src/crypto_hash/sha3-512/amd64/avx2/sha3-512.jinc @@ -1,6 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -inline fn __sha3_512_avx2(reg u64 out in inlen) +inline fn __sha3_512_avx2(reg u64 out in inlen, #msf reg u64 ms) { reg u64 outlen rate; reg u8 trail_byte; @@ -9,7 +9,7 @@ inline fn __sha3_512_avx2(reg u64 out in inlen) trail_byte = 0x6; rate = (576/8); - _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + _keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } From 43fd681a17c3bd28c40f5a24bdd78a47d025e241 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Wed, 26 Jun 2024 08:34:56 +0100 Subject: [PATCH 5/7] sct: crypto_hash sha3-* ref and bmi1 --- .../keccak/keccak1600/amd64/bmi1/keccak1600.jinc | 12 ++++++------ .../keccak/keccak1600/amd64/bmi1/keccakf1600.jinc | 2 +- .../keccak/keccak1600/amd64/ref/keccak1600.jinc | 8 ++++---- src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz | 3 +++ src/crypto_hash/sha3-224/amd64/ref/hash.jazz | 3 +++ src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz | 3 +++ src/crypto_hash/sha3-256/amd64/ref/hash.jazz | 3 +++ src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz | 3 +++ src/crypto_hash/sha3-384/amd64/ref/hash.jazz | 3 +++ src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz | 3 +++ src/crypto_hash/sha3-512/amd64/ref/hash.jazz | 3 +++ 11 files changed, 35 insertions(+), 11 deletions(-) diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc index fa81ca75..9e427d46 100644 --- a/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc +++ b/src/common/keccak/keccak1600/amd64/bmi1/keccak1600.jinc @@ -88,7 +88,7 @@ inline fn __absorb_bmi1( reg u64 rate // rate already in bytes -- it is returned bc of spills ) -> reg ptr u64[25], reg u64 { - stack u64 s_in s_inlen s_rate; + #mmx reg u64 s_in s_inlen s_rate; reg u8 trail_byte; // intermediate blocks @@ -174,13 +174,13 @@ inline fn __xtr_bytes_bmi1( inline fn __squeeze_bmi1( reg mut ptr u64[25] state, - stack u64 s_out, - reg u64 outlen, - reg u64 rate + #mmx reg u64 s_out, + reg u64 outlen, + reg u64 rate ) { reg u64 out; - stack u64 s_outlen s_rate; + #mmx reg u64 s_outlen s_rate; // intermediate blocks while ( outlen > rate ) @@ -213,7 +213,7 @@ inline fn __keccak1600_bmi1(reg u64 out outlen in inlen, reg u8 trail_byte, reg { stack u64[25] _state; reg ptr u64[25] state; - stack u64 s_out s_outlen; + #mmx reg u64 s_out s_outlen; stack u8 s_trail_byte; s_out = out; diff --git a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc index 565c69ae..40754c55 100644 --- a/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc +++ b/src/common/keccak/keccak1600/amd64/bmi1/keccakf1600.jinc @@ -129,7 +129,7 @@ inline fn __round_bmi1(reg ptr u64[25] e a, reg u64 rc) -> reg ptr u64[25] inline fn __keccakf1600_bmi1(reg ptr u64[25] a) -> reg ptr u64[25] { reg ptr u64[24] RC; - stack ptr u64[24] s_RC; + #mmx reg ptr u64[24] s_RC; stack u64[25] s_e; reg ptr u64[25] e; reg u64 c rc; diff --git a/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc index cd718735..f903e7cb 100644 --- a/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc +++ b/src/common/keccak/keccak1600/amd64/ref/keccak1600.jinc @@ -87,7 +87,7 @@ inline fn __absorb_ref( reg u64 rate // rate already in bytes -- it is returned bc of spills ) -> stack u64[25], reg u64 { - stack u64 s_in s_inlen s_rate; + #mmx reg u64 s_in s_inlen s_rate; reg u8 trail_byte; // intermediate blocks @@ -171,13 +171,13 @@ inline fn __xtr_bytes_ref( inline fn __squeeze_ref( stack u64[25] state, - stack u64 s_out, + #mmx reg u64 s_out, reg u64 outlen, reg u64 rate ) { reg u64 out; - stack u64 s_outlen s_rate; + #mmx reg u64 s_outlen s_rate; // intermediate blocks while ( outlen > rate ) @@ -205,7 +205,7 @@ inline fn __squeeze_ref( inline fn __keccak1600_ref(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) { stack u64[25] state; - stack u64 s_out s_outlen; + #mmx reg u64 s_out s_outlen; stack u8 s_trail_byte; s_out = out; diff --git a/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz index 9703da0d..df52afb9 100644 --- a/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz +++ b/src/crypto_hash/sha3-224/amd64/bmi1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-224.jinc" export fn jade_hash_sha3_224_amd64_bmi1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_224_bmi1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-224/amd64/ref/hash.jazz b/src/crypto_hash/sha3-224/amd64/ref/hash.jazz index 0bda7d05..9739444c 100644 --- a/src/crypto_hash/sha3-224/amd64/ref/hash.jazz +++ b/src/crypto_hash/sha3-224/amd64/ref/hash.jazz @@ -3,6 +3,9 @@ require "sha3-224.jinc" export fn jade_hash_sha3_224_amd64_ref(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_224_ref(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz index f2e646cb..bba2e585 100644 --- a/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz +++ b/src/crypto_hash/sha3-256/amd64/bmi1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-256.jinc" export fn jade_hash_sha3_256_amd64_bmi1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_256_bmi1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-256/amd64/ref/hash.jazz b/src/crypto_hash/sha3-256/amd64/ref/hash.jazz index 6c381cce..bfa36c72 100644 --- a/src/crypto_hash/sha3-256/amd64/ref/hash.jazz +++ b/src/crypto_hash/sha3-256/amd64/ref/hash.jazz @@ -3,6 +3,9 @@ require "sha3-256.jinc" export fn jade_hash_sha3_256_amd64_ref(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_256_ref(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz index 6090b84e..48b124fb 100644 --- a/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz +++ b/src/crypto_hash/sha3-384/amd64/bmi1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-384.jinc" export fn jade_hash_sha3_384_amd64_bmi1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_384_bmi1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-384/amd64/ref/hash.jazz b/src/crypto_hash/sha3-384/amd64/ref/hash.jazz index fb952862..65518d29 100644 --- a/src/crypto_hash/sha3-384/amd64/ref/hash.jazz +++ b/src/crypto_hash/sha3-384/amd64/ref/hash.jazz @@ -3,6 +3,9 @@ require "sha3-384.jinc" export fn jade_hash_sha3_384_amd64_ref(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_384_ref(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz b/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz index 79a4f3ce..4ff72114 100644 --- a/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz +++ b/src/crypto_hash/sha3-512/amd64/bmi1/hash.jazz @@ -3,6 +3,9 @@ require "sha3-512.jinc" export fn jade_hash_sha3_512_amd64_bmi1(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_512_bmi1(hash, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_hash/sha3-512/amd64/ref/hash.jazz b/src/crypto_hash/sha3-512/amd64/ref/hash.jazz index aa265621..c127947c 100644 --- a/src/crypto_hash/sha3-512/amd64/ref/hash.jazz +++ b/src/crypto_hash/sha3-512/amd64/ref/hash.jazz @@ -3,6 +3,9 @@ require "sha3-512.jinc" export fn jade_hash_sha3_512_amd64_ref(reg u64 hash input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __sha3_512_ref(hash, input, input_length); ?{}, r = #set0(); return r; From a283669451be00372fffe73eccdb60e24b138216 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Wed, 26 Jun 2024 08:50:25 +0100 Subject: [PATCH 6/7] sct: crypto_xof --- src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc | 2 +- src/crypto_xof/shake128/amd64/avx2/shake128.jinc | 4 ++-- src/crypto_xof/shake128/amd64/avx2/xof.jazz | 6 +++++- src/crypto_xof/shake128/amd64/bmi1/xof.jazz | 3 +++ src/crypto_xof/shake128/amd64/ref/xof.jazz | 3 +++ src/crypto_xof/shake128/amd64/ref1/xof.jazz | 3 +++ src/crypto_xof/shake256/amd64/avx2/shake256.jinc | 4 ++-- src/crypto_xof/shake256/amd64/avx2/xof.jazz | 7 ++++++- src/crypto_xof/shake256/amd64/bmi1/xof.jazz | 3 +++ src/crypto_xof/shake256/amd64/ref/xof.jazz | 3 +++ src/crypto_xof/shake256/amd64/ref1/xof.jazz | 3 +++ src/crypto_xof/shake256/amd64/spec/xof.jazz | 3 +++ 12 files changed, 37 insertions(+), 7 deletions(-) diff --git a/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc b/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc index 7dd3b9e3..783813b2 100644 --- a/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc +++ b/src/common/keccak/keccak1600/amd64/spec/keccak1600.jinc @@ -136,7 +136,7 @@ inline fn __xtr_bytes_spec( inline fn __keccak1600_spec(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) { stack u64[25] state; - stack u64 s_out s_outlen s_in s_inlen s_rate; + #mmx reg u64 s_out s_outlen s_in s_inlen s_rate; stack u8 s_trail_byte; s_out = out; diff --git a/src/crypto_xof/shake128/amd64/avx2/shake128.jinc b/src/crypto_xof/shake128/amd64/avx2/shake128.jinc index 187aac91..70875d0c 100644 --- a/src/crypto_xof/shake128/amd64/avx2/shake128.jinc +++ b/src/crypto_xof/shake128/amd64/avx2/shake128.jinc @@ -1,6 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -inline fn __shake128_avx2(reg u64 out outlen in inlen) +inline fn __shake128_avx2(reg u64 out outlen in inlen, #msf reg u64 ms) { reg u64 rate; reg u8 trail_byte; @@ -8,7 +8,7 @@ inline fn __shake128_avx2(reg u64 out outlen in inlen) trail_byte = 0x1F; rate = (1344/8); - __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } diff --git a/src/crypto_xof/shake128/amd64/avx2/xof.jazz b/src/crypto_xof/shake128/amd64/avx2/xof.jazz index 23dd3b45..6b41b262 100644 --- a/src/crypto_xof/shake128/amd64/avx2/xof.jazz +++ b/src/crypto_xof/shake128/amd64/avx2/xof.jazz @@ -3,7 +3,11 @@ require "shake128.jinc" export fn jade_xof_shake128_amd64_avx2(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; - __shake128_avx2(output, output_length, input, input_length); + #msf reg u64 ms; + + ms = #init_msf(); + + __shake128_avx2(output, output_length, input, input_length, ms); ?{}, r = #set0(); return r; } diff --git a/src/crypto_xof/shake128/amd64/bmi1/xof.jazz b/src/crypto_xof/shake128/amd64/bmi1/xof.jazz index 19921991..4c32db8d 100644 --- a/src/crypto_xof/shake128/amd64/bmi1/xof.jazz +++ b/src/crypto_xof/shake128/amd64/bmi1/xof.jazz @@ -3,6 +3,9 @@ require "shake128.jinc" export fn jade_xof_shake128_amd64_bmi1(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake128_bmi1(output, output_length, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_xof/shake128/amd64/ref/xof.jazz b/src/crypto_xof/shake128/amd64/ref/xof.jazz index ad386786..3cb07b30 100644 --- a/src/crypto_xof/shake128/amd64/ref/xof.jazz +++ b/src/crypto_xof/shake128/amd64/ref/xof.jazz @@ -3,6 +3,9 @@ require "shake128.jinc" export fn jade_xof_shake128_amd64_ref(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake128_ref(output, output_length, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_xof/shake128/amd64/ref1/xof.jazz b/src/crypto_xof/shake128/amd64/ref1/xof.jazz index 28e571ea..063f9637 100644 --- a/src/crypto_xof/shake128/amd64/ref1/xof.jazz +++ b/src/crypto_xof/shake128/amd64/ref1/xof.jazz @@ -3,6 +3,9 @@ require "shake128.jinc" export fn jade_xof_shake128_amd64_ref1(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake128_ref1(output, output_length, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_xof/shake256/amd64/avx2/shake256.jinc b/src/crypto_xof/shake256/amd64/avx2/shake256.jinc index 37c02fef..17a24c27 100644 --- a/src/crypto_xof/shake256/amd64/avx2/shake256.jinc +++ b/src/crypto_xof/shake256/amd64/avx2/shake256.jinc @@ -1,6 +1,6 @@ from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -inline fn __shake256_avx2(reg u64 out outlen in inlen) +inline fn __shake256_avx2(reg u64 out outlen in inlen, #msf reg u64 ms) { reg u64 rate; reg u8 trail_byte; @@ -8,7 +8,7 @@ inline fn __shake256_avx2(reg u64 out outlen in inlen) trail_byte = 0x1F; rate = (1088/8); - __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); + __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate, ms); } diff --git a/src/crypto_xof/shake256/amd64/avx2/xof.jazz b/src/crypto_xof/shake256/amd64/avx2/xof.jazz index 169f7701..80d3ae69 100644 --- a/src/crypto_xof/shake256/amd64/avx2/xof.jazz +++ b/src/crypto_xof/shake256/amd64/avx2/xof.jazz @@ -3,7 +3,12 @@ require "shake256.jinc" export fn jade_xof_shake256_amd64_avx2(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; - __shake256_avx2(output, output_length, input, input_length); + #msf reg u64 ms; + + ms = #init_msf(); + + __shake256_avx2(output, output_length, input, input_length, ms); + ?{}, r = #set0(); return r; } diff --git a/src/crypto_xof/shake256/amd64/bmi1/xof.jazz b/src/crypto_xof/shake256/amd64/bmi1/xof.jazz index f0988dd9..0a01874f 100644 --- a/src/crypto_xof/shake256/amd64/bmi1/xof.jazz +++ b/src/crypto_xof/shake256/amd64/bmi1/xof.jazz @@ -3,6 +3,9 @@ require "shake256.jinc" export fn jade_xof_shake256_amd64_bmi1(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake256_bmi1(output, output_length, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_xof/shake256/amd64/ref/xof.jazz b/src/crypto_xof/shake256/amd64/ref/xof.jazz index 8eb4e643..c876881e 100644 --- a/src/crypto_xof/shake256/amd64/ref/xof.jazz +++ b/src/crypto_xof/shake256/amd64/ref/xof.jazz @@ -3,6 +3,9 @@ require "shake256.jinc" export fn jade_xof_shake256_amd64_ref(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake256_ref(output, output_length, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_xof/shake256/amd64/ref1/xof.jazz b/src/crypto_xof/shake256/amd64/ref1/xof.jazz index 2051d26f..23d811bb 100644 --- a/src/crypto_xof/shake256/amd64/ref1/xof.jazz +++ b/src/crypto_xof/shake256/amd64/ref1/xof.jazz @@ -3,6 +3,9 @@ require "shake256.jinc" export fn jade_xof_shake256_amd64_ref1(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake256_ref1(output, output_length, input, input_length); ?{}, r = #set0(); return r; diff --git a/src/crypto_xof/shake256/amd64/spec/xof.jazz b/src/crypto_xof/shake256/amd64/spec/xof.jazz index f7045070..04b30887 100644 --- a/src/crypto_xof/shake256/amd64/spec/xof.jazz +++ b/src/crypto_xof/shake256/amd64/spec/xof.jazz @@ -3,6 +3,9 @@ require "shake256.jinc" export fn jade_xof_shake256_amd64_spec(reg u64 output output_length input input_length) -> reg u64 { reg u64 r; + + _ = #init_msf(); + __shake256_spec(output, output_length, input, input_length); ?{}, r = #set0(); return r; From 11385db258b2b42d259babc3eb49897b6ca49d73 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Thu, 27 Jun 2024 12:42:32 +0100 Subject: [PATCH 7/7] sct: fix compilation of kyber* (no sct yet); there will be a separate PR; --- src/common/keccak/common/fips202_DIRTY.jinc | 6 +- .../amd64/avx2/keccak1600_nomsf.jinc | 272 ++++++++++++++++++ .../amd64/avx2/keccakf1600_nomsf.jinc | 202 +++++++++++++ 3 files changed, 476 insertions(+), 4 deletions(-) create mode 100644 src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc create mode 100644 src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc diff --git a/src/common/keccak/common/fips202_DIRTY.jinc b/src/common/keccak/common/fips202_DIRTY.jinc index 92698c60..82f6c335 100644 --- a/src/common/keccak/common/fips202_DIRTY.jinc +++ b/src/common/keccak/common/fips202_DIRTY.jinc @@ -1,7 +1,5 @@ -param int KECCAK_ROUNDS=24; - -from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600.jinc" -from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600.jinc" +from Jade require "common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc" +from Jade require "common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc" require "fips202_params.jinc" #[returnaddress="stack"] diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc new file mode 100644 index 00000000..0f6ace84 --- /dev/null +++ b/src/common/keccak/keccak1600/amd64/avx2/keccak1600_nomsf.jinc @@ -0,0 +1,272 @@ +param int KECCAK_ROUNDS=24; + +require "keccakf1600_nomsf.jinc" + +inline fn __keccak_init_avx2() -> reg u256[7] +{ + inline int i; + reg u256[7] state; + + for i=0 to 7 + { state[i] = #set0_256(); } + + return state; +} + + +inline fn __init_s_state_avx2() -> stack u64[28] +{ + inline int i; + stack u64[28] s_state; + reg u256 zero; + + zero = #set0_256(); + for i=0 to 7 + { s_state[u256 i] = zero; } + + return s_state; +} + + +inline fn __add_full_block_avx2( + reg u256[7] state, + stack u64[28] s_state, + reg ptr u64[25] a_jagged_p, + reg u64 in inlen, + reg u64 rate +) -> reg u256[7], stack u64[28], reg u64, reg u64 +{ + + inline int i; + reg u64 j l t rate8; + + rate8 = rate; + rate8 >>= 3; + j = 0; + while ( j < rate8 ) + { + t = [in + 8*j]; + l = a_jagged_p[(int) j]; + s_state[(int) l] = t; + j += 1; + } + + //TODO: check & change to #VPBROADCAST_4u64 + t = s_state[0]; + s_state[1] = t; + s_state[2] = t; + s_state[3] = t; + + for i = 0 to 7 + { state[i] ^= s_state[u256 i]; } + + in += rate; + inlen -= rate; + + return state, s_state, in, inlen; +} + + +// TODO: refactor when this feature is available: https://github.com/haslab/libjbn/wiki/Feature-request-%231#procedural-parameters +inline fn __add_final_block_avx2( + reg u256[7] state, + stack u64[28] s_state, + reg ptr u64[25] a_jagged_p, + reg u64 in inlen, + reg u8 trail_byte, + reg u64 rate +) -> reg u256[7] +{ + inline int i; + reg u64 j l t inlen8; + reg u8 c; + + s_state = __init_s_state_avx2(); + + inlen8 = inlen; + inlen8 >>= 3; + j = 0; + while ( j < inlen8 ) + { + t = [in + 8*j]; + l = a_jagged_p[(int) j]; + s_state[(int) l] = t; + j += 1; + } + l = a_jagged_p[(int) j]; + l <<= 3; + j <<= 3; + + while ( j < inlen ) + { + c = (u8)[in + j]; + s_state[u8 (int) l] = c; + j += 1; + l += 1; + } + + s_state[u8 (int) l] = trail_byte; + + // j = (rate-1) >> 3; + j = rate; j -= 1; j >>= 3; + l = a_jagged_p[(int) j]; + l <<= 3; + // l += ((rate-1) & 0x7) + j = rate; j -= 1; j &= 0x7; + l += j; + + s_state[u8 (int) l] ^= 0x80; + + t = s_state[0]; + s_state[1] = t; + s_state[2] = t; + s_state[3] = t; + + for i = 0 to 7 + { state[i] ^= s_state[u256 i]; } + + return state; +} + + +// obs: @pre: len <= rate_in_bytes +inline fn __xtr_full_block_avx2( + reg u256[7] state, + reg ptr u64[25] a_jagged_p, + reg u64 out, + reg u64 len +) -> reg u64 +{ + inline int i; + stack u64[28] s_state; + reg u64 j l t len8; + + for i = 0 to 7 + { s_state[u256 i] = state[i]; } + + len8 = len; + len8 >>= 3; + j = 0; + while ( j < len8 ) + { + l = a_jagged_p[(int) j]; + t = s_state[(int) l]; + [out + 8*j] = t; + j += 1; + } + + out += len; + + return out; +} + + +// obs: @pre: len <= rate_in_bytes +inline fn __xtr_bytes_avx2( + reg u256[7] state, + reg ptr u64[25] a_jagged_p, + reg u64 out, + reg u64 len +) -> reg u64 +{ + inline int i; + stack u64[28] s_state; + reg u64 j l t len8; + reg u8 c; + + for i = 0 to 7 + { s_state[u256 i] = state[i]; } + + len8 = len; + len8 >>= 3; + j = 0; + while ( j < len8 ) + { l = a_jagged_p[(int) j]; + t = s_state[(int) l]; + [out + 8*j] = t; + j += 1; + } + l = a_jagged_p[(int)j]; + j <<= 3; + l <<= 3; + + while ( j < len ) + { + c = s_state[u8 (int) l]; + (u8)[out + j] = c; + j += 1; + l += 1; + } + + out += len; + + return out; +} + + +inline fn __absorb_avx2( + reg u256[7] state, + reg u64 in inlen, + reg u8 trail_byte, + reg u64 rate +) -> reg u256[7] +{ + stack u64[28] s_state; + reg ptr u64[25] a_jagged_p; + + a_jagged_p = KECCAK_A_JAGGED; + s_state = __init_s_state_avx2(); + + // intermediate blocks + while ( inlen >= rate ) + { + state, s_state, in, inlen = __add_full_block_avx2(state, s_state, a_jagged_p, in, inlen, rate); + state = __keccakf1600_avx2(state); + } + + // final block + state = __add_final_block_avx2(state, s_state, a_jagged_p, in, inlen, trail_byte, rate); + + return state; +} + + +inline fn __squeeze_avx2(reg u256[7] state, reg u64 out outlen rate) +{ + reg ptr u64[25] a_jagged_p; + + a_jagged_p = KECCAK_A_JAGGED; + + // intermediate blocks + while ( outlen > rate ) + { + state = __keccakf1600_avx2(state); + out = __xtr_full_block_avx2(state, a_jagged_p, out, rate); + outlen -= rate; + } + + state = __keccakf1600_avx2(state); + out = __xtr_bytes_avx2(state, a_jagged_p, out, outlen); +} + + +inline fn __keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) +{ + reg u256[7] state; + + state = __keccak_init_avx2(); + + // absorb + state = __absorb_avx2(state, in, inlen, trail_byte, rate); + + // squeeze + __squeeze_avx2(state, out, outlen, rate); +} + + +fn _keccak1600_avx2(reg u64 out outlen in inlen, reg u8 trail_byte, reg u64 rate) +{ + __keccak1600_avx2(out, outlen, in, inlen, trail_byte, rate); +} + + diff --git a/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc new file mode 100644 index 00000000..6ca9dda6 --- /dev/null +++ b/src/common/keccak/keccak1600/amd64/avx2/keccakf1600_nomsf.jinc @@ -0,0 +1,202 @@ + +u256[24] KECCAK_IOTAS = +{ (4u64)[0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001] + ,(4u64)[0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082] + ,(4u64)[0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a] + ,(4u64)[0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000] + ,(4u64)[0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b] + ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001] + ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081] + ,(4u64)[0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009] + ,(4u64)[0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a] + ,(4u64)[0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088] + ,(4u64)[0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009] + ,(4u64)[0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a] + ,(4u64)[0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b] + ,(4u64)[0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b] + ,(4u64)[0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089] + ,(4u64)[0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003] + ,(4u64)[0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002] + ,(4u64)[0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080] + ,(4u64)[0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a] + ,(4u64)[0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a] + ,(4u64)[0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081] + ,(4u64)[0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080] + ,(4u64)[0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001] + ,(4u64)[0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008] +}; + + +u256[6] KECCAK_RHOTATES_LEFT = +{ + (4u64)[41, 36, 18, 3], + (4u64)[27, 28, 62, 1], + (4u64)[39, 56, 6, 45], + (4u64)[ 8, 55, 61, 10], + (4u64)[20, 25, 15, 2], + (4u64)[14, 21, 43, 44] +}; + + +u256[6] KECCAK_RHOTATES_RIGHT = +{ + (4u64)[64-41, 64-36, 64-18, 64- 3], + (4u64)[64-27, 64-28, 64-62, 64- 1], + (4u64)[64-39, 64-56, 64- 6, 64-45], + (4u64)[64- 8, 64-55, 64-61, 64-10], + (4u64)[64-20, 64-25, 64-15, 64- 2], + (4u64)[64-14, 64-21, 64-43, 64-44] +}; + + +u64[25] KECCAK_A_JAGGED = +{ + 0, 4, 5, 6, 7, + 10, 24, 13, 18, 23, + 8, 16, 25, 22, 15, + 11, 12, 21, 26, 19, + 9, 20, 17, 14, 27 +}; + + +inline fn __keccakf1600_avx2(reg u256[7] state) -> reg u256[7] +{ + reg u256[9] t; + reg u256 c00 c14 d00 d14; + + reg bool zf; + reg u64 r iotas_o; + + reg ptr u256[24] iotas_p; + reg ptr u256[6] rhotates_left_p; + reg ptr u256[6] rhotates_right_p; + + iotas_p = KECCAK_IOTAS; + iotas_o = 0; + rhotates_left_p = KECCAK_RHOTATES_LEFT; + rhotates_right_p = KECCAK_RHOTATES_RIGHT; + + r = KECCAK_ROUNDS; + while + { + //######################################## Theta + c00 = #VPSHUFD_256(state[2], (4u2)[1,0,3,2]); + c14 = state[5] ^ state[3]; + t[2] = state[4] ^ state[6]; + c14 = c14 ^ state[1]; + c14 = c14 ^ t[2]; + t[4] = #VPERMQ(c14, (4u2)[2,1,0,3]); + c00 = c00 ^ state[2]; + t[0] = #VPERMQ(c00, (4u2)[1,0,3,2]); + t[1] = c14 >>4u64 63; + t[2] = c14 +4u64 c14; + t[1] = t[1] | t[2]; + d14 = #VPERMQ(t[1], (4u2)[0,3,2,1]); + d00 = t[1] ^ t[4]; + d00 = #VPERMQ(d00, (4u2)[0,0,0,0]); + c00 = c00 ^ state[0]; + c00 = c00 ^ t[0]; + t[0] = c00 >>4u64 63; + t[1] = c00 +4u64 c00; + t[1] = t[1] | t[0]; + state[2] = state[2] ^ d00; + state[0] = state[0] ^ d00; + d14 = #VPBLEND_8u32(d14, t[1], (8u1)[1,1,0,0,0,0,0,0]); + t[4] = #VPBLEND_8u32(t[4], c00, (8u1)[0,0,0,0,0,0,1,1]); + d14 = d14 ^ t[4]; + + //######################################## Rho + Pi + pre-Chi shuffle + t[3] = #VPSLLV_4u64(state[2], rhotates_left_p[0] ); + state[2] = #VPSRLV_4u64(state[2], rhotates_right_p[0] ); + state[2] = state[2] | t[3]; + state[3] = state[3] ^ d14; + t[4] = #VPSLLV_4u64(state[3], rhotates_left_p[2] ); + state[3] = #VPSRLV_4u64(state[3], rhotates_right_p[2] ); + state[3] = state[3] | t[4]; + state[4] = state[4] ^ d14; + t[5] = #VPSLLV_4u64(state[4], rhotates_left_p[3] ); + state[4] = #VPSRLV_4u64(state[4], rhotates_right_p[3] ); + state[4] = state[4] | t[5]; + state[5] = state[5] ^ d14; + t[6] = #VPSLLV_4u64(state[5], rhotates_left_p[4] ); + state[5] = #VPSRLV_4u64(state[5], rhotates_right_p[4] ); + state[5] = state[5] | t[6]; + state[6] = state[6] ^ d14; + t[3] = #VPERMQ(state[2], (4u2)[2,0,3,1]); + t[4] = #VPERMQ(state[3], (4u2)[2,0,3,1]); + t[7] = #VPSLLV_4u64(state[6], rhotates_left_p[5] ); + t[1] = #VPSRLV_4u64(state[6], rhotates_right_p[5] ); + t[1] = t[1] | t[7]; + state[1] = state[1] ^ d14; + t[5] = #VPERMQ(state[4], (4u2)[0,1,2,3]); + t[6] = #VPERMQ(state[5], (4u2)[1,3,0,2]); + t[8] = #VPSLLV_4u64(state[1], rhotates_left_p[1] ); + t[2] = #VPSRLV_4u64(state[1], rhotates_right_p[1] ); + t[2] = t[2] | t[8]; + + //######################################## Chi + t[7] = #VPSRLDQ_256(t[1], 8); + t[0] = !t[1] & t[7]; + state[3] = #VPBLEND_8u32(t[2], t[6], (8u1)[0,0,0,0,1,1,0,0]); + t[8] = #VPBLEND_8u32(t[4], t[2], (8u1)[0,0,0,0,1,1,0,0]); + state[5] = #VPBLEND_8u32(t[3], t[4], (8u1)[0,0,0,0,1,1,0,0]); + t[7] = #VPBLEND_8u32(t[2], t[3], (8u1)[0,0,0,0,1,1,0,0]); + state[3] = #VPBLEND_8u32(state[3], t[4], (8u1)[0,0,1,1,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[5], (8u1)[0,0,1,1,0,0,0,0]); + state[5] = #VPBLEND_8u32(state[5], t[2], (8u1)[0,0,1,1,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[6], (8u1)[0,0,1,1,0,0,0,0]); + state[3] = #VPBLEND_8u32(state[3], t[5], (8u1)[1,1,0,0,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[6], (8u1)[1,1,0,0,0,0,0,0]); + state[5] = #VPBLEND_8u32(state[5], t[6], (8u1)[1,1,0,0,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[4], (8u1)[1,1,0,0,0,0,0,0]); + state[3] = !state[3] & t[8]; + state[5] = !state[5] & t[7]; + state[6] = #VPBLEND_8u32(t[5], t[2], (8u1)[0,0,0,0,1,1,0,0]); + t[8] = #VPBLEND_8u32(t[3], t[5], (8u1)[0,0,0,0,1,1,0,0]); + state[3] = state[3] ^ t[3]; + state[6] = #VPBLEND_8u32(state[6], t[3], (8u1)[0,0,1,1,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[4], (8u1)[0,0,1,1,0,0,0,0]); + state[5] = state[5] ^ t[5]; + state[6] = #VPBLEND_8u32(state[6], t[4], (8u1)[1,1,0,0,0,0,0,0]); + t[8] = #VPBLEND_8u32(t[8], t[2], (8u1)[1,1,0,0,0,0,0,0]); + state[6] = !state[6] & t[8]; + state[6] = state[6] ^ t[6]; + state[4] = #VPERMQ(t[1], (4u2)[0,1,3,2]); + t[8] = #VPBLEND_8u32(state[4], state[0], (8u1)[0,0,1,1,0,0,0,0]); + state[1] = #VPERMQ(t[1], (4u2)[0,3,2,1]); + state[1] = #VPBLEND_8u32(state[1], state[0], (8u1)[1,1,0,0,0,0,0,0]); + state[1] = !state[1] & t[8]; + state[2] = #VPBLEND_8u32(t[4], t[5], (8u1)[0,0,0,0,1,1,0,0]); + t[7] = #VPBLEND_8u32(t[6], t[4], (8u1)[0,0,0,0,1,1,0,0]); + state[2] = #VPBLEND_8u32(state[2], t[6], (8u1)[0,0,1,1,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[0,0,1,1,0,0,0,0]); + state[2] = #VPBLEND_8u32(state[2], t[3], (8u1)[1,1,0,0,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[5], (8u1)[1,1,0,0,0,0,0,0]); + state[2] = !state[2] & t[7]; + state[2] = state[2] ^ t[2]; + t[0] = #VPERMQ(t[0], (4u2)[0,0,0,0]); + state[3] = #VPERMQ(state[3], (4u2)[0,1,2,3]); + state[5] = #VPERMQ(state[5], (4u2)[2,0,3,1]); + state[6] = #VPERMQ(state[6], (4u2)[1,3,0,2]); + state[4] = #VPBLEND_8u32(t[6], t[3], (8u1)[0,0,0,0,1,1,0,0]); + t[7] = #VPBLEND_8u32(t[5], t[6], (8u1)[0,0,0,0,1,1,0,0]); + state[4] = #VPBLEND_8u32(state[4], t[5], (8u1)[0,0,1,1,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[2], (8u1)[0,0,1,1,0,0,0,0]); + state[4] = #VPBLEND_8u32(state[4], t[2], (8u1)[1,1,0,0,0,0,0,0]); + t[7] = #VPBLEND_8u32(t[7], t[3], (8u1)[1,1,0,0,0,0,0,0]); + state[4] = !state[4] & t[7]; + state[0] = state[0] ^ t[0]; + state[1] = state[1] ^ t[1]; + state[4] = state[4] ^ t[4]; + + //######################################## Iota + state[0] = state[0] ^ iotas_p.[(int) iotas_o]; + iotas_o += 32; + + _,_,_,zf,r = #DEC_64(r); + }(!zf) + + return state; +} + +