From 44ddbd7b5344f3bcdc6c60e64d49fea8f6860797 Mon Sep 17 00:00:00 2001 From: "Thing-han, Lim" Date: Mon, 13 Nov 2023 15:30:27 +0800 Subject: [PATCH 01/14] feat(frodo): init forod976shake and frodo640shake --- .../frodo/common/amd64/ref/encode.jinc | 58 +++ .../frodo/common/amd64/ref/matrix.jinc | 315 ++++++++++++++ .../frodo/common/amd64/ref/noise.jinc | 85 ++++ .../frodo/common/amd64/ref/pack.jinc | 325 ++++++++++++++ .../frodo/common/amd64/ref/shake128.jinc | 396 ++++++++++++++++++ .../frodo/common/amd64/ref/shake256.jinc | 338 +++++++++++++++ .../frodo/common/frodo640_params.jinc | 35 ++ .../frodo/common/frodo976_params.jinc | 33 ++ src/crypto_kem/frodo/frodo640shake/META.yml | 26 ++ .../frodo/frodo640shake/amd64/ref/Makefile | 2 + .../frodo640shake/amd64/ref/include/api.h | 36 ++ .../frodo/frodo640shake/amd64/ref/kem.jazz | 37 ++ .../frodo/frodo640shake/amd64/ref/kem.jinc | 383 +++++++++++++++++ src/crypto_kem/frodo/frodo976shake/META.yml | 26 ++ .../frodo/frodo976shake/amd64/ref/Makefile | 2 + .../frodo976shake/amd64/ref/include/api.h | 38 ++ .../frodo/frodo976shake/amd64/ref/kem.jazz | 37 ++ .../frodo/frodo976shake/amd64/ref/kem.jinc | 384 +++++++++++++++++ 18 files changed, 2556 insertions(+) create mode 100644 src/crypto_kem/frodo/common/amd64/ref/encode.jinc create mode 100644 src/crypto_kem/frodo/common/amd64/ref/matrix.jinc create mode 100644 src/crypto_kem/frodo/common/amd64/ref/noise.jinc create mode 100644 src/crypto_kem/frodo/common/amd64/ref/pack.jinc create mode 100644 src/crypto_kem/frodo/common/amd64/ref/shake128.jinc create mode 100644 src/crypto_kem/frodo/common/amd64/ref/shake256.jinc create mode 100644 src/crypto_kem/frodo/common/frodo640_params.jinc create mode 100644 src/crypto_kem/frodo/common/frodo976_params.jinc create mode 100644 src/crypto_kem/frodo/frodo640shake/META.yml create mode 100644 src/crypto_kem/frodo/frodo640shake/amd64/ref/Makefile create mode 100644 src/crypto_kem/frodo/frodo640shake/amd64/ref/include/api.h create mode 100644 src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz create mode 100644 src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc create mode 100644 src/crypto_kem/frodo/frodo976shake/META.yml create mode 100644 src/crypto_kem/frodo/frodo976shake/amd64/ref/Makefile create mode 100644 src/crypto_kem/frodo/frodo976shake/amd64/ref/include/api.h create mode 100644 src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz create mode 100644 src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc diff --git a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc new file mode 100644 index 00000000..7e2ec9cb --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc @@ -0,0 +1,58 @@ +fn __encode(reg ptr u16[NBAR * NBAR]out, reg ptr u8[EXTRACTED_BITS * NBAR * NBAR / 8] in) -> stack u16[NBAR * NBAR] { + inline int i j; + reg u64 tmp tmp2 mask; + + mask = (1 << EXTRACTED_BITS) - 1; + + for i = 0 to NBAR { + tmp = 0; + + for j = 0 to EXTRACTED_BITS { + tmp2 = (64u)in[i * EXTRACTED_BITS + j]; + tmp2 <<= 8 * j; + tmp |= tmp2; + } + + for j = 0 to 8 { + out[i * NBAR + j] = tmp; + out[i * NBAR + j] &= mask; + out[i * NBAR + j] <<= D - EXTRACTED_BITS; + tmp >>= EXTRACTED_BITS; + } + } + + return out; +} + +fn __decode(reg ptr u8[EXTRACTED_BITS * NBAR] out, reg ptr u16[NBAR * NBAR] in) -> stack u8[EXTRACTED_BITS * NBAR] { + reg u32 tmplong tmp mask d; + inline int i j; + + d = 1 << (D - EXTRACTED_BITS - 1); + mask = (1 << EXTRACTED_BITS) - 1; + + out = out; + in = in; + + for i = 0 to NBAR { + tmplong = 0; + + for j = 0 to 8 { + tmp = (32u)in[(int) i * NBAR + j]; + tmp += d; + tmp >>= D - EXTRACTED_BITS; + tmp &= mask; + + tmp <<= EXTRACTED_BITS * j; + + tmplong |= tmp; + } + + for j = 0 to EXTRACTED_BITS { + out[i * EXTRACTED_BITS + j] = (8u)tmplong; + tmplong >>= 8; + } + } + + return out; +} diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc new file mode 100644 index 00000000..4b6839ff --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc @@ -0,0 +1,315 @@ +from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" + +#[returnaddress="stack"] +fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] { + stack ptr u16[NNBAR] s_B; + stack u16[N] A_row; + stack ptr u8[BYTES_SEED_A] s_seedA; + stack ptr u16[NNBAR] s_S s_E; + stack u8[2 + BYTES_SEED_A] b; + + reg u64 i j k; stack u64 s_i s_j s_k; + reg u16 tmp ac; + + s_B = B; s_S = S; s_E = E; + + // copy seedA + i = 0; + while (i < BYTES_SEED_A) { + b[(int)i + 2] = seedA[(int)i]; + i += 1; + } + s_seedA = seedA; + + // first set B = E + B = s_B; E = s_E; + + i = 0; + while (i < NNBAR) { + B[(int)i] = E[(int)i]; + i += 1; + } + + s_B = B; s_E = E; s_i = i; + + // calculate A and B += A * S + b[u16 0] = 0; + k = 0; + + while (b[u16 0] < N) { + s_i = i; s_j = j; s_k = k; s_S = S; s_B = B; + A_row = __shake128_gen_A(A_row, b); + i = s_i; j = s_j; k = s_k; S = s_S; B = s_B; + + i = 0; + while (i < NNBAR) { + ac = 0; + j = 0; + + // A_row * S_T_row + while (j < N) { + tmp = A_row[(int)j]; + tmp *= S[(int)i]; + ac += tmp; + j += 1; + i += 1; + } + + B[(int)k] += ac; + k += 1; + } + + b[u16 0] += 1; + } + + return B; +} + +#[returnaddress="stack"] +fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] { + stack ptr u16[NNBAR] s_B; + stack u16[N] A_row; + stack ptr u8[BYTES_SEED_A] s_seedA; + stack ptr u16[NNBAR] s_S s_E; + stack u8[2 + BYTES_SEED_A] b; + + reg u64 i j k; stack u64 s_i s_j s_k; + reg u16 tmp s; + + // copy seedA + i = 0; + while (i < BYTES_SEED_A) { + b[(int)i + 2] = seedA[(int)i]; + i += 1; + } + s_seedA = seedA; + + i = 0; + while (i < NNBAR) { + B[(int)i] = E[(int)i]; + i += 1; + } + s_B = B; s_S = S; s_E = E; + + // calculate A and B += S * A + b[u16 0] = 0; + + while (b[u16 0] < N) { + s_i = i; + A_row = __shake128_gen_A(A_row, b); + i = s_i; + + i = 0; + while (i < NNBAR) { + k = s_k; S = s_S; + + k = (64u)b[u16 0]; + k += i; + s = S[(int)k]; + + s_k = k; s_S = S; + + j = s_j; B = s_B; + j = 0; + while (j < N) { + tmp = A_row[(int)j]; + tmp *= s; + B[(int)i] += tmp; + + j += 1; + i += 1; + } + s_j = j; s_B = B; + } + + b[u16 0] += 1; + } + + return B; +} + +#[returnaddress="stack"] +fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[NBAR * NBAR] E) -> stack u16[NBAR * NBAR] { + reg u64 i j k ti tj; + reg u16 tmp; + + i = 0; + while (i < NBAR * NBAR) { + V[(int)i] = E[(int)i]; + i += 1; + } + + i = 0; + while (i < NBAR) { + j = 0; + while (j < NBAR) { + ti = i * NBAR; + ti += j; + + k = 0; + while (k < N) { + tj = i * N; + tj += k; + tmp = S[(int)tj]; + tj = k * NBAR; + tj += j; + tmp *= B[(int)tj]; + + V[(int)ti] += tmp; + k += 1; + } + + V[(int)ti] &= (1 << D) - 1; + j += 1; + } + i += 1; + } + + return V; +} + +fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { + reg u64 i; + reg u16 tmp; + + i = 0; + while (i < NBAR * NBAR) { + tmp = a[(int)i]; + tmp += b[(int)i]; + tmp &= (1 << D) - 1; + a[(int)i] = tmp; + i += 1; + } + + return a; +} + +#[returnaddress="stack"] +// a = b - a +fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { + reg u64 i; + reg u16 tmp; + + i = 0; + while (i < NBAR * NBAR) { + tmp = b[(int)i]; + tmp -= a[(int)i]; + tmp &= (1 << D) - 1; + a[(int)i] = tmp; + i += 1; + } + + return a; +} + +#[returnaddress="stack"] +fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR * NBAR] { + reg u64 i j k ti tj; + reg u16 tmp; + + i = 0; + while (i < NBAR) { + j = 0; + while (j < NBAR) { + ti = i * NBAR; + ti += j; + M[(int)ti] = 0; + + k = 0; + while (k < N) { + tj = i * N; + tj += k; + tmp = B[(int)tj]; + + tj = j * N; + tj += k; + tmp *= S[(int)tj]; + + M[(int)ti] += tmp; + + k += 1; + } + M[(int)ti] &= (1 << D) - 1; + j += 1; + } + + i += 1; + } + + return M; +} + +#[returnaddress="stack"] +fn __ct_verify_NNBAR(reg ptr u16[NNBAR] a b) -> stack u8 { + reg u64 i; + reg u16 ac tmp; + reg u8 r; + + i = 0; + ac = 0; + while (i < NNBAR) { + tmp = a[(int) i]; + tmp ^= b[(int)i]; + ac |= tmp; + i += 1; + } + + tmp = ac * -1; + ac |= tmp; + ac >>= 15; + ac *= (-1); + + r = (8u)ac; + + return r; +} + +#[returnaddress="stack"] +fn __ct_verify_NBAR2(reg ptr u16[NBAR * NBAR] a b) -> stack u8 { + reg u64 i; + reg u16 ac tmp; + reg u8 r; + + i = 0; + ac = 0; + while (i < NBAR * NBAR) { + tmp = a[(int) i]; + tmp ^= b[(int)i]; + ac |= tmp; + i += 1; + } + + tmp = ac * -1; + ac |= tmp; + ac >>= 15; + ac *= (-1); + + r = (8u) ac; + + return r; +} + +#[returnaddress="stack"] +fn __ct_select(reg ptr u8[BYTES_SEC] out a b, reg u8 selector) -> stack u8[BYTES_SEC] { + reg u64 i; + + reg u8 n_selector tmp; + + n_selector = selector; + n_selector ^= 0xFF; + + i = 0; + while (i < BYTES_SEC) { + tmp = a[(int)i]; + tmp &= n_selector; + out[(int)i] = tmp; + + tmp = b[(int)i]; + tmp &= selector; + out[(int)i] |= tmp; + + i += 1; + } + + return out; +} diff --git a/src/crypto_kem/frodo/common/amd64/ref/noise.jinc b/src/crypto_kem/frodo/common/amd64/ref/noise.jinc new file mode 100644 index 00000000..c39b352f --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/noise.jinc @@ -0,0 +1,85 @@ +fn __sample_2NNBAR(reg ptr u16[2 * NNBAR] s) -> stack u16[2 * NNBAR] { + reg ptr u16[CDF_TABLE_LEN] cdftp; + cdftp = CDF_TABLE; + + reg u64 i; + i = 0; + while (i < 2 * NNBAR) { + reg u16 sample prnd sign; + + sample = 0; + + // prnd = s[i] >> 1 + prnd = s[(int)i]; + prnd >>= 1; + + // sign = s[(int)i] & 0x1 + sign = s[(int)i]; + sign &= 0x1; + + // no need to compare with the last value + inline int j; + for j = 0 to CDF_TABLE_LEN - 1 { + // sample += (CDF_TABLE[j] - prnd) >> 15 + + reg u16 tmp_sample; + tmp_sample = cdftp[j]; + tmp_sample -= prnd; + tmp_sample >>= 15; + sample += tmp_sample; + } + + // s[i] = ((-sign) ^ sample) + sign + s[(int)i] = 0; + s[(int)i] -= sign; + s[(int)i] ^= sample; + s[(int)i] += sign; + + i += 1; + } + + return s; +} + +fn __sample_NBAR2(reg ptr u16[NBAR * NBAR] s) -> stack u16[NBAR * NBAR] { + reg ptr u16[CDF_TABLE_LEN] cdftp; + cdftp = CDF_TABLE; + + reg u64 i; + i = 0; + while (i < NBAR * NBAR) { + reg u16 sample prnd sign; + + sample = 0; + + // prnd = s[i] >> 1 + prnd = s[(int)i]; + prnd >>= 1; + + // sign = s[(int)i] & 0x1 + sign = s[(int)i]; + sign &= 0x1; + + // no need to compare with the last value + inline int j; + for j = 0 to CDF_TABLE_LEN - 1 { + // sample += (CDF_TABLE[j] - prnd) >> 15 + + reg u16 tmp_sample; + tmp_sample = cdftp[j]; + tmp_sample -= prnd; + tmp_sample >>= 15; + sample += tmp_sample; + } + + // s[i] = ((-sign) ^ sample) + sign + s[(int)i] = 0; + s[(int)i] -= sign; + s[(int)i] ^= sample; + s[(int)i] += sign; + + i += 1; + } + + return s; +} diff --git a/src/crypto_kem/frodo/common/amd64/ref/pack.jinc b/src/crypto_kem/frodo/common/amd64/ref/pack.jinc new file mode 100644 index 00000000..62510de2 --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/pack.jinc @@ -0,0 +1,325 @@ +fn __pack_B(reg ptr u8[D * N]out, reg ptr u16[NNBAR] in) -> stack u8[D * N] { + reg u64 i j; + inline int k MID TERM Mask; + reg u64 ac tmp; + reg u16 acm tm; + + Mask = (1 << D) - 1; + TERM = (D - 1)/2; + MID = D - TERM * 2; + + i = 0; j = 0; + + // process 16 * 8 bits at a time + // process 2 parts: | 4 * D bits | 4 * D bits | + // separate into: | TERM bytes | 1st half of MID bytes | 2nd half of MID bytes | TERM bytes | + while (i < NNBAR) { + ac = 0; + acm = 0; + + // aggregate 1st half (16 * 4 bits) into 4 * D bits in ac + for k = 0 to 4 { + tmp = (64u)in[(int)i + k]; + tmp &= Mask; + ac <<= D; + ac |= tmp; + } + + // aggregate the 1st half of the MID in acm from ac + for k = 0 to MID { + tm = ac; + tm &= 0x0F; + tm <<= 4 * k; + acm |= tm; + ac >>= 4; + } + + // process the 1st TERM in ac + for k = 0 to TERM { + out[(int)j + TERM - 1 - k] = ac; + ac >>= 8; + } + + // aggregate 2nd half (16 * 4 bits) into 4 * D bits in ac + for k = 0 to 4 { + tmp = (64u)in[(int)i + 4 + k]; + tmp &= Mask; + ac <<= D; + ac |= tmp; + } + + // process the 2nd TERM in ac + for k = 0 to TERM { + out[(int)j + D - 1 - k] = ac; + ac >>= 8; + } + + // aggregate the 2nd half of the MID in acm from ac + acm <<= 4 * MID; + for k = 0 to MID { + tm = ac; + tm &= 0x0F; + tm <<= 4 * k; + acm |= tm; + ac >>= 4; + } + + for k = 0 to MID { + out[(int)j + TERM + MID - 1 - k] = acm; + acm >>= 8; + } + + i += 8; + j += D; + } + + return out; +} + +fn __unpack_B(reg ptr u16[NNBAR]out, reg ptr u8[D * N]in) -> stack u16[NNBAR] { + inline int k TERM MID MASK; + + reg u64 i j ac tmp; + reg u16 acm tm; + + // D = TERM + MID + TERM + TERM = (D - 1)/2; + MID = D - TERM * 2; + MASK = (1 << D) - 1; + + i = 0; j = 0; + + // process D bytes at a time + // separate processing into: | TERM bytes | MID bytes | TERM bytes | + // combined 1st half 4 * D bits: | TERM | 1st half of MID | + // combined 2nd half 4 * D bits: | 2nd half of MID | TERM | + while (i < D * N) { + ac = 0; + acm = 0; + + // accumulate the MID bytes (8/16 for D = 15/16) in acm (u16) + for k = 0 to MID { + tm = (16u)in[(int)i + TERM + k]; + acm <<= 8; + acm |= tm; + } + + // aggregate 2nd half of MID from acm in ac + for k = 0 to MID { + tmp = (64u)acm; + tmp &= 0x0F; + tmp <<= 4 * k; + ac |= tmp; + acm >>= 4; + } + + // accumulate 2nd TERM bytes in ac + for k = 0 to TERM { + tmp = (64u)in[(int)i + TERM + MID + k]; + ac <<= 8; + ac |= tmp; + } + + // result in 4 * D bits in 2nd half of output + for k = 0 to 4 { + tm = ac; + tm &= MASK; + out[(int)j + 7 - k] = tm; + ac >>= D; + } + + // accumulate 1st TERM bytes in ac + for k = 0 to TERM { + tmp = (64u)in[(int)i + k]; + ac <<= 8; + ac |= tmp; + } + + // aggregate 2nd half of MID from acm to ac + ac <<= 4 * MID; + for k = 0 to MID { + tmp = (64u)acm; + tmp &= 0x0F; + tmp <<= 4 * k; + ac |= tmp; + acm >>= 4; + } + + // result in 4 * D bits in 1st half of output + for k = 0 to 4 { + // disable implicit scaling for handling little endianess + tm = ac; + tm &= MASK; + out[(int)j + 3 - k] = tm; + ac >>= D; + } + + i += D; + j += 8; + } + + return out; +} + +fn __pack_C(reg ptr u8[D * NBAR]out, reg ptr u16[NBAR * NBAR] in) -> stack u8[D * NBAR] { + reg u64 i j; + inline int k MID TERM Mask; + reg u64 ac tmp; + reg u16 acm tm; + + Mask = (1 << D) - 1; + TERM = (D - 1)/2; + MID = D - TERM * 2; + + i = 0; j = 0; + + // process 16 * 8 bits at a time + // process 2 parts: | 4 * D bits | 4 * D bits | + // separate into: | TERM bytes | 1st half of MID bytes | 2nd half of MID bytes | TERM bytes | + while (i < NBAR * NBAR) { + ac = 0; + acm = 0; + + // aggregate 1st half (16 * 4 bits) into 4 * D bits in ac + for k = 0 to 4 { + tmp = (64u)in[(int)i + k]; + tmp &= Mask; + ac <<= D; + ac |= tmp; + } + + // aggregate the 1st half of the MID in acm from ac + for k = 0 to MID { + tm = ac; + tm &= 0x0F; + tm <<= 4 * k; + acm |= tm; + ac >>= 4; + } + + // process the 1st TERM in ac + for k = 0 to TERM { + out[(int)j + TERM - 1 - k] = ac; + ac >>= 8; + } + + // aggregate 2nd half (16 * 4 bits) into 4 * D bits in ac + for k = 0 to 4 { + tmp = (64u)in[(int)i + 4 + k]; + tmp &= Mask; + ac <<= D; + ac |= tmp; + } + + // process the 2nd TERM in ac + for k = 0 to TERM { + out[(int)j + D - 1 - k] = ac; + ac >>= 8; + } + + // aggregate the 2nd half of the MID in acm from ac + acm <<= 4 * MID; + for k = 0 to MID { + tm = ac; + tm &= 0x0F; + tm <<= 4 * k; + acm |= tm; + ac >>= 4; + } + + for k = 0 to MID { + out[(int)j + TERM + MID - 1 - k] = acm; + acm >>= 8; + } + + i += 8; + j += D; + } + + return out; +} + +fn __unpack_C(reg ptr u16[NBAR * NBAR]out, reg ptr u8[D * NBAR]in) -> stack u16[NBAR * NBAR] { + inline int k TERM MID MASK; + + reg u64 i j ac tmp; + reg u16 acm tm; + + // D = TERM + MID + TERM + TERM = (D - 1)/2; + MID = D - TERM * 2; + MASK = (1 << D) - 1; + + i = 0; j = 0; + + // process D bytes each time + // separate processing into: | TERM bytes | MID bytes | TERM bytes | + // combined 1st half 4 * D bits: | TERM | 1st half of MID | + // combined 2nd half 4 * D bits: | 2nd half of MID | TERM | + while (i < D * NBAR) { + ac = 0; + acm = 0; + + // accumulate the MID bytes (8/16 for D = 15/16) in acm (u16) + for k = 0 to MID { + tm = (16u)in[(int)i + TERM + k]; + acm <<= 8; + acm |= tm; + } + + // aggregate 2nd half of MID from acm in ac + for k = 0 to MID { + tmp = (64u)acm; + tmp &= 0x0F; + tmp <<= 4 * k; + ac |= tmp; + acm >>= 4; + } + + // accumulate 2nd TERM bytes in ac + for k = 0 to TERM { + tmp = (64u)in[(int)i + TERM + MID + k]; + ac <<= 8; + ac |= tmp; + } + + // result in 4 * D bits in 2nd half of output + for k = 0 to 4 { + tm = ac; + tm &= MASK; + out[(int)j + 7 - k] = tm; + ac >>= D; + } + + // accumulate 1st TERM bytes in ac + for k = 0 to TERM { + tmp = (64u)in[(int)i + k]; + ac <<= 8; + ac |= tmp; + } + + // aggregate 2nd half of MID from acm to ac + ac <<= 4 * MID; + for k = 0 to MID { + tmp = (64u)acm; + tmp &= 0x0F; + tmp <<= 4 * k; + ac |= tmp; + acm >>= 4; + } + + // result in 4 * D bits in 1st half of output + for k = 0 to 4 { + // disable implicit scaling for handling little endianess + tm = ac; + tm &= MASK; + out[(int)j + 3 - k] = tm; + ac >>= D; + } + + i += D; + j += 8; + } + + return out; +} diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc new file mode 100644 index 00000000..498bcb85 --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc @@ -0,0 +1,396 @@ +from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc" + +param int SHAKE128_RATE = 168; + +#[returnaddress="stack"] +fn __shake128_gen_A(reg ptr u8[2 * N] out, reg const ptr u8[2 + BYTES_SEED_A] in) -> stack u8[2 * N] +{ + stack ptr u8[2 * N] s_out; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN OUTRND; + reg u64 i j; stack u64 s_i s_j; + + INLEN = 2 + BYTES_SEED_A; + OUTLEN = 2 * N; + OUTRND = OUTLEN / SHAKE128_RATE; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)i] ^= c; + i += 1; + } + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + i = 0; + while (i < OUTRND * SHAKE128_RATE) { + s_out = out; s_i = i; s_j = j; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + + j = 0; + while (j < SHAKE128_RATE) { + out[(int)i] = state[u8 (int)j]; + i += 1; + j += 1; + } + } + + s_out = out; s_i = i; s_j = j; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; + + j = 0; + while (i < OUTLEN) { + out[(int)i] = state[u8 (int)j]; + i += 1; + j += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake128_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A] in) -> stack u8[BYTES_SEED_A] +{ + stack ptr u8[BYTES_SEED_A] s_out; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + reg u64 i; stack u64 s_i; + inline int INLEN OUTLEN; + + INLEN = BYTES_SEED_A; OUTLEN = BYTES_SEED_A; + + s_out = out; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int) i]; + state[u8 (int) i] ^= c; + i += 1; + } + state[u8 (int) i] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + s_i = i; + + state = __keccakf1600_ref1(state); + + out = s_out; + + i = s_i; + i = 0; + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) i]; + i += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[4 * NNBAR] +{ + stack ptr u8[4 * NNBAR] s_out; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN OUTRND; + reg u64 i j; + stack u64 s_i s_j; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 4 * NNBAR; + OUTRND = OUTLEN / SHAKE128_RATE; + + s_out = out; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int) i]; + state[u8 (int) i] ^= c; + i += 1; + } + + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + i = 0; + while (i < OUTRND * SHAKE128_RATE) { + s_i = i; s_j = j; s_out = out; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + j = 0; + + while (j < SHAKE128_RATE) { + out[(int)i] = state[u8 (int)j]; + j += 1; + i += 1; + } + } + + s_i = i; s_j = j; s_out = out; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + j = 0; + + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) j]; + j += 1; + i += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake128_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> stack u8[BYTES_SEC] +{ + stack ptr u8[BYTES_SEC] s_out; + stack ptr u8[BYTES_PK] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + reg u64 i j; stack u64 s_i s_j; + inline int INLEN OUTLEN INRND; + + INLEN = BYTES_PK; OUTLEN = BYTES_SEC; + INRND = INLEN / SHAKE128_RATE; + s_out = out; + s_in = in; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INRND * SHAKE128_RATE) { + in = s_in; + j = 0; + while (j < SHAKE128_RATE) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + + i += 1; + j += 1; + } + + s_in = in; s_i = i; s_j = j; + state = __keccakf1600_ref1(state); + i = s_i; j = s_j; + } + + in = s_in; + j = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + i += 1; + j += 1; + } + + state[u8 INLEN - INRND * SHAKE128_RATE] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + s_in = in; s_i = i; s_j = j; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; + + i = 0; + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) i]; + i += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake128_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) -> stack u8[BYTES_SEED_SE + BYTES_SEC] { + stack ptr u8[BYTES_SEED_SE + BYTES_SEC] s_out; + stack ptr u8[2 * BYTES_SEC + BYTES_SALT] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN; + reg u64 i; stack u64 s_i; + + INLEN = 2 * BYTES_SEC + BYTES_SALT; + OUTLEN = BYTES_SEED_SE + BYTES_SEC; + + s_out = out; s_in = in; + + state = s_state; + state = __keccak_init_ref1(state); + + in = s_in; + i = 0; + while (i < INLEN) { + c = in[(int) i]; + state[u8 (int) i] ^= c; + i += 1; + } + + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + s_in = in; s_i = i; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; + i = 0; + + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) i]; + i += 1; + } + + return out; +} + + +#[returnaddress="stack"] +fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[2 * (2 * NNBAR + NBAR * NBAR)] { + stack ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] s_out; + stack ptr u8[1 + BYTES_SEED_SE] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN OUTRND; + reg u64 i j; stack u64 s_i s_j; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 2 * (2 * NNBAR + NBAR * NBAR); + OUTRND = OUTLEN / SHAKE128_RATE; + + s_out = out; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)i] ^= c; + i += 1; + } + + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + s_in = in; + + i = 0; + while (i < OUTRND * SHAKE128_RATE) { + s_out = out; s_i = i; s_j = j; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + j = 0; + while (j < SHAKE128_RATE) { + out[(int)i] = state[u8 (int)j]; + i += 1; + j += 1; + } + } + + s_out = out; s_i = i; s_j = j; + state = __keccakf1600_ref1(state); + out = s_out; i = s_i; j = s_j; + + j = 0; + while (i < OUTLEN) { + out[(int) i] = state[u8 (int)j]; + i += 1; + j += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake128_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SEC] in) -> stack u8[BYTES_SEC] +{ + stack ptr u8[BYTES_SEC] s_out; + stack ptr u8[BYTES_CT + BYTES_SEC] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + reg u64 i j; stack u64 s_i s_j; + inline int INLEN OUTLEN INRND; + + INLEN = BYTES_CT + BYTES_SEC; + OUTLEN = BYTES_SEC; + INRND = INLEN / SHAKE128_RATE; + + s_out = out; s_in = in; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INRND * SHAKE128_RATE) { + in = s_in; + j = 0; + while (j < SHAKE128_RATE) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + + i += 1; + j += 1; + } + + s_in = in; s_i = i; s_j = j; + state = __keccakf1600_ref1(state); + i = s_i; j = s_j; + } + + in = s_in; + j = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + i += 1; + j += 1; + } + s_in = in; s_i = i; s_j = j; + + state[u8 INLEN - INRND * SHAKE128_RATE] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + state = __keccakf1600_ref1(state); + + out = s_out; i = s_i; + + i = 0; + while (i < OUTLEN) { + out[(int)i] = state[u8 (int)i]; + i += 1; + } + + return out; +} diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc new file mode 100644 index 00000000..1ed5acf0 --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/shake256.jinc @@ -0,0 +1,338 @@ +from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc" + +param int SHAKE256_RATE = 136; + +#[returnaddress="stack"] +fn __shake256_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A] in) -> stack u8[BYTES_SEED_A] +{ + stack ptr u8[BYTES_SEED_A] s_out; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + reg u64 i; stack u64 s_i; + inline int INLEN OUTLEN; + + INLEN = BYTES_SEED_A; OUTLEN = BYTES_SEED_A; + + s_out = out; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int) i]; + state[u8 (int) i] ^= c; + i += 1; + } + state[u8 (int) i] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + s_i = i; + + state = _keccakf1600_ref1(state); + + out = s_out; + + i = s_i; + i = 0; + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) i]; + i += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake256_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[4 * NNBAR] +{ + stack ptr u8[4 * NNBAR] s_out; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN OUTRND; + reg u64 i j; + stack u64 s_i s_j; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 4 * NNBAR; + OUTRND = OUTLEN / SHAKE256_RATE; + + s_out = out; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int) i]; + state[u8 (int) i] ^= c; + i += 1; + } + + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + i = 0; + while (i < OUTRND * SHAKE256_RATE) { + s_i = i; s_j = j; s_out = out; + + state = _keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + j = 0; + + while (j < SHAKE256_RATE) { + out[(int)i] = state[u8 (int)j]; + j += 1; + i += 1; + } + } + + s_i = i; s_j = j; s_out = out; + + state = _keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + j = 0; + + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) j]; + j += 1; + i += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake256_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> stack u8[BYTES_SEC] +{ + stack ptr u8[BYTES_SEC] s_out; + stack ptr u8[BYTES_PK] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + reg u64 i j; stack u64 s_i s_j; + inline int INLEN OUTLEN INRND; + + INLEN = BYTES_PK; OUTLEN = BYTES_SEC; + INRND = INLEN / SHAKE256_RATE; + s_out = out; + s_in = in; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INRND * SHAKE256_RATE) { + in = s_in; + j = 0; + while (j < SHAKE256_RATE) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + + i += 1; + j += 1; + } + + s_in = in; s_i = i; s_j = j; + state = _keccakf1600_ref1(state); + i = s_i; j = s_j; + } + + in = s_in; + j = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + i += 1; + j += 1; + } + + state[u8 INLEN - INRND * SHAKE256_RATE] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + s_in = in; s_i = i; s_j = j; + + state = _keccakf1600_ref1(state); + + out = s_out; i = s_i; + + i = 0; + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) i]; + i += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake256_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) -> stack u8[BYTES_SEED_SE + BYTES_SEC] { + stack ptr u8[BYTES_SEED_SE + BYTES_SEC] s_out; + stack ptr u8[2 * BYTES_SEC + BYTES_SALT] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN; + reg u64 i; stack u64 s_i; + + INLEN = 2 * BYTES_SEC + BYTES_SALT; + OUTLEN = BYTES_SEED_SE + BYTES_SEC; + + s_out = out; s_in = in; + + state = s_state; + state = __keccak_init_ref1(state); + + in = s_in; + i = 0; + while (i < INLEN) { + c = in[(int) i]; + state[u8 (int) i] ^= c; + i += 1; + } + + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + s_in = in; s_i = i; + + state = _keccakf1600_ref1(state); + + out = s_out; i = s_i; + i = 0; + + while (i < OUTLEN) { + out[(int) i] = state[u8 (int) i]; + i += 1; + } + + return out; +} + + +#[returnaddress="stack"] +fn __shake256_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const ptr u8[1 + BYTES_SEED_SE] in) -> stack u8[2 * (2 * NNBAR + NBAR * NBAR)] { + stack ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] s_out; + stack ptr u8[1 + BYTES_SEED_SE] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + inline int INLEN OUTLEN OUTRND; + reg u64 i j; stack u64 s_i s_j; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 2 * (2 * NNBAR + NBAR * NBAR); + OUTRND = OUTLEN / SHAKE256_RATE; + + s_out = out; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)i] ^= c; + i += 1; + } + + state[u8 INLEN] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + s_in = in; + + i = 0; + while (i < OUTRND * SHAKE256_RATE) { + s_out = out; s_i = i; s_j = j; + + state = _keccakf1600_ref1(state); + + out = s_out; i = s_i; j = s_j; + j = 0; + while (j < SHAKE256_RATE) { + out[(int)i] = state[u8 (int)j]; + i += 1; + j += 1; + } + } + + s_out = out; s_i = i; s_j = j; + state = _keccakf1600_ref1(state); + out = s_out; i = s_i; j = s_j; + + j = 0; + while (i < OUTLEN) { + out[(int) i] = state[u8 (int)j]; + i += 1; + j += 1; + } + + return out; +} + +#[returnaddress="stack"] +fn __shake256_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SEC] in) -> stack u8[BYTES_SEC] +{ + stack ptr u8[BYTES_SEC] s_out; + stack ptr u8[BYTES_CT + BYTES_SEC] s_in; + stack u64[25] s_state; + reg ptr u64[25] state; + reg u8 c; + reg u64 i j; stack u64 s_i s_j; + inline int INLEN OUTLEN INRND; + + INLEN = BYTES_CT + BYTES_SEC; + OUTLEN = BYTES_SEC; + INRND = INLEN / SHAKE256_RATE; + + s_out = out; s_in = in; + + state = s_state; + state = __keccak_init_ref1(state); + + i = 0; + while (i < INRND * SHAKE256_RATE) { + in = s_in; + j = 0; + while (j < SHAKE256_RATE) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + + i += 1; + j += 1; + } + + s_in = in; s_i = i; s_j = j; + state = _keccakf1600_ref1(state); + i = s_i; j = s_j; + } + + in = s_in; + j = 0; + while (i < INLEN) { + c = in[(int)i]; + state[u8 (int)j] ^= c; + i += 1; + j += 1; + } + s_in = in; s_i = i; s_j = j; + + state[u8 INLEN - INRND * SHAKE256_RATE] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + state = _keccakf1600_ref1(state); + + out = s_out; i = s_i; + + i = 0; + while (i < OUTLEN) { + out[(int)i] = state[u8 (int)i]; + i += 1; + } + + return out; +} diff --git a/src/crypto_kem/frodo/common/frodo640_params.jinc b/src/crypto_kem/frodo/common/frodo640_params.jinc new file mode 100644 index 00000000..2656cc7e --- /dev/null +++ b/src/crypto_kem/frodo/common/frodo640_params.jinc @@ -0,0 +1,35 @@ +// params of frodo 640 + +param int EXTRACTED_BITS = 2; +param int D = 15; +param int N = 640; +param int NBAR = 8; + +param int NNBAR = N * NBAR; + +param int BYTES_SEED_A = 16; +param int BYTES_SEC = 16; +param int BYTES_SEED_SE = 32; +param int BYTES_SALT = 32; + +param int BYTES_PK = BYTES_SEED_A + D * N; +param int BYTES_SK = BYTES_SEC + BYTES_PK + 2 * NNBAR + BYTES_SEC; +param int BYTES_CT = D * N + D * NBAR + BYTES_SALT; + +param int CDF_TABLE_LEN = 13; + +u16[CDF_TABLE_LEN] CDF_TABLE = { + 4643, + 13363, + 20579, + 25843, + 29227, + 31145, + 32103, + 32525, + 32689, + 32745, + 32762, + 32766, + 32767 +}; diff --git a/src/crypto_kem/frodo/common/frodo976_params.jinc b/src/crypto_kem/frodo/common/frodo976_params.jinc new file mode 100644 index 00000000..49c32cd5 --- /dev/null +++ b/src/crypto_kem/frodo/common/frodo976_params.jinc @@ -0,0 +1,33 @@ +// params of frodo 976 + +param int EXTRACTED_BITS = 3; +param int D = 16; +param int N = 976; +param int NBAR = 8; + +param int NNBAR = N * NBAR; + +param int BYTES_SEED_A = 16; +param int BYTES_SEC = 24; +param int BYTES_SEED_SE = 48; +param int BYTES_SALT = 48; + +param int BYTES_PK = BYTES_SEED_A + D * N; +param int BYTES_SK = BYTES_SEC + BYTES_PK + 2 * NNBAR + BYTES_SEC; +param int BYTES_CT = D * N + D * NBAR + BYTES_SALT; + +param int CDF_TABLE_LEN = 11; + +u16[CDF_TABLE_LEN] CDF_TABLE = { + 5638, + 15915, + 23689, + 28571, + 31116, + 32217, + 32613, + 32731, + 32760, + 32766, + 32767 +}; diff --git a/src/crypto_kem/frodo/frodo640shake/META.yml b/src/crypto_kem/frodo/frodo640shake/META.yml new file mode 100644 index 00000000..a3dbdce3 --- /dev/null +++ b/src/crypto_kem/frodo/frodo640shake/META.yml @@ -0,0 +1,26 @@ +name: frodo640shake +type: kem +checksumsmall: 24cf060c7263b4e138de49f07c70d0c3392e1d57b1295f25bff27eee7c88ebe5 +checksumbig: ac84a42c2750be4a04256b3fb4817a8483d3422d5f557e067e320491147f5401 +claimed-nist-level: 3 +claimed-security: IND-CCA2 +length-public-key: 9616 +length-ciphertext: 9752 +length-secret-key: 19888 +length-shared-secret: 16 +principal-submitters: + - TODO +auxiliary-submitters: + - TODO +implementations: + - name: amd64/ref + version: TODO + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: # FIXME + - avx2 + - bmi2 + - popcnt diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/Makefile b/src/crypto_kem/frodo/frodo640shake/amd64/ref/Makefile new file mode 100644 index 00000000..29a80faa --- /dev/null +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/Makefile @@ -0,0 +1,2 @@ +SRCS := kem.jazz +include ../../../../../Makefile.common diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/include/api.h b/src/crypto_kem/frodo/frodo640shake/amd64/ref/include/api.h new file mode 100644 index 00000000..3eebf599 --- /dev/null +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/include/api.h @@ -0,0 +1,36 @@ +#ifndef JADE_KEM_frodo_frodo640shake_amd64_ref_API_H +#define JADE_KEM_frodo_frodo640shake_amd64_ref_API_H + +#include + +#define JADE_KEM_frodo_frodo640shake_amd64_ref_SECRETKEYBYTES 19888 +#define JADE_KEM_frodo_frodo640shake_amd64_ref_PUBLICKEYBYTES 9616 +#define JADE_KEM_frodo_frodo640shake_amd64_ref_CIPHERTEXTBYTES 9752 +#define JADE_KEM_frodo_frodo640shake_amd64_ref_KEYPAIRCOINBYTES 64 +#define JADE_KEM_frodo_frodo640shake_amd64_ref_ENCCOINBYTES 48 +#define JADE_KEM_frodo_frodo640shake_amd64_ref_BYTES 16 + +#define JADE_KEM_frodo_frodo640shake_amd64_ref_ALGNAME "Frodo640" +#define JADE_KEM_frodo_frodo640shake_amd64_ref_ARCH "amd64" +#define JADE_KEM_frodo_frodo640shake_amd64_ref_IMPL "ref" + +// kem api +int jade_kem_frodo_frodo640shake_amd64_ref_keypair_derand( + uint8_t *public_key, uint8_t *secret_key, const uint8_t *coins); + +int jade_kem_frodo_frodo640shake_amd64_ref_keypair(uint8_t *public_key, + uint8_t *secret_key); + +int jade_kem_frodo_frodo640shake_amd64_ref_enc_derand( + uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key, + const uint8_t *coins); + +int jade_kem_frodo_frodo640shake_amd64_ref_enc(uint8_t *ciphertext, + uint8_t *shared_secret, + const uint8_t *public_key); + +int jade_kem_frodo_frodo640shake_amd64_ref_dec(uint8_t *shared_secret, + uint8_t *ciphertext, + uint8_t *secret_key); + +#endif diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz new file mode 100644 index 00000000..b6dd27f1 --- /dev/null +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jazz @@ -0,0 +1,37 @@ +from Jade require "crypto_kem/frodo/common/frodo640_params.jinc" +from Jade require "crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc" + +export fn jade_kem_frodo_frodo640shake_amd64_ref_keypair_derand(#public reg u64 pkp skp coinsp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_keypair_derand(pkp, skp, coinsp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo640shake_amd64_ref_keypair(#public reg u64 pkp skp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_keypair(pkp, skp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo640shake_amd64_ref_enc_derand(#public reg u64 ctp ssp pkp coinsp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coinsp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo640shake_amd64_ref_enc(#public reg u64 ctp ssp pkp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_enc(ctp, ssp, pkp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo640shake_amd64_ref_dec(#public reg u64 ssp ctp skp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_dec(ssp, ctp, skp); + ?{}, r = #set0(); + return r; +} diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc new file mode 100644 index 00000000..9e840992 --- /dev/null +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -0,0 +1,383 @@ +from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" + +#[returnaddress="stack"] +fn __gen_SE(reg ptr u16[2 * NNBAR] SE, reg ptr u8[BYTES_SEED_SE] r) -> stack u16[2 * NNBAR] { + stack u8[1 + BYTES_SEED_SE] seed_se; + reg u64 i; stack u64 s_i; + + r = r; + i = 0; + while (i < BYTES_SEED_SE) { + seed_se[(int)i + 1] = r[(int)i]; + i += 1; + } + s_i = i; + + // S || E + seed_se[0] = 0x5F; + + SE = __shake128_r(SE, seed_se); + SE = __sample_2NNBAR(SE); + + return SE; +} + +// coins = s || seed SE || z +fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) { + stack u16[2 * NNBAR] SE; + stack u16[NNBAR] B; + + stack u64 s_pkp s_skp; + reg u64 i; stack u64 s_i; + + s_pkp = pkp; + s_skp = skp; + coins = coins; + + // seedA || b + stack u8[BYTES_PK] pk; + + // s || seedA || b || S_T || pkh + stack u8[BYTES_SK] sk; + + // gen seedA + pk[0:BYTES_SEED_A] = __shake128_seed_A(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); + + // gen S || E + SE = __gen_SE(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + + // B = A*S+E + B = __AS_plus_E(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]); + + // pack + pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B); + + // + i = s_i; i = 0; + while (i < BYTES_SEC) { + sk[(int) i] = coins[(int)i]; + i += 1; + } + + i = 0; + while (i < BYTES_PK) { + sk[BYTES_SEC + (int)i] = pk[(int)i]; + i += 1; + } + + i = 0; + while (i < 2 * NNBAR) { + sk[BYTES_SEC + BYTES_PK + (int)i] = SE.[u8 (int)i]; + i += 1; + } + s_i = i; + + sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake128_pkh(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk); + + pkp = s_pkp; + i = 0; + while (i < BYTES_PK) { + (u8)[pkp + i] = pk[(int) i]; + i += 1; + } + + skp = s_skp; + i = 0; + while (i < BYTES_SK) { + (u8)[skp + i] = sk[(int) i]; + i += 1; + } +} + +#[returnaddress="stack"] +fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { + reg u64 i j; stack u64 s_i s_j; + stack u64 s_ctp s_ssp; + + // seedA || b + #public stack u8[BYTES_PK] pk; + + // c1 || c2 || salt || k + stack u8[BYTES_CT + BYTES_SEC] ct_k; + + // pkh || u || salt + stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; + // 0x96 || seedSE || k + stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; + seedSE_k[0] = 0x96; + + // S' || E' || E'' + stack u16[2 * NNBAR + NBAR * NBAR] SEE; + + stack u16[NNBAR] B Bp; + stack u16[NBAR * NBAR] V C; + stack u8[BYTES_SEC] ss; + + pkp = pkp; + s_ctp = ctp; + s_ssp = ssp; + coins = coins; + s_j = j; + + // gen u || salt + i = 0; + while (i < BYTES_SEC + BYTES_SALT) { + pkh_u_salt[BYTES_SEC + (int)i] = coins[(int)i]; + i += 1; + } + + i = 0; + while (i < BYTES_SALT) { + ct_k[D * N + D * NBAR + (int)i] = pkh_u_salt[BYTES_SEC * 2 + (int)i]; + i += 1; + } + + // read pk + i = 0; + while (i < BYTES_PK) { + #declassify pk[(int)i] = (u8) [pkp + i]; + i += 1; + } + s_i = i; + + // pkh + pkh_u_salt[0:BYTES_SEC] = __shake128_pkh(pkh_u_salt[0:BYTES_SEC], pk); + + // seedSE || k + seedSE_k[1 : BYTES_SEED_SE + BYTES_SEC] = __shake128_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + + // copy k + i = s_i; i = 0; + while (i < BYTES_SEC) { + ct_k[BYTES_CT + (int)i] = seedSE_k[1 + BYTES_SEED_SE + (int)i]; + i += 1; + } + s_i = i; + + // gen input bit string for sampling S and E + SEE = __shake128_encap_r(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); + + // S' || E' + SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); + // E'' + SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); + + // B' = S'A + E'' + Bp = __SA_plus_E(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + + // c1 <- Pack(B') + ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp); + + // B <- Unpack(b) + B = __unpack_B(B, pk[BYTES_SEED_A:D * N]); + + // V = S'B + E'' + V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + + // C = V + Encode(u) + C = __encode(C, pkh_u_salt[BYTES_SEC:BYTES_SEC]); + C = __matrix_add(C, V); + + // c2 <- Pack(C) + ct_k[D * N: D * NBAR] = __pack_C(ct_k[D * N: D * NBAR], C); + + // ss <- shake(c1 || c2 || salt || k) + ss = __shake128_ss(ss, ct_k); + + i = s_i; i = 0; + ctp = s_ctp; + ssp = s_ssp; + while (i < BYTES_CT) { + (u8)[ctp + i] = ct_k[(int)i]; + i += 1; + } + + i = 0; + while (i < BYTES_SEC) { + (u8)[ssp + i] = ss[(int)i]; + i += 1; + } +} + +#[returnaddress="stack"] +fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { + #public stack u8[BYTES_PK] pk; + stack u8[2 * NNBAR] ST; + stack u8[BYTES_SEC] s; + stack u8[BYTES_CT + BYTES_SEC] ct_k; + stack u16[NNBAR] B Bp Bpp; + stack u16[NBAR * NBAR] M C Cp V; + stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; + stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; + stack u8[BYTES_SEC] ss; + + // S' || E' || E'' + stack u16[2 * NNBAR + NBAR * NBAR] SEE; + + stack u64 s_ssp s_ctp s_skp; + reg u8 s1 s2; + reg u64 i; stack u64 s_i; + + s_ssp = ssp; + ctp = ctp; + skp = skp; + + // read ct + i = 0; + while (i < BYTES_CT) { + ct_k[(int) i] = (u8)[ctp + i]; + i += 1; + } + s_ctp = ctp; + + i = 0; + while (i < BYTES_SEC) { + s[(int) i] = (u8)[skp + i]; + i += 1; + } + + i = 0; + while (i < BYTES_PK) { + #declassify pk[(int)i] = (u8)[skp + BYTES_SEC + i]; + i += 1; + } + + i = 0; + while (i < 2 * NNBAR) { + ST[(int)i] = (u8)[skp + BYTES_SEC + BYTES_PK + i]; + i += 1; + } + + // copy pkh + i = 0; + while (i < BYTES_SEC) { + pkh_u_salt[(int)i] = (u8)[skp + BYTES_SK - BYTES_SEC + i]; + i += 1; + } + s_skp = skp; + + // copy salt + i = 0; + while (i < BYTES_SALT) { + pkh_u_salt[BYTES_SEC * 2 + (int)i] = ct_k[BYTES_CT - BYTES_SALT + (int)i]; + i += 1; + } + s_i = i; + + // B' <- Unpack(c1) + Bp = __unpack_B(Bp, ct_k[0:D * N]); + // C <- Unpack(c2) + C = __unpack_C(C, ct_k[D * N:D * NBAR]); + + // M = C - B'S + M = __mul_BS(M, Bp, ST); + M = __matrix_sub(M, C); + + pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M); + + seedSE_k[0] = 0x96; + seedSE_k[1:BYTES_SEED_SE + BYTES_SEC] = __shake128_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + + SEE = __shake128_encap_r(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]); + + // S' || E' + SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); + // E'' + SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); + + // B'' = S'A + E' + Bpp = __SA_plus_E(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + + // B'' (mod q) + i = s_i; + i = 0; + while (i < NNBAR) { + Bpp[(int)i] &= (1 << D) - 1; + i += 1; + } + + // + B = __unpack_B(B, pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]); + + V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + + Cp = __encode(Cp, pkh_u_salt[BYTES_SEC:BYTES_SEC]); + Cp = __matrix_add(Cp, V); + + s1 = __ct_verify_NNBAR(Bp, Bpp); + s2 = __ct_verify_NBAR2(C, Cp); + s1 |= s2; + + ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[1+BYTES_SEED_SE:BYTES_SEC], s, s1); + + ss = __shake128_ss(ss, ct_k); + + ssp = s_ssp; + i = s_i; + i = 0; + while (i < BYTES_SEC) { + (u8)[ssp + i] = ss[(int)i]; + i += 1; + } +} + +fn _frodo_amd64_ref_keypair(reg u64 pkp skp) { + #public stack u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins; + + pkp = pkp; + skp = skp; + + #declassify coins = #randombytes(coins); + + __frodo_amd64_ref_keypair_derand(pkp, skp, coins); +} + +fn _frodo_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) { + #public stack u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins; + reg u64 i; stack u64 s_i; + + pkp = pkp; + skp = skp; + + i = 0; + while (i < BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC) { + #declassify coins[(int)i] = (u8)[coinsp + i]; + i += 1; + } + + s_i = i; + __frodo_amd64_ref_keypair_derand(pkp, skp, coins); +} + +fn _frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp coinsp) { + stack u8[BYTES_SEC + BYTES_SALT] coins; + reg u64 i; stack u64 s_i; + + pkp = pkp; + ctp = ctp; + ssp = ssp; + + i = 0; + while (i < BYTES_SEC + BYTES_SALT) { + coins[(int)i] = (u8)[coinsp + i]; + i += 1; + } + s_i = i; + + __frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coins); +} + +fn _frodo_amd64_ref_enc(reg u64 ctp ssp pkp) { + stack u8[BYTES_SEC + BYTES_SALT] coins; + pkp = pkp; + ctp = ctp; + ssp = ssp; + + coins = #randombytes(coins); + + __frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coins); +} diff --git a/src/crypto_kem/frodo/frodo976shake/META.yml b/src/crypto_kem/frodo/frodo976shake/META.yml new file mode 100644 index 00000000..8179c4c3 --- /dev/null +++ b/src/crypto_kem/frodo/frodo976shake/META.yml @@ -0,0 +1,26 @@ +name: frodo976shake +type: kem +checksumsmall: 0fc98ee686f97225c0f831be7a72d45a7ad2848811adaa7cc4a9bc651c245fa8 +checksumbig: de7124ccc70e76c6058b517a55789fe7b53a2a668d4dfb5651242d4afcd2d53a +claimed-nist-level: 3 +claimed-security: IND-CCA2 +length-public-key: 15632 +length-ciphertext: 15792 +length-secret-key: 31296 +length-shared-secret: 24 +principal-submitters: + - TODO +auxiliary-submitters: + - TODO +implementations: + - name: amd64/ref + version: TODO + supported_platforms: + - architecture: x86_64 + operating_systems: + - Linux + - Darwin + required_flags: # FIXME + - avx2 + - bmi2 + - popcnt diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/Makefile b/src/crypto_kem/frodo/frodo976shake/amd64/ref/Makefile new file mode 100644 index 00000000..29a80faa --- /dev/null +++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/Makefile @@ -0,0 +1,2 @@ +SRCS := kem.jazz +include ../../../../../Makefile.common diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/include/api.h b/src/crypto_kem/frodo/frodo976shake/amd64/ref/include/api.h new file mode 100644 index 00000000..bb30c2ac --- /dev/null +++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/include/api.h @@ -0,0 +1,38 @@ +#ifndef JADE_KEM_frodo_frodo976shake_amd64_ref_API_H +#define JADE_KEM_frodo_frodo976shake_amd64_ref_API_H + +#include + +#define JADE_KEM_frodo_frodo976shake_amd64_ref_SECRETKEYBYTES 31296 +#define JADE_KEM_frodo_frodo976shake_amd64_ref_PUBLICKEYBYTES 15632 +#define JADE_KEM_frodo_frodo976shake_amd64_ref_CIPHERTEXTBYTES 15792 +#define JADE_KEM_frodo_frodo976shake_amd64_ref_KEYPAIRCOINBYTES 88 +#define JADE_KEM_frodo_frodo976shake_amd64_ref_ENCCOINBYTES 72 +#define JADE_KEM_frodo_frodo976shake_amd64_ref_BYTES 24 + +#define JADE_KEM_frodo_frodo976shake_amd64_ref_ALGNAME "Frodo976shake" +#define JADE_KEM_frodo_frodo976shake_amd64_ref_ARCH "amd64" +#define JADE_KEM_frodo_frodo976shake_amd64_ref_IMPL "ref" + +// kem api +int jade_kem_frodo_frodo976shake_amd64_ref_keypair_derand(uint8_t *public_key, + uint8_t *secret_key, + uint8_t *coins); + +int jade_kem_frodo_frodo976shake_amd64_ref_keypair(uint8_t *public_key, + uint8_t *secret_key); + +int jade_kem_frodo_frodo976shake_amd64_ref_enc_derand(uint8_t *ciphertext, + uint8_t *shared_secret, + uint8_t *public_key, + uint8_t *coins); + +int jade_kem_frodo_frodo976shake_amd64_ref_enc(uint8_t *ciphertext, + uint8_t *shared_secret, + uint8_t *public_key); + +int jade_kem_frodo_frodo976shake_amd64_ref_dec(uint8_t *shared_secret, + uint8_t *ciphertext, + uint8_t *secret_key); + +#endif diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz new file mode 100644 index 00000000..c64c8260 --- /dev/null +++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jazz @@ -0,0 +1,37 @@ +from Jade require "crypto_kem/frodo/common/frodo976_params.jinc" +from Jade require "crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc" + +export fn jade_kem_frodo_frodo976shake_amd64_ref_keypair_derand(#public reg u64 pkp skp coinsp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_keypair_derand(pkp, skp, coinsp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo976shake_amd64_ref_keypair(#public reg u64 pkp skp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_keypair(pkp, skp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo976shake_amd64_ref_enc_derand(#public reg u64 ctp ssp pkp coinsp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coinsp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo976shake_amd64_ref_enc(#public reg u64 ctp ssp pkp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_enc(ctp, ssp, pkp); + ?{}, r = #set0(); + return r; +} + +export fn jade_kem_frodo_frodo976shake_amd64_ref_dec(#public reg u64 ssp ctp skp) -> #public reg u64 { + reg u64 r; + _frodo_amd64_ref_dec(ssp, ctp, skp); + ?{}, r = #set0(); + return r; +} diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc new file mode 100644 index 00000000..81b8f4ba --- /dev/null +++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc @@ -0,0 +1,384 @@ +from Jade require "crypto_kem/frodo/common/amd64/ref/shake256.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" + +#[returnaddress="stack"] +fn __gen_SE(reg ptr u16[2 * NNBAR] SE, reg ptr u8[BYTES_SEED_SE] r) -> stack u16[2 * NNBAR] { + stack u8[1 + BYTES_SEED_SE] seed_se; + reg u64 i; stack u64 s_i; + + r = r; + i = 0; + while (i < BYTES_SEED_SE) { + seed_se[(int)i + 1] = r[(int)i]; + i += 1; + } + s_i = i; + + // S || E + seed_se[0] = 0x5F; + + SE = __shake256_r(SE, seed_se); + SE = __sample_2NNBAR(SE); + + return SE; +} + +// coins = s || seed SE || z +fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) { + stack u16[2 * NNBAR] SE; + stack u16[NNBAR] B; + + stack u64 s_pkp s_skp; + reg u64 i; stack u64 s_i; + + s_pkp = pkp; + s_skp = skp; + coins = coins; + + // seedA || b + stack u8[BYTES_PK] pk; + + // s || seedA || b || S_T || pkh + stack u8[BYTES_SK] sk; + + // gen seedA + pk[0:BYTES_SEED_A] = __shake256_seed_A(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); + + // gen S || E + SE = __gen_SE(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + + // B = A*S+E + B = __AS_plus_E(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]); + + // pack + pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B); + + // + i = s_i; i = 0; + while (i < BYTES_SEC) { + sk[(int) i] = coins[(int)i]; + i += 1; + } + + i = 0; + while (i < BYTES_PK) { + sk[BYTES_SEC + (int)i] = pk[(int)i]; + i += 1; + } + + i = 0; + while (i < 2 * NNBAR) { + sk[BYTES_SEC + BYTES_PK + (int)i] = SE.[u8 (int)i]; + i += 1; + } + s_i = i; + + sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake256_pkh(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk); + + pkp = s_pkp; + i = 0; + while (i < BYTES_PK) { + (u8)[pkp + i] = pk[(int) i]; + i += 1; + } + + skp = s_skp; + i = 0; + while (i < BYTES_SK) { + (u8)[skp + i] = sk[(int) i]; + i += 1; + } +} + +#[returnaddress="stack"] +fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { + reg u64 i j; stack u64 s_i s_j; + stack u64 s_ctp s_ssp; + + // seedA || b + #public stack u8[BYTES_PK] pk; + + // c1 || c2 || salt || k + stack u8[BYTES_CT + BYTES_SEC] ct_k; + + // pkh || u || salt + stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; + // 0x96 || seedSE || k + stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; + seedSE_k[0] = 0x96; + + // S' || E' || E'' + stack u16[2 * NNBAR + NBAR * NBAR] SEE; + + stack u16[NNBAR] B Bp; + stack u16[NBAR * NBAR] V C; + stack u8[BYTES_SEC] ss; + + pkp = pkp; + s_ctp = ctp; + s_ssp = ssp; + coins = coins; + s_j = j; + + // gen u || salt + i = 0; + while (i < BYTES_SEC + BYTES_SALT) { + pkh_u_salt[BYTES_SEC + (int)i] = coins[(int)i]; + i += 1; + } + + i = 0; + while (i < BYTES_SALT) { + ct_k[D * N + D * NBAR + (int)i] = pkh_u_salt[BYTES_SEC * 2 + (int)i]; + i += 1; + } + + // read pk + i = 0; + while (i < BYTES_PK) { + #declassify pk[(int)i] = (u8) [pkp + i]; + i += 1; + } + s_i = i; + + // pkh + pkh_u_salt[0:BYTES_SEC] = __shake256_pkh(pkh_u_salt[0:BYTES_SEC], pk); + + // seedSE || k + seedSE_k[1 : BYTES_SEED_SE + BYTES_SEC] = __shake256_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + + // copy k + i = s_i; i = 0; + while (i < BYTES_SEC) { + ct_k[BYTES_CT + (int)i] = seedSE_k[1 + BYTES_SEED_SE + (int)i]; + i += 1; + } + s_i = i; + + // gen input bit string for sampling S and E + SEE = __shake256_encap_r(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); + + // S' || E' + SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); + // E'' + SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); + + // B' = S'A + E'' + Bp = __SA_plus_E(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + + // c1 <- Pack(B') + ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp); + + // B <- Unpack(b) + B = __unpack_B(B, pk[BYTES_SEED_A:D * N]); + + // V = S'B + E'' + V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + + // C = V + Encode(u) + C = __encode(C, pkh_u_salt[BYTES_SEC:BYTES_SEC]); + C = __matrix_add(C, V); + + // c2 <- Pack(C) + ct_k[D * N: D * NBAR] = __pack_C(ct_k[D * N: D * NBAR], C); + + // ss <- shake(c1 || c2 || salt || k) + ss = __shake256_ss(ss, ct_k); + + i = s_i; i = 0; + ctp = s_ctp; + ssp = s_ssp; + while (i < BYTES_CT) { + (u8)[ctp + i] = ct_k[(int)i]; + i += 1; + } + + i = 0; + while (i < BYTES_SEC) { + (u8)[ssp + i] = ss[(int)i]; + i += 1; + } +} + +#[returnaddress="stack"] +fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { + #public stack u8[BYTES_PK] pk; + stack u8[2 * NNBAR] ST; + stack u8[BYTES_SEC] s; + stack u8[BYTES_CT + BYTES_SEC] ct_k; + stack u16[NNBAR] B Bp Bpp; + stack u16[NBAR * NBAR] M C Cp V; + stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; + stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; + stack u8[BYTES_SEC] ss; + + // S' || E' || E'' + stack u16[2 * NNBAR + NBAR * NBAR] SEE; + + stack u64 s_ssp s_ctp s_skp; + reg u8 s1 s2; + reg u64 i; stack u64 s_i; + + s_ssp = ssp; + ctp = ctp; + skp = skp; + + // read ct + i = 0; + while (i < BYTES_CT) { + ct_k[(int) i] = (u8)[ctp + i]; + i += 1; + } + s_ctp = ctp; + + i = 0; + while (i < BYTES_SEC) { + s[(int) i] = (u8)[skp + i]; + i += 1; + } + + i = 0; + while (i < BYTES_PK) { + #declassify pk[(int)i] = (u8)[skp + BYTES_SEC + i]; + i += 1; + } + + i = 0; + while (i < 2 * NNBAR) { + ST[(int)i] = (u8)[skp + BYTES_SEC + BYTES_PK + i]; + i += 1; + } + + // copy pkh + i = 0; + while (i < BYTES_SEC) { + pkh_u_salt[(int)i] = (u8)[skp + BYTES_SK - BYTES_SEC + i]; + i += 1; + } + s_skp = skp; + + // copy salt + i = 0; + while (i < BYTES_SALT) { + pkh_u_salt[BYTES_SEC * 2 + (int)i] = ct_k[BYTES_CT - BYTES_SALT + (int)i]; + i += 1; + } + s_i = i; + + // B' <- Unpack(c1) + Bp = __unpack_B(Bp, ct_k[0:D * N]); + // C <- Unpack(c2) + C = __unpack_C(C, ct_k[D * N:D * NBAR]); + + // M = C - B'S + M = __mul_BS(M, Bp, ST); + M = __matrix_sub(M, C); + + pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M); + + seedSE_k[0] = 0x96; + seedSE_k[1:BYTES_SEED_SE + BYTES_SEC] = __shake256_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + + SEE = __shake256_encap_r(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]); + + // S' || E' + SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); + // E'' + SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); + + // B'' = S'A + E' + Bpp = __SA_plus_E(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + + // B'' (mod q) + i = s_i; + i = 0; + while (i < NNBAR) { + Bpp[(int)i] &= (1 << D) - 1; + i += 1; + } + + // + B = __unpack_B(B ,pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]); + + V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + + Cp = __encode(Cp, pkh_u_salt[BYTES_SEC:BYTES_SEC]); + Cp = __matrix_add(Cp, V); + + s1 = __ct_verify_NNBAR(Bp, Bpp); + s2 = __ct_verify_NBAR2(C, Cp); + s1 |= s2; + + ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[1+BYTES_SEED_SE:BYTES_SEC], s, s1); + + ss = __shake256_ss(ss, ct_k); + + ssp = s_ssp; + i = s_i; + i = 0; + while (i < BYTES_SEC) { + (u8)[ssp + i] = ss[(int)i]; + i += 1; + } +} + +fn _frodo_amd64_ref_keypair(reg u64 pkp skp) { + #public stack u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins; + + pkp = pkp; + skp = skp; + + #declassify coins = #randombytes(coins); + + __frodo_amd64_ref_keypair_derand(pkp, skp, coins); +} + +fn _frodo_amd64_ref_keypair_derand(reg u64 pkp skp coinsp) { + #public stack u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins; + reg u64 i; stack u64 s_i; + + pkp = pkp; + skp = skp; + + i = 0; + while (i < BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC) { + #declassify coins[(int)i] = (u8)[coinsp + i]; + i += 1; + } + + s_i = i; + __frodo_amd64_ref_keypair_derand(pkp, skp, coins); +} + +fn _frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp coinsp) { + stack u8[BYTES_SEC + BYTES_SALT] coins; + reg u64 i; stack u64 s_i; + + pkp = pkp; + ctp = ctp; + ssp = ssp; + + i = 0; + while (i < BYTES_SEC + BYTES_SALT) { + coins[(int)i] = (u8)[coinsp + i]; + i += 1; + } + s_i = i; + + __frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coins); +} + +fn _frodo_amd64_ref_enc(reg u64 ctp ssp pkp) { + stack u8[BYTES_SEC + BYTES_SALT] coins; + pkp = pkp; + ctp = ctp; + ssp = ssp; + + coins = #randombytes(coins); + + __frodo_amd64_ref_enc_derand(ctp, ssp, pkp, coins); +} From c53ff3e8c8dbe5ea8ddd5f215f94ce4825656c26 Mon Sep 17 00:00:00 2001 From: Tiago Oliveira Date: Thu, 22 Feb 2024 08:01:00 +0000 Subject: [PATCH 02/14] frodo: improve performance of sa_plus_e -- 11700K, m 101t, frodo640 keypair , 22337522 enc , 23211852 dec , 23409619 -- keypair , 22505842 enc , 10833123 dec , 11027370 --- .../frodo/common/amd64/ref/matrix_opt.jinc | 208 ++++++++++++++++++ .../frodo/frodo640shake/amd64/ref/kem.jinc | 5 +- .../frodo/frodo976shake/amd64/ref/kem.jinc | 5 +- 3 files changed, 214 insertions(+), 4 deletions(-) create mode 100644 src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc new file mode 100644 index 00000000..6b9c06c2 --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc @@ -0,0 +1,208 @@ + +// notes: "16" instead of BYTES_SEED_A on purpose +// compilation should fail if BYTES_SEED_A changes + +fn _shake128_gen_A_opt( + #spill_to_mmx reg ptr u8[2 * N * 8] out, // note: this implementation should work for N multiple of 4 + #spill_to_mmx reg u64 out_offset, // out_offset in bytes + #spill_to_mmx reg ptr u8[16+2+6] index_seed_padding) + -> + reg ptr u8[2 * N * 8], + reg u64 +{ + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j t0 t1 t2 zero; + reg u8 v0; + + state = s_state; + + t0 = index_seed_padding[u64 0]; + t1 = index_seed_padding[u64 1]; + t2 = index_seed_padding[u64 2]; + ?{}, zero = #set0(); + + state[0] = t0; + state[1] = t1; + state[2] = t2; + + i = 3; + while (i < 25) + { state[i] = zero; + i += 1; + } + + state[u8 167] = 0x80; + + // out_offset in u64 words + out_offset >>= 3; + + // notes: + // - i is incremented in the inner loop + // - this function needs to output 2*N bytes + // - each iteration produces 168 bytes, but is incremented in "u64 mode" (by 21) + // - for N=640, 1280 bytes are needed, in 7 iterations, we produce 1176 bytes + // and i will be 147. + i = 0; + while (i < (2*N)/168) + { + () = #spill(i, out, out_offset); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, out, out_offset); + + j = 0; + while (j < 21) + { t0 = state[j]; + out[u64 out_offset] = t0; + out_offset += 1; + j += 1; + } + + i += 1; + } + + () = #spill(i, out, out_offset); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, out, out_offset); + + out_offset <<= 3; // in bytes again + i *= 168; + j = 0; + while (i < 2*N) + { + v0 = state[u8 (int)j]; + out[(int)out_offset] = v0; + out_offset += 1; + i += 1; + j += 1; + } + + return out, out_offset; +} + +fn __SA_plus_E_opt( + #spill_to_mmx reg ptr u16[NNBAR] B, + reg ptr u8[16] seedA, + #spill_to_mmx reg ptr u16[NNBAR] S, + #spill_to_mmx reg ptr u16[NNBAR] E) + -> + reg ptr u16[NNBAR] +{ + inline int m; + stack u8[2+16+ 6] s_index_seed_padding; + reg ptr u8[2+16+ 6] index_seed_padding; + + stack u16[N * 8] s_A; + reg ptr u16[N * 8] A; + reg u64 A_offset B_offset S_offset; + + inline int p; + #spill_to_mmx reg u64 i j ij jN q; + reg u32[8] sp; + reg u32 sum mul; + reg u16 t; + reg u8 v; + + // setup "index || seed || padding" + s_index_seed_padding[u16 0] = 0; + s_index_seed_padding[u64 2] = 0; + i = 0; + j = 2; + while(i < 16) + { v = seedA[i]; + s_index_seed_padding[j] = v; + i += 1; + j += 1; + } + s_index_seed_padding[18] = 0x1f; + + + // TODO, avoid copy? + i = 0; + while (i < NNBAR) + { B[i] = E[i]; + i += 1; + } + + A = s_A; + i = 0; + while( i < N ) + { + + () = #spill(B, S); + + // + A_offset = 0; + j = 0; + while( j < 8 ) + { + ij = #LEA(i + j); + + () = #spill(i, j); + + s_index_seed_padding[u16 0] = (16u) ij; + index_seed_padding = s_index_seed_padding; + A, A_offset = _shake128_gen_A_opt(A, A_offset, index_seed_padding); + + () = #unspill(i, j); + j += 1; + } + + () = #unspill(B, S); + + // + j = 0; + jN = 0; + while ( j < NBAR ) + { + () = #spill(j); + + // + S_offset = #LEA(jN + i); + for p=0 to 8 + { sp[p] = (32u) S[S_offset + p]; } + + () = #spill(S); + + // + q = 0; + B_offset = jN; + + () = #spill(jN); + + while( q < N ) + { + sum = (32u) B[B_offset]; + + for p=0 to 8 + { + mul = (32u) A[p*N + q]; + mul *= sp[p]; + sum += mul; + } + + B[B_offset] = (16u) sum; + + q += 1; + B_offset += 1; + } + + () = #unspill(j, S, jN); + + j += 1; + jN += N; + } + + i += 8; + } + + return B; +} + + diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index 9e840992..4adc859e 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -2,6 +2,7 @@ from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" #[returnaddress="stack"] @@ -167,7 +168,7 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B' = S'A + E'' - Bp = __SA_plus_E(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); // c1 <- Pack(B') ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp); @@ -290,7 +291,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B'' = S'A + E' - Bpp = __SA_plus_E(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); // B'' (mod q) i = s_i; diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc index 81b8f4ba..94508463 100644 --- a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc @@ -3,6 +3,7 @@ from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" #[returnaddress="stack"] @@ -168,7 +169,7 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B' = S'A + E'' - Bp = __SA_plus_E(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); // c1 <- Pack(B') ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp); @@ -291,7 +292,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B'' = S'A + E' - Bpp = __SA_plus_E(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); // B'' (mod q) i = s_i; From 89af0d8591d956ddd481c3b14291f1041f2c48c5 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Thu, 25 Jan 2024 17:34:00 +0800 Subject: [PATCH 03/14] refactor(frodo): unroll frodo640shake and common for ref implementation --- .../frodo/common/amd64/ref/matrix.jinc | 181 ++++++++--------- .../frodo/common/amd64/ref/noise.jinc | 119 +++++------ .../frodo/common/amd64/ref/shake128.jinc | 192 +++++++++--------- .../frodo/frodo640shake/amd64/ref/kem.jinc | 68 +++---- 4 files changed, 263 insertions(+), 297 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc index 4b6839ff..cbca1de1 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc @@ -8,56 +8,55 @@ fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[ stack ptr u16[NNBAR] s_S s_E; stack u8[2 + BYTES_SEED_A] b; - reg u64 i j k; stack u64 s_i s_j s_k; + reg u64 j k; stack u64 s_j s_k; reg u16 tmp ac; + inline int i l; s_B = B; s_S = S; s_E = E; // copy seedA - i = 0; - while (i < BYTES_SEED_A) { - b[(int)i + 2] = seedA[(int)i]; - i += 1; + for i = 0 to BYTES_SEED_A { + b[i + 2] = seedA[i]; } s_seedA = seedA; // first set B = E B = s_B; E = s_E; - i = 0; - while (i < NNBAR) { - B[(int)i] = E[(int)i]; - i += 1; + j = 0; + while (j < NNBAR) { + for l = 0 to 4 { + B[(int)j + l] = E[(int)j + l]; + } + j += 4; } - s_B = B; s_E = E; s_i = i; + s_B = B; s_E = E; // calculate A and B += A * S b[u16 0] = 0; k = 0; while (b[u16 0] < N) { - s_i = i; s_j = j; s_k = k; s_S = S; s_B = B; + s_j = j; s_k = k; s_S = S; s_B = B; A_row = __shake128_gen_A(A_row, b); - i = s_i; j = s_j; k = s_k; S = s_S; B = s_B; + j = s_j; k = s_k; S = s_S; B = s_B; - i = 0; - while (i < NNBAR) { + for i = 0 to NBAR { ac = 0; j = 0; // A_row * S_T_row while (j < N) { tmp = A_row[(int)j]; - tmp *= S[(int)i]; + tmp *= S[i * N + (int)j]; ac += tmp; j += 1; - i += 1; } - B[(int)k] += ac; - k += 1; + B[(int)k + i] += ac; } + k += NBAR; b[u16 0] += 1; } @@ -73,21 +72,22 @@ fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[ stack ptr u16[NNBAR] s_S s_E; stack u8[2 + BYTES_SEED_A] b; - reg u64 i j k; stack u64 s_i s_j s_k; + reg u64 j k; stack u64 s_j s_k; reg u16 tmp s; + inline int l; // copy seedA - i = 0; - while (i < BYTES_SEED_A) { - b[(int)i + 2] = seedA[(int)i]; - i += 1; + for l = 0 to BYTES_SEED_A { + b[l + 2] = seedA[l]; } s_seedA = seedA; - i = 0; - while (i < NNBAR) { - B[(int)i] = E[(int)i]; - i += 1; + j = 0; + while (j < NNBAR) { + for l = 0 to 4 { + B[(int)j + l] = E[(int)j + l]; + } + j += 4; } s_B = B; s_S = S; s_E = E; @@ -95,17 +95,13 @@ fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[ b[u16 0] = 0; while (b[u16 0] < N) { - s_i = i; A_row = __shake128_gen_A(A_row, b); - i = s_i; - i = 0; - while (i < NNBAR) { + for l = 0 to NBAR { k = s_k; S = s_S; k = (64u)b[u16 0]; - k += i; - s = S[(int)k]; + s = S[l * N + (int)k]; s_k = k; s_S = S; @@ -114,10 +110,9 @@ fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[ while (j < N) { tmp = A_row[(int)j]; tmp *= s; - B[(int)i] += tmp; + B[l * N + (int)j] += tmp; j += 1; - i += 1; } s_j = j; s_B = B; } @@ -130,39 +125,36 @@ fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[ #[returnaddress="stack"] fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[NBAR * NBAR] E) -> stack u16[NBAR * NBAR] { - reg u64 i j k ti tj; - reg u16 tmp; + reg u64 k tj; + reg u16 tmp ac; + inline int i j l; - i = 0; - while (i < NBAR * NBAR) { - V[(int)i] = E[(int)i]; - i += 1; + k = 0; + while (k < NBAR * NBAR) { + for l = 0 to 4 { + V[(int)k + l] = E[(int)k + l]; + } + k += 4; } - i = 0; - while (i < NBAR) { - j = 0; - while (j < NBAR) { - ti = i * NBAR; - ti += j; - + for i = 0 to NBAR { + for j = 0 to NBAR { k = 0; + ac = 0; while (k < N) { - tj = i * N; - tj += k; - tmp = S[(int)tj]; - tj = k * NBAR; - tj += j; + tmp = S[i * N + (int)k]; + + // NOTE: why is this needed ? + tj = j + NBAR * k; tmp *= B[(int)tj]; - V[(int)ti] += tmp; + ac += tmp; k += 1; } - V[(int)ti] &= (1 << D) - 1; - j += 1; + V[i * NBAR + j] += ac; + V[i * NBAR + j] &= (1 << D) - 1; } - i += 1; } return V; @@ -171,14 +163,17 @@ fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[N fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { reg u64 i; reg u16 tmp; + inline int k; i = 0; while (i < NBAR * NBAR) { - tmp = a[(int)i]; - tmp += b[(int)i]; - tmp &= (1 << D) - 1; - a[(int)i] = tmp; - i += 1; + for k = 0 to 2 { + tmp = a[(int)i + k]; + tmp += b[(int)i + k]; + tmp &= (1 << D) - 1; + a[(int)i + k] = tmp; + } + i += 2; } return a; @@ -189,14 +184,17 @@ fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { reg u64 i; reg u16 tmp; + inline int k; i = 0; while (i < NBAR * NBAR) { - tmp = b[(int)i]; - tmp -= a[(int)i]; - tmp &= (1 << D) - 1; - a[(int)i] = tmp; - i += 1; + for k = 0 to 2 { + tmp = b[(int)i + k]; + tmp -= a[(int)i + k]; + tmp &= (1 << D) - 1; + a[(int)i + k] = tmp; + } + i += 2; } return a; @@ -204,36 +202,31 @@ fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { #[returnaddress="stack"] fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR * NBAR] { - reg u64 i j k ti tj; + reg u64 k ti tj; reg u16 tmp; + inline int i j l; - i = 0; - while (i < NBAR) { - j = 0; - while (j < NBAR) { - ti = i * NBAR; - ti += j; - M[(int)ti] = 0; + for i = 0 to NBAR { + for j = 0 to NBAR { + M[i * NBAR + j] = 0; k = 0; while (k < N) { - tj = i * N; - tj += k; - tmp = B[(int)tj]; + for l = 0 to 1 { + tmp = B[i * N + l + (int)k]; - tj = j * N; - tj += k; - tmp *= S[(int)tj]; + tj = j * N + l; + // NOTE: why is this needed ? register allocation, k and tj must be merged will be raised + tj += k; + tmp *= S[(int)tj]; - M[(int)ti] += tmp; + M[i * NBAR + j] += tmp; + } k += 1; } - M[(int)ti] &= (1 << D) - 1; - j += 1; + M[i * NBAR + j] &= (1 << D) - 1; } - - i += 1; } return M; @@ -244,6 +237,7 @@ fn __ct_verify_NNBAR(reg ptr u16[NNBAR] a b) -> stack u8 { reg u64 i; reg u16 ac tmp; reg u8 r; + inline int k; i = 0; ac = 0; @@ -291,24 +285,21 @@ fn __ct_verify_NBAR2(reg ptr u16[NBAR * NBAR] a b) -> stack u8 { #[returnaddress="stack"] fn __ct_select(reg ptr u8[BYTES_SEC] out a b, reg u8 selector) -> stack u8[BYTES_SEC] { - reg u64 i; + inline int i; reg u8 n_selector tmp; n_selector = selector; n_selector ^= 0xFF; - i = 0; - while (i < BYTES_SEC) { - tmp = a[(int)i]; + for i = 0 to BYTES_SEC { + tmp = a[i]; tmp &= n_selector; - out[(int)i] = tmp; + out[i] = tmp; - tmp = b[(int)i]; + tmp = b[i]; tmp &= selector; - out[(int)i] |= tmp; - - i += 1; + out[i] |= tmp; } return out; diff --git a/src/crypto_kem/frodo/common/amd64/ref/noise.jinc b/src/crypto_kem/frodo/common/amd64/ref/noise.jinc index c39b352f..0e712299 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/noise.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/noise.jinc @@ -3,39 +3,41 @@ fn __sample_2NNBAR(reg ptr u16[2 * NNBAR] s) -> stack u16[2 * NNBAR] { cdftp = CDF_TABLE; reg u64 i; + inline int j k; + reg u16 sample prnd sign; + i = 0; while (i < 2 * NNBAR) { - reg u16 sample prnd sign; - - sample = 0; - - // prnd = s[i] >> 1 - prnd = s[(int)i]; - prnd >>= 1; - - // sign = s[(int)i] & 0x1 - sign = s[(int)i]; - sign &= 0x1; - - // no need to compare with the last value - inline int j; - for j = 0 to CDF_TABLE_LEN - 1 { - // sample += (CDF_TABLE[j] - prnd) >> 15 - - reg u16 tmp_sample; - tmp_sample = cdftp[j]; - tmp_sample -= prnd; - tmp_sample >>= 15; - sample += tmp_sample; + for k = 0 to 2 { + sample = 0; + + // prnd = s[i] >> 1 + prnd = s[(int)i + k]; + prnd >>= 1; + + // sign = s[(int)i + k] & 0x1 + sign = s[(int)i + k]; + sign &= 0x1; + + // no need to compare with the last value + for j = 0 to CDF_TABLE_LEN - 1 { + // sample += (CDF_TABLE[j] - prnd) >> 15 + + reg u16 tmp_sample; + tmp_sample = cdftp[j]; + tmp_sample -= prnd; + tmp_sample >>= 15; + sample += tmp_sample; + } + + // s[i] = ((-sign) ^ sample) + sign + s[(int)i + k] = 0; + s[(int)i + k] -= sign; + s[(int)i + k] ^= sample; + s[(int)i + k] += sign; } - // s[i] = ((-sign) ^ sample) + sign - s[(int)i] = 0; - s[(int)i] -= sign; - s[(int)i] ^= sample; - s[(int)i] += sign; - - i += 1; + i += 2; } return s; @@ -46,39 +48,40 @@ fn __sample_NBAR2(reg ptr u16[NBAR * NBAR] s) -> stack u16[NBAR * NBAR] { cdftp = CDF_TABLE; reg u64 i; + reg u16 sample prnd sign; + inline int j k; i = 0; while (i < NBAR * NBAR) { - reg u16 sample prnd sign; - - sample = 0; - - // prnd = s[i] >> 1 - prnd = s[(int)i]; - prnd >>= 1; - - // sign = s[(int)i] & 0x1 - sign = s[(int)i]; - sign &= 0x1; - - // no need to compare with the last value - inline int j; - for j = 0 to CDF_TABLE_LEN - 1 { - // sample += (CDF_TABLE[j] - prnd) >> 15 - - reg u16 tmp_sample; - tmp_sample = cdftp[j]; - tmp_sample -= prnd; - tmp_sample >>= 15; - sample += tmp_sample; + for k = 0 to 2 { + sample = 0; + + // prnd = s[i] >> 1 + prnd = s[(int)i + k]; + prnd >>= 1; + + // sign = s[(int)i] & 0x1 + sign = s[(int)i + k]; + sign &= 0x1; + + // no need to compare with the last value + for j = 0 to CDF_TABLE_LEN - 1 { + // sample += (CDF_TABLE[j] - prnd) >> 15 + + reg u16 tmp_sample; + tmp_sample = cdftp[j]; + tmp_sample -= prnd; + tmp_sample >>= 15; + sample += tmp_sample; + } + + // s[i] = ((-sign) ^ sample) + sign + s[(int)i + k] = 0; + s[(int)i + k] -= sign; + s[(int)i + k] ^= sample; + s[(int)i + k] += sign; } - // s[i] = ((-sign) ^ sample) + sign - s[(int)i] = 0; - s[(int)i] -= sign; - s[(int)i] ^= sample; - s[(int)i] += sign; - - i += 1; + i += 2; } return s; diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc index 498bcb85..d287adb3 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/shake128.jinc @@ -9,8 +9,8 @@ fn __shake128_gen_A(reg ptr u8[2 * N] out, reg const ptr u8[2 + BYTES_SEED_A] in stack u64[25] s_state; reg ptr u64[25] state; reg u8 c; - inline int INLEN OUTLEN OUTRND; - reg u64 i j; stack u64 s_i s_j; + inline int INLEN OUTLEN OUTRND i k; + reg u64 j; stack u64 s_j; INLEN = 2 + BYTES_SEED_A; OUTLEN = 2 * N; @@ -19,42 +19,41 @@ fn __shake128_gen_A(reg ptr u8[2 * N] out, reg const ptr u8[2 + BYTES_SEED_A] in state = s_state; state = __keccak_init_ref1(state); - i = 0; - while (i < INLEN) { - c = in[(int)i]; - state[u8 (int)i] ^= c; - i += 1; + for i = 0 to INLEN { + c = in[i]; + state[u8 i] ^= c; } state[u8 INLEN] ^= 0x1f; state[u8 SHAKE128_RATE-1] ^= 0x80; - i = 0; - while (i < OUTRND * SHAKE128_RATE) { - s_out = out; s_i = i; s_j = j; + for i = 0 to OUTRND { + s_out = out; s_j = j; state = __keccakf1600_ref1(state); - out = s_out; i = s_i; j = s_j; + out = s_out; j = s_j; j = 0; while (j < SHAKE128_RATE) { - out[(int)i] = state[u8 (int)j]; - i += 1; - j += 1; + for k = 0 to 4 { + out[(int)j + k + i * SHAKE128_RATE] = state[u8 (int)j + k]; + } + j += 4; } } - s_out = out; s_i = i; s_j = j; + s_out = out; s_j = j; state = __keccakf1600_ref1(state); - out = s_out; i = s_i; + out = s_out; j = 0; - while (i < OUTLEN) { - out[(int)i] = state[u8 (int)j]; - i += 1; - j += 1; + while (j < OUTLEN % SHAKE128_RATE) { + for k = 0 to 4 { + out[(int)j + k + OUTRND * SHAKE128_RATE] = state[u8 (int)j + k]; + } + j += 4; } return out; @@ -67,8 +66,7 @@ fn __shake128_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A stack u64[25] s_state; reg ptr u64[25] state; reg u8 c; - reg u64 i; stack u64 s_i; - inline int INLEN OUTLEN; + inline int INLEN OUTLEN i; INLEN = BYTES_SEED_A; OUTLEN = BYTES_SEED_A; @@ -77,26 +75,19 @@ fn __shake128_seed_A(reg ptr u8[BYTES_SEED_A] out, reg const ptr u8[BYTES_SEED_A state = s_state; state = __keccak_init_ref1(state); - i = 0; - while (i < INLEN) { - c = in[(int) i]; - state[u8 (int) i] ^= c; - i += 1; + for i = 0 to INLEN { + c = in[i]; + state[u8 i] ^= c; } - state[u8 (int) i] ^= 0x1f; + state[u8 INLEN] ^= 0x1f; state[u8 SHAKE128_RATE-1] ^= 0x80; - s_i = i; - state = __keccakf1600_ref1(state); out = s_out; - i = s_i; - i = 0; - while (i < OUTLEN) { - out[(int) i] = state[u8 (int) i]; - i += 1; + for i = 0 to OUTLEN { + out[i] = state[u8 i]; } return out; @@ -109,7 +100,7 @@ fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] i stack u64[25] s_state; reg ptr u64[25] state; reg u8 c; - inline int INLEN OUTLEN OUTRND; + inline int INLEN OUTLEN OUTRND k; reg u64 i j; stack u64 s_i s_j; @@ -122,11 +113,9 @@ fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] i state = s_state; state = __keccak_init_ref1(state); - i = 0; - while (i < INLEN) { - c = in[(int) i]; - state[u8 (int) i] ^= c; - i += 1; + for k = 0 to INLEN { + c = in[k]; + state[u8 k] ^= c; } state[u8 INLEN] ^= 0x1f; @@ -142,9 +131,11 @@ fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] i j = 0; while (j < SHAKE128_RATE) { - out[(int)i] = state[u8 (int)j]; - j += 1; - i += 1; + for k = 0 to 4 { + out[(int)i + k] = state[u8 (int)j + k]; + } + j += 4; + i += 4; } } @@ -152,13 +143,14 @@ fn __shake128_r(reg ptr u8[4 * NNBAR] out, reg const ptr u8[1 + BYTES_SEED_SE] i state = __keccakf1600_ref1(state); - out = s_out; i = s_i; j = s_j; + out = s_out; j = s_j; j = 0; - while (i < OUTLEN) { - out[(int) i] = state[u8 (int) j]; - j += 1; - i += 1; + while (j < OUTLEN % SHAKE128_RATE) { + for k = 0 to 4 { + out[(int) j + k + OUTRND * SHAKE128_RATE] = state[u8 (int) j + k]; + } + j += 4; } return out; @@ -173,7 +165,7 @@ fn __shake128_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> s reg ptr u64[25] state; reg u8 c; reg u64 i j; stack u64 s_i s_j; - inline int INLEN OUTLEN INRND; + inline int INLEN OUTLEN INRND k; INLEN = BYTES_PK; OUTLEN = BYTES_SEC; INRND = INLEN / SHAKE128_RATE; @@ -188,11 +180,13 @@ fn __shake128_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> s in = s_in; j = 0; while (j < SHAKE128_RATE) { - c = in[(int)i]; - state[u8 (int)j] ^= c; + for k = 0 to 4 { + c = in[(int)i + k]; + state[u8 (int)j + k] ^= c; + } - i += 1; - j += 1; + i += 4; + j += 4; } s_in = in; s_i = i; s_j = j; @@ -202,26 +196,25 @@ fn __shake128_pkh(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_PK] in) -> s in = s_in; j = 0; - while (i < INLEN) { - c = in[(int)i]; - state[u8 (int)j] ^= c; - i += 1; - j += 1; + while (j < INLEN % SHAKE128_RATE) { + for k = 0 to 4 { + c = in[(int)j + INRND * SHAKE128_RATE + k]; + state[u8 (int)j + k] ^= c; + } + j += 4; } - state[u8 INLEN - INRND * SHAKE128_RATE] ^= 0x1f; + state[u8 INLEN % SHAKE128_RATE] ^= 0x1f; state[u8 SHAKE128_RATE-1] ^= 0x80; s_in = in; s_i = i; s_j = j; state = __keccakf1600_ref1(state); - out = s_out; i = s_i; + out = s_out; - i = 0; - while (i < OUTLEN) { - out[(int) i] = state[u8 (int) i]; - i += 1; + for k = 0 to OUTLEN { + out[k] = state[u8 k]; } return out; @@ -234,7 +227,7 @@ fn __shake128_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 stack u64[25] s_state; reg ptr u64[25] state; reg u8 c; - inline int INLEN OUTLEN; + inline int INLEN OUTLEN k; reg u64 i; stack u64 s_i; INLEN = 2 * BYTES_SEC + BYTES_SALT; @@ -246,11 +239,9 @@ fn __shake128_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 state = __keccak_init_ref1(state); in = s_in; - i = 0; - while (i < INLEN) { - c = in[(int) i]; - state[u8 (int) i] ^= c; - i += 1; + for k = 0 to INLEN { + c = in[k]; + state[u8 k] ^= c; } state[u8 INLEN] ^= 0x1f; @@ -263,9 +254,8 @@ fn __shake128_SE_k(reg ptr u8[BYTES_SEED_SE + BYTES_SEC] out, reg const ptr u8[2 out = s_out; i = s_i; i = 0; - while (i < OUTLEN) { - out[(int) i] = state[u8 (int) i]; - i += 1; + for k = 0 to OUTLEN { + out[k] = state[u8 k]; } return out; @@ -279,7 +269,7 @@ fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const p stack u64[25] s_state; reg ptr u64[25] state; reg u8 c; - inline int INLEN OUTLEN OUTRND; + inline int INLEN OUTLEN OUTRND k; reg u64 i j; stack u64 s_i s_j; INLEN = 1 + BYTES_SEED_SE; @@ -291,11 +281,9 @@ fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const p state = s_state; state = __keccak_init_ref1(state); - i = 0; - while (i < INLEN) { - c = in[(int)i]; - state[u8 (int)i] ^= c; - i += 1; + for k = 0 to INLEN { + c = in[k]; + state[u8 k] ^= c; } state[u8 INLEN] ^= 0x1f; @@ -312,9 +300,11 @@ fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const p out = s_out; i = s_i; j = s_j; j = 0; while (j < SHAKE128_RATE) { - out[(int)i] = state[u8 (int)j]; - i += 1; - j += 1; + for k = 0 to 4 { + out[(int)i + k] = state[u8 (int)j + k]; + } + i += 4; + j += 4; } } @@ -323,10 +313,11 @@ fn __shake128_encap_r(reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, reg const p out = s_out; i = s_i; j = s_j; j = 0; - while (i < OUTLEN) { - out[(int) i] = state[u8 (int)j]; - i += 1; - j += 1; + while (j < OUTLEN % SHAKE128_RATE) { + for k = 0 to 4 { + out[(int) j + OUTRND * SHAKE128_RATE + k] = state[u8 (int)j + k]; + } + j += 4; } return out; @@ -341,7 +332,7 @@ fn __shake128_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SE reg ptr u64[25] state; reg u8 c; reg u64 i j; stack u64 s_i s_j; - inline int INLEN OUTLEN INRND; + inline int INLEN OUTLEN INRND k; INLEN = BYTES_CT + BYTES_SEC; OUTLEN = BYTES_SEC; @@ -357,11 +348,13 @@ fn __shake128_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SE in = s_in; j = 0; while (j < SHAKE128_RATE) { - c = in[(int)i]; - state[u8 (int)j] ^= c; + for k = 0 to 4 { + c = in[(int)i + k]; + state[u8 (int)j + k] ^= c; + } - i += 1; - j += 1; + i += 4; + j += 4; } s_in = in; s_i = i; s_j = j; @@ -370,16 +363,16 @@ fn __shake128_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SE } in = s_in; + s_i = i; j = 0; - while (i < INLEN) { - c = in[(int)i]; + while (j < INLEN % SHAKE128_RATE) { + c = in[(int)j + INRND * SHAKE128_RATE]; state[u8 (int)j] ^= c; - i += 1; j += 1; } - s_in = in; s_i = i; s_j = j; + s_in = in; s_j = j; - state[u8 INLEN - INRND * SHAKE128_RATE] ^= 0x1f; + state[u8 INLEN % SHAKE128_RATE] ^= 0x1f; state[u8 SHAKE128_RATE-1] ^= 0x80; state = __keccakf1600_ref1(state); @@ -387,9 +380,8 @@ fn __shake128_ss(reg ptr u8[BYTES_SEC] out, reg const ptr u8[BYTES_CT + BYTES_SE out = s_out; i = s_i; i = 0; - while (i < OUTLEN) { - out[(int)i] = state[u8 (int)i]; - i += 1; + for k = 0 to OUTLEN { + out[k] = state[u8 k]; } return out; diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index 4adc859e..0b436f22 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -8,15 +8,12 @@ from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" #[returnaddress="stack"] fn __gen_SE(reg ptr u16[2 * NNBAR] SE, reg ptr u8[BYTES_SEED_SE] r) -> stack u16[2 * NNBAR] { stack u8[1 + BYTES_SEED_SE] seed_se; - reg u64 i; stack u64 s_i; + inline int i; r = r; - i = 0; - while (i < BYTES_SEED_SE) { - seed_se[(int)i + 1] = r[(int)i]; - i += 1; + for i = 0 to BYTES_SEED_SE { + seed_se[i + 1] = r[i]; } - s_i = i; // S || E seed_se[0] = 0x5F; @@ -33,6 +30,7 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B stack u16[NNBAR] B; stack u64 s_pkp s_skp; + inline int k; reg u64 i; stack u64 s_i; s_pkp = pkp; @@ -59,9 +57,8 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B // i = s_i; i = 0; - while (i < BYTES_SEC) { - sk[(int) i] = coins[(int)i]; - i += 1; + for k = 0 to BYTES_SEC { + sk[k] = coins[k]; } i = 0; @@ -97,6 +94,7 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B #[returnaddress="stack"] fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { reg u64 i j; stack u64 s_i s_j; + inline int k; stack u64 s_ctp s_ssp; // seedA || b @@ -125,16 +123,12 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE s_j = j; // gen u || salt - i = 0; - while (i < BYTES_SEC + BYTES_SALT) { - pkh_u_salt[BYTES_SEC + (int)i] = coins[(int)i]; - i += 1; + for k = 0 to BYTES_SEC + BYTES_SALT { + pkh_u_salt[BYTES_SEC + k] = coins[k]; } - i = 0; - while (i < BYTES_SALT) { - ct_k[D * N + D * NBAR + (int)i] = pkh_u_salt[BYTES_SEC * 2 + (int)i]; - i += 1; + for k = 0 to BYTES_SALT { + ct_k[D * N + D * NBAR + k] = pkh_u_salt[BYTES_SEC * 2 + k]; } // read pk @@ -152,12 +146,9 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE seedSE_k[1 : BYTES_SEED_SE + BYTES_SEC] = __shake128_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); // copy k - i = s_i; i = 0; - while (i < BYTES_SEC) { - ct_k[BYTES_CT + (int)i] = seedSE_k[1 + BYTES_SEED_SE + (int)i]; - i += 1; + for k = 0 to BYTES_SEC { + ct_k[BYTES_CT + k] = seedSE_k[1 + BYTES_SEED_SE + k]; } - s_i = i; // gen input bit string for sampling S and E SEE = __shake128_encap_r(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); @@ -197,10 +188,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE i += 1; } - i = 0; - while (i < BYTES_SEC) { - (u8)[ssp + i] = ss[(int)i]; - i += 1; + for k = 0 to BYTES_SEC { + (u8)[ssp + k] = ss[k]; } } @@ -222,6 +211,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { stack u64 s_ssp s_ctp s_skp; reg u8 s1 s2; reg u64 i; stack u64 s_i; + inline int k; s_ssp = ssp; ctp = ctp; @@ -235,10 +225,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { } s_ctp = ctp; - i = 0; - while (i < BYTES_SEC) { - s[(int) i] = (u8)[skp + i]; - i += 1; + for k = 0 to BYTES_SEC { + s[k] = (u8)[skp + k]; } i = 0; @@ -254,20 +242,15 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { } // copy pkh - i = 0; - while (i < BYTES_SEC) { - pkh_u_salt[(int)i] = (u8)[skp + BYTES_SK - BYTES_SEC + i]; - i += 1; + for k = 0 to BYTES_SEC { + pkh_u_salt[k] = (u8)[skp + BYTES_SK - BYTES_SEC + k]; } s_skp = skp; // copy salt - i = 0; - while (i < BYTES_SALT) { - pkh_u_salt[BYTES_SEC * 2 + (int)i] = ct_k[BYTES_CT - BYTES_SALT + (int)i]; - i += 1; + for k = 0 to BYTES_SALT { + pkh_u_salt[BYTES_SEC * 2 + k] = ct_k[BYTES_CT - BYTES_SALT + k]; } - s_i = i; // B' <- Unpack(c1) Bp = __unpack_B(Bp, ct_k[0:D * N]); @@ -318,11 +301,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { ss = __shake128_ss(ss, ct_k); ssp = s_ssp; - i = s_i; - i = 0; - while (i < BYTES_SEC) { - (u8)[ssp + i] = ss[(int)i]; - i += 1; + for k = 0 to BYTES_SEC { + (u8)[ssp + k] = ss[k]; } } From 22a35eb463c9e9082d23229f83d0b6fcb48d3a8f Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Mon, 26 Feb 2024 14:17:52 +0800 Subject: [PATCH 04/14] refactor(frodo): remove unnecessary for loop in matrix --- .../frodo/common/amd64/ref/matrix.jinc | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc index cbca1de1..2d31bde2 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc @@ -202,9 +202,9 @@ fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { #[returnaddress="stack"] fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR * NBAR] { - reg u64 k ti tj; + reg u64 k tj; reg u16 tmp; - inline int i j l; + inline int i j; for i = 0 to NBAR { for j = 0 to NBAR { @@ -212,16 +212,14 @@ fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR k = 0; while (k < N) { - for l = 0 to 1 { - tmp = B[i * N + l + (int)k]; + tmp = B[i * N + (int)k]; - tj = j * N + l; - // NOTE: why is this needed ? register allocation, k and tj must be merged will be raised - tj += k; - tmp *= S[(int)tj]; + tj = j * N; + // NOTE: why is this needed ? register allocation, k and tj must be merged will be raised + tj += k; + tmp *= S[(int)tj]; - M[i * NBAR + j] += tmp; - } + M[i * NBAR + j] += tmp; k += 1; } @@ -237,7 +235,6 @@ fn __ct_verify_NNBAR(reg ptr u16[NNBAR] a b) -> stack u8 { reg u64 i; reg u16 ac tmp; reg u8 r; - inline int k; i = 0; ac = 0; From 3268be174ce887d18f8bf6dd0902b17c66aee088 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Mon, 26 Feb 2024 14:20:37 +0800 Subject: [PATCH 05/14] refactor(frodo): optimizing AS+E --- .../frodo/common/amd64/ref/matrix_opt.jinc | 144 +++++++++++++++--- .../frodo/frodo640shake/amd64/ref/kem.jinc | 19 ++- 2 files changed, 135 insertions(+), 28 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc index 6b9c06c2..d4c0d6e2 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc @@ -86,7 +86,29 @@ fn _shake128_gen_A_opt( return out, out_offset; } -fn __SA_plus_E_opt( +inline fn __pad_seedA(reg ptr u8[16] seedA) -> reg ptr u8[2 + 16 + 6] { + reg u64 i j; + stack u8[2+16+ 6] s_index_seed_padding; + reg u8 v; + + // setup "index || seed || padding" + s_index_seed_padding[u16 0] = 0; + s_index_seed_padding[u64 2] = 0; + + i = 0; j = 2; + while (i < 16) + { v = seedA[i]; + s_index_seed_padding[j] = v; + i += 1; + j += 1; + } + + s_index_seed_padding[18] = 0x1f; + + return s_index_seed_padding; +} + +fn __AS_plus_E_opt( #spill_to_mmx reg ptr u16[NNBAR] B, reg ptr u8[16] seedA, #spill_to_mmx reg ptr u16[NNBAR] S, @@ -94,8 +116,6 @@ fn __SA_plus_E_opt( -> reg ptr u16[NNBAR] { - inline int m; - stack u8[2+16+ 6] s_index_seed_padding; reg ptr u8[2+16+ 6] index_seed_padding; stack u16[N * 8] s_A; @@ -104,38 +124,118 @@ fn __SA_plus_E_opt( inline int p; #spill_to_mmx reg u64 i j ij jN q; - reg u32[8] sp; - reg u32 sum mul; - reg u16 t; - reg u8 v; + reg u32[8] sum; + reg u16 t16; + reg u32 mul sp; + + index_seed_padding = __pad_seedA(seedA); - // setup "index || seed || padding" - s_index_seed_padding[u16 0] = 0; - s_index_seed_padding[u64 2] = 0; i = 0; - j = 2; - while(i < 16) - { v = seedA[i]; - s_index_seed_padding[j] = v; + while (i < NNBAR) { + B[i] = E[i]; i += 1; - j += 1; } - s_index_seed_padding[18] = 0x1f; + () = #spill(E, index_seed_padding); - // TODO, avoid copy? + A = s_A; i = 0; - while (i < NNBAR) - { B[i] = E[i]; - i += 1; + B_offset = 0; + while( i < N ) { + () = #spill(B, S); + () = #unspill(index_seed_padding); + + A_offset = 0; + j = 0; + while( j < 8 ) + { + ij = #LEA(i + j); + + () = #spill(i, j); + + index_seed_padding[u16 0] = (16u) ij; + A, A_offset = _shake128_gen_A_opt(A, A_offset, index_seed_padding); + + () = #unspill(i, j); + j += 1; + } + + () = #unspill(B, S); + () = #spill(index_seed_padding); + + j = 0; + jN = 0; + while ( j < NBAR ) + { + B_offset = #LEA(i * NBAR + j); + + () = #spill(i, j, jN); + for p = 0 to 8 { sum[p] = (32u)B[B_offset + p*NBAR]; } + + q = 0; + while (q < N) { + () = #unspill(jN); + + S_offset = #LEA(jN + q); + sp = (32u)S[S_offset]; + + () = #spill(jN); + + for p = 0 to 8 + { + mul = (32u)A[p*N + q]; + mul *= sp; + sum[p] += mul; + } + + q += 1; + } + + for p = 0 to 8 { + B[B_offset + p*NBAR] = (16u)sum[p]; + } + + () = #unspill(i, j, jN); + + j += 1; + jN += N; + } + + i += 8; } + return B; +} + +fn __SA_plus_E_opt( + #spill_to_mmx reg ptr u16[NNBAR] B, // initial value is set to E's to avoid copy + reg ptr u8[16] seedA, + #spill_to_mmx reg ptr u16[NNBAR] S) + -> + reg ptr u16[NNBAR] +{ + reg ptr u8[2+16+ 6] index_seed_padding; + + stack u16[N * 8] s_A; + reg ptr u16[N * 8] A; + reg u64 A_offset B_offset S_offset; + + inline int p; + #spill_to_mmx reg u64 i j ij jN q; + reg u32[8] sp; + reg u32 sum mul; + + index_seed_padding = __pad_seedA(seedA); + + () = #spill(index_seed_padding); + A = s_A; i = 0; while( i < N ) { () = #spill(B, S); + () = #unspill(index_seed_padding); // A_offset = 0; @@ -146,8 +246,7 @@ fn __SA_plus_E_opt( () = #spill(i, j); - s_index_seed_padding[u16 0] = (16u) ij; - index_seed_padding = s_index_seed_padding; + index_seed_padding[u16 0] = (16u) ij; A, A_offset = _shake128_gen_A_opt(A, A_offset, index_seed_padding); () = #unspill(i, j); @@ -155,6 +254,7 @@ fn __SA_plus_E_opt( } () = #unspill(B, S); + () = #spill(index_seed_padding); // j = 0; diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index 0b436f22..534f8165 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -49,12 +49,15 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B // gen S || E SE = __gen_SE(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + () = #spill(coins); + // B = A*S+E - B = __AS_plus_E(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]); + B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]); // pack pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B); + () = #unspill(coins); // i = s_i; i = 0; for k = 0 to BYTES_SEC { @@ -112,8 +115,9 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE // S' || E' || E'' stack u16[2 * NNBAR + NBAR * NBAR] SEE; - stack u16[NNBAR] B Bp; - stack u16[NBAR * NBAR] V C; + stack u16[NNBAR] B; + reg ptr u16[NNBAR] Bp; + stack u16[NBAR * NBAR] C V; stack u8[BYTES_SEC] ss; pkp = pkp; @@ -159,7 +163,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B' = S'A + E'' - Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bp = SEE[NNBAR:NNBAR]; + Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]); // c1 <- Pack(B') ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp); @@ -199,7 +204,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { stack u8[2 * NNBAR] ST; stack u8[BYTES_SEC] s; stack u8[BYTES_CT + BYTES_SEC] ct_k; - stack u16[NNBAR] B Bp Bpp; + stack u16[NNBAR] B Bp; + reg ptr u16[NNBAR] Bpp; stack u16[NBAR * NBAR] M C Cp V; stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; @@ -274,7 +280,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B'' = S'A + E' - Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bpp = SEE[NNBAR:NNBAR]; + Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]); // B'' (mod q) i = s_i; From 43e3d43660424642495c08e6240d99d003e68209 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Tue, 27 Feb 2024 12:10:36 +0800 Subject: [PATCH 06/14] refactor(frodo): optimize SB+E --- .../frodo/common/amd64/ref/matrix_opt.jinc | 29 +++++++++++++++++++ .../frodo/frodo640shake/amd64/ref/kem.jinc | 12 +++++--- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc index d4c0d6e2..7f0ca654 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc @@ -305,4 +305,33 @@ fn __SA_plus_E_opt( return B; } +fn __SB_plus_E_opt( + #spill_to_mmx reg ptr u16[NBAR * NBAR] V, // initial value is set to E's to avoid copy + #spill_to_mmx reg ptr u16[NNBAR] S B) +-> reg ptr u16[NBAR * NBAR] { + reg u64 k B_offset; + reg u32 sum mul t32; + inline int i j l; + + for i = 0 to NBAR { + for j = 0 to NBAR { + k = 0; + sum = (32u)V[i * NBAR + j]; + while (k < N) { + mul = (32u)S[i * N + k]; + + B_offset = #LEA(j + NBAR*k); + t32 = (32u)B[B_offset]; + mul *= t32; + + sum += mul; + k += 1; + } + + sum &= (1 << D) - 1; + V[i * NBAR + j] = (16u)sum; + } + } + return V; +} diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index 534f8165..b9f1f8b7 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -117,7 +117,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE stack u16[NNBAR] B; reg ptr u16[NNBAR] Bp; - stack u16[NBAR * NBAR] C V; + stack u16[NBAR * NBAR] C; + reg ptr u16[NBAR * NBAR] V; stack u8[BYTES_SEC] ss; pkp = pkp; @@ -173,7 +174,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE B = __unpack_B(B, pk[BYTES_SEED_A:D * N]); // V = S'B + E'' - V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + V = SEE[NNBAR*2:NBAR*NBAR]; + V = __SB_plus_E_opt(V, SEE[0:NNBAR], B); // C = V + Encode(u) C = __encode(C, pkh_u_salt[BYTES_SEC:BYTES_SEC]); @@ -206,7 +208,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { stack u8[BYTES_CT + BYTES_SEC] ct_k; stack u16[NNBAR] B Bp; reg ptr u16[NNBAR] Bpp; - stack u16[NBAR * NBAR] M C Cp V; + stack u16[NBAR * NBAR] M C Cp; + reg ptr u16[NBAR * NBAR] V; stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; stack u8[BYTES_SEC] ss; @@ -294,7 +297,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { // B = __unpack_B(B, pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]); - V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + V = SEE[NNBAR*2:NBAR*NBAR]; + V = __SB_plus_E_opt(V, SEE[0:NNBAR], B); Cp = __encode(Cp, pkh_u_salt[BYTES_SEC:BYTES_SEC]); Cp = __matrix_add(Cp, V); From a3cdcc8440c410ae87a9f878231949f03a2afff7 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Tue, 27 Feb 2024 17:51:01 +0800 Subject: [PATCH 07/14] refactor(frodo): optimize mul_BS --- .../frodo/common/amd64/ref/matrix_opt.jinc | 32 +++++++++++++++++++ .../frodo/frodo640shake/amd64/ref/kem.jinc | 2 +- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc index 7f0ca654..998b7aec 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc @@ -335,3 +335,35 @@ fn __SB_plus_E_opt( return V; } + +fn __mul_BS_opt( + #spill_to_mmx reg ptr u16[NBAR * NBAR] M, + #spill_to_mmx reg ptr u16[NNBAR]B S) +-> reg ptr u16[NBAR * NBAR] { + reg u64 k S_offset; + reg u32 sum mul t32; + inline int i j; + + for i = 0 to NBAR { + for j = 0 to NBAR { + sum = 0; + + k = 0; + while (k < N) { + mul = (32u)B[i * N + k]; + + S_offset = #LEA(j*N+k); + t32 = (32u)S[S_offset]; + mul *= t32; + + sum += mul; + + k += 1; + } + sum &= (1 << D) - 1; + M[i * NBAR + j] = (16u)sum; + } + } + + return M; +} diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index b9f1f8b7..d3c774ef 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -267,7 +267,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { C = __unpack_C(C, ct_k[D * N:D * NBAR]); // M = C - B'S - M = __mul_BS(M, Bp, ST); + M = __mul_BS_opt(M, Bp, ST); M = __matrix_sub(M, C); pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M); From 6b61fd835a09f8e4351fbf04fe7ce7393ee31546 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Tue, 27 Feb 2024 17:54:17 +0800 Subject: [PATCH 08/14] refactor(frodo): spill/unspill instruction and read u64 instead u8 when copying --- .../frodo/frodo640shake/amd64/ref/kem.jinc | 159 +++++++++--------- 1 file changed, 81 insertions(+), 78 deletions(-) diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index d3c774ef..4c5d014a 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -25,17 +25,14 @@ fn __gen_SE(reg ptr u16[2 * NNBAR] SE, reg ptr u8[BYTES_SEED_SE] r) -> stack u16 } // coins = s || seed SE || z -fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) { +fn __frodo_amd64_ref_keypair_derand( + reg u64 pkp skp, + #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) { stack u16[2 * NNBAR] SE; stack u16[NNBAR] B; - stack u64 s_pkp s_skp; inline int k; - reg u64 i; stack u64 s_i; - - s_pkp = pkp; - s_skp = skp; - coins = coins; + reg u64 i j; // seedA || b stack u8[BYTES_PK] pk; @@ -43,6 +40,12 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B // s || seedA || b || S_T || pkh stack u8[BYTES_SK] sk; + () = #spill(i, j, pkp, skp); + + for k = 0 to BYTES_SEC/8 { + sk[u64 k] = coins[u64 k]; + } + // gen seedA pk[0:BYTES_SEED_A] = __shake128_seed_A(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); @@ -57,48 +60,44 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B // pack pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B); - () = #unspill(coins); - // - i = s_i; i = 0; - for k = 0 to BYTES_SEC { - sk[k] = coins[k]; - } - + () = #unspill(i); i = 0; - while (i < BYTES_PK) { - sk[BYTES_SEC + (int)i] = pk[(int)i]; + while (i < BYTES_PK/8) { + sk[u64 BYTES_SEC/8 + i] = pk[u64 i]; i += 1; } i = 0; - while (i < 2 * NNBAR) { - sk[BYTES_SEC + BYTES_PK + (int)i] = SE.[u8 (int)i]; + while (i < 2 * NNBAR / 8) { + sk[u64 BYTES_SEC/8 + BYTES_PK/8 + i] = SE[u64 i]; i += 1; } - s_i = i; + () = #spill(i); - sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake128_pkh(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk); + sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake128_pkh_opt(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk); - pkp = s_pkp; - i = 0; - while (i < BYTES_PK) { - (u8)[pkp + i] = pk[(int) i]; + () = #unspill(i, j, pkp, skp); + i = 0; j = 0; + while (i < BYTES_PK/8) { + [pkp + j] = pk[u64 i]; i += 1; + j += 8; } - skp = s_skp; - i = 0; - while (i < BYTES_SK) { - (u8)[skp + i] = sk[(int) i]; + i = 0; j = 0; + while (i < BYTES_SK/8) { + [skp + j] = sk[u64 i]; i += 1; + j += 8; } } #[returnaddress="stack"] -fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { - reg u64 i j; stack u64 s_i s_j; +fn __frodo_amd64_ref_enc_derand( + reg u64 ctp ssp pkp, + #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { + reg u64 i j; inline int k; - stack u64 s_ctp s_ssp; // seedA || b #public stack u8[BYTES_PK] pk; @@ -122,27 +121,26 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE stack u8[BYTES_SEC] ss; pkp = pkp; - s_ctp = ctp; - s_ssp = ssp; - coins = coins; - s_j = j; + () = #spill(ctp, ssp, i, j); // gen u || salt - for k = 0 to BYTES_SEC + BYTES_SALT { - pkh_u_salt[BYTES_SEC + k] = coins[k]; + for k = 0 to (BYTES_SEC + BYTES_SALT)/8 { + pkh_u_salt[u64 BYTES_SEC/8 + k] = coins[u64 k]; } - for k = 0 to BYTES_SALT { - ct_k[D * N + D * NBAR + k] = pkh_u_salt[BYTES_SEC * 2 + k]; + for k = 0 to BYTES_SALT/8 { + ct_k[u64 (D * N + D * NBAR)/8 + k] = pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k]; } + () = #unspill(i, j); // read pk - i = 0; - while (i < BYTES_PK) { - #declassify pk[(int)i] = (u8) [pkp + i]; + i = 0; j = 0; + while (i < BYTES_PK/8) { + #declassify pk[u64 i] = [pkp + j]; i += 1; + j += 8; } - s_i = i; + () = #spill(i, j); // pkh pkh_u_salt[0:BYTES_SEC] = __shake128_pkh(pkh_u_salt[0:BYTES_SEC], pk); @@ -151,8 +149,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE seedSE_k[1 : BYTES_SEED_SE + BYTES_SEC] = __shake128_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); // copy k - for k = 0 to BYTES_SEC { - ct_k[BYTES_CT + k] = seedSE_k[1 + BYTES_SEED_SE + k]; + for k = 0 to BYTES_SEC/8 { + ct_k[u64 BYTES_CT/8 + k] = seedSE_k.[u64 1 + BYTES_SEED_SE + 8*k]; } // gen input bit string for sampling S and E @@ -187,16 +185,16 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE // ss <- shake(c1 || c2 || salt || k) ss = __shake128_ss(ss, ct_k); - i = s_i; i = 0; - ctp = s_ctp; - ssp = s_ssp; - while (i < BYTES_CT) { - (u8)[ctp + i] = ct_k[(int)i]; + () = #unspill(i, j, ctp, ssp); + i = 0; j = 0; + while (i < BYTES_CT/8) { + [ctp + j] = ct_k[u64 i]; i += 1; + j += 8; } - for k = 0 to BYTES_SEC { - (u8)[ssp + k] = ss[k]; + for k = 0 to BYTES_SEC/8 { + [ssp + 8*k] = ss[u64 k]; } } @@ -217,48 +215,53 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { // S' || E' || E'' stack u16[2 * NNBAR + NBAR * NBAR] SEE; - stack u64 s_ssp s_ctp s_skp; reg u8 s1 s2; - reg u64 i; stack u64 s_i; + reg u64 i j t; + stack u64 s_ssp s_skp; inline int k; - s_ssp = ssp; ctp = ctp; skp = skp; + s_ssp = ssp; + + // copy pkh + for k = 0 to BYTES_SEC/8 { + pkh_u_salt[u64 k] = [skp + BYTES_SK - BYTES_SEC + 8*k]; + } + s_skp = skp; // read ct - i = 0; - while (i < BYTES_CT) { - ct_k[(int) i] = (u8)[ctp + i]; + i = 0; j = 0; + while (i < BYTES_CT/8) { + t = [ctp + j]; + ct_k[u64 i] = t; i += 1; + j += 8; } - s_ctp = ctp; - for k = 0 to BYTES_SEC { - s[k] = (u8)[skp + k]; + for k = 0 to BYTES_SEC/8 { + s[u64 k] = [skp + 8*k]; } - i = 0; - while (i < BYTES_PK) { - #declassify pk[(int)i] = (u8)[skp + BYTES_SEC + i]; + i = 0; j = 0; + while (i < BYTES_PK/8) { + #declassify pk[u64 i] = [skp + BYTES_SEC + j]; i += 1; + j += 8; } - i = 0; - while (i < 2 * NNBAR) { - ST[(int)i] = (u8)[skp + BYTES_SEC + BYTES_PK + i]; + i = 0; j = 0; + while (i < 2 * NNBAR/8) { + ST[u64 i] = [skp + BYTES_SEC + BYTES_PK + j]; i += 1; + j += 8; } - // copy pkh - for k = 0 to BYTES_SEC { - pkh_u_salt[k] = (u8)[skp + BYTES_SK - BYTES_SEC + k]; - } - s_skp = skp; + () = #spill(i); // copy salt - for k = 0 to BYTES_SALT { - pkh_u_salt[BYTES_SEC * 2 + k] = ct_k[BYTES_CT - BYTES_SALT + k]; + for k = 0 to BYTES_SALT/8 { + pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k] = ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + k]; } // B' <- Unpack(c1) @@ -287,10 +290,10 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]); // B'' (mod q) - i = s_i; + () = #unspill(i); i = 0; while (i < NNBAR) { - Bpp[(int)i] &= (1 << D) - 1; + Bpp[i] &= (1 << D) - 1; i += 1; } @@ -312,8 +315,8 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { ss = __shake128_ss(ss, ct_k); ssp = s_ssp; - for k = 0 to BYTES_SEC { - (u8)[ssp + k] = ss[k]; + for k = 0 to BYTES_SEC/8 { + [ssp + 8*k] = ss[u64 k]; } } From 04ed0225ca73795d80e25f3ac1948ca5e2b5a1f2 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Tue, 27 Feb 2024 17:55:44 +0800 Subject: [PATCH 09/14] refactor(frodo): optimize shake128 --- .../frodo/common/amd64/ref/shake128_opt.jinc | 367 ++++++++++++++++++ .../frodo/frodo640shake/amd64/ref/kem.jinc | 39 +- 2 files changed, 378 insertions(+), 28 deletions(-) create mode 100644 src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc new file mode 100644 index 00000000..032f4dc3 --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc @@ -0,0 +1,367 @@ +from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc" + +param int SHAKE128_RATE = 168; + +fn __shake128_seed_A_opt( + #spill_to_mmx reg ptr u8[BYTES_SEED_A] out, + #spill_to_mmx reg const ptr u8[BYTES_SEED_A] in) + -> reg ptr u8[BYTES_SEED_A] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 t0 zero; + inline int k INLEN OUTLEN; + + INLEN = BYTES_SEED_A; + OUTLEN = BYTES_SEED_A; + + state = s_state; + + for k = 0 to INLEN/8 { + t0 = in[u64 k]; + state[k] = t0; + } + ?{}, zero = #set0(); + + i = INLEN/8; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE128_RATE-1] = 0x80; + + () = #spill(out); + + state = __keccakf1600_ref1(state); + + () = #unspill(out); + + for k = 0 to OUTLEN/8 { + t0 = state[u64 k]; + out[u64 k] = t0; + } + + return out; +} + +fn __shake128_r_opt( + #spill_to_mmx reg ptr u8[4 * NNBAR] out, + #spill_to_mmx reg const ptr u8[BYTES_SEED_SE] in) +-> reg ptr u8[4 * NNBAR] +{ + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j t0 zero; + reg u8 t; + inline int k INLEN OUTLEN OUTRND; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 4 * NNBAR; + OUTRND = OUTLEN/SHAKE128_RATE; + + state = s_state; + + ?{}, zero = #set0(); + state[u8 0] = 0x5F; + state[INLEN/8] = zero; + + for k = 0 to INLEN/8 { + t0 = in[u64 k]; + state.[u64 1 + 8*k] = t0; + } + + i = INLEN/8 + 1; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE128_RATE-1] = 0x80; + + i = 0; + j = 0; + while (i < OUTRND) { + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + for k = 0 to SHAKE128_RATE/8 { + t0 = state[u64 k]; + out[u64 j + k] = t0; + } + + i += 1; + j += SHAKE128_RATE/8; + } + + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + for k = 0 to (OUTLEN % SHAKE128_RATE) / 8 { + t0 = state[u64 k]; + out[u64 OUTRND*SHAKE128_RATE/8 + k] = t0; + } + + return out; +} + +fn __shake128_pkh_opt( + #spill_to_mmx reg ptr u8[BYTES_SEC] out, + #spill_to_mmx reg const ptr u8[BYTES_PK] in) +-> reg ptr u8[BYTES_SEC] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j t0 zero; + reg u8 t; + inline int k INLEN OUTLEN INRND; + + INLEN = BYTES_PK; + INRND = INLEN/SHAKE128_RATE; + OUTLEN = BYTES_SEC; + + state = s_state; + + ?{}, zero = #set0(); + + i = 0; + while (i < 25) { + state[i] = zero; + i += 1; + } + + () = #spill(out); + + i = 0; j = 0; + while (i < INRND) { + for k = 0 to SHAKE128_RATE/8 { + t0 = in[u64 j + k]; + state[u64 k] ^= t0; + } + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j); + + i += 1; + j += SHAKE128_RATE/8; + } + + for k = 0 to (INLEN % SHAKE128_RATE)/8 { + t0 = in[u64 INRND * SHAKE128_RATE/8 + k]; + state[u64 k] ^= t0; + } + + state[u8 INLEN%SHAKE128_RATE] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + for k = 0 to OUTLEN/8 { + out[u64 k] = state[u64 k]; + } + + return out; +} + +fn __shake128_SE_k_opt( + #spill_to_mmx reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] out, + #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) +-> reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 t0 zero; + inline int k INLEN OUTLEN; + + INLEN = 2 * BYTES_SEC + BYTES_SALT; + OUTLEN = BYTES_SEED_SE + BYTES_SEC; + + state = s_state; + + for k = 0 to INLEN/8 { + t0 = in.[u64 8*k]; + state[k] = t0; + } + ?{}, zero = #set0(); + + i = INLEN/8; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE128_RATE-1] = 0x80; + + () = #spill(out); + + state = __keccakf1600_ref1(state); + + () = #unspill(out); + + for k = 0 to OUTLEN/8 { + t0 = state[u64 k]; + out.[u64 1 + 8*k] = t0; + } + + return out; +} + +fn __shake128_encap_r_opt( + #spill_to_mmx reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, + #spill_to_mmx reg const ptr u8[1 + BYTES_SEED_SE] in) +-> reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j t0 zero; + reg u8 t; + inline int k INLEN OUTLEN OUTRND; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 2 * (2 * NNBAR + NBAR * NBAR); + OUTRND = OUTLEN/SHAKE128_RATE; + + state = s_state; + + ?{}, zero = #set0(); + state[u8 0] = 0x5F; + state[INLEN/8] = zero; + + for k = 0 to INLEN/8 { + t0 = in[u64 k]; + state[u64 k] = t0; + } + + for k = 0 to INLEN%8 { + t = in[INLEN-INLEN%8 + k]; + state[u8 INLEN-INLEN%8 + k] = t; + } + + i = INLEN/8 + 1; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE128_RATE-1] = 0x80; + + i = 0; + j = 0; + while (i < OUTRND) { + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + for k = 0 to SHAKE128_RATE/8 { + t0 = state[u64 k]; + out[u64 j + k] = t0; + } + + i += 1; + j += SHAKE128_RATE/8; + } + + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + for k = 0 to (OUTLEN % SHAKE128_RATE) / 8 { + t0 = state[u64 k]; + out[u64 OUTRND*SHAKE128_RATE/8 + k] = t0; + } + + return out; +} + +fn __shake128_ss_opt( + #spill_to_mmx reg ptr u8[BYTES_SEC] out, + #spill_to_mmx reg const ptr u8[BYTES_CT + BYTES_SEC] in) +-> reg ptr u8[BYTES_SEC] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j t0 zero; + reg u8 t; + inline int k INLEN OUTLEN INRND; + + INLEN = BYTES_CT + BYTES_SEC; + INRND = INLEN/SHAKE128_RATE; + OUTLEN = BYTES_SEC; + + state = s_state; + + ?{}, zero = #set0(); + + i = 0; + while (i < 25) { + state[i] = zero; + i += 1; + } + + () = #spill(out); + + i = 0; j = 0; + while (i < INRND) { + for k = 0 to SHAKE128_RATE/8 { + t0 = in[u64 j + k]; + state[u64 k] ^= t0; + } + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j); + + i += 1; + j += SHAKE128_RATE/8; + } + + for k = 0 to (INLEN % SHAKE128_RATE)/8 { + t0 = in[u64 INRND * SHAKE128_RATE/8 + k]; + state[u64 k] ^= t0; + } + + state[u8 INLEN%SHAKE128_RATE] ^= 0x1f; + state[u8 SHAKE128_RATE-1] ^= 0x80; + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + for k = 0 to OUTLEN/8 { + out[u64 k] = state[u64 k]; + } + + return out; +} diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index 4c5d014a..c1d06e7b 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -1,29 +1,11 @@ from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" -#[returnaddress="stack"] -fn __gen_SE(reg ptr u16[2 * NNBAR] SE, reg ptr u8[BYTES_SEED_SE] r) -> stack u16[2 * NNBAR] { - stack u8[1 + BYTES_SEED_SE] seed_se; - inline int i; - - r = r; - for i = 0 to BYTES_SEED_SE { - seed_se[i + 1] = r[i]; - } - - // S || E - seed_se[0] = 0x5F; - - SE = __shake128_r(SE, seed_se); - SE = __sample_2NNBAR(SE); - - return SE; -} - // coins = s || seed SE || z fn __frodo_amd64_ref_keypair_derand( reg u64 pkp skp, @@ -47,10 +29,11 @@ fn __frodo_amd64_ref_keypair_derand( } // gen seedA - pk[0:BYTES_SEED_A] = __shake128_seed_A(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); + pk[0:BYTES_SEED_A] = __shake128_seed_A_opt(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); // gen S || E - SE = __gen_SE(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + SE = __shake128_r_opt(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + SE = __sample_2NNBAR(SE); () = #spill(coins); @@ -143,10 +126,10 @@ fn __frodo_amd64_ref_enc_derand( () = #spill(i, j); // pkh - pkh_u_salt[0:BYTES_SEC] = __shake128_pkh(pkh_u_salt[0:BYTES_SEC], pk); + pkh_u_salt[0:BYTES_SEC] = __shake128_pkh_opt(pkh_u_salt[0:BYTES_SEC], pk); // seedSE || k - seedSE_k[1 : BYTES_SEED_SE + BYTES_SEC] = __shake128_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + seedSE_k = __shake128_SE_k_opt(seedSE_k, pkh_u_salt); // copy k for k = 0 to BYTES_SEC/8 { @@ -154,7 +137,7 @@ fn __frodo_amd64_ref_enc_derand( } // gen input bit string for sampling S and E - SEE = __shake128_encap_r(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); + SEE = __shake128_encap_r_opt(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); // S' || E' SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); @@ -183,7 +166,7 @@ fn __frodo_amd64_ref_enc_derand( ct_k[D * N: D * NBAR] = __pack_C(ct_k[D * N: D * NBAR], C); // ss <- shake(c1 || c2 || salt || k) - ss = __shake128_ss(ss, ct_k); + ss = __shake128_ss_opt(ss, ct_k); () = #unspill(i, j, ctp, ssp); i = 0; j = 0; @@ -276,9 +259,9 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M); seedSE_k[0] = 0x96; - seedSE_k[1:BYTES_SEED_SE + BYTES_SEC] = __shake128_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + seedSE_k = __shake128_SE_k_opt(seedSE_k, pkh_u_salt); - SEE = __shake128_encap_r(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]); + SEE = __shake128_encap_r_opt(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]); // S' || E' SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); @@ -312,7 +295,7 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[1+BYTES_SEED_SE:BYTES_SEC], s, s1); - ss = __shake128_ss(ss, ct_k); + ss = __shake128_ss_opt(ss, ct_k); ssp = s_ssp; for k = 0 to BYTES_SEC/8 { From 61b41b8f50fbe96f0e19ff4dbb16f1354f5588d2 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Tue, 27 Feb 2024 17:56:06 +0800 Subject: [PATCH 10/14] refactor(frodo): optimize encode --- src/crypto_kem/frodo/common/amd64/ref/encode.jinc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc index 7e2ec9cb..6726e370 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc @@ -1,6 +1,6 @@ fn __encode(reg ptr u16[NBAR * NBAR]out, reg ptr u8[EXTRACTED_BITS * NBAR * NBAR / 8] in) -> stack u16[NBAR * NBAR] { inline int i j; - reg u64 tmp tmp2 mask; + reg u64 t tmp tmp2 mask; mask = (1 << EXTRACTED_BITS) - 1; @@ -14,9 +14,10 @@ fn __encode(reg ptr u16[NBAR * NBAR]out, reg ptr u8[EXTRACTED_BITS * NBAR * NBAR } for j = 0 to 8 { - out[i * NBAR + j] = tmp; - out[i * NBAR + j] &= mask; - out[i * NBAR + j] <<= D - EXTRACTED_BITS; + t = tmp; + t &= mask; + t <<= D - EXTRACTED_BITS; + out[i*NBAR + j] = (16u)t; tmp >>= EXTRACTED_BITS; } } From c8eec7b0d3792eb8f31ba6fc62204ad93385f0e0 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Thu, 28 Mar 2024 17:50:21 +0800 Subject: [PATCH 11/14] refactor(frodo): optimize code size --- .../frodo/common/amd64/ref/encode.jinc | 47 +++-- .../frodo/common/amd64/ref/matrix.jinc | 33 ++-- .../frodo/common/amd64/ref/matrix_opt.jinc | 51 +++-- .../frodo/common/amd64/ref/noise.jinc | 120 ++++++------ .../frodo/common/amd64/ref/pack.jinc | 182 +++++++++++++----- .../frodo/common/amd64/ref/shake128_opt.jinc | 174 ++++++++++------- 6 files changed, 379 insertions(+), 228 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc index 6726e370..499b877c 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc @@ -1,25 +1,32 @@ fn __encode(reg ptr u16[NBAR * NBAR]out, reg ptr u8[EXTRACTED_BITS * NBAR * NBAR / 8] in) -> stack u16[NBAR * NBAR] { - inline int i j; - reg u64 t tmp tmp2 mask; + inline int k; + reg u64 i j t tmp tmp2 mask offset; mask = (1 << EXTRACTED_BITS) - 1; - for i = 0 to NBAR { + i = 0; + while (i < NBAR) { tmp = 0; - for j = 0 to EXTRACTED_BITS { - tmp2 = (64u)in[i * EXTRACTED_BITS + j]; - tmp2 <<= 8 * j; + for k = 0 to EXTRACTED_BITS { + tmp2 = (64u)in[i * EXTRACTED_BITS + k]; + tmp2 <<= 8 * k; tmp |= tmp2; } - for j = 0 to 8 { + j = 0; + while (j < 8) { t = tmp; t &= mask; t <<= D - EXTRACTED_BITS; - out[i*NBAR + j] = (16u)t; + offset = #LEA(i*NBAR+j); + out[offset] = (16u)t; tmp >>= EXTRACTED_BITS; + + j += 1; } + + i += 1; } return out; @@ -27,7 +34,7 @@ fn __encode(reg ptr u16[NBAR * NBAR]out, reg ptr u8[EXTRACTED_BITS * NBAR * NBAR fn __decode(reg ptr u8[EXTRACTED_BITS * NBAR] out, reg ptr u16[NBAR * NBAR] in) -> stack u8[EXTRACTED_BITS * NBAR] { reg u32 tmplong tmp mask d; - inline int i j; + reg u64 i j offset; d = 1 << (D - EXTRACTED_BITS - 1); mask = (1 << EXTRACTED_BITS) - 1; @@ -35,24 +42,32 @@ fn __decode(reg ptr u8[EXTRACTED_BITS * NBAR] out, reg ptr u16[NBAR * NBAR] in) out = out; in = in; - for i = 0 to NBAR { + i = 0; + while (i < NBAR) { tmplong = 0; - for j = 0 to 8 { - tmp = (32u)in[(int) i * NBAR + j]; + j = 0; + while (j < 8) { + offset = #LEA(i * NBAR + 7); + offset -= j; + tmp = (32u)in[offset]; tmp += d; tmp >>= D - EXTRACTED_BITS; tmp &= mask; - tmp <<= EXTRACTED_BITS * j; - + tmplong <<= EXTRACTED_BITS; tmplong |= tmp; + j += 1; } - for j = 0 to EXTRACTED_BITS { - out[i * EXTRACTED_BITS + j] = (8u)tmplong; + j = 0; + while (j < EXTRACTED_BITS) { + offset = #LEA(i*EXTRACTED_BITS+j); + out[offset] = (8u)tmplong; tmplong >>= 8; + j += 1; } + i += 1; } return out; diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc index 2d31bde2..b93d57a2 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc @@ -163,17 +163,14 @@ fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[N fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { reg u64 i; reg u16 tmp; - inline int k; i = 0; while (i < NBAR * NBAR) { - for k = 0 to 2 { - tmp = a[(int)i + k]; - tmp += b[(int)i + k]; - tmp &= (1 << D) - 1; - a[(int)i + k] = tmp; - } - i += 2; + tmp = a[i]; + tmp += b[i]; + tmp &= (1 << D) - 1; + a[i] = tmp; + i += 1; } return a; @@ -184,17 +181,14 @@ fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { reg u64 i; reg u16 tmp; - inline int k; i = 0; while (i < NBAR * NBAR) { - for k = 0 to 2 { - tmp = b[(int)i + k]; - tmp -= a[(int)i + k]; - tmp &= (1 << D) - 1; - a[(int)i + k] = tmp; - } - i += 2; + tmp = b[i]; + tmp -= a[i]; + tmp &= (1 << D) - 1; + a[i] = tmp; + i += 1; } return a; @@ -282,14 +276,15 @@ fn __ct_verify_NBAR2(reg ptr u16[NBAR * NBAR] a b) -> stack u8 { #[returnaddress="stack"] fn __ct_select(reg ptr u8[BYTES_SEC] out a b, reg u8 selector) -> stack u8[BYTES_SEC] { - inline int i; + reg u64 i; reg u8 n_selector tmp; n_selector = selector; n_selector ^= 0xFF; - for i = 0 to BYTES_SEC { + i = 0; + while (i < BYTES_SEC) { tmp = a[i]; tmp &= n_selector; out[i] = tmp; @@ -297,6 +292,8 @@ fn __ct_select(reg ptr u8[BYTES_SEC] out a b, reg u8 selector) -> stack u8[BYTES tmp = b[i]; tmp &= selector; out[i] |= tmp; + + i += 1; } return out; diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc index 998b7aec..d3aaadb8 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc @@ -125,7 +125,6 @@ fn __AS_plus_E_opt( inline int p; #spill_to_mmx reg u64 i j ij jN q; reg u32[8] sum; - reg u16 t16; reg u32 mul sp; index_seed_padding = __pad_seedA(seedA); @@ -309,19 +308,22 @@ fn __SB_plus_E_opt( #spill_to_mmx reg ptr u16[NBAR * NBAR] V, // initial value is set to E's to avoid copy #spill_to_mmx reg ptr u16[NNBAR] S B) -> reg ptr u16[NBAR * NBAR] { - reg u64 k B_offset; + reg u64 i j k iN V_offset offset; reg u32 sum mul t32; - inline int i j l; - for i = 0 to NBAR { - for j = 0 to NBAR { + i = 0; iN = 0; + while (i < NBAR) { + j = 0; + while (j < NBAR) { k = 0; - sum = (32u)V[i * NBAR + j]; + V_offset = #LEA(i*NBAR+j); + sum = (32u)V[V_offset]; while (k < N) { - mul = (32u)S[i * N + k]; + offset = #LEA(iN+k); + mul = (32u)S[offset]; - B_offset = #LEA(j + NBAR*k); - t32 = (32u)B[B_offset]; + offset = #LEA(j + NBAR*k); + t32 = (32u)B[offset]; mul *= t32; sum += mul; @@ -329,8 +331,12 @@ fn __SB_plus_E_opt( } sum &= (1 << D) - 1; - V[i * NBAR + j] = (16u)sum; + V[V_offset] = (16u)sum; + + j += 1; } + i += 1; + iN += N; } return V; @@ -340,20 +346,22 @@ fn __mul_BS_opt( #spill_to_mmx reg ptr u16[NBAR * NBAR] M, #spill_to_mmx reg ptr u16[NNBAR]B S) -> reg ptr u16[NBAR * NBAR] { - reg u64 k S_offset; + reg u64 i j k iN jN offset; reg u32 sum mul t32; - inline int i j; - for i = 0 to NBAR { - for j = 0 to NBAR { + i = 0; iN = 0; + while (i < NBAR) { + j = 0; jN = 0; + while (j < NBAR) { sum = 0; k = 0; while (k < N) { - mul = (32u)B[i * N + k]; + offset = #LEA(iN+k); + mul = (32u)B[offset]; - S_offset = #LEA(j*N+k); - t32 = (32u)S[S_offset]; + offset = #LEA(jN+k); + t32 = (32u)S[offset]; mul *= t32; sum += mul; @@ -361,8 +369,15 @@ fn __mul_BS_opt( k += 1; } sum &= (1 << D) - 1; - M[i * NBAR + j] = (16u)sum; + offset = #LEA(i*NBAR+j); + M[offset] = (16u)sum; + + j += 1; + jN += N; } + + i += 1; + iN += N; } return M; diff --git a/src/crypto_kem/frodo/common/amd64/ref/noise.jinc b/src/crypto_kem/frodo/common/amd64/ref/noise.jinc index 0e712299..313f28b9 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/noise.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/noise.jinc @@ -2,42 +2,42 @@ fn __sample_2NNBAR(reg ptr u16[2 * NNBAR] s) -> stack u16[2 * NNBAR] { reg ptr u16[CDF_TABLE_LEN] cdftp; cdftp = CDF_TABLE; - reg u64 i; - inline int j k; + reg u64 i j; reg u16 sample prnd sign; i = 0; while (i < 2 * NNBAR) { - for k = 0 to 2 { - sample = 0; - - // prnd = s[i] >> 1 - prnd = s[(int)i + k]; - prnd >>= 1; - - // sign = s[(int)i + k] & 0x1 - sign = s[(int)i + k]; - sign &= 0x1; - - // no need to compare with the last value - for j = 0 to CDF_TABLE_LEN - 1 { - // sample += (CDF_TABLE[j] - prnd) >> 15 - - reg u16 tmp_sample; - tmp_sample = cdftp[j]; - tmp_sample -= prnd; - tmp_sample >>= 15; - sample += tmp_sample; - } - - // s[i] = ((-sign) ^ sample) + sign - s[(int)i + k] = 0; - s[(int)i + k] -= sign; - s[(int)i + k] ^= sample; - s[(int)i + k] += sign; + sample = 0; + + // prnd = s[i] >> 1 + prnd = s[i]; + prnd >>= 1; + + // sign = s[i] & 0x1 + sign = s[i]; + sign &= 0x1; + + // no need to compare with the last value + j = 0; + while (j < CDF_TABLE_LEN - 1) { + // sample += (CDF_TABLE[j] - prnd) >> 15 + + reg u16 tmp_sample; + tmp_sample = cdftp[j]; + tmp_sample -= prnd; + tmp_sample >>= 15; + sample += tmp_sample; + + j += 1; } - i += 2; + // s[i] = ((-sign) ^ sample) + sign + s[i] = 0; + s[i] -= sign; + s[i] ^= sample; + s[i] += sign; + + i += 1; } return s; @@ -47,41 +47,41 @@ fn __sample_NBAR2(reg ptr u16[NBAR * NBAR] s) -> stack u16[NBAR * NBAR] { reg ptr u16[CDF_TABLE_LEN] cdftp; cdftp = CDF_TABLE; - reg u64 i; + reg u64 i j; reg u16 sample prnd sign; - inline int j k; i = 0; while (i < NBAR * NBAR) { - for k = 0 to 2 { - sample = 0; - - // prnd = s[i] >> 1 - prnd = s[(int)i + k]; - prnd >>= 1; - - // sign = s[(int)i] & 0x1 - sign = s[(int)i + k]; - sign &= 0x1; - - // no need to compare with the last value - for j = 0 to CDF_TABLE_LEN - 1 { - // sample += (CDF_TABLE[j] - prnd) >> 15 - - reg u16 tmp_sample; - tmp_sample = cdftp[j]; - tmp_sample -= prnd; - tmp_sample >>= 15; - sample += tmp_sample; - } - - // s[i] = ((-sign) ^ sample) + sign - s[(int)i + k] = 0; - s[(int)i + k] -= sign; - s[(int)i + k] ^= sample; - s[(int)i + k] += sign; + sample = 0; + + // prnd = s[i] >> 1 + prnd = s[i]; + prnd >>= 1; + + // sign = s[i] & 0x1 + sign = s[i]; + sign &= 0x1; + + // no need to compare with the last value + j = 0; + while (j < CDF_TABLE_LEN - 1) { + // sample += (CDF_TABLE[j] - prnd) >> 15 + + reg u16 tmp_sample; + tmp_sample = cdftp[j]; + tmp_sample -= prnd; + tmp_sample >>= 15; + sample += tmp_sample; + + j += 1; } - i += 2; + // s[i] = ((-sign) ^ sample) + sign + s[i] = 0; + s[i] -= sign; + s[i] ^= sample; + s[i] += sign; + + i += 1; } return s; diff --git a/src/crypto_kem/frodo/common/amd64/ref/pack.jinc b/src/crypto_kem/frodo/common/amd64/ref/pack.jinc index 62510de2..0880eb61 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/pack.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/pack.jinc @@ -1,5 +1,5 @@ fn __pack_B(reg ptr u8[D * N]out, reg ptr u16[NNBAR] in) -> stack u8[D * N] { - reg u64 i j; + reg u64 i j l offset; inline int k MID TERM Mask; reg u64 ac tmp; reg u16 acm tm; @@ -18,11 +18,15 @@ fn __pack_B(reg ptr u8[D * N]out, reg ptr u16[NNBAR] in) -> stack u8[D * N] { acm = 0; // aggregate 1st half (16 * 4 bits) into 4 * D bits in ac - for k = 0 to 4 { - tmp = (64u)in[(int)i + k]; + l = 0; + while (l < 4) { + offset = #LEA(i + l); + tmp = (64u)in[offset]; tmp &= Mask; ac <<= D; ac |= tmp; + + l += 1; } // aggregate the 1st half of the MID in acm from ac @@ -35,23 +39,37 @@ fn __pack_B(reg ptr u8[D * N]out, reg ptr u16[NNBAR] in) -> stack u8[D * N] { } // process the 1st TERM in ac - for k = 0 to TERM { - out[(int)j + TERM - 1 - k] = ac; + l = 0; + while (l < TERM) { + offset = #LEA(j + TERM - 1); + offset -= l; + out[offset] = ac; ac >>= 8; + + l += 1; } // aggregate 2nd half (16 * 4 bits) into 4 * D bits in ac - for k = 0 to 4 { - tmp = (64u)in[(int)i + 4 + k]; + l = 0; + while (l < 4) { + offset = #LEA(i + 4 + l); + tmp = (64u)in[offset]; tmp &= Mask; ac <<= D; ac |= tmp; + + l += 1; } // process the 2nd TERM in ac - for k = 0 to TERM { - out[(int)j + D - 1 - k] = ac; + l = 0; + while (l < TERM) { + offset = #LEA(j + D - 1); + offset -= l; + out[offset] = ac; ac >>= 8; + + l += 1; } // aggregate the 2nd half of the MID in acm from ac @@ -64,9 +82,14 @@ fn __pack_B(reg ptr u8[D * N]out, reg ptr u16[NNBAR] in) -> stack u8[D * N] { ac >>= 4; } - for k = 0 to MID { - out[(int)j + TERM + MID - 1 - k] = acm; + l = 0; + while (l < MID) { + offset = #LEA(j + TERM + MID - 1); + offset -= l; + out[offset] = acm; acm >>= 8; + + l += 1; } i += 8; @@ -79,7 +102,7 @@ fn __pack_B(reg ptr u8[D * N]out, reg ptr u16[NNBAR] in) -> stack u8[D * N] { fn __unpack_B(reg ptr u16[NNBAR]out, reg ptr u8[D * N]in) -> stack u16[NNBAR] { inline int k TERM MID MASK; - reg u64 i j ac tmp; + reg u64 i j l ac tmp offset; reg u16 acm tm; // D = TERM + MID + TERM @@ -98,10 +121,14 @@ fn __unpack_B(reg ptr u16[NNBAR]out, reg ptr u8[D * N]in) -> stack u16[NNBAR] { acm = 0; // accumulate the MID bytes (8/16 for D = 15/16) in acm (u16) - for k = 0 to MID { - tm = (16u)in[(int)i + TERM + k]; + l = 0; + while (l < MID) { + offset = #LEA(i + TERM + l); + tm = (16u)in[offset]; acm <<= 8; acm |= tm; + + l += 1; } // aggregate 2nd half of MID from acm in ac @@ -114,25 +141,38 @@ fn __unpack_B(reg ptr u16[NNBAR]out, reg ptr u8[D * N]in) -> stack u16[NNBAR] { } // accumulate 2nd TERM bytes in ac - for k = 0 to TERM { - tmp = (64u)in[(int)i + TERM + MID + k]; + l = 0; + while (l < TERM) { + offset = #LEA(i + TERM + MID + l); + tmp = (64u)in[offset]; ac <<= 8; ac |= tmp; + + l += 1; } // result in 4 * D bits in 2nd half of output - for k = 0 to 4 { + l = 0; + while (l < 4) { tm = ac; tm &= MASK; - out[(int)j + 7 - k] = tm; + offset = #LEA(j + 7); + offset -= l; + out[offset] = tm; ac >>= D; + + l += 1; } // accumulate 1st TERM bytes in ac - for k = 0 to TERM { - tmp = (64u)in[(int)i + k]; + l = 0; + while (l < TERM) { + offset = #LEA(i + l); + tmp = (64u)in[offset]; ac <<= 8; ac |= tmp; + + l += 1; } // aggregate 2nd half of MID from acm to ac @@ -146,12 +186,17 @@ fn __unpack_B(reg ptr u16[NNBAR]out, reg ptr u8[D * N]in) -> stack u16[NNBAR] { } // result in 4 * D bits in 1st half of output - for k = 0 to 4 { - // disable implicit scaling for handling little endianess + l = 0; + while (l < 4) { + // disable implicit scaling for handling little endianness + offset = #LEA(j + 3); + offset -= l; tm = ac; tm &= MASK; - out[(int)j + 3 - k] = tm; + out[offset] = tm; ac >>= D; + + l += 1; } i += D; @@ -162,7 +207,7 @@ fn __unpack_B(reg ptr u16[NNBAR]out, reg ptr u8[D * N]in) -> stack u16[NNBAR] { } fn __pack_C(reg ptr u8[D * NBAR]out, reg ptr u16[NBAR * NBAR] in) -> stack u8[D * NBAR] { - reg u64 i j; + reg u64 i j l offset; inline int k MID TERM Mask; reg u64 ac tmp; reg u16 acm tm; @@ -181,11 +226,15 @@ fn __pack_C(reg ptr u8[D * NBAR]out, reg ptr u16[NBAR * NBAR] in) -> stack u8[D acm = 0; // aggregate 1st half (16 * 4 bits) into 4 * D bits in ac - for k = 0 to 4 { - tmp = (64u)in[(int)i + k]; + l = 0; + while (l < 4) { + offset = #LEA(i + l); + tmp = (64u)in[offset]; tmp &= Mask; ac <<= D; ac |= tmp; + + l += 1; } // aggregate the 1st half of the MID in acm from ac @@ -198,23 +247,37 @@ fn __pack_C(reg ptr u8[D * NBAR]out, reg ptr u16[NBAR * NBAR] in) -> stack u8[D } // process the 1st TERM in ac - for k = 0 to TERM { - out[(int)j + TERM - 1 - k] = ac; + l = 0; + while (l < TERM) { + offset = #LEA(j + TERM - 1); + offset -= l; + out[offset] = ac; ac >>= 8; + + l += 1; } // aggregate 2nd half (16 * 4 bits) into 4 * D bits in ac - for k = 0 to 4 { - tmp = (64u)in[(int)i + 4 + k]; + l = 0; + while (l < 4) { + offset = #LEA(i + 4 + l); + tmp = (64u)in[offset]; tmp &= Mask; ac <<= D; ac |= tmp; + + l += 1; } // process the 2nd TERM in ac - for k = 0 to TERM { - out[(int)j + D - 1 - k] = ac; + l = 0; + while (l < TERM) { + offset = #LEA(j + D - 1); + offset -= l; + out[offset] = ac; ac >>= 8; + + l += 1; } // aggregate the 2nd half of the MID in acm from ac @@ -227,9 +290,14 @@ fn __pack_C(reg ptr u8[D * NBAR]out, reg ptr u16[NBAR * NBAR] in) -> stack u8[D ac >>= 4; } - for k = 0 to MID { - out[(int)j + TERM + MID - 1 - k] = acm; + l = 0; + while (l < MID) { + offset = #LEA(j + TERM + MID - 1); + offset -= l; + out[offset] = acm; acm >>= 8; + + l += 1; } i += 8; @@ -242,7 +310,7 @@ fn __pack_C(reg ptr u8[D * NBAR]out, reg ptr u16[NBAR * NBAR] in) -> stack u8[D fn __unpack_C(reg ptr u16[NBAR * NBAR]out, reg ptr u8[D * NBAR]in) -> stack u16[NBAR * NBAR] { inline int k TERM MID MASK; - reg u64 i j ac tmp; + reg u64 i j l ac tmp offset; reg u16 acm tm; // D = TERM + MID + TERM @@ -261,10 +329,14 @@ fn __unpack_C(reg ptr u16[NBAR * NBAR]out, reg ptr u8[D * NBAR]in) -> stack u16[ acm = 0; // accumulate the MID bytes (8/16 for D = 15/16) in acm (u16) - for k = 0 to MID { - tm = (16u)in[(int)i + TERM + k]; + l = 0; + while (l < MID) { + offset = #LEA(i + TERM + l); + tm = (16u)in[offset]; acm <<= 8; acm |= tm; + + l += 1; } // aggregate 2nd half of MID from acm in ac @@ -277,25 +349,38 @@ fn __unpack_C(reg ptr u16[NBAR * NBAR]out, reg ptr u8[D * NBAR]in) -> stack u16[ } // accumulate 2nd TERM bytes in ac - for k = 0 to TERM { - tmp = (64u)in[(int)i + TERM + MID + k]; + l = 0; + while (l < TERM) { + offset = #LEA(i + TERM + MID + l); + tmp = (64u)in[offset]; ac <<= 8; ac |= tmp; + + l += 1; } // result in 4 * D bits in 2nd half of output - for k = 0 to 4 { + l = 0; + while (l < 4) { + offset = #LEA(j + 7); + offset -= l; tm = ac; tm &= MASK; - out[(int)j + 7 - k] = tm; + out[offset] = tm; ac >>= D; + + l += 1; } // accumulate 1st TERM bytes in ac - for k = 0 to TERM { - tmp = (64u)in[(int)i + k]; + l = 0; + while (l < TERM) { + offset = #LEA(i + l); + tmp = (64u)in[offset]; ac <<= 8; ac |= tmp; + + l += 1; } // aggregate 2nd half of MID from acm to ac @@ -309,12 +394,17 @@ fn __unpack_C(reg ptr u16[NBAR * NBAR]out, reg ptr u8[D * NBAR]in) -> stack u16[ } // result in 4 * D bits in 1st half of output - for k = 0 to 4 { - // disable implicit scaling for handling little endianess + l = 0; + while (l < 4) { + // disable implicit scaling for handling little endianness + offset = #LEA(j + 3); + offset -= l; tm = ac; tm &= MASK; - out[(int)j + 3 - k] = tm; + out[offset] = tm; ac >>= D; + + l += 1; } i += D; diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc index 032f4dc3..6a6fbff9 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc @@ -56,9 +56,8 @@ fn __shake128_r_opt( stack u64[25] s_state; reg ptr u64[25] state; - reg u64 j t0 zero; - reg u8 t; - inline int k INLEN OUTLEN OUTRND; + reg u64 j offset t0 zero; + inline int INLEN OUTLEN OUTRND; INLEN = 1 + BYTES_SEED_SE; OUTLEN = 4 * NNBAR; @@ -70,9 +69,13 @@ fn __shake128_r_opt( state[u8 0] = 0x5F; state[INLEN/8] = zero; - for k = 0 to INLEN/8 { - t0 = in[u64 k]; - state.[u64 1 + 8*k] = t0; + i = 0; + while (i < INLEN/8) { + t0 = in[u64 i]; + offset = #LEA(1+8*i); + state.[u64 offset] = t0; + + i += 1; } i = INLEN/8 + 1; @@ -85,21 +88,23 @@ fn __shake128_r_opt( state[u8 SHAKE128_RATE-1] = 0x80; i = 0; - j = 0; - while (i < OUTRND) { + while (i < OUTRND * SHAKE128_RATE/8) { () = #spill(i, j, out); state = __keccakf1600_ref1(state); () = #unspill(i, j, out); - for k = 0 to SHAKE128_RATE/8 { - t0 = state[u64 k]; - out[u64 j + k] = t0; + j = 0; + while (j < SHAKE128_RATE/8) { + t0 = state[u64 j]; + offset = #LEA(i+j); + out[u64 offset] = t0; + + j += 1; } - i += 1; - j += SHAKE128_RATE/8; + i += SHAKE128_RATE/8; } () = #spill(i, j, out); @@ -108,9 +113,11 @@ fn __shake128_r_opt( () = #unspill(i, j, out); - for k = 0 to (OUTLEN % SHAKE128_RATE) / 8 { - t0 = state[u64 k]; - out[u64 OUTRND*SHAKE128_RATE/8 + k] = t0; + i = 0; + while (i < (OUTLEN % SHAKE128_RATE) / 8) { + t0 = state[u64 i]; + out[u64 OUTRND*SHAKE128_RATE/8 + i] = t0; + i += 1; } return out; @@ -124,9 +131,8 @@ fn __shake128_pkh_opt( stack u64[25] s_state; reg ptr u64[25] state; - reg u64 j t0 zero; - reg u8 t; - inline int k INLEN OUTLEN INRND; + reg u64 j offset t0 zero; + inline int INLEN OUTLEN INRND; INLEN = BYTES_PK; INRND = INLEN/SHAKE128_RATE; @@ -144,11 +150,14 @@ fn __shake128_pkh_opt( () = #spill(out); - i = 0; j = 0; - while (i < INRND) { - for k = 0 to SHAKE128_RATE/8 { - t0 = in[u64 j + k]; - state[u64 k] ^= t0; + i = 0; + while (i < INRND * SHAKE128_RATE/8) { + j = 0; + while (j < SHAKE128_RATE/8) { + offset = #LEA(i+j); + t0 = in[u64 offset]; + state[u64 j] ^= t0; + j += 1; } () = #spill(i, j); @@ -157,13 +166,15 @@ fn __shake128_pkh_opt( () = #unspill(i, j); - i += 1; - j += SHAKE128_RATE/8; + i += SHAKE128_RATE/8; } - for k = 0 to (INLEN % SHAKE128_RATE)/8 { - t0 = in[u64 INRND * SHAKE128_RATE/8 + k]; - state[u64 k] ^= t0; + i = 0; + while (i < (INLEN % SHAKE128_RATE)/8) { + t0 = in[u64 INRND * SHAKE128_RATE/8 + i]; + state[u64 i] ^= t0; + + i += 1; } state[u8 INLEN%SHAKE128_RATE] ^= 0x1f; @@ -175,8 +186,10 @@ fn __shake128_pkh_opt( () = #unspill(i, j, out); - for k = 0 to OUTLEN/8 { - out[u64 k] = state[u64 k]; + i = 0; + while (i < OUTLEN/8) { + out[u64 i] = state[u64 i]; + i += 1; } return out; @@ -190,17 +203,20 @@ fn __shake128_SE_k_opt( stack u64[25] s_state; reg ptr u64[25] state; - reg u64 t0 zero; - inline int k INLEN OUTLEN; + reg u64 offset t0 zero; + inline int INLEN OUTLEN; INLEN = 2 * BYTES_SEC + BYTES_SALT; OUTLEN = BYTES_SEED_SE + BYTES_SEC; state = s_state; - for k = 0 to INLEN/8 { - t0 = in.[u64 8*k]; - state[k] = t0; + i = 0; + while (i < INLEN/8) { + t0 = in[u64 i]; + state[i] = t0; + + i += 1; } ?{}, zero = #set0(); @@ -219,9 +235,13 @@ fn __shake128_SE_k_opt( () = #unspill(out); - for k = 0 to OUTLEN/8 { - t0 = state[u64 k]; - out.[u64 1 + 8*k] = t0; + i = 0; + while (i < OUTLEN/8) { + t0 = state[u64 i]; + offset = #LEA(1+8*i); + out.[u64 offset] = t0; + + i += 1; } return out; @@ -235,7 +255,7 @@ fn __shake128_encap_r_opt( stack u64[25] s_state; reg ptr u64[25] state; - reg u64 j t0 zero; + reg u64 j offset t0 zero; reg u8 t; inline int k INLEN OUTLEN OUTRND; @@ -249,9 +269,11 @@ fn __shake128_encap_r_opt( state[u8 0] = 0x5F; state[INLEN/8] = zero; - for k = 0 to INLEN/8 { - t0 = in[u64 k]; - state[u64 k] = t0; + i = 0; + while (i < INLEN/8) { + t0 = in[u64 i]; + state[u64 i] = t0; + i += 1; } for k = 0 to INLEN%8 { @@ -269,21 +291,23 @@ fn __shake128_encap_r_opt( state[u8 SHAKE128_RATE-1] = 0x80; i = 0; - j = 0; - while (i < OUTRND) { - () = #spill(i, j, out); + while (i < OUTRND * SHAKE128_RATE/8) { + () = #spill(i, out); state = __keccakf1600_ref1(state); - () = #unspill(i, j, out); + () = #unspill(i, out); + + j = 0; + while (j < SHAKE128_RATE/8) { + t0 = state[u64 j]; + offset = #LEA(i+j); + out[u64 offset] = t0; - for k = 0 to SHAKE128_RATE/8 { - t0 = state[u64 k]; - out[u64 j + k] = t0; + j += 1; } - i += 1; - j += SHAKE128_RATE/8; + i += SHAKE128_RATE/8; } () = #spill(i, j, out); @@ -292,9 +316,12 @@ fn __shake128_encap_r_opt( () = #unspill(i, j, out); - for k = 0 to (OUTLEN % SHAKE128_RATE) / 8 { - t0 = state[u64 k]; - out[u64 OUTRND*SHAKE128_RATE/8 + k] = t0; + i = 0; + while (i < (OUTLEN % SHAKE128_RATE) / 8) { + t0 = state[u64 i]; + out[u64 OUTRND*SHAKE128_RATE/8 + i] = t0; + + i += 1; } return out; @@ -308,9 +335,8 @@ fn __shake128_ss_opt( stack u64[25] s_state; reg ptr u64[25] state; - reg u64 j t0 zero; - reg u8 t; - inline int k INLEN OUTLEN INRND; + reg u64 j offset t0 zero; + inline int INLEN OUTLEN INRND; INLEN = BYTES_CT + BYTES_SEC; INRND = INLEN/SHAKE128_RATE; @@ -328,11 +354,15 @@ fn __shake128_ss_opt( () = #spill(out); - i = 0; j = 0; - while (i < INRND) { - for k = 0 to SHAKE128_RATE/8 { - t0 = in[u64 j + k]; - state[u64 k] ^= t0; + i = 0; + while (i < INRND*SHAKE128_RATE/8) { + j = 0; + while (j < SHAKE128_RATE/8) { + offset = #LEA(i+j); + t0 = in[u64 offset]; + state[u64 j] ^= t0; + + j += 1; } () = #spill(i, j); @@ -341,13 +371,15 @@ fn __shake128_ss_opt( () = #unspill(i, j); - i += 1; - j += SHAKE128_RATE/8; + i += SHAKE128_RATE/8; } - for k = 0 to (INLEN % SHAKE128_RATE)/8 { - t0 = in[u64 INRND * SHAKE128_RATE/8 + k]; - state[u64 k] ^= t0; + i = 0; + while (i < (INLEN % SHAKE128_RATE)/8) { + t0 = in[u64 INRND * SHAKE128_RATE/8 + i]; + state[u64 i] ^= t0; + + i += 1; } state[u8 INLEN%SHAKE128_RATE] ^= 0x1f; @@ -359,8 +391,10 @@ fn __shake128_ss_opt( () = #unspill(i, j, out); - for k = 0 to OUTLEN/8 { - out[u64 k] = state[u64 k]; + i = 0; + while (i < OUTLEN/8) { + out[u64 i] = state[u64 i]; + i += 1; } return out; From b598ceef884e424fc260592521021f289fc382c3 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Thu, 28 Mar 2024 18:54:27 +0800 Subject: [PATCH 12/14] refactor(frodo): optimize shake128_gen_A_opt --- .../frodo/common/amd64/ref/matrix_opt.jinc | 88 +------------------ .../frodo/common/amd64/ref/shake128_opt.jinc | 74 ++++++++++++++++ 2 files changed, 76 insertions(+), 86 deletions(-) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc index d3aaadb8..ecb4ba52 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc @@ -2,90 +2,6 @@ // notes: "16" instead of BYTES_SEED_A on purpose // compilation should fail if BYTES_SEED_A changes -fn _shake128_gen_A_opt( - #spill_to_mmx reg ptr u8[2 * N * 8] out, // note: this implementation should work for N multiple of 4 - #spill_to_mmx reg u64 out_offset, // out_offset in bytes - #spill_to_mmx reg ptr u8[16+2+6] index_seed_padding) - -> - reg ptr u8[2 * N * 8], - reg u64 -{ - #spill_to_mmx reg u64 i; - - stack u64[25] s_state; - reg ptr u64[25] state; - reg u64 j t0 t1 t2 zero; - reg u8 v0; - - state = s_state; - - t0 = index_seed_padding[u64 0]; - t1 = index_seed_padding[u64 1]; - t2 = index_seed_padding[u64 2]; - ?{}, zero = #set0(); - - state[0] = t0; - state[1] = t1; - state[2] = t2; - - i = 3; - while (i < 25) - { state[i] = zero; - i += 1; - } - - state[u8 167] = 0x80; - - // out_offset in u64 words - out_offset >>= 3; - - // notes: - // - i is incremented in the inner loop - // - this function needs to output 2*N bytes - // - each iteration produces 168 bytes, but is incremented in "u64 mode" (by 21) - // - for N=640, 1280 bytes are needed, in 7 iterations, we produce 1176 bytes - // and i will be 147. - i = 0; - while (i < (2*N)/168) - { - () = #spill(i, out, out_offset); - - state = __keccakf1600_ref1(state); - - () = #unspill(i, out, out_offset); - - j = 0; - while (j < 21) - { t0 = state[j]; - out[u64 out_offset] = t0; - out_offset += 1; - j += 1; - } - - i += 1; - } - - () = #spill(i, out, out_offset); - - state = __keccakf1600_ref1(state); - - () = #unspill(i, out, out_offset); - - out_offset <<= 3; // in bytes again - i *= 168; - j = 0; - while (i < 2*N) - { - v0 = state[u8 (int)j]; - out[(int)out_offset] = v0; - out_offset += 1; - i += 1; - j += 1; - } - - return out, out_offset; -} - inline fn __pad_seedA(reg ptr u8[16] seedA) -> reg ptr u8[2 + 16 + 6] { reg u64 i j; stack u8[2+16+ 6] s_index_seed_padding; @@ -153,7 +69,7 @@ fn __AS_plus_E_opt( () = #spill(i, j); index_seed_padding[u16 0] = (16u) ij; - A, A_offset = _shake128_gen_A_opt(A, A_offset, index_seed_padding); + A, A_offset = __shake128_gen_A_opt(A, A_offset, index_seed_padding); () = #unspill(i, j); j += 1; @@ -246,7 +162,7 @@ fn __SA_plus_E_opt( () = #spill(i, j); index_seed_padding[u16 0] = (16u) ij; - A, A_offset = _shake128_gen_A_opt(A, A_offset, index_seed_padding); + A, A_offset = __shake128_gen_A_opt(A, A_offset, index_seed_padding); () = #unspill(i, j); j += 1; diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc index 6a6fbff9..81b3ca1b 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc @@ -1,6 +1,80 @@ from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc" param int SHAKE128_RATE = 168; +fn __shake128_gen_A_opt( + #spill_to_mmx reg ptr u8[2 * N * 8] out, + #spill_to_mmx reg u64 out_offset, + #spill_to_mmx reg const ptr u8[BYTES_SEED_A + 2 + 6] in) +-> reg ptr u8[2 * N * 8], + reg u64 +{ + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j offset t0 zero; + reg u16 t16; + inline int k INLEN OUTLEN OUTRND; + + INLEN = BYTES_SEED_A + 2; + OUTLEN = 2 * N; + OUTRND = OUTLEN/SHAKE128_RATE; + + state = s_state; + + ?{}, zero = #set0(); + state[u8 0] = 0x5F; + + for k = 0 to INLEN/8 + 1 { + t0 = in[u64 k]; + state[u64 k] = t0; + } + + i = INLEN/8 + 1; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE128_RATE-1] = 0x80; + + i = 0; + while (i < OUTRND * SHAKE128_RATE/8) { + () = #spill(i, j, out, out_offset); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out, out_offset); + + j = 0; + while (j < SHAKE128_RATE/8) { + t0 = state[u64 j]; + out[u64 out_offset] = t0; + out_offset += 1; + + j += 1; + } + + i += SHAKE128_RATE/8; + } + + () = #spill(i, j, out, out_offset); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out, out_offset); + + i = 0; + while (i < (OUTLEN % SHAKE128_RATE) / 8) { + t0 = state[u64 i]; + out[u64 out_offset] = t0; + out_offset += 1; + i += 1; + } + + return out, out_offset; +} fn __shake128_seed_A_opt( #spill_to_mmx reg ptr u8[BYTES_SEED_A] out, From ddcfc09b07bf7688dd0fb87b41920516a8b213d0 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Fri, 19 Apr 2024 14:25:28 +0800 Subject: [PATCH 13/14] refactor(frodo): cleanup frodo640 and common --- .../frodo/common/amd64/ref/matrix.jinc | 192 ------------------ .../frodo/common/amd64/ref/matrix_mul.jinc | 191 +++++++++++++++++ .../{matrix_opt.jinc => matrix_mul_opt.jinc} | 1 - .../frodo/common/amd64/ref/shake128_opt.jinc | 3 +- .../frodo/frodo640shake/amd64/ref/kem.jinc | 3 +- 5 files changed, 193 insertions(+), 197 deletions(-) create mode 100644 src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc rename src/crypto_kem/frodo/common/amd64/ref/{matrix_opt.jinc => matrix_mul_opt.jinc} (99%) diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc index b93d57a2..535c338a 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix.jinc @@ -1,165 +1,3 @@ -from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" - -#[returnaddress="stack"] -fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] { - stack ptr u16[NNBAR] s_B; - stack u16[N] A_row; - stack ptr u8[BYTES_SEED_A] s_seedA; - stack ptr u16[NNBAR] s_S s_E; - stack u8[2 + BYTES_SEED_A] b; - - reg u64 j k; stack u64 s_j s_k; - reg u16 tmp ac; - inline int i l; - - s_B = B; s_S = S; s_E = E; - - // copy seedA - for i = 0 to BYTES_SEED_A { - b[i + 2] = seedA[i]; - } - s_seedA = seedA; - - // first set B = E - B = s_B; E = s_E; - - j = 0; - while (j < NNBAR) { - for l = 0 to 4 { - B[(int)j + l] = E[(int)j + l]; - } - j += 4; - } - - s_B = B; s_E = E; - - // calculate A and B += A * S - b[u16 0] = 0; - k = 0; - - while (b[u16 0] < N) { - s_j = j; s_k = k; s_S = S; s_B = B; - A_row = __shake128_gen_A(A_row, b); - j = s_j; k = s_k; S = s_S; B = s_B; - - for i = 0 to NBAR { - ac = 0; - j = 0; - - // A_row * S_T_row - while (j < N) { - tmp = A_row[(int)j]; - tmp *= S[i * N + (int)j]; - ac += tmp; - j += 1; - } - - B[(int)k + i] += ac; - } - k += NBAR; - - b[u16 0] += 1; - } - - return B; -} - -#[returnaddress="stack"] -fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] { - stack ptr u16[NNBAR] s_B; - stack u16[N] A_row; - stack ptr u8[BYTES_SEED_A] s_seedA; - stack ptr u16[NNBAR] s_S s_E; - stack u8[2 + BYTES_SEED_A] b; - - reg u64 j k; stack u64 s_j s_k; - reg u16 tmp s; - inline int l; - - // copy seedA - for l = 0 to BYTES_SEED_A { - b[l + 2] = seedA[l]; - } - s_seedA = seedA; - - j = 0; - while (j < NNBAR) { - for l = 0 to 4 { - B[(int)j + l] = E[(int)j + l]; - } - j += 4; - } - s_B = B; s_S = S; s_E = E; - - // calculate A and B += S * A - b[u16 0] = 0; - - while (b[u16 0] < N) { - A_row = __shake128_gen_A(A_row, b); - - for l = 0 to NBAR { - k = s_k; S = s_S; - - k = (64u)b[u16 0]; - s = S[l * N + (int)k]; - - s_k = k; s_S = S; - - j = s_j; B = s_B; - j = 0; - while (j < N) { - tmp = A_row[(int)j]; - tmp *= s; - B[l * N + (int)j] += tmp; - - j += 1; - } - s_j = j; s_B = B; - } - - b[u16 0] += 1; - } - - return B; -} - -#[returnaddress="stack"] -fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[NBAR * NBAR] E) -> stack u16[NBAR * NBAR] { - reg u64 k tj; - reg u16 tmp ac; - inline int i j l; - - k = 0; - while (k < NBAR * NBAR) { - for l = 0 to 4 { - V[(int)k + l] = E[(int)k + l]; - } - k += 4; - } - - for i = 0 to NBAR { - for j = 0 to NBAR { - k = 0; - ac = 0; - while (k < N) { - tmp = S[i * N + (int)k]; - - // NOTE: why is this needed ? - tj = j + NBAR * k; - tmp *= B[(int)tj]; - - ac += tmp; - k += 1; - } - - V[i * NBAR + j] += ac; - V[i * NBAR + j] &= (1 << D) - 1; - } - } - - return V; -} - fn __matrix_add(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { reg u64 i; reg u16 tmp; @@ -194,36 +32,6 @@ fn __matrix_sub(reg ptr u16[NBAR * NBAR] a b) -> stack u16[NBAR * NBAR] { return a; } -#[returnaddress="stack"] -fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR * NBAR] { - reg u64 k tj; - reg u16 tmp; - inline int i j; - - for i = 0 to NBAR { - for j = 0 to NBAR { - M[i * NBAR + j] = 0; - - k = 0; - while (k < N) { - tmp = B[i * N + (int)k]; - - tj = j * N; - // NOTE: why is this needed ? register allocation, k and tj must be merged will be raised - tj += k; - tmp *= S[(int)tj]; - - M[i * NBAR + j] += tmp; - - k += 1; - } - M[i * NBAR + j] &= (1 << D) - 1; - } - } - - return M; -} - #[returnaddress="stack"] fn __ct_verify_NNBAR(reg ptr u16[NNBAR] a b) -> stack u8 { reg u64 i; diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc new file mode 100644 index 00000000..0008277a --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul.jinc @@ -0,0 +1,191 @@ +from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" + +#[returnaddress="stack"] +fn __AS_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] { + stack ptr u16[NNBAR] s_B; + stack u16[N] A_row; + stack ptr u8[BYTES_SEED_A] s_seedA; + stack ptr u16[NNBAR] s_S s_E; + stack u8[2 + BYTES_SEED_A] b; + + reg u64 j k; stack u64 s_j s_k; + reg u16 tmp ac; + inline int i l; + + s_B = B; s_S = S; s_E = E; + + // copy seedA + for i = 0 to BYTES_SEED_A { + b[i + 2] = seedA[i]; + } + s_seedA = seedA; + + // first set B = E + B = s_B; E = s_E; + + j = 0; + while (j < NNBAR) { + for l = 0 to 4 { + B[(int)j + l] = E[(int)j + l]; + } + j += 4; + } + + s_B = B; s_E = E; + + // calculate A and B += A * S + b[u16 0] = 0; + k = 0; + + while (b[u16 0] < N) { + s_j = j; s_k = k; s_S = S; s_B = B; + A_row = __shake128_gen_A(A_row, b); + j = s_j; k = s_k; S = s_S; B = s_B; + + for i = 0 to NBAR { + ac = 0; + j = 0; + + // A_row * S_T_row + while (j < N) { + tmp = A_row[(int)j]; + tmp *= S[i * N + (int)j]; + ac += tmp; + j += 1; + } + + B[(int)k + i] += ac; + } + k += NBAR; + + b[u16 0] += 1; + } + + return B; +} + +#[returnaddress="stack"] +fn __SA_plus_E(reg ptr u16[NNBAR] B, reg ptr u8[BYTES_SEED_A]seedA, reg ptr u16[NNBAR] S E) -> stack u16[NNBAR] { + stack ptr u16[NNBAR] s_B; + stack u16[N] A_row; + stack ptr u8[BYTES_SEED_A] s_seedA; + stack ptr u16[NNBAR] s_S s_E; + stack u8[2 + BYTES_SEED_A] b; + + reg u64 j k; stack u64 s_j s_k; + reg u16 tmp s; + inline int l; + + // copy seedA + for l = 0 to BYTES_SEED_A { + b[l + 2] = seedA[l]; + } + s_seedA = seedA; + + j = 0; + while (j < NNBAR) { + for l = 0 to 4 { + B[(int)j + l] = E[(int)j + l]; + } + j += 4; + } + s_B = B; s_S = S; s_E = E; + + // calculate A and B += S * A + b[u16 0] = 0; + + while (b[u16 0] < N) { + A_row = __shake128_gen_A(A_row, b); + + for l = 0 to NBAR { + k = s_k; S = s_S; + + k = (64u)b[u16 0]; + s = S[l * N + (int)k]; + + s_k = k; s_S = S; + + j = s_j; B = s_B; + j = 0; + while (j < N) { + tmp = A_row[(int)j]; + tmp *= s; + B[l * N + (int)j] += tmp; + + j += 1; + } + s_j = j; s_B = B; + } + + b[u16 0] += 1; + } + + return B; +} + +#[returnaddress="stack"] +fn __SB_plus_E(reg ptr u16[NBAR * NBAR] V, reg ptr u16[NNBAR] S B, reg ptr u16[NBAR * NBAR] E) -> stack u16[NBAR * NBAR] { + reg u64 k tj; + reg u16 tmp ac; + inline int i j l; + + k = 0; + while (k < NBAR * NBAR) { + for l = 0 to 4 { + V[(int)k + l] = E[(int)k + l]; + } + k += 4; + } + + for i = 0 to NBAR { + for j = 0 to NBAR { + k = 0; + ac = 0; + while (k < N) { + tmp = S[i * N + (int)k]; + + // NOTE: why is this needed ? + tj = j + NBAR * k; + tmp *= B[(int)tj]; + + ac += tmp; + k += 1; + } + + V[i * NBAR + j] += ac; + V[i * NBAR + j] &= (1 << D) - 1; + } + } + + return V; +} + +#[returnaddress="stack"] +fn __mul_BS(reg ptr u16[NBAR * NBAR] M, reg ptr u16[NNBAR]B S) -> stack u16[NBAR * NBAR] { + reg u64 k tj; + reg u16 tmp; + inline int i j; + + for i = 0 to NBAR { + for j = 0 to NBAR { + M[i * NBAR + j] = 0; + + k = 0; + while (k < N) { + tmp = B[i * N + (int)k]; + + tj = j * N; + // NOTE: why is this needed ? register allocation, k and tj must be merged will be raised + tj += k; + tmp *= S[(int)tj]; + + M[i * NBAR + j] += tmp; + + k += 1; + } + M[i * NBAR + j] &= (1 << D) - 1; + } + } + + return M; +} diff --git a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc similarity index 99% rename from src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc rename to src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc index ecb4ba52..deca2d84 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc @@ -1,4 +1,3 @@ - // notes: "16" instead of BYTES_SEED_A on purpose // compilation should fail if BYTES_SEED_A changes diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc index 81b3ca1b..320c29af 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc @@ -12,8 +12,7 @@ fn __shake128_gen_A_opt( stack u64[25] s_state; reg ptr u64[25] state; - reg u64 j offset t0 zero; - reg u16 t16; + reg u64 j t0 zero; inline int k INLEN OUTLEN OUTRND; INLEN = BYTES_SEED_A + 2; diff --git a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc index c1d06e7b..922c5a50 100644 --- a/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo640shake/amd64/ref/kem.jinc @@ -1,9 +1,8 @@ -from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" -from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" // coins = s || seed SE || z From e7e9e783f6945ca62a3f9d583d39f05ee0803792 Mon Sep 17 00:00:00 2001 From: "iThing-han, Lim" Date: Fri, 19 Apr 2024 14:55:35 +0800 Subject: [PATCH 14/14] refactor(frodo): optimize frodo976shake --- .../frodo/common/amd64/ref/encode.jinc | 6 +- .../frodo/common/amd64/ref/shake256_opt.jinc | 401 ++++++++++++++++++ .../frodo/frodo976shake/amd64/ref/kem.jinc | 260 ++++++------ 3 files changed, 523 insertions(+), 144 deletions(-) create mode 100644 src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc diff --git a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc index 499b877c..28975366 100644 --- a/src/crypto_kem/frodo/common/amd64/ref/encode.jinc +++ b/src/crypto_kem/frodo/common/amd64/ref/encode.jinc @@ -8,8 +8,9 @@ fn __encode(reg ptr u16[NBAR * NBAR]out, reg ptr u8[EXTRACTED_BITS * NBAR * NBAR while (i < NBAR) { tmp = 0; + offset = i * EXTRACTED_BITS; for k = 0 to EXTRACTED_BITS { - tmp2 = (64u)in[i * EXTRACTED_BITS + k]; + tmp2 = (64u)in[offset + k]; tmp2 <<= 8 * k; tmp |= tmp2; } @@ -61,11 +62,12 @@ fn __decode(reg ptr u8[EXTRACTED_BITS * NBAR] out, reg ptr u16[NBAR * NBAR] in) } j = 0; + offset = i*EXTRACTED_BITS; while (j < EXTRACTED_BITS) { - offset = #LEA(i*EXTRACTED_BITS+j); out[offset] = (8u)tmplong; tmplong >>= 8; j += 1; + offset += 1; } i += 1; } diff --git a/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc b/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc new file mode 100644 index 00000000..177ee356 --- /dev/null +++ b/src/crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc @@ -0,0 +1,401 @@ +from Jade require "common/keccak/keccak1600/amd64/ref1/keccak1600.jinc" + +param int SHAKE256_RATE = 136; + +fn __shake256_seed_A_opt( + #spill_to_mmx reg ptr u8[BYTES_SEED_A] out, + #spill_to_mmx reg const ptr u8[BYTES_SEED_A] in) + -> reg ptr u8[BYTES_SEED_A] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 t0 zero; + inline int k INLEN OUTLEN; + + INLEN = BYTES_SEED_A; + OUTLEN = BYTES_SEED_A; + + state = s_state; + + for k = 0 to INLEN/8 { + t0 = in[u64 k]; + state[k] = t0; + } + ?{}, zero = #set0(); + + i = INLEN/8; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE256_RATE-1] = 0x80; + + () = #spill(out); + + state = __keccakf1600_ref1(state); + + () = #unspill(out); + + for k = 0 to OUTLEN/8 { + t0 = state[u64 k]; + out[u64 k] = t0; + } + + return out; +} + +fn __shake256_r_opt( + #spill_to_mmx reg ptr u8[4 * NNBAR] out, + #spill_to_mmx reg const ptr u8[BYTES_SEED_SE] in) +-> reg ptr u8[4 * NNBAR] +{ + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j offset t0 zero; + inline int INLEN OUTLEN OUTRND; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 4 * NNBAR; + OUTRND = OUTLEN/SHAKE256_RATE; + + state = s_state; + + ?{}, zero = #set0(); + state[u8 0] = 0x5F; + state[INLEN/8] = zero; + + i = 0; + while (i < INLEN/8) { + t0 = in[u64 i]; + offset = #LEA(1+8*i); + state.[u64 offset] = t0; + + i += 1; + } + + i = INLEN/8 + 1; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE256_RATE-1] = 0x80; + + i = 0; + while (i < OUTRND * SHAKE256_RATE/8) { + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + j = 0; + while (j < SHAKE256_RATE/8) { + t0 = state[u64 j]; + offset = #LEA(i+j); + out[u64 offset] = t0; + + j += 1; + } + + i += SHAKE256_RATE/8; + } + + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + i = 0; + while (i < (OUTLEN % SHAKE256_RATE) / 8) { + t0 = state[u64 i]; + out[u64 OUTRND*SHAKE256_RATE/8 + i] = t0; + i += 1; + } + + return out; +} + +fn __shake256_pkh_opt( + #spill_to_mmx reg ptr u8[BYTES_SEC] out, + #spill_to_mmx reg const ptr u8[BYTES_PK] in) +-> reg ptr u8[BYTES_SEC] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j offset t0 zero; + inline int INLEN OUTLEN INRND; + + INLEN = BYTES_PK; + INRND = INLEN/SHAKE256_RATE; + OUTLEN = BYTES_SEC; + + state = s_state; + + ?{}, zero = #set0(); + + i = 0; + while (i < 25) { + state[i] = zero; + i += 1; + } + + () = #spill(out); + + i = 0; + while (i < INRND * SHAKE256_RATE/8) { + j = 0; + while (j < SHAKE256_RATE/8) { + offset = #LEA(i+j); + t0 = in[u64 offset]; + state[u64 j] ^= t0; + j += 1; + } + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j); + + i += SHAKE256_RATE/8; + } + + i = 0; + while (i < (INLEN % SHAKE256_RATE)/8) { + t0 = in[u64 INRND * SHAKE256_RATE/8 + i]; + state[u64 i] ^= t0; + + i += 1; + } + + state[u8 INLEN%SHAKE256_RATE] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + i = 0; + while (i < OUTLEN/8) { + out[u64 i] = state[u64 i]; + i += 1; + } + + return out; +} + +fn __shake256_SE_k_opt( + #spill_to_mmx reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] out, + #spill_to_mmx reg const ptr u8[2 * BYTES_SEC + BYTES_SALT] in) +-> reg ptr u8[1 + BYTES_SEED_SE + BYTES_SEC] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 offset t0 zero; + inline int INLEN OUTLEN; + + INLEN = 2 * BYTES_SEC + BYTES_SALT; + OUTLEN = BYTES_SEED_SE + BYTES_SEC; + + state = s_state; + + i = 0; + while (i < INLEN/8) { + t0 = in[u64 i]; + state[i] = t0; + + i += 1; + } + ?{}, zero = #set0(); + + i = INLEN/8; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE256_RATE-1] = 0x80; + + () = #spill(out); + + state = __keccakf1600_ref1(state); + + () = #unspill(out); + + i = 0; + while (i < OUTLEN/8) { + t0 = state[u64 i]; + offset = #LEA(1+8*i); + out.[u64 offset] = t0; + + i += 1; + } + + return out; +} + +fn __shake256_encap_r_opt( + #spill_to_mmx reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] out, + #spill_to_mmx reg const ptr u8[1 + BYTES_SEED_SE] in) +-> reg ptr u8[2 * (2 * NNBAR + NBAR * NBAR)] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j offset t0 zero; + reg u8 t; + inline int k INLEN OUTLEN OUTRND; + + INLEN = 1 + BYTES_SEED_SE; + OUTLEN = 2 * (2 * NNBAR + NBAR * NBAR); + OUTRND = OUTLEN/SHAKE256_RATE; + + state = s_state; + + ?{}, zero = #set0(); + state[u8 0] = 0x5F; + state[INLEN/8] = zero; + + i = 0; + while (i < INLEN/8) { + t0 = in[u64 i]; + state[u64 i] = t0; + i += 1; + } + + for k = 0 to INLEN%8 { + t = in[INLEN-INLEN%8 + k]; + state[u8 INLEN-INLEN%8 + k] = t; + } + + i = INLEN/8 + 1; + while (i < 25) { + state[i] = zero; + i += 1; + } + + state[u8 INLEN] = 0x1f; + state[u8 SHAKE256_RATE-1] = 0x80; + + i = 0; + while (i < OUTRND * SHAKE256_RATE/8) { + () = #spill(i, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, out); + + j = 0; + while (j < SHAKE256_RATE/8) { + t0 = state[u64 j]; + offset = #LEA(i+j); + out[u64 offset] = t0; + + j += 1; + } + + i += SHAKE256_RATE/8; + } + + () = #spill(i, j, out); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + i = 0; + while (i < (OUTLEN % SHAKE256_RATE) / 8) { + t0 = state[u64 i]; + out[u64 OUTRND*SHAKE256_RATE/8 + i] = t0; + + i += 1; + } + + return out; +} + +fn __shake256_ss_opt( + #spill_to_mmx reg ptr u8[BYTES_SEC] out, + #spill_to_mmx reg const ptr u8[BYTES_CT + BYTES_SEC] in) +-> reg ptr u8[BYTES_SEC] { + #spill_to_mmx reg u64 i; + + stack u64[25] s_state; + reg ptr u64[25] state; + reg u64 j offset t0 zero; + inline int INLEN OUTLEN INRND; + + INLEN = BYTES_CT + BYTES_SEC; + INRND = INLEN/SHAKE256_RATE; + OUTLEN = BYTES_SEC; + + state = s_state; + + ?{}, zero = #set0(); + + i = 0; + while (i < 25) { + state[i] = zero; + i += 1; + } + + () = #spill(out); + + i = 0; + while (i < INRND*SHAKE256_RATE/8) { + j = 0; + while (j < SHAKE256_RATE/8) { + offset = #LEA(i+j); + t0 = in[u64 offset]; + state[u64 j] ^= t0; + + j += 1; + } + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j); + + i += SHAKE256_RATE/8; + } + + i = 0; + while (i < (INLEN % SHAKE256_RATE)/8) { + t0 = in[u64 INRND * SHAKE256_RATE/8 + i]; + state[u64 i] ^= t0; + + i += 1; + } + + state[u8 INLEN%SHAKE256_RATE] ^= 0x1f; + state[u8 SHAKE256_RATE-1] ^= 0x80; + + () = #spill(i, j); + + state = __keccakf1600_ref1(state); + + () = #unspill(i, j, out); + + i = 0; + while (i < OUTLEN/8) { + out[u64 i] = state[u64 i]; + i += 1; + } + + return out; +} diff --git a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc index 94508463..c1ed48cd 100644 --- a/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc +++ b/src/crypto_kem/frodo/frodo976shake/amd64/ref/kem.jinc @@ -1,44 +1,20 @@ -from Jade require "crypto_kem/frodo/common/amd64/ref/shake256.jinc" -from Jade require "crypto_kem/frodo/common/amd64/ref/shake128.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/shake128_opt.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/shake256_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/encode.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/noise.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/matrix.jinc" -from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_opt.jinc" +from Jade require "crypto_kem/frodo/common/amd64/ref/matrix_mul_opt.jinc" from Jade require "crypto_kem/frodo/common/amd64/ref/pack.jinc" -#[returnaddress="stack"] -fn __gen_SE(reg ptr u16[2 * NNBAR] SE, reg ptr u8[BYTES_SEED_SE] r) -> stack u16[2 * NNBAR] { - stack u8[1 + BYTES_SEED_SE] seed_se; - reg u64 i; stack u64 s_i; - - r = r; - i = 0; - while (i < BYTES_SEED_SE) { - seed_se[(int)i + 1] = r[(int)i]; - i += 1; - } - s_i = i; - - // S || E - seed_se[0] = 0x5F; - - SE = __shake256_r(SE, seed_se); - SE = __sample_2NNBAR(SE); - - return SE; -} - // coins = s || seed SE || z -fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) { +fn __frodo_amd64_ref_keypair_derand( + reg u64 pkp skp, + #spill_to_mmx reg ptr u8[BYTES_SEED_A + BYTES_SEED_SE + BYTES_SEC] coins) { stack u16[2 * NNBAR] SE; stack u16[NNBAR] B; - stack u64 s_pkp s_skp; - reg u64 i; stack u64 s_i; - - s_pkp = pkp; - s_skp = skp; - coins = coins; + inline int k; + reg u64 i j; // seedA || b stack u8[BYTES_PK] pk; @@ -46,59 +22,65 @@ fn __frodo_amd64_ref_keypair_derand(reg u64 pkp skp, reg ptr u8[BYTES_SEED_A + B // s || seedA || b || S_T || pkh stack u8[BYTES_SK] sk; + () = #spill(i, j, pkp, skp); + + for k = 0 to BYTES_SEC/8 { + sk[u64 k] = coins[u64 k]; + } + // gen seedA - pk[0:BYTES_SEED_A] = __shake256_seed_A(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); + pk[0:BYTES_SEED_A] = __shake256_seed_A_opt(pk[0:BYTES_SEED_A], coins[BYTES_SEC + BYTES_SEED_SE:BYTES_SEED_A]); // gen S || E - SE = __gen_SE(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + SE = __shake256_r_opt(SE, coins[BYTES_SEC:BYTES_SEED_SE]); + SE = __sample_2NNBAR(SE); + + () = #spill(coins); // B = A*S+E - B = __AS_plus_E(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]); + B = __AS_plus_E_opt(B, pk[0:BYTES_SEED_A], SE[0:NNBAR], SE[NNBAR:NNBAR]); // pack pk[BYTES_SEED_A:D * N] = __pack_B(pk[BYTES_SEED_A:D * N], B); - // - i = s_i; i = 0; - while (i < BYTES_SEC) { - sk[(int) i] = coins[(int)i]; - i += 1; - } - + () = #unspill(i); i = 0; - while (i < BYTES_PK) { - sk[BYTES_SEC + (int)i] = pk[(int)i]; + while (i < BYTES_PK/8) { + sk[u64 BYTES_SEC/8 + i] = pk[u64 i]; i += 1; } i = 0; - while (i < 2 * NNBAR) { - sk[BYTES_SEC + BYTES_PK + (int)i] = SE.[u8 (int)i]; + while (i < 2 * NNBAR / 8) { + sk[u64 BYTES_SEC/8 + BYTES_PK/8 + i] = SE[u64 i]; i += 1; } - s_i = i; + () = #spill(i); - sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake256_pkh(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk); + sk[BYTES_SEC + BYTES_PK + 2 * NNBAR : BYTES_SEC] = __shake256_pkh_opt(sk[BYTES_SEC + BYTES_PK + 2 * NNBAR:BYTES_SEC], pk); - pkp = s_pkp; - i = 0; - while (i < BYTES_PK) { - (u8)[pkp + i] = pk[(int) i]; + () = #unspill(i, j, pkp, skp); + i = 0; j = 0; + while (i < BYTES_PK/8) { + [pkp + j] = pk[u64 i]; i += 1; + j += 8; } - skp = s_skp; - i = 0; - while (i < BYTES_SK) { - (u8)[skp + i] = sk[(int) i]; + i = 0; j = 0; + while (i < BYTES_SK/8) { + [skp + j] = sk[u64 i]; i += 1; + j += 8; } } #[returnaddress="stack"] -fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { - reg u64 i j; stack u64 s_i s_j; - stack u64 s_ctp s_ssp; +fn __frodo_amd64_ref_enc_derand( + reg u64 ctp ssp pkp, + #spill_to_mmx reg ptr u8[BYTES_SEC + BYTES_SALT] coins) { + reg u64 i j; + inline int k; // seedA || b #public stack u8[BYTES_PK] pk; @@ -115,53 +97,47 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE // S' || E' || E'' stack u16[2 * NNBAR + NBAR * NBAR] SEE; - stack u16[NNBAR] B Bp; - stack u16[NBAR * NBAR] V C; + stack u16[NNBAR] B; + reg ptr u16[NNBAR] Bp; + stack u16[NBAR * NBAR] C; + reg ptr u16[NBAR * NBAR] V; stack u8[BYTES_SEC] ss; pkp = pkp; - s_ctp = ctp; - s_ssp = ssp; - coins = coins; - s_j = j; + () = #spill(ctp, ssp, i, j); // gen u || salt - i = 0; - while (i < BYTES_SEC + BYTES_SALT) { - pkh_u_salt[BYTES_SEC + (int)i] = coins[(int)i]; - i += 1; + for k = 0 to (BYTES_SEC + BYTES_SALT)/8 { + pkh_u_salt[u64 BYTES_SEC/8 + k] = coins[u64 k]; } - i = 0; - while (i < BYTES_SALT) { - ct_k[D * N + D * NBAR + (int)i] = pkh_u_salt[BYTES_SEC * 2 + (int)i]; - i += 1; + for k = 0 to BYTES_SALT/8 { + ct_k[u64 (D * N + D * NBAR)/8 + k] = pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k]; } + () = #unspill(i, j); // read pk - i = 0; - while (i < BYTES_PK) { - #declassify pk[(int)i] = (u8) [pkp + i]; + i = 0; j = 0; + while (i < BYTES_PK/8) { + #declassify pk[u64 i] = [pkp + j]; i += 1; + j += 8; } - s_i = i; + () = #spill(i, j); // pkh - pkh_u_salt[0:BYTES_SEC] = __shake256_pkh(pkh_u_salt[0:BYTES_SEC], pk); + pkh_u_salt[0:BYTES_SEC] = __shake256_pkh_opt(pkh_u_salt[0:BYTES_SEC], pk); // seedSE || k - seedSE_k[1 : BYTES_SEED_SE + BYTES_SEC] = __shake256_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + seedSE_k = __shake256_SE_k_opt(seedSE_k, pkh_u_salt); // copy k - i = s_i; i = 0; - while (i < BYTES_SEC) { - ct_k[BYTES_CT + (int)i] = seedSE_k[1 + BYTES_SEED_SE + (int)i]; - i += 1; + for k = 0 to BYTES_SEC/8 { + ct_k[u64 BYTES_CT/8 + k] = seedSE_k.[u64 1 + BYTES_SEED_SE + 8*k]; } - s_i = i; // gen input bit string for sampling S and E - SEE = __shake256_encap_r(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); + SEE = __shake256_encap_r_opt(SEE, seedSE_k[0 : 1 + BYTES_SEED_SE]); // S' || E' SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); @@ -169,7 +145,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B' = S'A + E'' - Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bp = SEE[NNBAR:NNBAR]; + Bp = __SA_plus_E_opt(Bp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]); // c1 <- Pack(B') ct_k[0:D * N] = __pack_B(ct_k[0:D * N], Bp); @@ -178,7 +155,8 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE B = __unpack_B(B, pk[BYTES_SEED_A:D * N]); // V = S'B + E'' - V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + V = SEE[NNBAR*2:NBAR*NBAR]; + V = __SB_plus_E_opt(V, SEE[0:NNBAR], B); // C = V + Encode(u) C = __encode(C, pkh_u_salt[BYTES_SEC:BYTES_SEC]); @@ -188,20 +166,18 @@ fn __frodo_amd64_ref_enc_derand(reg u64 ctp ssp pkp, reg ptr u8[BYTES_SEC + BYTE ct_k[D * N: D * NBAR] = __pack_C(ct_k[D * N: D * NBAR], C); // ss <- shake(c1 || c2 || salt || k) - ss = __shake256_ss(ss, ct_k); + ss = __shake256_ss_opt(ss, ct_k); - i = s_i; i = 0; - ctp = s_ctp; - ssp = s_ssp; - while (i < BYTES_CT) { - (u8)[ctp + i] = ct_k[(int)i]; + () = #unspill(i, j, ctp, ssp); + i = 0; j = 0; + while (i < BYTES_CT/8) { + [ctp + j] = ct_k[u64 i]; i += 1; + j += 8; } - i = 0; - while (i < BYTES_SEC) { - (u8)[ssp + i] = ss[(int)i]; - i += 1; + for k = 0 to BYTES_SEC/8 { + [ssp + 8*k] = ss[u64 k]; } } @@ -211,8 +187,10 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { stack u8[2 * NNBAR] ST; stack u8[BYTES_SEC] s; stack u8[BYTES_CT + BYTES_SEC] ct_k; - stack u16[NNBAR] B Bp Bpp; - stack u16[NBAR * NBAR] M C Cp V; + stack u16[NNBAR] B Bp; + reg ptr u16[NNBAR] Bpp; + stack u16[NBAR * NBAR] M C Cp; + reg ptr u16[NBAR * NBAR] V; stack u8[BYTES_SEC * 2 + BYTES_SALT] pkh_u_salt; stack u8[1 + BYTES_SEED_SE + BYTES_SEC] seedSE_k; stack u8[BYTES_SEC] ss; @@ -220,55 +198,54 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { // S' || E' || E'' stack u16[2 * NNBAR + NBAR * NBAR] SEE; - stack u64 s_ssp s_ctp s_skp; reg u8 s1 s2; - reg u64 i; stack u64 s_i; + reg u64 i j t; + stack u64 s_ssp s_skp; + inline int k; - s_ssp = ssp; ctp = ctp; skp = skp; + s_ssp = ssp; - // read ct - i = 0; - while (i < BYTES_CT) { - ct_k[(int) i] = (u8)[ctp + i]; - i += 1; + // copy pkh + for k = 0 to BYTES_SEC/8 { + pkh_u_salt[u64 k] = [skp + BYTES_SK - BYTES_SEC + 8*k]; } - s_ctp = ctp; + s_skp = skp; - i = 0; - while (i < BYTES_SEC) { - s[(int) i] = (u8)[skp + i]; + // read ct + i = 0; j = 0; + while (i < BYTES_CT/8) { + t = [ctp + j]; + ct_k[u64 i] = t; i += 1; + j += 8; } - i = 0; - while (i < BYTES_PK) { - #declassify pk[(int)i] = (u8)[skp + BYTES_SEC + i]; - i += 1; + for k = 0 to BYTES_SEC/8 { + s[u64 k] = [skp + 8*k]; } - i = 0; - while (i < 2 * NNBAR) { - ST[(int)i] = (u8)[skp + BYTES_SEC + BYTES_PK + i]; + i = 0; j = 0; + while (i < BYTES_PK/8) { + #declassify pk[u64 i] = [skp + BYTES_SEC + j]; i += 1; + j += 8; } - // copy pkh - i = 0; - while (i < BYTES_SEC) { - pkh_u_salt[(int)i] = (u8)[skp + BYTES_SK - BYTES_SEC + i]; + i = 0; j = 0; + while (i < 2 * NNBAR/8) { + ST[u64 i] = [skp + BYTES_SEC + BYTES_PK + j]; i += 1; + j += 8; } - s_skp = skp; + + () = #spill(i); // copy salt - i = 0; - while (i < BYTES_SALT) { - pkh_u_salt[BYTES_SEC * 2 + (int)i] = ct_k[BYTES_CT - BYTES_SALT + (int)i]; - i += 1; + for k = 0 to BYTES_SALT/8 { + pkh_u_salt[u64 (BYTES_SEC * 2)/8 + k] = ct_k[u64 (BYTES_CT - BYTES_SALT)/8 + k]; } - s_i = i; // B' <- Unpack(c1) Bp = __unpack_B(Bp, ct_k[0:D * N]); @@ -276,15 +253,15 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { C = __unpack_C(C, ct_k[D * N:D * NBAR]); // M = C - B'S - M = __mul_BS(M, Bp, ST); + M = __mul_BS_opt(M, Bp, ST); M = __matrix_sub(M, C); pkh_u_salt[BYTES_SEC:BYTES_SEC] = __decode(pkh_u_salt[BYTES_SEC:BYTES_SEC], M); seedSE_k[0] = 0x96; - seedSE_k[1:BYTES_SEED_SE + BYTES_SEC] = __shake256_SE_k(seedSE_k[1:BYTES_SEED_SE + BYTES_SEC], pkh_u_salt); + seedSE_k = __shake256_SE_k_opt(seedSE_k, pkh_u_salt); - SEE = __shake256_encap_r(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]); + SEE = __shake256_encap_r_opt(SEE, seedSE_k[0: 1 + BYTES_SEED_SE]); // S' || E' SEE[0:2 * NNBAR] = __sample_2NNBAR(SEE[0:2 * NNBAR]); @@ -292,20 +269,22 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { SEE[NNBAR * 2:NBAR * NBAR] = __sample_NBAR2(SEE[NNBAR * 2:NBAR * NBAR]); // B'' = S'A + E' - Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR], SEE[NNBAR:NNBAR]); + Bpp = SEE[NNBAR:NNBAR]; + Bpp = __SA_plus_E_opt(Bpp, pk[0:BYTES_SEED_A], SEE[0:NNBAR]); // B'' (mod q) - i = s_i; + () = #unspill(i); i = 0; while (i < NNBAR) { - Bpp[(int)i] &= (1 << D) - 1; + Bpp[i] &= (1 << D) - 1; i += 1; } // - B = __unpack_B(B ,pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]); + B = __unpack_B(B, pk[BYTES_SEED_A:BYTES_PK - BYTES_SEED_A]); - V = __SB_plus_E(V, SEE[0:NNBAR], B, SEE[NNBAR * 2:NBAR * NBAR]); + V = SEE[NNBAR*2:NBAR*NBAR]; + V = __SB_plus_E_opt(V, SEE[0:NNBAR], B); Cp = __encode(Cp, pkh_u_salt[BYTES_SEC:BYTES_SEC]); Cp = __matrix_add(Cp, V); @@ -316,14 +295,11 @@ fn _frodo_amd64_ref_dec(reg u64 ssp ctp skp) { ct_k[BYTES_CT:BYTES_SEC] = __ct_select(ct_k[BYTES_CT:BYTES_SEC], seedSE_k[1+BYTES_SEED_SE:BYTES_SEC], s, s1); - ss = __shake256_ss(ss, ct_k); + ss = __shake256_ss_opt(ss, ct_k); ssp = s_ssp; - i = s_i; - i = 0; - while (i < BYTES_SEC) { - (u8)[ssp + i] = ss[(int)i]; - i += 1; + for k = 0 to BYTES_SEC/8 { + [ssp + 8*k] = ss[u64 k]; } }