From cccf187f3152b5776c8485ad5a86dfd6bc4b57cf Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Thu, 13 Sep 2018 15:28:53 +0200 Subject: [PATCH 01/13] Move OOR_MARK definition to property.h --- src/fec_base.h | 2 -- src/property.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/fec_base.h b/src/fec_base.h index 13ac78d9..741d480f 100644 --- a/src/fec_base.h +++ b/src/fec_base.h @@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin) return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec; } -#define OOR_MARK 1 - enum class FecType { /** Systematic code * diff --git a/src/property.h b/src/property.h index 766acea7..2fcaa61c 100644 --- a/src/property.h +++ b/src/property.h @@ -40,6 +40,8 @@ namespace quadiron { +static constexpr unsigned OOR_MARK = 1; + /** Ancillary data attached to values. * * A property carries extra-information (whose interpretation is left to the From af5455287bf7883651f8ea22fb20a796731cfb91 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Mon, 15 Oct 2018 11:37:09 +0200 Subject: [PATCH 02/13] FFT2n: specialize butterfly operations --- src/fft_2n.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/src/fft_2n.h b/src/fft_2n.h index f3da7174..8a63bbb2 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -602,6 +602,65 @@ void Radix2::ifft(vec::Buffers& output, vec::Buffers& input) this->gf->mul_vec_to_vecp(*(this->vec_inv_n), output, output); } +#ifdef QUADIRON_USE_SIMD + +/* Operations are vectorized by SIMD */ +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step); + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m); +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step); +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step); + +#endif // #ifdef QUADIRON_USE_SIMD + } // namespace fft } // namespace quadiron From bf559a6ff102e5c461767c4c0fa8089c4bc10f5d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:45:34 +0200 Subject: [PATCH 03/13] CMakeLists: add fft_2n.cpp file --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index aff9c341..3ff590af 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -31,6 +31,7 @@ include(GNUInstallDirs) set(LIB_SRC ${SOURCE_DIR}/core.cpp ${SOURCE_DIR}/fec_vectorisation.cpp + ${SOURCE_DIR}/fft_2n.cpp ${SOURCE_DIR}/misc.cpp ${SOURCE_DIR}/gf_nf4.cpp ${SOURCE_DIR}/gf_ring.cpp From 5409360aa3c61994612171ffb41fdee9e20f02d2 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:41:40 +0200 Subject: [PATCH 04/13] FFT2n.cpp: implement specialized operations --- src/fec_vectorisation.cpp | 2 +- src/fft_2n.cpp | 272 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 273 insertions(+), 1 deletion(-) create mode 100644 src/fft_2n.cpp diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index 2564ed7d..8684e1ab 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -32,7 +32,7 @@ #include "fec_rs_fnt.h" /* - * The file includes vectorized operations used by FEC classes + * The file includes specialized operations used by FEC classes */ #ifdef QUADIRON_USE_SIMD diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp new file mode 100644 index 00000000..f3d8847f --- /dev/null +++ b/src/fft_2n.cpp @@ -0,0 +1,272 @@ +/* -*- mode: c++ -*- */ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fft_2n.h" + +/* + * The file includes vectorized operations used by Radix2 classes + */ + +#ifdef QUADIRON_USE_SIMD + +#include "simd.h" + +namespace quadiron { +namespace fft { + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + const unsigned coefIndex = start * this->n / m / 2; + const uint16_t r1 = vec_W[coefIndex]; + const uint16_t r2 = vec_W[coefIndex / 2]; + const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4]; + + // perform vector operations + simd::butterfly_ct_two_layers_step( + buf, r1, r2, r3, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + const unsigned step = m << 2; + size_t offset = vec_len * ratio; + // --------- + // First layer + // --------- + const uint16_t r1 = W->get(start * this->n / m / 2); + // first pair + butterfly_ct_step_slow(buf, r1, start, m, step, offset); + // second pair + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); + // --------- + // Second layer + // --------- + // first pair + const uint16_t r2 = W->get(start * this->n / m / 4); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); + // second pair + const uint16_t r3 = W->get((start + m) * this->n / m / 4); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + } +} + +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint16_t r, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_ct_step_slow(buf, r, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_slow(buf, coef, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint16_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_ct_two_layers_step( + vec::Buffers& buf, + unsigned start, + unsigned m) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + const unsigned coefIndex = start * this->n / m / 2; + const uint32_t r1 = vec_W[coefIndex]; + const uint32_t r2 = vec_W[coefIndex / 2]; + const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4]; + + // perform vector operations + simd::butterfly_ct_two_layers_step( + buf, r1, r2, r3, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + const unsigned step = m << 2; + size_t offset = vec_len * ratio; + // --------- + // First layer + // --------- + const uint32_t r1 = W->get(start * this->n / m / 2); + // first pair + butterfly_ct_step_slow(buf, r1, start, m, step, offset); + // second pair + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); + // --------- + // Second layer + // --------- + // first pair + const uint32_t r2 = W->get(start * this->n / m / 4); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); + // second pair + const uint32_t r3 = W->get((start + m) * this->n / m / 4); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + } +} + +template <> +void Radix2::butterfly_ct_step( + vec::Buffers& buf, + uint32_t r, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_ct_step_slow(buf, r, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_slow(buf, coef, start, m, step, offset); + } +} + +template <> +void Radix2::butterfly_gs_step_simple( + vec::Buffers& buf, + uint32_t coef, + unsigned start, + unsigned m, + unsigned step) +{ + const unsigned ratio = simd::countof(); + const size_t len = this->pkt_size; + const size_t vec_len = len / ratio; + const size_t last_len = len - vec_len * ratio; + + // perform vector operations + simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + + // for last elements, perform as non-SIMD method + if (last_len > 0) { + size_t offset = vec_len * ratio; + butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + } +} + +} // namespace fft +} // namespace quadiron + +#endif // #ifdef QUADIRON_USE_SIMD From 0d11e1a9ee47f02c3c2b412c88d5feb355e2e1c1 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:43:47 +0200 Subject: [PATCH 05/13] SIMD: remove useless files The SIMD parts will be re-implemented in next commits --- src/simd_128_u16.h | 358 ---------------------------------- src/simd_128_u32.h | 442 ------------------------------------------ src/simd_256_u16.h | 354 ---------------------------------- src/simd_256_u32.h | 466 --------------------------------------------- 4 files changed, 1620 deletions(-) delete mode 100644 src/simd_128_u16.h delete mode 100644 src/simd_128_u32.h delete mode 100644 src/simd_256_u16.h delete mode 100644 src/simd_256_u32.h diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h deleted file mode 100644 index a177cfb3..00000000 --- a/src/simd_128_u16.h +++ /dev/null @@ -1,358 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_U16_H__ -#define __QUAD_SIMD_128_U16_H__ - -#include - -#include "property.h" -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m128i mod_after_add(m128i a, aint16 card) -{ - const m128i _card = _mm_set1_epi16(card); - const m128i _card_minus_1 = _mm_set1_epi16(card - 1); - - m128i cmp = _mm_cmpgt_epi16(a, _card_minus_1); - m128i b = _mm_sub_epi16(a, _mm_and_si128(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m128i add(m128i a, m128i b, aint16 card) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - m128i c = _mm_add_epi16(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m128i sub(m128i a, m128i b, aint16 card) -{ - const m128i _card = _mm_set1_epi16(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i cmp = _mm_cmpgt_epi16(_b, _a); - m128i _a1 = _mm_add_epi16(_a, _mm_and_si128(_card, cmp)); - - return _mm_sub_epi16(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m128i neg(m128i a, aint16 card = F3) -{ - const m128i _card = _mm_set1_epi16(card); - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_setzero_si128(); - - m128i cmp = _mm_cmpgt_epi16(_a, _b); - - return _mm_sub_epi16(_mm_and_si128(cmp, _card), _a); -} - -inline m128i mod_after_multiply(m128i a) -{ - const m128i mask = _mm_set1_epi16(F3 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 1); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi16(hi, lo); - m128i _lo = _mm_add_epi16(lo, _mm_and_si128(F3_m128i_u16, cmp)); - - return _mm_sub_epi16(_lo, hi); -} - -inline m128i mul(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi16(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi16(_a, F3minus1_m128i_u16), - _mm_cmpeq_epi16(_b, F3minus1_m128i_u16)); - - const m128i one = _mm_set1_epi16(1); - c = _mm_add_epi16(c, _mm_and_si128(one, cmp)); - - // Modulo - return mod_after_multiply(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 - */ -inline m128i mul(m128i a, m128i b, aint16 card) -{ - // FIXME: generalize card - assert(card == F3); - return mul(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint16* buf, aint16 card = F3) -{ - m128i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint16 a, - aint16* src, - aint16* dest, - size_t len, - aint16 card = F3) -{ - const m128i coef = _mm_set1_epi16(a); - - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint32_t coef_doubled = (uint32_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint16)((coef_doubled * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint16 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint16* bufa, - aint16* bufb, - aint16* res, - size_t len, - aint16 card = F3) -{ - m128i* _bufa = reinterpret_cast(bufa); - m128i* _bufb = reinterpret_cast(bufb); - m128i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], F3); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - // dest[i] = uint32_t(src[i]) * uint32_t(dest[i]) % card; - dest[i] = uint16_t((uint32_t(src[i]) * dest[i]) % card); - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint32_t card = F3) -{ - const m128i _coef = _mm_set1_epi16(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m128i _coef = _mm_set1_epi16(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = _buf1[i]; - m128i b = _buf2[i]; - m128i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m128i _threshold = _mm_set1_epi16(threshold); - uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1); - const m128i mask_hi = _mm_set1_epi16(max); - const unsigned element_size = sizeof(uint16_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint16_t* chunk = output.get(frag_id); - m128i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m128i a = _mm_load_si128(&(buf[vec_id])); - const m128i b = _mm_cmpeq_epi16(_threshold, a); - const m128i c = _mm_and_si128(mask_hi, b); - uint16_t d = _mm_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h deleted file mode 100644 index 5039f0f8..00000000 --- a/src/simd_128_u32.h +++ /dev/null @@ -1,442 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_128_U32_H__ -#define __QUAD_SIMD_128_U32_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m128i mod_after_add(m128i a, aint32 card) -{ - const m128i _card = _mm_set1_epi32(card); - const m128i _card_minus_1 = _mm_set1_epi32(card - 1); - - m128i cmp = _mm_cmpgt_epi32(a, _card_minus_1); - m128i b = _mm_sub_epi32(a, _mm_and_si128(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m128i add(m128i a, m128i b, aint32 card) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - m128i c = _mm_add_epi32(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m128i sub(m128i a, m128i b, aint32 card) -{ - const m128i _card = _mm_set1_epi32(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i cmp = _mm_cmpgt_epi32(_b, _a); - m128i _a1 = _mm_add_epi32(_a, _mm_and_si128(_card, cmp)); - - return _mm_sub_epi32(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m128i neg(m128i a, aint32 card = F4) -{ - const m128i _card = _mm_set1_epi32(card); - - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_setzero_si128(); - m128i cmp = _mm_cmpgt_epi32(_a, _b); - - return _mm_sub_epi32(_mm_and_si128(cmp, _card), _a); -} - -/** Perform a%card where a is a multiplication of two numbers whose elements are - * symbols of GF(F4) - * - * We find v in a = u * card + v - * a is expressed also as: a = hi * (card-1) + lo - * where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a - * hence, v = (lo - hi) % F4 - * v = lo - hi, if lo >= hi - * or - * F4 + lo - hi, otherwise - */ -inline m128i mod_after_multiply_f4(m128i a) -{ - const m128i mask = _mm_set1_epi32(F4 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 2); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F4_m128i, cmp)); - - return _mm_sub_epi32(_lo, hi); -} - -inline m128i mod_after_multiply_f3(m128i a) -{ - const m128i mask = _mm_set1_epi32(F3 - 2); - - m128i lo = _mm_and_si128(a, mask); - - m128i a_shift = _mm_srli_si128(a, 1); - m128i hi = _mm_and_si128(a_shift, mask); - - m128i cmp = _mm_cmpgt_epi32(hi, lo); - m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F3_m128i, cmp)); - - return _mm_sub_epi32(_lo, hi); -} - -inline m128i mul_f4(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi32(_a, F4minus1_m128i), - _mm_cmpeq_epi32(_b, F4minus1_m128i)); - - const m128i one = _mm_set1_epi32(1); - c = _mm_add_epi32(c, _mm_and_si128(one, cmp)); - - // Modulo - return mod_after_multiply_f4(c); -} - -inline m128i mul_f3(m128i a, m128i b) -{ - m128i _a = _mm_load_si128(&a); - m128i _b = _mm_load_si128(&b); - - m128i c = _mm_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m128i cmp = _mm_and_si128( - _mm_cmpeq_epi32(_a, F3minus1_m128i), - _mm_cmpeq_epi32(_b, F3minus1_m128i)); - - c = _mm_xor_si128(c, _mm_and_si128(F4_m128i, cmp)); - - // Modulo - return mod_after_multiply_f3(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 and F4 - */ -inline m128i mul(m128i a, m128i b, aint32 card) -{ - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4(a, b); - return mul_f3(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) -{ - m128i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i] > 0) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint32 a, - aint32* src, - aint32* dest, - size_t len, - aint32 card = F4) -{ - const m128i coef = _mm_set1_epi32(a); - - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint64_t coef_64 = (uint64_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint32)((coef_64 * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint32 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint32* bufa, - aint32* bufb, - aint32* res, - size_t len, - aint32 card = F4) -{ - m128i* _bufa = reinterpret_cast(bufa); - m128i* _bufb = reinterpret_cast(bufb); - m128i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m128i* _src = reinterpret_cast(src); - m128i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m128i _coef = _mm_set1_epi32(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m128i _coef = _mm_set1_epi32(coef); - m128i* _buf1 = reinterpret_cast(buf1); - m128i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m128i a = _buf1[i]; - m128i b = _buf2[i]; - m128i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m128i _threshold = _mm_set1_epi32(threshold); - const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1); - const m128i mask_hi = _mm_set1_epi32(max); - const unsigned element_size = sizeof(uint32_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint32_t* chunk = output.get(frag_id); - m128i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m128i a = _mm_load_si128(&(buf[vec_id])); - const m128i b = _mm_cmpeq_epi32(_threshold, a); - const m128i c = _mm_and_si128(mask_hi, b); - uint16_t d = _mm_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -/* ==================== Operations for NF4 =================== */ - -/** Return aint128 integer from a _m128i register */ -static inline aint128 m128i_to_uint128(m128i v) -{ - aint128 i; - _mm_store_si128((m128i*)&i, v); - - return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} - -inline __uint128_t add(__uint128_t a, __uint128_t b) -{ - m128i res = add((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline __uint128_t sub(__uint128_t a, __uint128_t b) -{ - m128i res = sub((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline __uint128_t mul(__uint128_t a, __uint128_t b) -{ - m128i res = mul((m128i)a, (m128i)b, F4); - return m128i_to_uint128(res); -} - -inline void hadamard_mul(int n, aint128* _x, aint128* _y) -{ - int i; - m128i* x = reinterpret_cast(_x); - m128i* y = reinterpret_cast(_y); - - // multiply y to `x` - for (i = 0; i < n; i++) { - x[i] = mul(x[i], y[i], F4); - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h deleted file mode 100644 index 974e136e..00000000 --- a/src/simd_256_u16.h +++ /dev/null @@ -1,354 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_U16_H__ -#define __QUAD_SIMD_256_U16_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m256i mod_after_add(m256i a, aint16 card) -{ - const m256i _card = _mm256_set1_epi16(card); - const m256i _card_minus_1 = _mm256_set1_epi16(card - 1); - - m256i cmp = _mm256_cmpgt_epi16(a, _card_minus_1); - m256i b = _mm256_sub_epi16(a, _mm256_and_si256(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m256i add(m256i a, m256i b, aint16 card = F3) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - m256i c = _mm256_add_epi16(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m256i sub(m256i a, m256i b, aint16 card) -{ - const m256i _card = _mm256_set1_epi16(card); - - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i cmp = _mm256_cmpgt_epi16(_b, _a); - m256i _a1 = _mm256_add_epi16(_a, _mm256_and_si256(_card, cmp)); - - return _mm256_sub_epi16(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m256i neg(m256i a, aint16 card = F3) -{ - const m256i _card = _mm256_set1_epi16(card); - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_setzero_si256(); - - m256i cmp = _mm256_cmpgt_epi16(_a, _b); - - return _mm256_sub_epi16(_mm256_and_si256(cmp, _card), _a); -} - -inline m256i mod_after_multiply(m256i a) -{ - const m256i mask = _mm256_set1_epi16(F3 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 1); - m256i hi = _mm256_and_si256(a_shift, mask); - - m256i cmp = _mm256_cmpgt_epi16(hi, lo); - m256i _lo = _mm256_add_epi16(lo, _mm256_and_si256(F3_m256i_u16, cmp)); - - return _mm256_sub_epi16(_lo, hi); -} - -inline m256i mul(m256i a, m256i b) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi16(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi16(_a, F3minus1_m256i_u16), - _mm256_cmpeq_epi16(_b, F3minus1_m256i_u16)); - - const m256i one = _mm256_set1_epi16(1); - c = _mm256_add_epi16(c, _mm256_and_si256(one, cmp)); - - // Modulo - return mod_after_multiply(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 - */ -inline m256i mul(m256i a, m256i b, aint16 card) -{ - // FIXME: generalize card - assert(card == F3); - return mul(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint16* buf, aint16 card = F3) -{ - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint16 a, - aint16* src, - aint16* dest, - size_t len, - aint16 card = F3) -{ - const m256i coef = _mm256_set1_epi16(a); - - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint32_t coef_doubled = (uint32_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint16)((coef_doubled * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint16 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint16* bufa, - aint16* bufb, - aint16* res, - size_t len, - aint16 card = F3) -{ - m256i* _bufa = reinterpret_cast(bufa); - m256i* _bufb = reinterpret_cast(bufb); - m256i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = (uint32_t(src[i]) * dest[i]) % card; - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m256i _coef = _mm256_set1_epi16(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint16_t coef, - aint16* buf1, - aint16* buf2, - size_t len, - uint16_t card = F3) -{ - const m256i _coef = _mm256_set1_epi16(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = _buf1[i]; - m256i b = _buf2[i]; - m256i c = sub(a, b, card); - _buf1[i] = add(a, b, card); - _buf2[i] = mul(_coef, c, card); - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint16_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m256i _threshold = _mm256_set1_epi16(threshold); - uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1); - const m256i mask_hi = _mm256_set1_epi16(max); - const unsigned element_size = sizeof(uint16_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint16_t* chunk = output.get(frag_id); - m256i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m256i a = _mm256_load_si256(&(buf[vec_id])); - const m256i b = _mm256_cmpeq_epi16(_threshold, a); - const m256i c = _mm256_and_si256(mask_hi, b); - uint32_t d = _mm256_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h deleted file mode 100644 index 9c76c89b..00000000 --- a/src/simd_256_u32.h +++ /dev/null @@ -1,466 +0,0 @@ -/* - * Copyright 2017-2018 Scality - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright notice, - * this list of conditions and the following disclaimer in the documentation - * and/or other materials provided with the distribution. - * - * 3. Neither the name of the copyright holder nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __QUAD_SIMD_256_U32_H__ -#define __QUAD_SIMD_256_U32_H__ - -#include - -#include "simd/simd.h" - -namespace quadiron { -namespace simd { - -/* ==================== Essential Operations =================== */ - -/** Perform a%card where a is a addition of two numbers whose elements are - * symbols of GF(card) */ -inline m256i mod_after_add(m256i a, aint32 card) -{ - const m256i _card = _mm256_set1_epi32(card); - const m256i _card_minus_1 = _mm256_set1_epi32(card - 1); - - m256i cmp = _mm256_cmpgt_epi32(a, _card_minus_1); - m256i b = _mm256_sub_epi32(a, _mm256_and_si256(_card, cmp)); - - return b; -} - -/** Perform addition of two numbers a, b whose elements are of GF(card) */ -inline m256i add(m256i a, m256i b, aint32 card) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - m256i c = _mm256_add_epi32(_a, _b); - - // Modulo - return mod_after_add(c, card); -} - -/** Perform subtraction of a by b where a, b whose elements are symbols of - * GF(card) - * sub(a, b) = a - b if a >= b, or - * card + a - b, otherwise - */ -inline m256i sub(m256i a, m256i b, aint32 card) -{ - const m256i _card = _mm256_set1_epi32(card); - - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i cmp = _mm256_cmpgt_epi32(_b, _a); - m256i _a1 = _mm256_add_epi32(_a, _mm256_and_si256(_card, cmp)); - - return _mm256_sub_epi32(_a1, _b); -} - -/** Negate `a` - * @return 0 if (a == 0), else card - a - */ -inline m256i neg(m256i a, aint32 card = F4) -{ - const m256i _card = _mm256_set1_epi32(card); - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_setzero_si256(); - - m256i cmp = _mm256_cmpgt_epi32(_a, _b); - - return _mm256_sub_epi32(_mm256_and_si256(cmp, _card), _a); -} - -/** Perform a%card where a is a multiplication of two numbers whose elements are - * symbols of GF(F4) - * - * We find v in a = u * card + v - * a is expressed also as: a = hi * (card-1) + lo - * where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a - * hence, v = (lo - hi) % F4 - * v = lo - hi, if lo >= hi - * or - * F4 + lo - hi, otherwise - */ -inline m256i mod_after_multiply_f4(m256i a) -{ - const m256i mask = _mm256_set1_epi32(F4 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 2); - m256i hi = _mm256_and_si256(a_shift, mask); - - m256i cmp = _mm256_cmpgt_epi32(hi, lo); - m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F4_m256i, cmp)); - - return _mm256_sub_epi32(_lo, hi); -} - -inline m256i mod_after_multiply_f3(m256i a) -{ - const m256i mask = _mm256_set1_epi32(F3 - 2); - - m256i lo = _mm256_and_si256(a, mask); - - m256i a_shift = _mm256_srli_si256(a, 1); - m256i hi = _mm256_and_si256(a_shift, mask); - - m256i cmp = _mm256_cmpgt_epi32(hi, lo); - m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F3_m256i, cmp)); - - return _mm256_sub_epi32(_lo, hi); -} - -inline m256i mul_f4(m256i a, m256i b) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi32(_a, F4minus1_m256i), - _mm256_cmpeq_epi32(_b, F4minus1_m256i)); - - const m256i one = _mm256_set1_epi32(1); - c = _mm256_add_epi32(c, _mm256_and_si256(one, cmp)); - - // Modulo - return mod_after_multiply_f4(c); -} - -inline m256i mul_f3(m256i a, m256i b) -{ - m256i _a = _mm256_load_si256(&a); - m256i _b = _mm256_load_si256(&b); - - m256i c = _mm256_mullo_epi32(_a, _b); - - // filter elements of both of a & b = card-1 - m256i cmp = _mm256_and_si256( - _mm256_cmpeq_epi32(_a, F3minus1_m256i), - _mm256_cmpeq_epi32(_b, F3minus1_m256i)); - - c = _mm256_xor_si256(c, _mm256_and_si256(F4_m256i, cmp)); - - // Modulo - return mod_after_multiply_f3(c); -} - -/** Perform multiplication of two numbers a, b whose elements are of GF(card) - * where `card` is a prime Fermat number, i.e. card = Fx with x < 5 - * Currently, it supports only for F3 and F4 - */ -inline m256i mul(m256i a, m256i b, aint32 card) -{ - assert(card == F4 || card == F3); - if (card == F4) - return mul_f4(a, b); - return mul_f3(a, b); -} - -/** Apply an element-wise negation to a buffer - */ -inline void neg(size_t len, aint32* buf, aint32 card = F4) -{ - m256i* _buf = reinterpret_cast(buf); - unsigned ratio = sizeof(*_buf) / sizeof(*buf); - size_t _len = len / ratio; - size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - _buf[i] = neg(_buf[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - if (buf[i]) - buf[i] = card - buf[i]; - } - } -} - -/** Perform a multiplication of a coefficient `a` to each element of `src` and - * add result to correspondent element of `dest` - */ -inline void mul_coef_to_buf( - const aint32 a, - aint32* src, - aint32* dest, - size_t len, - aint32 card = F4) -{ - const m256i coef = _mm256_set1_epi32(a); - - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplication - _dest[i] = mul(coef, _src[i], card); - } - if (_last_len > 0) { - uint64_t coef_64 = (uint64_t)a; - for (i = _len * ratio; i < len; i++) { - // perform multiplication - dest[i] = (aint32)((coef_64 * src[i]) % card); - } - } -} - -inline void -add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform addition - _dest[i] = add(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform addition - aint32 tmp = src[i] + dest[i]; - dest[i] = (tmp >= card) ? (tmp - card) : tmp; - } - } -} - -inline void sub_two_bufs( - aint32* bufa, - aint32* bufb, - aint32* res, - size_t len, - aint32 card = F4) -{ - m256i* _bufa = reinterpret_cast(bufa); - m256i* _bufb = reinterpret_cast(bufb); - m256i* _res = reinterpret_cast(res); - const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform subtraction - _res[i] = sub(_bufa[i], _bufb[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform subtraction - if (bufa[i] >= bufb[i]) - res[i] = bufa[i] - bufb[i]; - else - res[i] = card - (bufb[i] - bufa[i]); - } - } -} - -inline void -mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4) -{ - m256i* _src = reinterpret_cast(src); - m256i* _dest = reinterpret_cast(dest); - const unsigned ratio = sizeof(*_src) / sizeof(*src); - const size_t _len = len / ratio; - const size_t _last_len = len - _len * ratio; - - size_t i; - for (i = 0; i < _len; i++) { - // perform multiplicaton - _dest[i] = mul(_src[i], _dest[i], card); - } - if (_last_len > 0) { - for (i = _len * ratio; i < len; i++) { - // perform multiplicaton - dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card); - } - } -} - -/* - * buf1[i] = buf1[i] + coef * buf2[i] - * buf2[i] = buf1[i] - coef * buf2[i] - */ -inline void butterfly_ct( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m256i _coef = _mm256_set1_epi32(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = mul(_coef, _buf2[i], card); - _buf2[i] = sub(_buf1[i], a, card); - _buf1[i] = add(_buf1[i], a, card); - } -} - -/* - * buf1[i] = buf1[i] + buf2[i] - * buf2[i] = coef * (buf1[i] - buf2[i]) - */ -inline void butterfly_gs( - uint32_t coef, - aint32* buf1, - aint32* buf2, - size_t len, - uint32_t card = F4) -{ - const m256i _coef = _mm256_set1_epi32(coef); - m256i* _buf1 = reinterpret_cast(buf1); - m256i* _buf2 = reinterpret_cast(buf2); - - for (size_t i = 0; i < len; ++i) { - m256i a = add(_buf1[i], _buf2[i], card); - _buf2[i] = mul(_coef, sub(_buf1[i], _buf2[i], card), card); - _buf1[i] = a; - } -} - -inline void encode_post_process( - vec::Buffers& output, - std::vector& props, - off_t offset, - unsigned code_len, - uint32_t threshold, - size_t vecs_nb) -{ - const unsigned vec_size = simd::countof(); - - const m256i _threshold = _mm256_set1_epi32(threshold); - const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1); - const m256i mask_hi = _mm256_set1_epi32(max); - const unsigned element_size = sizeof(uint32_t); - - for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { - uint32_t* chunk = output.get(frag_id); - m256i* buf = reinterpret_cast(chunk); - for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) { - const m256i a = _mm256_load_si256(&(buf[vec_id])); - const m256i b = _mm256_cmpeq_epi32(_threshold, a); - const m256i c = _mm256_and_si256(mask_hi, b); - uint32_t d = _mm256_movemask_epi8(c); - - while (d > 0) { - unsigned byte_idx = __builtin_ctz(d); - unsigned element_idx = byte_idx / element_size; - off_t _offset = offset + vec_id * vec_size + element_idx; - props[frag_id].add(_offset, 1); - d ^= 1 << byte_idx; - } - } - } -} - -/* ==================== Operations for NF4 =================== */ -typedef __m128i m128i; - -/** Return aint128 integer from a _m128i register */ -inline aint128 m256i_to_uint128(m256i v) -{ - aint128 hi, lo; - _mm256_storeu2_m128i((m128i*)&hi, (m128i*)&lo, v); - return lo; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) -} - -inline __uint128_t add(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = add(_a, _b, F4); - return m256i_to_uint128(res); -} - -inline __uint128_t sub(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = sub(_a, _b, F4); - return m256i_to_uint128(res); -} - -inline __uint128_t mul(__uint128_t a, __uint128_t b) -{ - m256i _a = _mm256_castsi128_si256((m128i)a); - m256i _b = _mm256_castsi128_si256((m128i)b); - m256i res = mul(_a, _b, F4); - return m256i_to_uint128(res); -} - -/** Store low 128-bit part of `reg` to memory */ -inline void store_low(aint128* address, m256i reg) -{ - _mm_store_si128((m128i*)address, _mm256_castsi256_si128(reg)); -} - -inline void hadamard_mul(int n, aint128* _x, aint128* _y) -{ - int i; - m256i* x = reinterpret_cast(_x); - m256i* y = reinterpret_cast(_y); - - const unsigned ratio = sizeof(*x) / sizeof(*_x); - const int len_256 = n / ratio; - const int last_len = n - len_256 * ratio; - - // multiply y to the first half of `x` - for (i = 0; i < len_256; i++) { - x[i] = mul(x[i], y[i], F4); - } - - if (last_len > 0) { - // add last _y[] to x - for (i = len_256 * ratio; i < n; i++) { - m256i _x_p = _mm256_castsi128_si256((m128i)_x[i]); - m256i _y_p = _mm256_castsi128_si256((m128i)_y[i]); - - store_low(_x + i, mul(_x_p, _y_p, F4)); - } - } -} - -} // namespace simd -} // namespace quadiron - -#endif From 120016fe548da96b4815661d73589a839e555672 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:03 +0200 Subject: [PATCH 06/13] SIMD Main file including necessary files 1. Essential operations - simd_128.h contains essential wrappers of SIMD operations on SSE - simd_256.h contains essential wrappers of SIMD operations on AVX 2. Basic operations - simd_basic.h contain basic operations used in following cases, and also operations for RingModN 3. Vectorized operations - simd_fnt.h contains vectorized operations dedicated for FNT - simd_nf4.h contains vectorized operations dedicated for nf4 --- src/simd.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/simd.h b/src/simd.h index 8309bfff..41e4935e 100644 --- a/src/simd.h +++ b/src/simd.h @@ -39,17 +39,11 @@ const unsigned F4 = 65537; const unsigned F3 = 257; -typedef uint8_t aint8 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint16_t aint16 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint32_t aint32 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef uint64_t aint64 __attribute__((aligned(quadiron::simd::ALIGNMENT))); -typedef __uint128_t aint128 __attribute__((aligned(quadiron::simd::ALIGNMENT))); - namespace quadiron { -/** The namespace simd contains functions for GF-NF4 that are accelerated by - * using SIMD operations over 128bits +/** The namespace simd contains functions accelerated by + * using SIMD operations over 128bits and 256bits * - * It supports operations on 32-bit numbers + * It supports operations on 16-bit and 32-bit numbers */ namespace simd { @@ -58,12 +52,20 @@ namespace simd { } // namespace simd } // namespace quadiron +// Include essential operations that use SIMD functions #if defined(__AVX2__) #include "simd_256.h" #elif defined(__SSE4_1__) #include "simd_128.h" #endif +// Include basic operations +#include "simd_basic.h" + +// Include accelerated operations dedicated for FNT +#include "simd_fnt.h" + +// Include accelerated operations dedicated for NF4 #include "simd_nf4.h" #endif // #ifdef QUADIRON_USE_SIMD From cfc241b4c2f26f082d752c99f46be4e1ba29b87e Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:10 +0200 Subject: [PATCH 07/13] SIMD 128: essential operations for SSE --- src/simd_128.h | 160 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 149 insertions(+), 11 deletions(-) diff --git a/src/simd_128.h b/src/simd_128.h index 0f6b8857..856588ec 100644 --- a/src/simd_128.h +++ b/src/simd_128.h @@ -33,19 +33,157 @@ #include -typedef __m128i m128i; +namespace quadiron { +namespace simd { -// Disable `cert-err58-cpp` on these: AFAIK they cannot throw. -// (probably a false positive present in Clang 5 and fixed in Clang 6). -const m128i F4_m128i = _mm_set1_epi32(65537); // NOLINT(cert-err58-cpp) -const m128i F4minus1_m128i = _mm_set1_epi32(65536); // NOLINT(cert-err58-cpp) -const m128i F3_m128i = _mm_set1_epi32(257); // NOLINT(cert-err58-cpp) -const m128i F3minus1_m128i = _mm_set1_epi32(256); // NOLINT(cert-err58-cpp) +typedef __m128i VecType; -const m128i F3_m128i_u16 = _mm_set1_epi16(257); // NOLINT(cert-err58-cpp) -const m128i F3minus1_m128i_u16 = _mm_set1_epi16(256); // NOLINT(cert-err58-cpp) +/* ============= Constant variable ============ */ -#include "simd_128_u16.h" -#include "simd_128_u32.h" +// @note: using const leads to an lint error of initialization of 'variable' +// with static storage duration may throw an exception that cannot be caught + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F4_U32 = _mm_set1_epi32(65537); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F4_MINUS_ONE_U32 = _mm_set1_epi32(65536); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_U32 = _mm_set1_epi32(257); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_MINUS_ONE_U32 = _mm_set1_epi32(256); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_U16 = _mm_set1_epi16(257); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_MINUS_ONE_U16 = _mm_set1_epi16(256); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType ZERO = _mm_setzero_si128(); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType ONE_U16 = _mm_set1_epi16(1); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType ONE_U32 = _mm_set1_epi32(1); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType MASK8_LO = _mm_set1_epi16(0x80); + +/* ============= Essential Operations for SSE w/ both u16 & u32 ============ */ + +inline VecType load_to_reg(VecType* address) +{ + return _mm_load_si128(address); +} +inline void store_to_mem(VecType* address, VecType reg) +{ + _mm_store_si128(address, reg); +} + +inline VecType bit_and(VecType x, VecType y) +{ + return _mm_and_si128(x, y); +} +inline VecType bit_xor(VecType x, VecType y) +{ + return _mm_xor_si128(x, y); +} +inline uint16_t msb8_mask(VecType x) +{ + return _mm_movemask_epi8(x); +} +inline bool and_is_zero(VecType x, VecType y) +{ + return _mm_testz_si128(x, y); +} +inline bool is_zero(VecType x) +{ + return _mm_testc_si128(ZERO, x); +} + +#define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8)) +#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8)) + +/* ================= Essential Operations for SSE ================= */ + +template +inline VecType set_one(T val); +template <> +inline VecType set_one(uint32_t val) +{ + return _mm_set1_epi32(val); +} +template <> +inline VecType set_one(uint16_t val) +{ + return _mm_set1_epi16(val); +} + +template +inline VecType add(VecType x, VecType y); +template <> +inline VecType add(VecType x, VecType y) +{ + return _mm_add_epi32(x, y); +} +template <> +inline VecType add(VecType x, VecType y) +{ + return _mm_add_epi16(x, y); +} + +template +inline VecType sub(VecType x, VecType y); +template <> +inline VecType sub(VecType x, VecType y) +{ + return _mm_sub_epi32(x, y); +} +template <> +inline VecType sub(VecType x, VecType y) +{ + return _mm_sub_epi16(x, y); +} + +template +inline VecType mul(VecType x, VecType y); +template <> +inline VecType mul(VecType x, VecType y) +{ + return _mm_mullo_epi32(x, y); +} +template <> +inline VecType mul(VecType x, VecType y) +{ + return _mm_mullo_epi16(x, y); +} + +template +inline VecType compare_eq(VecType x, VecType y); +template <> +inline VecType compare_eq(VecType x, VecType y) +{ + return _mm_cmpeq_epi32(x, y); +} +template <> +inline VecType compare_eq(VecType x, VecType y) +{ + return _mm_cmpeq_epi16(x, y); +} + +template +inline VecType min(VecType x, VecType y); +template <> +inline VecType min(VecType x, VecType y) +{ + return _mm_min_epu32(x, y); +} +template <> +inline VecType min(VecType x, VecType y) +{ + return _mm_min_epu16(x, y); +} + +} // namespace simd +} // namespace quadiron #endif From a6cd1f4ab8ce8cd1fb66bcf7cd71c5cec9b787ca Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:16 +0200 Subject: [PATCH 08/13] SIMD 256: essential operations for AVX --- src/simd_256.h | 168 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 153 insertions(+), 15 deletions(-) diff --git a/src/simd_256.h b/src/simd_256.h index 2dc49cc4..8084d95e 100644 --- a/src/simd_256.h +++ b/src/simd_256.h @@ -33,19 +33,6 @@ #include -typedef __m256i m256i; - -// Disable `cert-err58-cpp` on these: AFAIK they cannot throw. -// (probably a false positive present in Clang 5 and fixed in Clang 6). -const m256i F4_m256i = _mm256_set1_epi32(65537); // NOLINT(cert-err58-cpp) -const m256i F4minus1_m256i = _mm256_set1_epi32(65536); // NOLINT(cert-err58-cpp) -const m256i F3_m256i = _mm256_set1_epi32(257); // NOLINT(cert-err58-cpp) -const m256i F3minus1_m256i = _mm256_set1_epi32(256); // NOLINT(cert-err58-cpp) - -const m256i F3_m256i_u16 = _mm256_set1_epi16(257); // NOLINT(cert-err58-cpp) -// NOLINTNEXTLINE(cert-err58-cpp) -const m256i F3minus1_m256i_u16 = _mm256_set1_epi16(256); - /* GCC doesn't include the split store intrinsics so define them here. */ #if defined(__GNUC__) && !defined(__clang__) @@ -58,7 +45,158 @@ _mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a) #endif /* defined(__GNUC__) */ -#include "simd_256_u16.h" -#include "simd_256_u32.h" +namespace quadiron { +namespace simd { + +typedef __m256i VecType; +typedef __m128i HalfVecType; + +/* ============= Constant variable ============ */ + +// @note: using const leads to an lint error of initialization of 'variable' +// with static storage duration may throw an exception that cannot be caught + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F4_U32 = _mm256_set1_epi32(65537); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F4_MINUS_ONE_U32 = _mm256_set1_epi32(65536); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_U32 = _mm256_set1_epi32(257); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_MINUS_ONE_U32 = _mm256_set1_epi32(256); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_U16 = _mm256_set1_epi16(257); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType F3_MINUS_ONE_U16 = _mm256_set1_epi16(256); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType ZERO = _mm256_setzero_si256(); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType ONE_U16 = _mm256_set1_epi16(1); +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType ONE_U32 = _mm256_set1_epi32(1); + +// NOLINTNEXTLINE(cert-err58-cpp) +const VecType MASK8_LO = _mm256_set1_epi16(0x80); + +/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */ + +inline VecType load_to_reg(VecType* address) +{ + return _mm256_load_si256(address); +} +inline void store_to_mem(VecType* address, VecType reg) +{ + _mm256_store_si256(address, reg); +} + +inline VecType bit_and(VecType x, VecType y) +{ + return _mm256_and_si256(x, y); +} +inline VecType bit_xor(VecType x, VecType y) +{ + return _mm256_xor_si256(x, y); +} +inline uint32_t msb8_mask(VecType x) +{ + return _mm256_movemask_epi8(x); +} +inline bool and_is_zero(VecType x, VecType y) +{ + return _mm256_testz_si256(x, y); +} +inline bool is_zero(VecType x) +{ + return _mm256_testc_si256(ZERO, x); +} + +#define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8)) +#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask)) +#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8)) + +/* ================= Essential Operations for AVX2 ================= */ + +template +inline VecType set_one(T val); +template <> +inline VecType set_one(uint32_t val) +{ + return _mm256_set1_epi32(val); +} +template <> +inline VecType set_one(uint16_t val) +{ + return _mm256_set1_epi16(val); +} + +template +inline VecType add(VecType x, VecType y); +template <> +inline VecType add(VecType x, VecType y) +{ + return _mm256_add_epi32(x, y); +} +template <> +inline VecType add(VecType x, VecType y) +{ + return _mm256_add_epi16(x, y); +} + +template +inline VecType sub(VecType x, VecType y); +template <> +inline VecType sub(VecType x, VecType y) +{ + return _mm256_sub_epi32(x, y); +} +template <> +inline VecType sub(VecType x, VecType y) +{ + return _mm256_sub_epi16(x, y); +} + +template +inline VecType mul(VecType x, VecType y); +template <> +inline VecType mul(VecType x, VecType y) +{ + return _mm256_mullo_epi32(x, y); +} +template <> +inline VecType mul(VecType x, VecType y) +{ + return _mm256_mullo_epi16(x, y); +} + +template +inline VecType compare_eq(VecType x, VecType y); +template <> +inline VecType compare_eq(VecType x, VecType y) +{ + return _mm256_cmpeq_epi32(x, y); +} +template <> +inline VecType compare_eq(VecType x, VecType y) +{ + return _mm256_cmpeq_epi16(x, y); +} + +template +inline VecType min(VecType x, VecType y); +template <> +inline VecType min(VecType x, VecType y) +{ + return _mm256_min_epu32(x, y); +} +template <> +inline VecType min(VecType x, VecType y) +{ + return _mm256_min_epu16(x, y); +} + +} // namespace simd +} // namespace quadiron #endif From 40379409bf93825936d9fd36988a7d68510a907e Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:21 +0200 Subject: [PATCH 09/13] SIMD Basic: includes basic Operations It implements basic operations that will be used everywhere. It includes also operations for RingModN --- src/simd_basic.h | 334 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 334 insertions(+) create mode 100644 src/simd_basic.h diff --git a/src/simd_basic.h b/src/simd_basic.h new file mode 100644 index 00000000..4fed13ca --- /dev/null +++ b/src/simd_basic.h @@ -0,0 +1,334 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_BASIC_H__ +#define __QUAD_SIMD_BASIC_H__ + +#include + +namespace quadiron { +namespace simd { + +template +inline VecType card(T q); +template <> +inline VecType card(uint16_t q) +{ + return F3_U16; +} +template <> +inline VecType card(uint32_t q) +{ + return (q == F3) ? F3_U32 : F4_U32; +} + +template +inline VecType card_minus_one(T q); +template <> +inline VecType card_minus_one(uint16_t q) +{ + return F3_MINUS_ONE_U16; +} +template <> +inline VecType card_minus_one(uint32_t q) +{ + return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32; +} + +template +inline VecType get_low_half(VecType x, T q) +{ + return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55); +} + +template +inline VecType get_high_half(VecType x, T q) +{ + return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO) + : BLEND16(ZERO, SHIFTR(x, 2), 0x55); +} + +/* ================= Basic Operations ================= */ + +/** + * Modular addition + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x + y) mod q + */ +template +inline VecType mod_add(VecType x, VecType y, T q) +{ + const VecType res = add(x, y); + return min(res, sub(res, card(q))); +} + +/** + * Modular subtraction for packed unsigned 32-bit integers + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x - y) mod q + */ +template +inline VecType mod_sub(VecType x, VecType y, T q) +{ + const VecType res = sub(x, y); + return min(res, add(res, card(q))); +} + +/** + * Modular negation for packed unsigned 32-bit integers + * + * @param x input register + * @param q modulo + * @return (-x) mod q + */ +template +inline VecType mod_neg(VecType x, T q) +{ + const VecType res = sub(card(q), x); + return min(res, sub(res, card(q))); +} + +/** + * Modular multiplication for packed unsigned 32-bit integers + * + * @note We assume that at least `x` or `y` is less than `q-1` so it's + * not necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +template +inline VecType mod_mul(VecType x, VecType y, T q) +{ + const VecType res = mul(x, y); + const VecType lo = get_low_half(res, q); + const VecType hi = get_high_half(res, q); + return mod_sub(lo, hi, q); +} + +/** + * Modular general multiplication for packed unsigned 32-bit integers + * + * @note It's necessary to verify overflow on multiplying elements + * + * @param x input register + * @param y input register + * @param q modulo + * @return (x * y) mod q + */ +template +inline VecType mod_mul_safe(VecType x, VecType y, T q) +{ + const VecType res = mod_mul(x, y, q); + + // filter elements of both of a & b = card-1 + const VecType cmp = bit_and( + compare_eq(x, card_minus_one(q)), + compare_eq(y, card_minus_one(q))); + + if (is_zero(cmp)) { + return res; + } + return (q == F3) ? bit_xor(res, bit_and(F4_U32, cmp)) + : add(res, bit_and(ONE_U32, cmp)); +} + +/** + * Update property for a given register for packed unsigned 32-bit integers + * + * @param props properties bound to fragments + * @param threshold register storing max value in its elements + * @param mask a specific mask + * @param symb input register + * @param offset offset in the data fragments + * @param max a dummy variable + */ +template +inline void add_props( + Properties& props, + VecType threshold, + VecType mask, + VecType symb, + off_t offset, + T max) +{ + const VecType b = compare_eq(threshold, symb); + const VecType c = bit_and(mask, b); + auto d = msb8_mask(c); + const unsigned element_size = sizeof(T); + while (d > 0) { + const unsigned byte_idx = __builtin_ctz(d); + const size_t _offset = offset + byte_idx / element_size; + props.add(_offset, OOR_MARK); + d ^= 1 << byte_idx; + } +} + +/* ==================== Operations for RingModN =================== */ +/** Perform a multiplication of a coefficient `a` to each element of `src` and + * add result to correspondent element of `dest` + * + * @note: 1 < `a` < card - 1 + */ +template +inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card) +{ + const VecType coef = set_one(a); + + VecType* _src = reinterpret_cast(src); + VecType* _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i = 0; + const size_t end = (_len > 3) ? _len - 3 : 0; + for (; i < end; i += 4) { + _dest[i] = mod_mul(coef, _src[i], card); + _dest[i + 1] = mod_mul(coef, _src[i + 1], card); + _dest[i + 2] = mod_mul(coef, _src[i + 2], card); + _dest[i + 3] = mod_mul(coef, _src[i + 3], card); + } + for (; i < _len; ++i) { + _dest[i] = mod_mul(coef, _src[i], card); + } + + if (_last_len > 0) { + const DoubleSizeVal coef_double = DoubleSizeVal(a); + for (size_t i = _len * ratio; i < len; i++) { + dest[i] = (T)((coef_double * src[i]) % card); + } + } +} + +template +inline void add_two_bufs(T* src, T* dest, size_t len, T card) +{ + VecType* _src = reinterpret_cast(src); + VecType* _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + _dest[i] = mod_add(_src[i], _dest[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + const T tmp = src[i] + dest[i]; + dest[i] = (tmp >= card) ? (tmp - card) : tmp; + } + } +} + +template +inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card) +{ + VecType* _bufa = reinterpret_cast(bufa); + VecType* _bufb = reinterpret_cast(bufb); + VecType* _res = reinterpret_cast(res); + const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + // perform subtraction + _res[i] = mod_sub(_bufa[i], _bufb[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + // perform subtraction + if (bufa[i] >= bufb[i]) { + res[i] = bufa[i] - bufb[i]; + } else { + res[i] = card - (bufb[i] - bufa[i]); + } + } + } +} + +template +inline void mul_two_bufs(T* src, T* dest, size_t len, T card) +{ + VecType* _src = reinterpret_cast(src); + VecType* _dest = reinterpret_cast(dest); + const unsigned ratio = sizeof(*_src) / sizeof(*src); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + // perform multiplicaton + _dest[i] = mod_mul_safe(_src[i], _dest[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + // perform multiplicaton + dest[i] = T((DoubleSizeVal(src[i]) * dest[i]) % card); + } + } +} + +/** Apply an element-wise negation to a buffer + */ +template +inline void neg(size_t len, T* buf, T card) +{ + VecType* _buf = reinterpret_cast(buf); + const unsigned ratio = sizeof(*_buf) / sizeof(*buf); + const size_t _len = len / ratio; + const size_t _last_len = len - _len * ratio; + + size_t i; + for (i = 0; i < _len; i++) { + _buf[i] = mod_neg(_buf[i], card); + } + if (_last_len > 0) { + for (i = _len * ratio; i < len; i++) { + if (buf[i]) + buf[i] = card - buf[i]; + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif From 07bf74d688a88901bb5ca0df55096d3e7f9fca5d Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:27 +0200 Subject: [PATCH 10/13] SIMD NF4 contains vectorized operations for NF4 --- src/simd_nf4.h | 314 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 263 insertions(+), 51 deletions(-) diff --git a/src/simd_nf4.h b/src/simd_nf4.h index 7d517430..79c62907 100644 --- a/src/simd_nf4.h +++ b/src/simd_nf4.h @@ -38,37 +38,35 @@ namespace quadiron { namespace simd { -#ifdef __AVX2__ -typedef __m128i m128i; +typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT))); -/** Return aint128 integer from a _m128i register */ -static inline aint128 m128i_to_uint128(m128i v) +/** Return __uint128_t integer from a _m128i register */ +static inline __uint128_t m128i_to_uint128(__m128i v) { - aint128 i; - _mm_store_si128((m128i*)&i, v); + __uint128_t i; + _mm_store_si128((__m128i*)&i, v); return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn) } -#endif // #ifdef __AVX2__ -inline aint128 expand16(uint16_t* arr, int n) +inline __uint128_t expand16(uint16_t* arr, int n) { // since n <= 4 uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0}; std::copy_n(arr, n, _arr); - m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]); + __m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]); return m128i_to_uint128(b); } -inline aint128 expand32(uint32_t* arr, int n) +inline __uint128_t expand32(uint32_t* arr, int n) { // since n <= 4 uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0}; std::copy_n(arr, n, _arr); - m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]); + __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]); return m128i_to_uint128(b); } @@ -76,23 +74,23 @@ inline aint128 expand32(uint32_t* arr, int n) inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) { uint16_t ai[8]; - aint128 values; - - m128i _a = _mm_loadu_si128((m128i*)&a); - ai[0] = _mm_extract_epi16(_a, 0); - ai[1] = _mm_extract_epi16(_a, 1); - ai[2] = _mm_extract_epi16(_a, 2); - ai[3] = _mm_extract_epi16(_a, 3); - ai[4] = _mm_extract_epi16(_a, 4); - ai[5] = _mm_extract_epi16(_a, 5); - ai[6] = _mm_extract_epi16(_a, 6); - ai[7] = _mm_extract_epi16(_a, 7); + __uint128_t values; + + __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a)); + ai[0] = _mm_extract_epi16(vec_a, 0); + ai[1] = _mm_extract_epi16(vec_a, 1); + ai[2] = _mm_extract_epi16(vec_a, 2); + ai[3] = _mm_extract_epi16(vec_a, 3); + ai[4] = _mm_extract_epi16(vec_a, 4); + ai[5] = _mm_extract_epi16(vec_a, 5); + ai[6] = _mm_extract_epi16(vec_a, 6); + ai[7] = _mm_extract_epi16(vec_a, 7); const uint32_t flag = ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); - m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); - _mm_store_si128((m128i*)&values, val); + __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); + _mm_store_si128((__m128i*)&values, val); GroupedValues<__uint128_t> b = {values, flag}; @@ -102,70 +100,284 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n) inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n) { uint16_t ai[8]; - aint128 values; - - m128i _a = _mm_loadu_si128((m128i*)&a); - ai[0] = _mm_extract_epi16(_a, 0); - ai[1] = _mm_extract_epi16(_a, 1); - ai[2] = _mm_extract_epi16(_a, 2); - ai[3] = _mm_extract_epi16(_a, 3); - ai[4] = _mm_extract_epi16(_a, 4); - ai[5] = _mm_extract_epi16(_a, 5); - ai[6] = _mm_extract_epi16(_a, 6); - ai[7] = _mm_extract_epi16(_a, 7); + __uint128_t values; + + __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a)); + ai[0] = _mm_extract_epi16(vec_a, 0); + ai[1] = _mm_extract_epi16(vec_a, 1); + ai[2] = _mm_extract_epi16(vec_a, 2); + ai[3] = _mm_extract_epi16(vec_a, 3); + ai[4] = _mm_extract_epi16(vec_a, 4); + ai[5] = _mm_extract_epi16(vec_a, 5); + ai[6] = _mm_extract_epi16(vec_a, 6); + ai[7] = _mm_extract_epi16(vec_a, 7); const uint32_t flag = ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u); - m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); - _mm_store_si128((m128i*)&values, val); + __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]); + _mm_store_si128((__m128i*)&values, val); b.flag = flag; b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign) } -inline aint128 pack(__uint128_t a) +inline __uint128_t pack(__uint128_t a) { - m128i _a = _mm_loadu_si128((m128i*)&a); - m128i b = _mm_set_epi32( - _mm_extract_epi16(_a, 3), - _mm_extract_epi16(_a, 2), - _mm_extract_epi16(_a, 1), - _mm_extract_epi16(_a, 0)); + __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a)); + __m128i b = _mm_set_epi32( + _mm_extract_epi16(vec_a, 3), + _mm_extract_epi16(vec_a, 2), + _mm_extract_epi16(vec_a, 1), + _mm_extract_epi16(vec_a, 0)); return m128i_to_uint128(b); } -inline aint128 pack(__uint128_t a, uint32_t flag) +inline __uint128_t pack(__uint128_t a, uint32_t flag) { aint32 b0, b1, b2, b3; - m128i _a = _mm_loadu_si128((m128i*)&a); + __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a)); if (flag & 1) b0 = 65536; else - b0 = _mm_extract_epi16(_a, 0); + b0 = _mm_extract_epi16(vec_a, 0); flag >>= 1; if (flag & 1) b1 = 65536; else - b1 = _mm_extract_epi16(_a, 1); + b1 = _mm_extract_epi16(vec_a, 1); flag >>= 1; if (flag & 1) b2 = 65536; else - b2 = _mm_extract_epi16(_a, 2); + b2 = _mm_extract_epi16(vec_a, 2); flag >>= 1; if (flag & 1) b3 = 65536; else - b3 = _mm_extract_epi16(_a, 3); + b3 = _mm_extract_epi16(vec_a, 3); - m128i b = _mm_set_epi32(b3, b2, b1, b0); + __m128i b = _mm_set_epi32(b3, b2, b1, b0); return m128i_to_uint128(b); } +/* ================= Basic operations for NF4 ================= */ + +#if defined(__AVX2__) + +inline VecType load_to_reg(HalfVecType x) +{ + return _mm256_castsi128_si256(_mm_load_si128(&x)); +} + +inline VecType load_to_reg(__uint128_t x) +{ + const HalfVecType* _x = reinterpret_cast(&x); + return load_to_reg(*_x); +} + +inline void store_low_half_to_mem(HalfVecType* address, VecType reg) +{ + _mm_store_si128(address, _mm256_castsi256_si128(reg)); +} + +inline __uint128_t add(__uint128_t a, __uint128_t b) +{ + HalfVecType res; + VecType vec_a = load_to_reg(a); + VecType vec_b = load_to_reg(b); + store_low_half_to_mem(&res, mod_add(vec_a, vec_b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t sub(__uint128_t a, __uint128_t b) +{ + HalfVecType res; + VecType vec_a = load_to_reg(a); + VecType vec_b = load_to_reg(b); + store_low_half_to_mem(&res, mod_sub(vec_a, vec_b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t mul(__uint128_t a, __uint128_t b) +{ + HalfVecType res; + VecType vec_a = load_to_reg(a); + VecType vec_b = load_to_reg(b); + store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline void add_buf_to_two_bufs_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + // add last _y[] to x and x_next + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _x_half = reinterpret_cast(x_half); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = load_to_reg(_x[i]); + VecType _x_next_p = load_to_reg(_x_half[i]); + VecType _y_p = load_to_reg(_y[i]); + + store_low_half_to_mem(_x + i, mod_add(_x_p, _y_p, F4)); + store_low_half_to_mem(_x_half + i, mod_add(_x_next_p, _y_p, F4)); + } +} + +inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) +{ + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = load_to_reg(_x[i]); + VecType _y_p = load_to_reg(_y[i]); + + store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4)); + } +} + +inline void hadamard_mul_doubled_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + HalfVecType* _x = reinterpret_cast(x); + HalfVecType* _x_half = reinterpret_cast(x_half); + HalfVecType* _y = reinterpret_cast(y); + for (unsigned i = 0; i < n; ++i) { + VecType _x_p = load_to_reg(_x[i]); + VecType _x_next_p = load_to_reg(_x_half[i]); + VecType _y_p = load_to_reg(_y[i]); + + store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4)); + store_low_half_to_mem(_x_half + i, mod_mul_safe(_x_next_p, _y_p, F4)); + } +} + +#elif defined(__SSE4_1__) + +inline VecType load_to_reg(__uint128_t x) +{ + const VecType* _x = reinterpret_cast(&x); + return _mm_load_si128(_x); +} + +inline __uint128_t add(__uint128_t a, __uint128_t b) +{ + VecType res; + VecType vec_a = load_to_reg(a); + VecType vec_b = load_to_reg(b); + store_to_mem(&res, mod_add(vec_a, vec_b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t sub(__uint128_t a, __uint128_t b) +{ + VecType res; + VecType vec_a = load_to_reg(a); + VecType vec_b = load_to_reg(b); + store_to_mem(&res, mod_sub(vec_a, vec_b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline __uint128_t mul(__uint128_t a, __uint128_t b) +{ + VecType res; + VecType vec_a = load_to_reg(a); + VecType vec_b = load_to_reg(b); + store_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4)); + return reinterpret_cast<__uint128_t>(res); +} + +inline void add_buf_to_two_bufs_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + // do nothing +} + +inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y) +{ + // do nothing +} + +inline void hadamard_mul_doubled_rem( + unsigned n, + __uint128_t* x, + __uint128_t* x_half, + __uint128_t* y) +{ + // do nothing +} + +#endif + +/* ==================== Operations for NF4 =================== */ + +/** Add buffer `y` to two halves of `x`. `x` is of length `n` */ +inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y) +{ + unsigned i; + VecType* x = reinterpret_cast(_x); + VecType* y = reinterpret_cast(_y); + + const unsigned ratio = sizeof(*x) / sizeof(*_x); + const unsigned half_len = n / 2; + const unsigned vec_len = half_len / ratio; + const unsigned num_len = vec_len * ratio; + const unsigned rem_len = half_len - num_len; + + __uint128_t* x_half = _x + half_len; + VecType* x_next = reinterpret_cast(x_half); + + // add y to the first half of `x` + for (i = 0; i < vec_len; ++i) { + x[i] = mod_add(x[i], y[i], F4); + } + + // add y to the second half of `x` + for (i = 0; i < vec_len; ++i) { + x_next[i] = mod_add(x_next[i], y[i], F4); + } + + if (rem_len > 0) { + add_buf_to_two_bufs_rem( + rem_len, _x + num_len, x_half + num_len, _y + num_len); + } +} + +inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y) +{ + unsigned i; + VecType* x = reinterpret_cast(_x); + VecType* y = reinterpret_cast(_y); + + const unsigned ratio = sizeof(*x) / sizeof(*_x); + const unsigned vec_len = n / ratio; + const unsigned num_len = vec_len * ratio; + const unsigned rem_len = n - num_len; + + // multiply y to the first half of `x` + for (i = 0; i < vec_len; ++i) { + x[i] = mod_mul_safe(x[i], y[i], F4); + } + + if (rem_len > 0) { + // add last _y[] to x + hadamard_mul_rem(rem_len, _x + num_len, _y + num_len); + } +} + } // namespace simd } // namespace quadiron From fb0ea0b53cba0ffd5b9ccb49bf7d060cc75ce020 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Fri, 5 Oct 2018 13:44:34 +0200 Subject: [PATCH 11/13] SIMD FNT: vectorised operations for FNT --- src/fft_2n.h | 2 +- src/simd_fnt.h | 529 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 530 insertions(+), 1 deletion(-) create mode 100644 src/simd_fnt.h diff --git a/src/fft_2n.h b/src/fft_2n.h index 8a63bbb2..4ded5bb4 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -394,7 +394,7 @@ void Radix2::butterfly_ct_step( } /** - * Butterly CT on two-layers at a time + * Butterfly CT on two-layers at a time * * For each quadruple * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) diff --git a/src/simd_fnt.h b/src/simd_fnt.h new file mode 100644 index 00000000..8e23e1dd --- /dev/null +++ b/src/simd_fnt.h @@ -0,0 +1,529 @@ +/* + * Copyright 2017-2018 Scality + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __QUAD_SIMD_FNT_H__ +#define __QUAD_SIMD_FNT_H__ + +#include + +namespace quadiron { +namespace simd { + +/* ================= Vectorized Operations ================= */ + +/** + * Butterfly Cooley-Tukey operation + * + * x <- x + r * y + * y <- x - r * y + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param y working register + * @param q modular + */ +template +inline void butterfly_ct(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + VecType z = (rp1 == 2) ? *y : mod_mul(c, *y, q); + if (rp1 < q) { + *y = mod_sub(*x, z, q); + *x = mod_add(*x, z, q); + } else { // i.e. r == q - 1 + *y = mod_add(*x, z, q); + *x = mod_sub(*x, z, q); + } +} + +/** + * Butterfly Genteleman-Sande operation + * + * x <- x + y + * y <- r * (x - y) + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param y working register + * @param q modular + */ +template +inline void butterfly_gs(T rp1, VecType c, VecType* x, VecType* y, T q) +{ + VecType add = mod_add(*x, *y, q); + if (rp1 == 2) { + *y = mod_sub(*x, *y, q); + } else if (rp1 < q) { + VecType sub = mod_sub(*x, *y, q); + *y = mod_mul(c, sub, q); + } else { // i.e. r == q - 1 + *y = mod_sub(*y, *x, q); + } + *x = add; +} + +/** + * Butterfly Genteleman-Sande simple operation where y = 0 + * + * x <- x, i.e. no operation + * y <- r * x + * + * @param rp1 coefficient `r` plus one + * @param c a register stores coefficient `r` + * @param x working register + * @param q modular + * @return r * x + */ +template +inline VecType butterfly_simple_gs(T rp1, VecType c, VecType x, T q) +{ + if (rp1 == 2) { + return x; + } else if (rp1 < q) { + return mod_mul(c, x, q); + } else { + return mod_neg(x, q); + } +} + +/** + * Vectorized butterfly CT step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + r * Q + * Q = P - r * Q + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param step - next loop + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + unsigned step, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const T rp1 = r + 1; + VecType c = set_one(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* p = reinterpret_cast(mem[i]); + VecType* q = reinterpret_cast(mem[i + m]); + + size_t j = 0; + for (; j < end; j += 2) { + x1 = load_to_reg(p + j); + y1 = load_to_reg(q + j); + + butterfly_ct(rp1, c, &x1, &y1, card); + + x2 = load_to_reg(p + j + 1); + y2 = load_to_reg(q + j + 1); + + butterfly_ct(rp1, c, &x2, &y2, card); + + // Store back to memory + store_to_mem(p + j, x1); + store_to_mem(p + j + 1, x2); + store_to_mem(q + j, y1); + store_to_mem(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = load_to_reg(p + j); + y1 = load_to_reg(q + j); + + butterfly_ct(rp1, c, &x1, &y1, card); + + // Store back to memory + store_to_mem(p + j, x1); + store_to_mem(q + j, y1); + } + } +} + +template +inline static void do_butterfly_ct_2_layers( + const std::vector& mem, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + const T r1p1 = r1 + 1; + const T r2p1 = r2 + 1; + const T r3p1 = r3 + 1; + + VecType c1 = set_one(r1); + VecType c2 = set_one(r2); + VecType c3 = set_one(r3); + + VecType* p = reinterpret_cast(mem[start]); + VecType* q = reinterpret_cast(mem[start + m]); + VecType* r = reinterpret_cast(mem[start + 2 * m]); + VecType* s = reinterpret_cast(mem[start + 3 * m]); + + size_t j = 0; + const size_t end = (len > 1) ? len - 1 : 0; + while (j < end) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = load_to_reg(p); + VecType x2 = load_to_reg(p + 1); + VecType y1 = load_to_reg(q); + VecType y2 = load_to_reg(q + 1); + + butterfly_ct(r1p1, c1, &x1, &y1, card); + butterfly_ct(r1p1, c1, &x2, &y2, card); + + VecType u1 = load_to_reg(r); + VecType u2 = load_to_reg(r + 1); + VecType v1 = load_to_reg(s); + VecType v2 = load_to_reg(s + 1); + + butterfly_ct(r1p1, c1, &u1, &v1, card); + butterfly_ct(r1p1, c1, &u2, &v2, card); + + // Second layer (c2, x, u) & (c3, y, v) + butterfly_ct(r2p1, c2, &x1, &u1, card); + butterfly_ct(r2p1, c2, &x2, &u2, card); + + butterfly_ct(r3p1, c3, &y1, &v1, card); + butterfly_ct(r3p1, c3, &y2, &v2, card); + + // Store back to memory + store_to_mem(p, x1); + store_to_mem(p + 1, x2); + store_to_mem(q, y1); + store_to_mem(q + 1, y2); + + store_to_mem(r, u1); + store_to_mem(r + 1, u2); + store_to_mem(s, v1); + store_to_mem(s + 1, v2); + p = p + 2; + q = q + 2; + r = r + 2; + s = s + 2; + j = j + 2; + }; + + for (; j < len; ++j) { + // First layer (c1, x, y) & (c1, u, v) + VecType x1 = load_to_reg(p + j); + VecType y1 = load_to_reg(q + j); + VecType u1 = load_to_reg(r + j); + VecType v1 = load_to_reg(s + j); + + // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card); + butterfly_ct(r1p1, c1, &x1, &y1, card); + butterfly_ct(r1p1, c1, &u1, &v1, card); + butterfly_ct(r2p1, c2, &x1, &u1, card); + butterfly_ct(r3p1, c3, &y1, &v1, card); + + // Store back to memory + store_to_mem(p + j, x1); + store_to_mem(q + j, y1); + store_to_mem(r + j, u1); + store_to_mem(s + j, v1); + } +} + +/** + * Vectorized butterfly CT on two-layers at a time + * + * For each quadruple + * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m]) + * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m + * coef r1 = W[start * n / (2 * m)] + * P = P + r1 * Q + * Q = P - r1 * Q + * R = R + r1 * S + * S = R - r1 * S + * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m + * coef r2 = W[start * n / (4 * m)] + * coef r3 = W[(start + m) * n / (4 * m)] + * P = P + r2 * R + * R = P - r2 * R + * Q = Q + r3 * S + * S = Q - r3 * S + * + * @param buf - working buffers + * @param r1 - coefficient for the 1st layer + * @param r2 - 1st coefficient for the 2nd layer + * @param r3 - 2nd coefficient for the 2nd layer + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_ct_two_layers_step( + vec::Buffers& buf, + T r1, + T r2, + T r3, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 2; + const unsigned bufs_nb = buf.get_n(); + + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card); + } +} + +/** + * Vectorized butterfly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * P = P + Q + * Q = r * (P - Q) + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = set_one(r); + + const size_t end = (len > 3) ? len - 3 : 0; + const unsigned bufs_nb = buf.get_n(); + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, x2, x3, x4; + VecType y1, y2, y3, y4; + VecType* p = reinterpret_cast(mem[i]); + VecType* q = reinterpret_cast(mem[i + m]); + + size_t j = 0; + for (; j < end; j += 4) { + x1 = load_to_reg(p + j); + x2 = load_to_reg(p + j + 1); + x3 = load_to_reg(p + j + 2); + x4 = load_to_reg(p + j + 3); + y1 = load_to_reg(q + j); + y2 = load_to_reg(q + j + 1); + y3 = load_to_reg(q + j + 2); + y4 = load_to_reg(q + j + 3); + + butterfly_gs(rp1, c, &x1, &y1, card); + butterfly_gs(rp1, c, &x2, &y2, card); + butterfly_gs(rp1, c, &x3, &y3, card); + butterfly_gs(rp1, c, &x4, &y4, card); + + // Store back to memory + store_to_mem(p + j, x1); + store_to_mem(p + j + 1, x2); + store_to_mem(p + j + 2, x3); + store_to_mem(p + j + 3, x4); + store_to_mem(q + j, y1); + store_to_mem(q + j + 1, y2); + store_to_mem(q + j + 2, y3); + store_to_mem(q + j + 3, y4); + } + for (; j < len; ++j) { + x1 = load_to_reg(p + j); + y1 = load_to_reg(q + j); + + butterfly_gs(rp1, c, &x1, &y1, card); + + // Store back to memory + store_to_mem(p + j, x1); + store_to_mem(q + j, y1); + } + } +} + +/** + * Vectorized butterfly GS step + * + * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r` + * Q = r * P + * + * @param buf - working buffers + * @param r - coefficient + * @param start - index of buffer among `m` ones + * @param m - current group size + * @param len - number of vectors per buffer + * @param card - modulo cardinal + */ +template +inline void butterfly_gs_step_simple( + vec::Buffers& buf, + T r, + unsigned start, + unsigned m, + size_t len, + T card) +{ + if (len == 0) { + return; + } + const unsigned step = m << 1; + const T rp1 = r + 1; + VecType c = set_one(r); + + const size_t end = (len > 1) ? len - 1 : 0; + const unsigned bufs_nb = buf.get_n(); + const std::vector& mem = buf.get_mem(); + for (unsigned i = start; i < bufs_nb; i += step) { + VecType x1, y1; + VecType x2, y2; + VecType* p = reinterpret_cast(mem[i]); + VecType* q = reinterpret_cast(mem[i + m]); + + size_t j = 0; + for (; j < end; j += 2) { + x1 = load_to_reg(p + j); + x2 = load_to_reg(p + j + 1); + + y1 = butterfly_simple_gs(rp1, c, x1, card); + y2 = butterfly_simple_gs(rp1, c, x2, card); + + // Store back to memory + store_to_mem(q + j, y1); + store_to_mem(q + j + 1, y2); + } + for (; j < len; ++j) { + x1 = load_to_reg(p + j); + + y1 = butterfly_simple_gs(rp1, c, x1, card); + + // Store back to memory + store_to_mem(q + j, y1); + } + } +} + +template +inline void encode_post_process( + vec::Buffers& output, + std::vector& props, + off_t offset, + unsigned code_len, + T threshold, + size_t vecs_nb) +{ + const unsigned element_size = sizeof(T); + const unsigned vec_size = countof(); + const T max = 1 << (element_size * 8 - 1); + const VecType _threshold = set_one(threshold); + const VecType mask_hi = set_one(max); + + const std::vector& mem = output.get_mem(); + for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) { + VecType* buf = reinterpret_cast(mem[frag_id]); + + size_t vec_id = 0; + size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0; + for (; vec_id < end; vec_id += 4) { + VecType a1 = load_to_reg(buf + vec_id); + VecType a2 = load_to_reg(buf + vec_id + 1); + VecType a3 = load_to_reg(buf + vec_id + 2); + VecType a4 = load_to_reg(buf + vec_id + 3); + + if (!and_is_zero(a1, _threshold)) { + const off_t curr_offset = offset + vec_id * vec_size; + add_props( + props[frag_id], _threshold, mask_hi, a1, curr_offset, max); + } + if (!and_is_zero(a2, _threshold)) { + const off_t curr_offset = offset + (vec_id + 1) * vec_size; + add_props( + props[frag_id], _threshold, mask_hi, a2, curr_offset, max); + } + if (!and_is_zero(a3, _threshold)) { + const off_t curr_offset = offset + (vec_id + 2) * vec_size; + add_props( + props[frag_id], _threshold, mask_hi, a3, curr_offset, max); + } + if (!and_is_zero(a4, _threshold)) { + const off_t curr_offset = offset + (vec_id + 3) * vec_size; + add_props( + props[frag_id], _threshold, mask_hi, a4, curr_offset, max); + } + } + for (; vec_id < vecs_nb; ++vec_id) { + VecType a = load_to_reg(buf + vec_id); + if (!and_is_zero(a, _threshold)) { + const off_t curr_offset = offset + vec_id * vec_size; + add_props( + props[frag_id], _threshold, mask_hi, a, curr_offset, max); + } + } + } +} + +} // namespace simd +} // namespace quadiron + +#endif From bf2b414a2c150354e8d192c1003636a478fa4a70 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:04:14 +0100 Subject: [PATCH 12/13] FFT_2n: compute indices for SIMD - Indices for SIMD parts are computed once in FFT function - Define butterfly_ct_two_layers_step_slow for non-vectorized functions --- src/fft_2n.cpp | 128 ++++++++++--------------------------------------- src/fft_2n.h | 34 +++++++++++-- 2 files changed, 54 insertions(+), 108 deletions(-) diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp index f3d8847f..f7d91468 100644 --- a/src/fft_2n.cpp +++ b/src/fft_2n.cpp @@ -48,10 +48,6 @@ void Radix2::butterfly_ct_two_layers_step( unsigned start, unsigned m) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; const unsigned coefIndex = start * this->n / m / 2; const uint16_t r1 = vec_W[coefIndex]; const uint16_t r2 = vec_W[coefIndex / 2]; @@ -59,29 +55,11 @@ void Radix2::butterfly_ct_two_layers_step( // perform vector operations simd::butterfly_ct_two_layers_step( - buf, r1, r2, r3, start, m, vec_len, card); + buf, r1, r2, r3, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - const unsigned step = m << 2; - size_t offset = vec_len * ratio; - // --------- - // First layer - // --------- - const uint16_t r1 = W->get(start * this->n / m / 2); - // first pair - butterfly_ct_step_slow(buf, r1, start, m, step, offset); - // second pair - butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); - // --------- - // Second layer - // --------- - // first pair - const uint16_t r2 = W->get(start * this->n / m / 4); - butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); - // second pair - const uint16_t r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + if (simd_trailing_len > 0) { + butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset); } } @@ -93,18 +71,12 @@ void Radix2::butterfly_ct_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_ct_step_slow(buf, r, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_ct_step_slow(buf, r, start, m, step, simd_offset); } } @@ -116,18 +88,12 @@ void Radix2::butterfly_gs_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset); } } @@ -139,18 +105,12 @@ void Radix2::butterfly_gs_step_simple( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset); } } @@ -160,10 +120,6 @@ void Radix2::butterfly_ct_two_layers_step( unsigned start, unsigned m) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; const unsigned coefIndex = start * this->n / m / 2; const uint32_t r1 = vec_W[coefIndex]; const uint32_t r2 = vec_W[coefIndex / 2]; @@ -171,29 +127,11 @@ void Radix2::butterfly_ct_two_layers_step( // perform vector operations simd::butterfly_ct_two_layers_step( - buf, r1, r2, r3, start, m, vec_len, card); + buf, r1, r2, r3, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - const unsigned step = m << 2; - size_t offset = vec_len * ratio; - // --------- - // First layer - // --------- - const uint32_t r1 = W->get(start * this->n / m / 2); - // first pair - butterfly_ct_step_slow(buf, r1, start, m, step, offset); - // second pair - butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); - // --------- - // Second layer - // --------- - // first pair - const uint32_t r2 = W->get(start * this->n / m / 4); - butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); - // second pair - const uint32_t r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); + if (simd_trailing_len > 0) { + butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset); } } @@ -205,18 +143,12 @@ void Radix2::butterfly_ct_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card); + simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_ct_step_slow(buf, r, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_ct_step_slow(buf, r, start, m, step, simd_offset); } } @@ -228,18 +160,12 @@ void Radix2::butterfly_gs_step( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset); } } @@ -251,18 +177,12 @@ void Radix2::butterfly_gs_step_simple( unsigned m, unsigned step) { - const unsigned ratio = simd::countof(); - const size_t len = this->pkt_size; - const size_t vec_len = len / ratio; - const size_t last_len = len - vec_len * ratio; - // perform vector operations - simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card); + simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card); // for last elements, perform as non-SIMD method - if (last_len > 0) { - size_t offset = vec_len * ratio; - butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset); + if (simd_trailing_len > 0) { + butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset); } } diff --git a/src/fft_2n.h b/src/fft_2n.h index 4ded5bb4..2e371e9a 100644 --- a/src/fft_2n.h +++ b/src/fft_2n.h @@ -112,6 +112,11 @@ class Radix2 : public FourierTransform { unsigned step); // Only used for non-vectorized elements + void butterfly_ct_two_layers_step_slow( + vec::Buffers& buf, + unsigned start, + unsigned m, + size_t offset = 0); void butterfly_ct_step_slow( vec::Buffers& buf, T coef, @@ -142,6 +147,11 @@ class Radix2 : public FourierTransform { size_t pkt_size; size_t buf_size; + // Indices used for accelerated functions + size_t simd_vec_len; + size_t simd_trailing_len; + size_t simd_offset; + std::unique_ptr rev = nullptr; std::unique_ptr> W = nullptr; std::unique_ptr> inv_W = nullptr; @@ -182,6 +192,12 @@ Radix2::Radix2(const gf::Field& gf, int n, int data_len, size_t pkt_size) rev = std::unique_ptr(new T[n]); init_bitrev(); + + // Indices used for accelerated functions + const unsigned ratio = simd::countof(); + simd_vec_len = this->pkt_size / ratio; + simd_trailing_len = this->pkt_size - simd_vec_len * ratio; + simd_offset = simd_vec_len * ratio; } template @@ -421,6 +437,16 @@ void Radix2::butterfly_ct_two_layers_step( vec::Buffers& buf, unsigned start, unsigned m) +{ + butterfly_ct_two_layers_step_slow(buf, start, m); +} + +template +void Radix2::butterfly_ct_two_layers_step_slow( + vec::Buffers& buf, + unsigned start, + unsigned m, + size_t offset) { const unsigned step = m << 2; // --------- @@ -428,18 +454,18 @@ void Radix2::butterfly_ct_two_layers_step( // --------- const T r1 = W->get(start * this->n / m / 2); // first pair - butterfly_ct_step(buf, r1, start, m, step); + butterfly_ct_step_slow(buf, r1, start, m, step, offset); // second pair - butterfly_ct_step(buf, r1, start + 2 * m, m, step); + butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset); // --------- // Second layer // --------- // first pair const T r2 = W->get(start * this->n / m / 4); - butterfly_ct_step(buf, r2, start, 2 * m, step); + butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset); // second pair const T r3 = W->get((start + m) * this->n / m / 4); - butterfly_ct_step(buf, r3, start + m, 2 * m, step); + butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset); } template From 9c47e189e03e6282af41c147ebd2b11977e5ebc2 Mon Sep 17 00:00:00 2001 From: Lam Pham-Sy Date: Tue, 30 Oct 2018 11:16:37 +0100 Subject: [PATCH 13/13] RS-FNT: simd indices as member variables --- src/fec_rs_fnt.h | 11 +++++++++++ src/fec_vectorisation.cpp | 26 ++++++-------------------- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h index 4bdd5475..55ffb487 100644 --- a/src/fec_rs_fnt.h +++ b/src/fec_rs_fnt.h @@ -60,6 +60,11 @@ class RsFnt : public FecCode { // decoding context used in encoding of systematic FNT std::unique_ptr> enc_context; + // Indices used for accelerated functions + size_t simd_vec_len; + size_t simd_trailing_len; + size_t simd_offset; + public: RsFnt( FecType type, @@ -70,6 +75,12 @@ class RsFnt : public FecCode { : FecCode(type, word_size, n_data, n_parities, pkt_size) { this->fec_init(); + + // Indices used for accelerated functions + const unsigned ratio = simd::countof(); + simd_vec_len = this->pkt_size / ratio; + simd_trailing_len = this->pkt_size - simd_vec_len * ratio; + simd_offset = simd_vec_len * ratio; } inline void check_params() override diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp index 8684e1ab..ed82fab8 100644 --- a/src/fec_vectorisation.cpp +++ b/src/fec_vectorisation.cpp @@ -53,20 +53,13 @@ void RsFnt::encode_post_process( uint16_t threshold = this->gf->card_minus_one(); unsigned code_len = this->n_outputs; - // number of elements per vector register - unsigned vec_size = simd::countof(); - // number of vector registers per fragment packet - size_t vecs_nb = size / vec_size; - // odd number of elements not vectorized - size_t last_len = size - vecs_nb * vec_size; - simd::encode_post_process( - output, props, offset, code_len, threshold, vecs_nb); + output, props, offset, code_len, threshold, simd_vec_len); - if (last_len > 0) { + if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { uint16_t* chunk = output.get(i); - for (size_t j = vecs_nb * vec_size; j < size; ++j) { + for (size_t j = simd_offset; j < size; ++j) { if (chunk[j] == threshold) { props[i].add(offset + j, OOR_MARK); } @@ -85,20 +78,13 @@ void RsFnt::encode_post_process( const uint32_t threshold = this->gf->card_minus_one(); const unsigned code_len = this->n_outputs; - // number of elements per vector register - const unsigned vec_size = simd::countof(); - // number of vector registers per fragment packet - const size_t vecs_nb = size / vec_size; - // odd number of elements not vectorized - const size_t last_len = size - vecs_nb * vec_size; - simd::encode_post_process( - output, props, offset, code_len, threshold, vecs_nb); + output, props, offset, code_len, threshold, simd_vec_len); - if (last_len > 0) { + if (simd_trailing_len > 0) { for (unsigned i = 0; i < code_len; ++i) { uint32_t* chunk = output.get(i); - for (size_t j = vecs_nb * vec_size; j < size; ++j) { + for (size_t j = simd_offset; j < size; ++j) { if (chunk[j] == threshold) { props[i].add(offset + j, OOR_MARK); }