From cccf187f3152b5776c8485ad5a86dfd6bc4b57cf Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Thu, 13 Sep 2018 15:28:53 +0200
Subject: [PATCH 01/13] Move OOR_MARK definition to property.h

---
 src/fec_base.h | 2 --
 src/property.h | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/fec_base.h b/src/fec_base.h
index 13ac78d9..741d480f 100644
--- a/src/fec_base.h
+++ b/src/fec_base.h
@@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin)
     return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec;
 }
 
-#define OOR_MARK 1
-
 enum class FecType {
     /** Systematic code
      *
diff --git a/src/property.h b/src/property.h
index 766acea7..2fcaa61c 100644
--- a/src/property.h
+++ b/src/property.h
@@ -40,6 +40,8 @@
 
 namespace quadiron {
 
+static constexpr unsigned OOR_MARK = 1;
+
 /** Ancillary data attached to values.
  *
  * A property carries extra-information (whose interpretation is left to the

From af5455287bf7883651f8ea22fb20a796731cfb91 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Mon, 15 Oct 2018 11:37:09 +0200
Subject: [PATCH 02/13] FFT2n: specialize butterfly operations

---
 src/fft_2n.h | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/src/fft_2n.h b/src/fft_2n.h
index f3da7174..8a63bbb2 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -602,6 +602,65 @@ void Radix2<T>::ifft(vec::Buffers<T>& output, vec::Buffers<T>& input)
     this->gf->mul_vec_to_vecp(*(this->vec_inv_n), output, output);
 }
 
+#ifdef QUADIRON_USE_SIMD
+
+/* Operations are vectorized by SIMD */
+template <>
+void Radix2<uint16_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    unsigned start,
+    unsigned m);
+template <>
+void Radix2<uint16_t>::butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint16_t>::butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint16_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m);
+template <>
+void Radix2<uint32_t>::butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint32_t>::butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+template <>
+void Radix2<uint32_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step);
+
+#endif // #ifdef QUADIRON_USE_SIMD
+
 } // namespace fft
 } // namespace quadiron
 

From bf559a6ff102e5c461767c4c0fa8089c4bc10f5d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:45:34 +0200
Subject: [PATCH 03/13] CMakeLists: add fft_2n.cpp file

---
 src/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aff9c341..3ff590af 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -31,6 +31,7 @@ include(GNUInstallDirs)
 set(LIB_SRC
   ${SOURCE_DIR}/core.cpp
   ${SOURCE_DIR}/fec_vectorisation.cpp
+  ${SOURCE_DIR}/fft_2n.cpp
   ${SOURCE_DIR}/misc.cpp
   ${SOURCE_DIR}/gf_nf4.cpp
   ${SOURCE_DIR}/gf_ring.cpp

From 5409360aa3c61994612171ffb41fdee9e20f02d2 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:41:40 +0200
Subject: [PATCH 04/13] FFT2n.cpp: implement specialized operations

---
 src/fec_vectorisation.cpp |   2 +-
 src/fft_2n.cpp            | 272 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 1 deletion(-)
 create mode 100644 src/fft_2n.cpp

diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index 2564ed7d..8684e1ab 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -32,7 +32,7 @@
 #include "fec_rs_fnt.h"
 
 /*
- * The file includes vectorized operations used by FEC classes
+ * The file includes specialized operations used by FEC classes
  */
 
 #ifdef QUADIRON_USE_SIMD
diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
new file mode 100644
index 00000000..f3d8847f
--- /dev/null
+++ b/src/fft_2n.cpp
@@ -0,0 +1,272 @@
+/* -*- mode: c++ -*- */
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "fft_2n.h"
+
+/*
+ * The file includes vectorized operations used by Radix2 classes
+ */
+
+#ifdef QUADIRON_USE_SIMD
+
+#include "simd.h"
+
+namespace quadiron {
+namespace fft {
+
+template <>
+void Radix2<uint16_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint16_t>& buf,
+    unsigned start,
+    unsigned m)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+    const unsigned coefIndex = start * this->n / m / 2;
+    const uint16_t r1 = vec_W[coefIndex];
+    const uint16_t r2 = vec_W[coefIndex / 2];
+    const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4];
+
+    // perform vector operations
+    simd::butterfly_ct_two_layers_step(
+        buf, r1, r2, r3, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        const unsigned step = m << 2;
+        size_t offset = vec_len * ratio;
+        //  ---------
+        // First layer
+        //  ---------
+        const uint16_t r1 = W->get(start * this->n / m / 2);
+        // first pair
+        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
+        // second pair
+        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
+        //  ---------
+        // Second layer
+        //  ---------
+        // first pair
+        const uint16_t r2 = W->get(start * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
+        // second pair
+        const uint16_t r3 = W->get((start + m) * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_ct_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_gs_step(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint16_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint16_t>& buf,
+    uint16_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint16_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_two_layers_step(
+    vec::Buffers<uint32_t>& buf,
+    unsigned start,
+    unsigned m)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+    const unsigned coefIndex = start * this->n / m / 2;
+    const uint32_t r1 = vec_W[coefIndex];
+    const uint32_t r2 = vec_W[coefIndex / 2];
+    const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4];
+
+    // perform vector operations
+    simd::butterfly_ct_two_layers_step(
+        buf, r1, r2, r3, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        const unsigned step = m << 2;
+        size_t offset = vec_len * ratio;
+        //  ---------
+        // First layer
+        //  ---------
+        const uint32_t r1 = W->get(start * this->n / m / 2);
+        // first pair
+        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
+        // second pair
+        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
+        //  ---------
+        // Second layer
+        //  ---------
+        // first pair
+        const uint32_t r2 = W->get(start * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
+        // second pair
+        const uint32_t r3 = W->get((start + m) * this->n / m / 4);
+        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_ct_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t r,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_gs_step(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+template <>
+void Radix2<uint32_t>::butterfly_gs_step_simple(
+    vec::Buffers<uint32_t>& buf,
+    uint32_t coef,
+    unsigned start,
+    unsigned m,
+    unsigned step)
+{
+    const unsigned ratio = simd::countof<uint32_t>();
+    const size_t len = this->pkt_size;
+    const size_t vec_len = len / ratio;
+    const size_t last_len = len - vec_len * ratio;
+
+    // perform vector operations
+    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+
+    // for last elements, perform as non-SIMD method
+    if (last_len > 0) {
+        size_t offset = vec_len * ratio;
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    }
+}
+
+} // namespace fft
+} // namespace quadiron
+
+#endif // #ifdef QUADIRON_USE_SIMD

From 0d11e1a9ee47f02c3c2b412c88d5feb355e2e1c1 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:43:47 +0200
Subject: [PATCH 05/13] SIMD: remove useless files

The SIMD parts will be re-implemented in next commits
---
 src/simd_128_u16.h | 358 ----------------------------------
 src/simd_128_u32.h | 442 ------------------------------------------
 src/simd_256_u16.h | 354 ----------------------------------
 src/simd_256_u32.h | 466 ---------------------------------------------
 4 files changed, 1620 deletions(-)
 delete mode 100644 src/simd_128_u16.h
 delete mode 100644 src/simd_128_u32.h
 delete mode 100644 src/simd_256_u16.h
 delete mode 100644 src/simd_256_u32.h

diff --git a/src/simd_128_u16.h b/src/simd_128_u16.h
deleted file mode 100644
index a177cfb3..00000000
--- a/src/simd_128_u16.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_U16_H__
-#define __QUAD_SIMD_128_U16_H__
-
-#include <x86intrin.h>
-
-#include "property.h"
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m128i mod_after_add(m128i a, aint16 card)
-{
-    const m128i _card = _mm_set1_epi16(card);
-    const m128i _card_minus_1 = _mm_set1_epi16(card - 1);
-
-    m128i cmp = _mm_cmpgt_epi16(a, _card_minus_1);
-    m128i b = _mm_sub_epi16(a, _mm_and_si128(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m128i add(m128i a, m128i b, aint16 card)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-    m128i c = _mm_add_epi16(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m128i sub(m128i a, m128i b, aint16 card)
-{
-    const m128i _card = _mm_set1_epi16(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i cmp = _mm_cmpgt_epi16(_b, _a);
-    m128i _a1 = _mm_add_epi16(_a, _mm_and_si128(_card, cmp));
-
-    return _mm_sub_epi16(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m128i neg(m128i a, aint16 card = F3)
-{
-    const m128i _card = _mm_set1_epi16(card);
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_setzero_si128();
-
-    m128i cmp = _mm_cmpgt_epi16(_a, _b);
-
-    return _mm_sub_epi16(_mm_and_si128(cmp, _card), _a);
-}
-
-inline m128i mod_after_multiply(m128i a)
-{
-    const m128i mask = _mm_set1_epi16(F3 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 1);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi16(hi, lo);
-    m128i _lo = _mm_add_epi16(lo, _mm_and_si128(F3_m128i_u16, cmp));
-
-    return _mm_sub_epi16(_lo, hi);
-}
-
-inline m128i mul(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi16(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi16(_a, F3minus1_m128i_u16),
-        _mm_cmpeq_epi16(_b, F3minus1_m128i_u16));
-
-    const m128i one = _mm_set1_epi16(1);
-    c = _mm_add_epi16(c, _mm_and_si128(one, cmp));
-
-    // Modulo
-    return mod_after_multiply(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3
- */
-inline m128i mul(m128i a, m128i b, aint16 card)
-{
-    // FIXME: generalize card
-    assert(card == F3);
-    return mul(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint16* buf, aint16 card = F3)
-{
-    m128i* _buf = reinterpret_cast<m128i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint16 a,
-    aint16* src,
-    aint16* dest,
-    size_t len,
-    aint16 card = F3)
-{
-    const m128i coef = _mm_set1_epi16(a);
-
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint32_t coef_doubled = (uint32_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint16)((coef_doubled * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint16 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint16* bufa,
-    aint16* bufb,
-    aint16* res,
-    size_t len,
-    aint16 card = F3)
-{
-    m128i* _bufa = reinterpret_cast<m128i*>(bufa);
-    m128i* _bufb = reinterpret_cast<m128i*>(bufb);
-    m128i* _res = reinterpret_cast<m128i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], F3);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            // dest[i] = uint32_t(src[i]) * uint32_t(dest[i]) % card;
-            dest[i] = uint16_t((uint32_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint32_t card = F3)
-{
-    const m128i _coef = _mm_set1_epi16(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m128i _coef = _mm_set1_epi16(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = _buf1[i];
-        m128i b = _buf2[i];
-        m128i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint16_t>();
-
-    const m128i _threshold = _mm_set1_epi16(threshold);
-    uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1);
-    const m128i mask_hi = _mm_set1_epi16(max);
-    const unsigned element_size = sizeof(uint16_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint16_t* chunk = output.get(frag_id);
-        m128i* buf = reinterpret_cast<m128i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m128i a = _mm_load_si128(&(buf[vec_id]));
-            const m128i b = _mm_cmpeq_epi16(_threshold, a);
-            const m128i c = _mm_and_si128(mask_hi, b);
-            uint16_t d = _mm_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_128_u32.h b/src/simd_128_u32.h
deleted file mode 100644
index 5039f0f8..00000000
--- a/src/simd_128_u32.h
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_128_U32_H__
-#define __QUAD_SIMD_128_U32_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m128i mod_after_add(m128i a, aint32 card)
-{
-    const m128i _card = _mm_set1_epi32(card);
-    const m128i _card_minus_1 = _mm_set1_epi32(card - 1);
-
-    m128i cmp = _mm_cmpgt_epi32(a, _card_minus_1);
-    m128i b = _mm_sub_epi32(a, _mm_and_si128(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m128i add(m128i a, m128i b, aint32 card)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-    m128i c = _mm_add_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m128i sub(m128i a, m128i b, aint32 card)
-{
-    const m128i _card = _mm_set1_epi32(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i cmp = _mm_cmpgt_epi32(_b, _a);
-    m128i _a1 = _mm_add_epi32(_a, _mm_and_si128(_card, cmp));
-
-    return _mm_sub_epi32(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m128i neg(m128i a, aint32 card = F4)
-{
-    const m128i _card = _mm_set1_epi32(card);
-
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_setzero_si128();
-    m128i cmp = _mm_cmpgt_epi32(_a, _b);
-
-    return _mm_sub_epi32(_mm_and_si128(cmp, _card), _a);
-}
-
-/** Perform a%card where a is a multiplication of two numbers whose elements are
- *  symbols of GF(F4)
- *
- *  We find v in a = u * card + v
- *  a is expressed also as: a = hi * (card-1) + lo
- *  where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a
- *  hence, v = (lo - hi) % F4
- *      v = lo - hi, if lo >= hi
- *          or
- *          F4 + lo - hi, otherwise
- */
-inline m128i mod_after_multiply_f4(m128i a)
-{
-    const m128i mask = _mm_set1_epi32(F4 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 2);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F4_m128i, cmp));
-
-    return _mm_sub_epi32(_lo, hi);
-}
-
-inline m128i mod_after_multiply_f3(m128i a)
-{
-    const m128i mask = _mm_set1_epi32(F3 - 2);
-
-    m128i lo = _mm_and_si128(a, mask);
-
-    m128i a_shift = _mm_srli_si128(a, 1);
-    m128i hi = _mm_and_si128(a_shift, mask);
-
-    m128i cmp = _mm_cmpgt_epi32(hi, lo);
-    m128i _lo = _mm_add_epi32(lo, _mm_and_si128(F3_m128i, cmp));
-
-    return _mm_sub_epi32(_lo, hi);
-}
-
-inline m128i mul_f4(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi32(_a, F4minus1_m128i),
-        _mm_cmpeq_epi32(_b, F4minus1_m128i));
-
-    const m128i one = _mm_set1_epi32(1);
-    c = _mm_add_epi32(c, _mm_and_si128(one, cmp));
-
-    // Modulo
-    return mod_after_multiply_f4(c);
-}
-
-inline m128i mul_f3(m128i a, m128i b)
-{
-    m128i _a = _mm_load_si128(&a);
-    m128i _b = _mm_load_si128(&b);
-
-    m128i c = _mm_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m128i cmp = _mm_and_si128(
-        _mm_cmpeq_epi32(_a, F3minus1_m128i),
-        _mm_cmpeq_epi32(_b, F3minus1_m128i));
-
-    c = _mm_xor_si128(c, _mm_and_si128(F4_m128i, cmp));
-
-    // Modulo
-    return mod_after_multiply_f3(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3 and F4
- */
-inline m128i mul(m128i a, m128i b, aint32 card)
-{
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4(a, b);
-    return mul_f3(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
-{
-    m128i* _buf = reinterpret_cast<m128i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i] > 0)
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint32 a,
-    aint32* src,
-    aint32* dest,
-    size_t len,
-    aint32 card = F4)
-{
-    const m128i coef = _mm_set1_epi32(a);
-
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint64_t coef_64 = (uint64_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint32)((coef_64 * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint32 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint32* bufa,
-    aint32* bufb,
-    aint32* res,
-    size_t len,
-    aint32 card = F4)
-{
-    m128i* _bufa = reinterpret_cast<m128i*>(bufa);
-    m128i* _bufb = reinterpret_cast<m128i*>(bufb);
-    m128i* _res = reinterpret_cast<m128i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m128i* _src = reinterpret_cast<m128i*>(src);
-    m128i* _dest = reinterpret_cast<m128i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m128i _coef = _mm_set1_epi32(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m128i _coef = _mm_set1_epi32(coef);
-    m128i* _buf1 = reinterpret_cast<m128i*>(buf1);
-    m128i* _buf2 = reinterpret_cast<m128i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m128i a = _buf1[i];
-        m128i b = _buf2[i];
-        m128i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint32_t>();
-
-    const m128i _threshold = _mm_set1_epi32(threshold);
-    const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1);
-    const m128i mask_hi = _mm_set1_epi32(max);
-    const unsigned element_size = sizeof(uint32_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint32_t* chunk = output.get(frag_id);
-        m128i* buf = reinterpret_cast<m128i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m128i a = _mm_load_si128(&(buf[vec_id]));
-            const m128i b = _mm_cmpeq_epi32(_threshold, a);
-            const m128i c = _mm_and_si128(mask_hi, b);
-            uint16_t d = _mm_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-/* ==================== Operations for NF4 =================== */
-
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
-{
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
-
-    return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-
-inline __uint128_t add(__uint128_t a, __uint128_t b)
-{
-    m128i res = add((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline __uint128_t sub(__uint128_t a, __uint128_t b)
-{
-    m128i res = sub((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline __uint128_t mul(__uint128_t a, __uint128_t b)
-{
-    m128i res = mul((m128i)a, (m128i)b, F4);
-    return m128i_to_uint128(res);
-}
-
-inline void hadamard_mul(int n, aint128* _x, aint128* _y)
-{
-    int i;
-    m128i* x = reinterpret_cast<m128i*>(_x);
-    m128i* y = reinterpret_cast<m128i*>(_y);
-
-    // multiply y to `x`
-    for (i = 0; i < n; i++) {
-        x[i] = mul(x[i], y[i], F4);
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_256_u16.h b/src/simd_256_u16.h
deleted file mode 100644
index 974e136e..00000000
--- a/src/simd_256_u16.h
+++ /dev/null
@@ -1,354 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_U16_H__
-#define __QUAD_SIMD_256_U16_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m256i mod_after_add(m256i a, aint16 card)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-    const m256i _card_minus_1 = _mm256_set1_epi16(card - 1);
-
-    m256i cmp = _mm256_cmpgt_epi16(a, _card_minus_1);
-    m256i b = _mm256_sub_epi16(a, _mm256_and_si256(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m256i add(m256i a, m256i b, aint16 card = F3)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-    m256i c = _mm256_add_epi16(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m256i sub(m256i a, m256i b, aint16 card)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i cmp = _mm256_cmpgt_epi16(_b, _a);
-    m256i _a1 = _mm256_add_epi16(_a, _mm256_and_si256(_card, cmp));
-
-    return _mm256_sub_epi16(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m256i neg(m256i a, aint16 card = F3)
-{
-    const m256i _card = _mm256_set1_epi16(card);
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_setzero_si256();
-
-    m256i cmp = _mm256_cmpgt_epi16(_a, _b);
-
-    return _mm256_sub_epi16(_mm256_and_si256(cmp, _card), _a);
-}
-
-inline m256i mod_after_multiply(m256i a)
-{
-    const m256i mask = _mm256_set1_epi16(F3 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 1);
-    m256i hi = _mm256_and_si256(a_shift, mask);
-
-    m256i cmp = _mm256_cmpgt_epi16(hi, lo);
-    m256i _lo = _mm256_add_epi16(lo, _mm256_and_si256(F3_m256i_u16, cmp));
-
-    return _mm256_sub_epi16(_lo, hi);
-}
-
-inline m256i mul(m256i a, m256i b)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi16(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi16(_a, F3minus1_m256i_u16),
-        _mm256_cmpeq_epi16(_b, F3minus1_m256i_u16));
-
-    const m256i one = _mm256_set1_epi16(1);
-    c = _mm256_add_epi16(c, _mm256_and_si256(one, cmp));
-
-    // Modulo
-    return mod_after_multiply(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3
- */
-inline m256i mul(m256i a, m256i b, aint16 card)
-{
-    // FIXME: generalize card
-    assert(card == F3);
-    return mul(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint16* buf, aint16 card = F3)
-{
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint16 a,
-    aint16* src,
-    aint16* dest,
-    size_t len,
-    aint16 card = F3)
-{
-    const m256i coef = _mm256_set1_epi16(a);
-
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint32_t coef_doubled = (uint32_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint16)((coef_doubled * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint16 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint16* bufa,
-    aint16* bufb,
-    aint16* res,
-    size_t len,
-    aint16 card = F3)
-{
-    m256i* _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* _res = reinterpret_cast<m256i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint16* src, aint16* dest, size_t len, aint16 card = F3)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = (uint32_t(src[i]) * dest[i]) % card;
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m256i _coef = _mm256_set1_epi16(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint16_t coef,
-    aint16* buf1,
-    aint16* buf2,
-    size_t len,
-    uint16_t card = F3)
-{
-    const m256i _coef = _mm256_set1_epi16(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = _buf1[i];
-        m256i b = _buf2[i];
-        m256i c = sub(a, b, card);
-        _buf1[i] = add(a, b, card);
-        _buf2[i] = mul(_coef, c, card);
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint16_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint16_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint16_t>();
-
-    const m256i _threshold = _mm256_set1_epi16(threshold);
-    uint16_t max = 1 << (sizeof(uint16_t) * 8 - 1);
-    const m256i mask_hi = _mm256_set1_epi16(max);
-    const unsigned element_size = sizeof(uint16_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint16_t* chunk = output.get(frag_id);
-        m256i* buf = reinterpret_cast<m256i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m256i a = _mm256_load_si256(&(buf[vec_id]));
-            const m256i b = _mm256_cmpeq_epi16(_threshold, a);
-            const m256i c = _mm256_and_si256(mask_hi, b);
-            uint32_t d = _mm256_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif
diff --git a/src/simd_256_u32.h b/src/simd_256_u32.h
deleted file mode 100644
index 9c76c89b..00000000
--- a/src/simd_256_u32.h
+++ /dev/null
@@ -1,466 +0,0 @@
-/*
- * Copyright 2017-2018 Scality
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *
- * 1. Redistributions of source code must retain the above copyright notice,
- *    this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright notice,
- *    this list of conditions and the following disclaimer in the documentation
- *    and/or other materials provided with the distribution.
- *
- * 3. Neither the name of the copyright holder nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __QUAD_SIMD_256_U32_H__
-#define __QUAD_SIMD_256_U32_H__
-
-#include <x86intrin.h>
-
-#include "simd/simd.h"
-
-namespace quadiron {
-namespace simd {
-
-/* ==================== Essential Operations =================== */
-
-/** Perform a%card where a is a addition of two numbers whose elements are
- *  symbols of GF(card) */
-inline m256i mod_after_add(m256i a, aint32 card)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-    const m256i _card_minus_1 = _mm256_set1_epi32(card - 1);
-
-    m256i cmp = _mm256_cmpgt_epi32(a, _card_minus_1);
-    m256i b = _mm256_sub_epi32(a, _mm256_and_si256(_card, cmp));
-
-    return b;
-}
-
-/** Perform addition of two numbers a, b whose elements are of GF(card) */
-inline m256i add(m256i a, m256i b, aint32 card)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-    m256i c = _mm256_add_epi32(_a, _b);
-
-    // Modulo
-    return mod_after_add(c, card);
-}
-
-/** Perform subtraction of a by b where a, b whose elements are symbols of
- *  GF(card)
- * sub(a, b) = a - b if a >= b, or
- *             card + a - b, otherwise
- */
-inline m256i sub(m256i a, m256i b, aint32 card)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i cmp = _mm256_cmpgt_epi32(_b, _a);
-    m256i _a1 = _mm256_add_epi32(_a, _mm256_and_si256(_card, cmp));
-
-    return _mm256_sub_epi32(_a1, _b);
-}
-
-/** Negate `a`
- * @return 0 if (a == 0), else card - a
- */
-inline m256i neg(m256i a, aint32 card = F4)
-{
-    const m256i _card = _mm256_set1_epi32(card);
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_setzero_si256();
-
-    m256i cmp = _mm256_cmpgt_epi32(_a, _b);
-
-    return _mm256_sub_epi32(_mm256_and_si256(cmp, _card), _a);
-}
-
-/** Perform a%card where a is a multiplication of two numbers whose elements are
- *  symbols of GF(F4)
- *
- *  We find v in a = u * card + v
- *  a is expressed also as: a = hi * (card-1) + lo
- *  where hi and lo is 16-bit for F4 (or 8-bit for F3) high and low parts of a
- *  hence, v = (lo - hi) % F4
- *      v = lo - hi, if lo >= hi
- *          or
- *          F4 + lo - hi, otherwise
- */
-inline m256i mod_after_multiply_f4(m256i a)
-{
-    const m256i mask = _mm256_set1_epi32(F4 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 2);
-    m256i hi = _mm256_and_si256(a_shift, mask);
-
-    m256i cmp = _mm256_cmpgt_epi32(hi, lo);
-    m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F4_m256i, cmp));
-
-    return _mm256_sub_epi32(_lo, hi);
-}
-
-inline m256i mod_after_multiply_f3(m256i a)
-{
-    const m256i mask = _mm256_set1_epi32(F3 - 2);
-
-    m256i lo = _mm256_and_si256(a, mask);
-
-    m256i a_shift = _mm256_srli_si256(a, 1);
-    m256i hi = _mm256_and_si256(a_shift, mask);
-
-    m256i cmp = _mm256_cmpgt_epi32(hi, lo);
-    m256i _lo = _mm256_add_epi32(lo, _mm256_and_si256(F3_m256i, cmp));
-
-    return _mm256_sub_epi32(_lo, hi);
-}
-
-inline m256i mul_f4(m256i a, m256i b)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi32(_a, F4minus1_m256i),
-        _mm256_cmpeq_epi32(_b, F4minus1_m256i));
-
-    const m256i one = _mm256_set1_epi32(1);
-    c = _mm256_add_epi32(c, _mm256_and_si256(one, cmp));
-
-    // Modulo
-    return mod_after_multiply_f4(c);
-}
-
-inline m256i mul_f3(m256i a, m256i b)
-{
-    m256i _a = _mm256_load_si256(&a);
-    m256i _b = _mm256_load_si256(&b);
-
-    m256i c = _mm256_mullo_epi32(_a, _b);
-
-    // filter elements of both of a & b = card-1
-    m256i cmp = _mm256_and_si256(
-        _mm256_cmpeq_epi32(_a, F3minus1_m256i),
-        _mm256_cmpeq_epi32(_b, F3minus1_m256i));
-
-    c = _mm256_xor_si256(c, _mm256_and_si256(F4_m256i, cmp));
-
-    // Modulo
-    return mod_after_multiply_f3(c);
-}
-
-/** Perform multiplication of two numbers a, b whose elements are of GF(card)
- *  where `card` is a prime Fermat number, i.e. card = Fx with x < 5
- *  Currently, it supports only for F3 and F4
- */
-inline m256i mul(m256i a, m256i b, aint32 card)
-{
-    assert(card == F4 || card == F3);
-    if (card == F4)
-        return mul_f4(a, b);
-    return mul_f3(a, b);
-}
-
-/** Apply an element-wise negation to a buffer
- */
-inline void neg(size_t len, aint32* buf, aint32 card = F4)
-{
-    m256i* _buf = reinterpret_cast<m256i*>(buf);
-    unsigned ratio = sizeof(*_buf) / sizeof(*buf);
-    size_t _len = len / ratio;
-    size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        _buf[i] = neg(_buf[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            if (buf[i])
-                buf[i] = card - buf[i];
-        }
-    }
-}
-
-/** Perform a multiplication of a coefficient `a` to each element of `src` and
- *  add result to correspondent element of `dest`
- */
-inline void mul_coef_to_buf(
-    const aint32 a,
-    aint32* src,
-    aint32* dest,
-    size_t len,
-    aint32 card = F4)
-{
-    const m256i coef = _mm256_set1_epi32(a);
-
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplication
-        _dest[i] = mul(coef, _src[i], card);
-    }
-    if (_last_len > 0) {
-        uint64_t coef_64 = (uint64_t)a;
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplication
-            dest[i] = (aint32)((coef_64 * src[i]) % card);
-        }
-    }
-}
-
-inline void
-add_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform addition
-        _dest[i] = add(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform addition
-            aint32 tmp = src[i] + dest[i];
-            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
-        }
-    }
-}
-
-inline void sub_two_bufs(
-    aint32* bufa,
-    aint32* bufb,
-    aint32* res,
-    size_t len,
-    aint32 card = F4)
-{
-    m256i* _bufa = reinterpret_cast<m256i*>(bufa);
-    m256i* _bufb = reinterpret_cast<m256i*>(bufb);
-    m256i* _res = reinterpret_cast<m256i*>(res);
-    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform subtraction
-        _res[i] = sub(_bufa[i], _bufb[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform subtraction
-            if (bufa[i] >= bufb[i])
-                res[i] = bufa[i] - bufb[i];
-            else
-                res[i] = card - (bufb[i] - bufa[i]);
-        }
-    }
-}
-
-inline void
-mul_two_bufs(aint32* src, aint32* dest, size_t len, aint32 card = F4)
-{
-    m256i* _src = reinterpret_cast<m256i*>(src);
-    m256i* _dest = reinterpret_cast<m256i*>(dest);
-    const unsigned ratio = sizeof(*_src) / sizeof(*src);
-    const size_t _len = len / ratio;
-    const size_t _last_len = len - _len * ratio;
-
-    size_t i;
-    for (i = 0; i < _len; i++) {
-        // perform multiplicaton
-        _dest[i] = mul(_src[i], _dest[i], card);
-    }
-    if (_last_len > 0) {
-        for (i = _len * ratio; i < len; i++) {
-            // perform multiplicaton
-            dest[i] = uint32_t((uint64_t(src[i]) * dest[i]) % card);
-        }
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + coef * buf2[i]
- * buf2[i] = buf1[i] - coef * buf2[i]
- */
-inline void butterfly_ct(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m256i _coef = _mm256_set1_epi32(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = mul(_coef, _buf2[i], card);
-        _buf2[i] = sub(_buf1[i], a, card);
-        _buf1[i] = add(_buf1[i], a, card);
-    }
-}
-
-/*
- * buf1[i] = buf1[i] + buf2[i]
- * buf2[i] = coef * (buf1[i] - buf2[i])
- */
-inline void butterfly_gs(
-    uint32_t coef,
-    aint32* buf1,
-    aint32* buf2,
-    size_t len,
-    uint32_t card = F4)
-{
-    const m256i _coef = _mm256_set1_epi32(coef);
-    m256i* _buf1 = reinterpret_cast<m256i*>(buf1);
-    m256i* _buf2 = reinterpret_cast<m256i*>(buf2);
-
-    for (size_t i = 0; i < len; ++i) {
-        m256i a = add(_buf1[i], _buf2[i], card);
-        _buf2[i] = mul(_coef, sub(_buf1[i], _buf2[i], card), card);
-        _buf1[i] = a;
-    }
-}
-
-inline void encode_post_process(
-    vec::Buffers<uint32_t>& output,
-    std::vector<Properties>& props,
-    off_t offset,
-    unsigned code_len,
-    uint32_t threshold,
-    size_t vecs_nb)
-{
-    const unsigned vec_size = simd::countof<uint32_t>();
-
-    const m256i _threshold = _mm256_set1_epi32(threshold);
-    const uint32_t max = 1 << (sizeof(uint32_t) * 8 - 1);
-    const m256i mask_hi = _mm256_set1_epi32(max);
-    const unsigned element_size = sizeof(uint32_t);
-
-    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
-        uint32_t* chunk = output.get(frag_id);
-        m256i* buf = reinterpret_cast<m256i*>(chunk);
-        for (unsigned vec_id = 0; vec_id < vecs_nb; ++vec_id) {
-            const m256i a = _mm256_load_si256(&(buf[vec_id]));
-            const m256i b = _mm256_cmpeq_epi32(_threshold, a);
-            const m256i c = _mm256_and_si256(mask_hi, b);
-            uint32_t d = _mm256_movemask_epi8(c);
-
-            while (d > 0) {
-                unsigned byte_idx = __builtin_ctz(d);
-                unsigned element_idx = byte_idx / element_size;
-                off_t _offset = offset + vec_id * vec_size + element_idx;
-                props[frag_id].add(_offset, 1);
-                d ^= 1 << byte_idx;
-            }
-        }
-    }
-}
-
-/* ==================== Operations for NF4 =================== */
-typedef __m128i m128i;
-
-/** Return aint128 integer from a _m128i register */
-inline aint128 m256i_to_uint128(m256i v)
-{
-    aint128 hi, lo;
-    _mm256_storeu2_m128i((m128i*)&hi, (m128i*)&lo, v);
-    return lo; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
-}
-
-inline __uint128_t add(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = add(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-inline __uint128_t sub(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = sub(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-inline __uint128_t mul(__uint128_t a, __uint128_t b)
-{
-    m256i _a = _mm256_castsi128_si256((m128i)a);
-    m256i _b = _mm256_castsi128_si256((m128i)b);
-    m256i res = mul(_a, _b, F4);
-    return m256i_to_uint128(res);
-}
-
-/** Store low 128-bit part of `reg` to memory */
-inline void store_low(aint128* address, m256i reg)
-{
-    _mm_store_si128((m128i*)address, _mm256_castsi256_si128(reg));
-}
-
-inline void hadamard_mul(int n, aint128* _x, aint128* _y)
-{
-    int i;
-    m256i* x = reinterpret_cast<m256i*>(_x);
-    m256i* y = reinterpret_cast<m256i*>(_y);
-
-    const unsigned ratio = sizeof(*x) / sizeof(*_x);
-    const int len_256 = n / ratio;
-    const int last_len = n - len_256 * ratio;
-
-    // multiply y to the first half of `x`
-    for (i = 0; i < len_256; i++) {
-        x[i] = mul(x[i], y[i], F4);
-    }
-
-    if (last_len > 0) {
-        // add last _y[] to x
-        for (i = len_256 * ratio; i < n; i++) {
-            m256i _x_p = _mm256_castsi128_si256((m128i)_x[i]);
-            m256i _y_p = _mm256_castsi128_si256((m128i)_y[i]);
-
-            store_low(_x + i, mul(_x_p, _y_p, F4));
-        }
-    }
-}
-
-} // namespace simd
-} // namespace quadiron
-
-#endif

From 120016fe548da96b4815661d73589a839e555672 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:03 +0200
Subject: [PATCH 06/13] SIMD Main file including necessary files

1. Essential operations

- simd_128.h contains essential wrappers of SIMD operations on SSE
- simd_256.h contains essential wrappers of SIMD operations on AVX

2. Basic operations

- simd_basic.h contain basic operations used in following cases, and
also operations for RingModN

3. Vectorized operations

- simd_fnt.h contains vectorized operations dedicated for FNT
- simd_nf4.h contains vectorized operations dedicated for nf4
---
 src/simd.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/simd.h b/src/simd.h
index 8309bfff..41e4935e 100644
--- a/src/simd.h
+++ b/src/simd.h
@@ -39,17 +39,11 @@
 const unsigned F4 = 65537;
 const unsigned F3 = 257;
 
-typedef uint8_t aint8 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint16_t aint16 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint32_t aint32 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef uint64_t aint64 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-typedef __uint128_t aint128 __attribute__((aligned(quadiron::simd::ALIGNMENT)));
-
 namespace quadiron {
-/** The namespace simd contains functions for GF-NF4 that are accelerated by
- *  using SIMD operations over 128bits
+/** The namespace simd contains functions accelerated by
+ *  using SIMD operations over 128bits and 256bits
  *
- *  It supports operations on 32-bit numbers
+ *  It supports operations on 16-bit and 32-bit numbers
  */
 namespace simd {
 
@@ -58,12 +52,20 @@ namespace simd {
 } // namespace simd
 } // namespace quadiron
 
+// Include essential operations that use SIMD functions
 #if defined(__AVX2__)
 #include "simd_256.h"
 #elif defined(__SSE4_1__)
 #include "simd_128.h"
 #endif
 
+// Include basic operations
+#include "simd_basic.h"
+
+// Include accelerated operations dedicated for FNT
+#include "simd_fnt.h"
+
+// Include accelerated operations dedicated for NF4
 #include "simd_nf4.h"
 
 #endif // #ifdef QUADIRON_USE_SIMD

From cfc241b4c2f26f082d752c99f46be4e1ba29b87e Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:10 +0200
Subject: [PATCH 07/13] SIMD 128: essential operations for SSE

---
 src/simd_128.h | 160 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 149 insertions(+), 11 deletions(-)

diff --git a/src/simd_128.h b/src/simd_128.h
index 0f6b8857..856588ec 100644
--- a/src/simd_128.h
+++ b/src/simd_128.h
@@ -33,19 +33,157 @@
 
 #include <x86intrin.h>
 
-typedef __m128i m128i;
+namespace quadiron {
+namespace simd {
 
-// Disable `cert-err58-cpp` on these: AFAIK they cannot throw.
-// (probably a false positive present in Clang 5 and fixed in Clang 6).
-const m128i F4_m128i = _mm_set1_epi32(65537);       // NOLINT(cert-err58-cpp)
-const m128i F4minus1_m128i = _mm_set1_epi32(65536); // NOLINT(cert-err58-cpp)
-const m128i F3_m128i = _mm_set1_epi32(257);         // NOLINT(cert-err58-cpp)
-const m128i F3minus1_m128i = _mm_set1_epi32(256);   // NOLINT(cert-err58-cpp)
+typedef __m128i VecType;
 
-const m128i F3_m128i_u16 = _mm_set1_epi16(257);       // NOLINT(cert-err58-cpp)
-const m128i F3minus1_m128i_u16 = _mm_set1_epi16(256); // NOLINT(cert-err58-cpp)
+/* ============= Constant variable  ============ */
 
-#include "simd_128_u16.h"
-#include "simd_128_u32.h"
+// @note: using const leads to an lint error of initialization of 'variable'
+// with static storage duration may throw an exception that cannot be caught
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F4_U32 = _mm_set1_epi32(65537);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F4_MINUS_ONE_U32 = _mm_set1_epi32(65536);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_U32 = _mm_set1_epi32(257);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_MINUS_ONE_U32 = _mm_set1_epi32(256);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_U16 = _mm_set1_epi16(257);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_MINUS_ONE_U16 = _mm_set1_epi16(256);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType ZERO = _mm_setzero_si128();
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType ONE_U16 = _mm_set1_epi16(1);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType ONE_U32 = _mm_set1_epi32(1);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType MASK8_LO = _mm_set1_epi16(0x80);
+
+/* ============= Essential Operations for SSE w/ both u16 & u32 ============ */
+
+inline VecType load_to_reg(VecType* address)
+{
+    return _mm_load_si128(address);
+}
+inline void store_to_mem(VecType* address, VecType reg)
+{
+    _mm_store_si128(address, reg);
+}
+
+inline VecType bit_and(VecType x, VecType y)
+{
+    return _mm_and_si128(x, y);
+}
+inline VecType bit_xor(VecType x, VecType y)
+{
+    return _mm_xor_si128(x, y);
+}
+inline uint16_t msb8_mask(VecType x)
+{
+    return _mm_movemask_epi8(x);
+}
+inline bool and_is_zero(VecType x, VecType y)
+{
+    return _mm_testz_si128(x, y);
+}
+inline bool is_zero(VecType x)
+{
+    return _mm_testc_si128(ZERO, x);
+}
+
+#define SHIFTR(x, imm8) (_mm_srli_si128(x, imm8))
+#define BLEND8(x, y, mask) (_mm_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm_blend_epi16(x, y, imm8))
+
+/* ================= Essential Operations for SSE ================= */
+
+template <typename T>
+inline VecType set_one(T val);
+template <>
+inline VecType set_one(uint32_t val)
+{
+    return _mm_set1_epi32(val);
+}
+template <>
+inline VecType set_one(uint16_t val)
+{
+    return _mm_set1_epi16(val);
+}
+
+template <typename T>
+inline VecType add(VecType x, VecType y);
+template <>
+inline VecType add<uint32_t>(VecType x, VecType y)
+{
+    return _mm_add_epi32(x, y);
+}
+template <>
+inline VecType add<uint16_t>(VecType x, VecType y)
+{
+    return _mm_add_epi16(x, y);
+}
+
+template <typename T>
+inline VecType sub(VecType x, VecType y);
+template <>
+inline VecType sub<uint32_t>(VecType x, VecType y)
+{
+    return _mm_sub_epi32(x, y);
+}
+template <>
+inline VecType sub<uint16_t>(VecType x, VecType y)
+{
+    return _mm_sub_epi16(x, y);
+}
+
+template <typename T>
+inline VecType mul(VecType x, VecType y);
+template <>
+inline VecType mul<uint32_t>(VecType x, VecType y)
+{
+    return _mm_mullo_epi32(x, y);
+}
+template <>
+inline VecType mul<uint16_t>(VecType x, VecType y)
+{
+    return _mm_mullo_epi16(x, y);
+}
+
+template <typename T>
+inline VecType compare_eq(VecType x, VecType y);
+template <>
+inline VecType compare_eq<uint32_t>(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi32(x, y);
+}
+template <>
+inline VecType compare_eq<uint16_t>(VecType x, VecType y)
+{
+    return _mm_cmpeq_epi16(x, y);
+}
+
+template <typename T>
+inline VecType min(VecType x, VecType y);
+template <>
+inline VecType min<uint32_t>(VecType x, VecType y)
+{
+    return _mm_min_epu32(x, y);
+}
+template <>
+inline VecType min<uint16_t>(VecType x, VecType y)
+{
+    return _mm_min_epu16(x, y);
+}
+
+} // namespace simd
+} // namespace quadiron
 
 #endif

From a6cd1f4ab8ce8cd1fb66bcf7cd71c5cec9b787ca Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:16 +0200
Subject: [PATCH 08/13] SIMD 256: essential operations for AVX

---
 src/simd_256.h | 168 ++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 153 insertions(+), 15 deletions(-)

diff --git a/src/simd_256.h b/src/simd_256.h
index 2dc49cc4..8084d95e 100644
--- a/src/simd_256.h
+++ b/src/simd_256.h
@@ -33,19 +33,6 @@
 
 #include <x86intrin.h>
 
-typedef __m256i m256i;
-
-// Disable `cert-err58-cpp` on these: AFAIK they cannot throw.
-// (probably a false positive present in Clang 5 and fixed in Clang 6).
-const m256i F4_m256i = _mm256_set1_epi32(65537);       // NOLINT(cert-err58-cpp)
-const m256i F4minus1_m256i = _mm256_set1_epi32(65536); // NOLINT(cert-err58-cpp)
-const m256i F3_m256i = _mm256_set1_epi32(257);         // NOLINT(cert-err58-cpp)
-const m256i F3minus1_m256i = _mm256_set1_epi32(256);   // NOLINT(cert-err58-cpp)
-
-const m256i F3_m256i_u16 = _mm256_set1_epi16(257); // NOLINT(cert-err58-cpp)
-// NOLINTNEXTLINE(cert-err58-cpp)
-const m256i F3minus1_m256i_u16 = _mm256_set1_epi16(256);
-
 /* GCC doesn't include the split store intrinsics so define them here. */
 #if defined(__GNUC__) && !defined(__clang__)
 
@@ -58,7 +45,158 @@ _mm256_storeu2_m128i(__m128i* const hi, __m128i* const lo, const __m256i a)
 
 #endif /* defined(__GNUC__) */
 
-#include "simd_256_u16.h"
-#include "simd_256_u32.h"
+namespace quadiron {
+namespace simd {
+
+typedef __m256i VecType;
+typedef __m128i HalfVecType;
+
+/* ============= Constant variable  ============ */
+
+// @note: using const leads to an lint error of initialization of 'variable'
+// with static storage duration may throw an exception that cannot be caught
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F4_U32 = _mm256_set1_epi32(65537);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F4_MINUS_ONE_U32 = _mm256_set1_epi32(65536);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_U32 = _mm256_set1_epi32(257);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_MINUS_ONE_U32 = _mm256_set1_epi32(256);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_U16 = _mm256_set1_epi16(257);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType F3_MINUS_ONE_U16 = _mm256_set1_epi16(256);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType ZERO = _mm256_setzero_si256();
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType ONE_U16 = _mm256_set1_epi16(1);
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType ONE_U32 = _mm256_set1_epi32(1);
+
+// NOLINTNEXTLINE(cert-err58-cpp)
+const VecType MASK8_LO = _mm256_set1_epi16(0x80);
+
+/* ============= Essential Operations for AVX2 w/ both u16 & u32 ============ */
+
+inline VecType load_to_reg(VecType* address)
+{
+    return _mm256_load_si256(address);
+}
+inline void store_to_mem(VecType* address, VecType reg)
+{
+    _mm256_store_si256(address, reg);
+}
+
+inline VecType bit_and(VecType x, VecType y)
+{
+    return _mm256_and_si256(x, y);
+}
+inline VecType bit_xor(VecType x, VecType y)
+{
+    return _mm256_xor_si256(x, y);
+}
+inline uint32_t msb8_mask(VecType x)
+{
+    return _mm256_movemask_epi8(x);
+}
+inline bool and_is_zero(VecType x, VecType y)
+{
+    return _mm256_testz_si256(x, y);
+}
+inline bool is_zero(VecType x)
+{
+    return _mm256_testc_si256(ZERO, x);
+}
+
+#define SHIFTR(x, imm8) (_mm256_srli_si256(x, imm8))
+#define BLEND8(x, y, mask) (_mm256_blendv_epi8(x, y, mask))
+#define BLEND16(x, y, imm8) (_mm256_blend_epi16(x, y, imm8))
+
+/* ================= Essential Operations for AVX2 ================= */
+
+template <typename T>
+inline VecType set_one(T val);
+template <>
+inline VecType set_one(uint32_t val)
+{
+    return _mm256_set1_epi32(val);
+}
+template <>
+inline VecType set_one(uint16_t val)
+{
+    return _mm256_set1_epi16(val);
+}
+
+template <typename T>
+inline VecType add(VecType x, VecType y);
+template <>
+inline VecType add<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_add_epi32(x, y);
+}
+template <>
+inline VecType add<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_add_epi16(x, y);
+}
+
+template <typename T>
+inline VecType sub(VecType x, VecType y);
+template <>
+inline VecType sub<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_sub_epi32(x, y);
+}
+template <>
+inline VecType sub<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_sub_epi16(x, y);
+}
+
+template <typename T>
+inline VecType mul(VecType x, VecType y);
+template <>
+inline VecType mul<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_mullo_epi32(x, y);
+}
+template <>
+inline VecType mul<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_mullo_epi16(x, y);
+}
+
+template <typename T>
+inline VecType compare_eq(VecType x, VecType y);
+template <>
+inline VecType compare_eq<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi32(x, y);
+}
+template <>
+inline VecType compare_eq<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_cmpeq_epi16(x, y);
+}
+
+template <typename T>
+inline VecType min(VecType x, VecType y);
+template <>
+inline VecType min<uint32_t>(VecType x, VecType y)
+{
+    return _mm256_min_epu32(x, y);
+}
+template <>
+inline VecType min<uint16_t>(VecType x, VecType y)
+{
+    return _mm256_min_epu16(x, y);
+}
+
+} // namespace simd
+} // namespace quadiron
 
 #endif

From 40379409bf93825936d9fd36988a7d68510a907e Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:21 +0200
Subject: [PATCH 09/13] SIMD Basic: includes basic Operations

It implements basic operations that will be used everywhere.
It includes also operations for RingModN
---
 src/simd_basic.h | 334 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 334 insertions(+)
 create mode 100644 src/simd_basic.h

diff --git a/src/simd_basic.h b/src/simd_basic.h
new file mode 100644
index 00000000..4fed13ca
--- /dev/null
+++ b/src/simd_basic.h
@@ -0,0 +1,334 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_BASIC_H__
+#define __QUAD_SIMD_BASIC_H__
+
+#include <x86intrin.h>
+
+namespace quadiron {
+namespace simd {
+
+template <typename T>
+inline VecType card(T q);
+template <>
+inline VecType card<uint16_t>(uint16_t q)
+{
+    return F3_U16;
+}
+template <>
+inline VecType card<uint32_t>(uint32_t q)
+{
+    return (q == F3) ? F3_U32 : F4_U32;
+}
+
+template <typename T>
+inline VecType card_minus_one(T q);
+template <>
+inline VecType card_minus_one<uint16_t>(uint16_t q)
+{
+    return F3_MINUS_ONE_U16;
+}
+template <>
+inline VecType card_minus_one<uint32_t>(uint32_t q)
+{
+    return (q == F3) ? F3_MINUS_ONE_U32 : F4_MINUS_ONE_U32;
+}
+
+template <typename T>
+inline VecType get_low_half(VecType x, T q)
+{
+    return (q == F3) ? BLEND8(ZERO, x, MASK8_LO) : BLEND16(ZERO, x, 0x55);
+}
+
+template <typename T>
+inline VecType get_high_half(VecType x, T q)
+{
+    return (q == F3) ? BLEND8(ZERO, SHIFTR(x, 1), MASK8_LO)
+                     : BLEND16(ZERO, SHIFTR(x, 2), 0x55);
+}
+
+/* ================= Basic Operations ================= */
+
+/**
+ * Modular addition
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x + y) mod q
+ */
+template <typename T>
+inline VecType mod_add(VecType x, VecType y, T q)
+{
+    const VecType res = add<T>(x, y);
+    return min<T>(res, sub<T>(res, card(q)));
+}
+
+/**
+ * Modular subtraction for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x - y) mod q
+ */
+template <typename T>
+inline VecType mod_sub(VecType x, VecType y, T q)
+{
+    const VecType res = sub<T>(x, y);
+    return min<T>(res, add<T>(res, card(q)));
+}
+
+/**
+ * Modular negation for packed unsigned 32-bit integers
+ *
+ * @param x input register
+ * @param q modulo
+ * @return (-x) mod q
+ */
+template <typename T>
+inline VecType mod_neg(VecType x, T q)
+{
+    const VecType res = sub<T>(card(q), x);
+    return min<T>(res, sub<T>(res, card(q)));
+}
+
+/**
+ * Modular multiplication for packed unsigned 32-bit integers
+ *
+ * @note We assume that at least `x` or `y` is less than `q-1` so it's
+ * not necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+template <typename T>
+inline VecType mod_mul(VecType x, VecType y, T q)
+{
+    const VecType res = mul<T>(x, y);
+    const VecType lo = get_low_half(res, q);
+    const VecType hi = get_high_half(res, q);
+    return mod_sub(lo, hi, q);
+}
+
+/**
+ * Modular general multiplication for packed unsigned 32-bit integers
+ *
+ * @note It's necessary to verify overflow on multiplying elements
+ *
+ * @param x input register
+ * @param y input register
+ * @param q modulo
+ * @return (x * y) mod q
+ */
+template <typename T>
+inline VecType mod_mul_safe(VecType x, VecType y, T q)
+{
+    const VecType res = mod_mul(x, y, q);
+
+    // filter elements of both of a & b = card-1
+    const VecType cmp = bit_and(
+        compare_eq<T>(x, card_minus_one(q)),
+        compare_eq<T>(y, card_minus_one(q)));
+
+    if (is_zero(cmp)) {
+        return res;
+    }
+    return (q == F3) ? bit_xor(res, bit_and(F4_U32, cmp))
+                     : add<T>(res, bit_and(ONE_U32, cmp));
+}
+
+/**
+ * Update property for a given register for packed unsigned 32-bit integers
+ *
+ * @param props properties bound to fragments
+ * @param threshold register storing max value in its elements
+ * @param mask a specific mask
+ * @param symb input register
+ * @param offset offset in the data fragments
+ * @param max a dummy variable
+ */
+template <typename T>
+inline void add_props(
+    Properties& props,
+    VecType threshold,
+    VecType mask,
+    VecType symb,
+    off_t offset,
+    T max)
+{
+    const VecType b = compare_eq<T>(threshold, symb);
+    const VecType c = bit_and(mask, b);
+    auto d = msb8_mask(c);
+    const unsigned element_size = sizeof(T);
+    while (d > 0) {
+        const unsigned byte_idx = __builtin_ctz(d);
+        const size_t _offset = offset + byte_idx / element_size;
+        props.add(_offset, OOR_MARK);
+        d ^= 1 << byte_idx;
+    }
+}
+
+/* ==================== Operations for RingModN =================== */
+/** Perform a multiplication of a coefficient `a` to each element of `src` and
+ *  add result to correspondent element of `dest`
+ *
+ * @note: 1 < `a` < card - 1
+ */
+template <typename T>
+inline void mul_coef_to_buf(const T a, T* src, T* dest, size_t len, T card)
+{
+    const VecType coef = set_one(a);
+
+    VecType* _src = reinterpret_cast<VecType*>(src);
+    VecType* _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i = 0;
+    const size_t end = (_len > 3) ? _len - 3 : 0;
+    for (; i < end; i += 4) {
+        _dest[i] = mod_mul(coef, _src[i], card);
+        _dest[i + 1] = mod_mul(coef, _src[i + 1], card);
+        _dest[i + 2] = mod_mul(coef, _src[i + 2], card);
+        _dest[i + 3] = mod_mul(coef, _src[i + 3], card);
+    }
+    for (; i < _len; ++i) {
+        _dest[i] = mod_mul(coef, _src[i], card);
+    }
+
+    if (_last_len > 0) {
+        const DoubleSizeVal<T> coef_double = DoubleSizeVal<T>(a);
+        for (size_t i = _len * ratio; i < len; i++) {
+            dest[i] = (T)((coef_double * src[i]) % card);
+        }
+    }
+}
+
+template <typename T>
+inline void add_two_bufs(T* src, T* dest, size_t len, T card)
+{
+    VecType* _src = reinterpret_cast<VecType*>(src);
+    VecType* _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _dest[i] = mod_add(_src[i], _dest[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            const T tmp = src[i] + dest[i];
+            dest[i] = (tmp >= card) ? (tmp - card) : tmp;
+        }
+    }
+}
+
+template <typename T>
+inline void sub_two_bufs(T* bufa, T* bufb, T* res, size_t len, T card)
+{
+    VecType* _bufa = reinterpret_cast<VecType*>(bufa);
+    VecType* _bufb = reinterpret_cast<VecType*>(bufb);
+    VecType* _res = reinterpret_cast<VecType*>(res);
+    const unsigned ratio = sizeof(*_bufa) / sizeof(*bufa);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        // perform subtraction
+        _res[i] = mod_sub(_bufa[i], _bufb[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            // perform subtraction
+            if (bufa[i] >= bufb[i]) {
+                res[i] = bufa[i] - bufb[i];
+            } else {
+                res[i] = card - (bufb[i] - bufa[i]);
+            }
+        }
+    }
+}
+
+template <typename T>
+inline void mul_two_bufs(T* src, T* dest, size_t len, T card)
+{
+    VecType* _src = reinterpret_cast<VecType*>(src);
+    VecType* _dest = reinterpret_cast<VecType*>(dest);
+    const unsigned ratio = sizeof(*_src) / sizeof(*src);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        // perform multiplicaton
+        _dest[i] = mod_mul_safe(_src[i], _dest[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            // perform multiplicaton
+            dest[i] = T((DoubleSizeVal<T>(src[i]) * dest[i]) % card);
+        }
+    }
+}
+
+/** Apply an element-wise negation to a buffer
+ */
+template <typename T>
+inline void neg(size_t len, T* buf, T card)
+{
+    VecType* _buf = reinterpret_cast<VecType*>(buf);
+    const unsigned ratio = sizeof(*_buf) / sizeof(*buf);
+    const size_t _len = len / ratio;
+    const size_t _last_len = len - _len * ratio;
+
+    size_t i;
+    for (i = 0; i < _len; i++) {
+        _buf[i] = mod_neg(_buf[i], card);
+    }
+    if (_last_len > 0) {
+        for (i = _len * ratio; i < len; i++) {
+            if (buf[i])
+                buf[i] = card - buf[i];
+        }
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif

From 07bf74d688a88901bb5ca0df55096d3e7f9fca5d Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:27 +0200
Subject: [PATCH 10/13] SIMD NF4 contains vectorized operations for NF4

---
 src/simd_nf4.h | 314 +++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 263 insertions(+), 51 deletions(-)

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
index 7d517430..79c62907 100644
--- a/src/simd_nf4.h
+++ b/src/simd_nf4.h
@@ -38,37 +38,35 @@
 namespace quadiron {
 namespace simd {
 
-#ifdef __AVX2__
-typedef __m128i m128i;
+typedef uint32_t aint32 __attribute__((aligned(ALIGNMENT)));
 
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
+/** Return __uint128_t integer from a _m128i register */
+static inline __uint128_t m128i_to_uint128(__m128i v)
 {
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
+    __uint128_t i;
+    _mm_store_si128((__m128i*)&i, v);
 
     return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
 }
-#endif // #ifdef __AVX2__
 
-inline aint128 expand16(uint16_t* arr, int n)
+inline __uint128_t expand16(uint16_t* arr, int n)
 {
     // since n <= 4
     uint16_t _arr[4] __attribute__((aligned(ALIGNMENT))) = {0, 0, 0, 0};
     std::copy_n(arr, n, _arr);
 
-    m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]);
+    __m128i b = _mm_set_epi16(0, 0, 0, 0, _arr[3], _arr[2], _arr[1], _arr[0]);
 
     return m128i_to_uint128(b);
 }
 
-inline aint128 expand32(uint32_t* arr, int n)
+inline __uint128_t expand32(uint32_t* arr, int n)
 {
     // since n <= 4
     uint32_t _arr[4] __attribute__((aligned(simd::ALIGNMENT))) = {0, 0, 0, 0};
     std::copy_n(arr, n, _arr);
 
-    m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
+    __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
 
     return m128i_to_uint128(b);
 }
@@ -76,23 +74,23 @@ inline aint128 expand32(uint32_t* arr, int n)
 inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 {
     uint16_t ai[8];
-    aint128 values;
-
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    ai[0] = _mm_extract_epi16(_a, 0);
-    ai[1] = _mm_extract_epi16(_a, 1);
-    ai[2] = _mm_extract_epi16(_a, 2);
-    ai[3] = _mm_extract_epi16(_a, 3);
-    ai[4] = _mm_extract_epi16(_a, 4);
-    ai[5] = _mm_extract_epi16(_a, 5);
-    ai[6] = _mm_extract_epi16(_a, 6);
-    ai[7] = _mm_extract_epi16(_a, 7);
+    __uint128_t values;
+
+    __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a));
+    ai[0] = _mm_extract_epi16(vec_a, 0);
+    ai[1] = _mm_extract_epi16(vec_a, 1);
+    ai[2] = _mm_extract_epi16(vec_a, 2);
+    ai[3] = _mm_extract_epi16(vec_a, 3);
+    ai[4] = _mm_extract_epi16(vec_a, 4);
+    ai[5] = _mm_extract_epi16(vec_a, 5);
+    ai[6] = _mm_extract_epi16(vec_a, 6);
+    ai[7] = _mm_extract_epi16(vec_a, 7);
 
     const uint32_t flag =
         ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
 
-    m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
-    _mm_store_si128((m128i*)&values, val);
+    __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
+    _mm_store_si128((__m128i*)&values, val);
 
     GroupedValues<__uint128_t> b = {values, flag};
 
@@ -102,70 +100,284 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
 {
     uint16_t ai[8];
-    aint128 values;
-
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    ai[0] = _mm_extract_epi16(_a, 0);
-    ai[1] = _mm_extract_epi16(_a, 1);
-    ai[2] = _mm_extract_epi16(_a, 2);
-    ai[3] = _mm_extract_epi16(_a, 3);
-    ai[4] = _mm_extract_epi16(_a, 4);
-    ai[5] = _mm_extract_epi16(_a, 5);
-    ai[6] = _mm_extract_epi16(_a, 6);
-    ai[7] = _mm_extract_epi16(_a, 7);
+    __uint128_t values;
+
+    __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a));
+    ai[0] = _mm_extract_epi16(vec_a, 0);
+    ai[1] = _mm_extract_epi16(vec_a, 1);
+    ai[2] = _mm_extract_epi16(vec_a, 2);
+    ai[3] = _mm_extract_epi16(vec_a, 3);
+    ai[4] = _mm_extract_epi16(vec_a, 4);
+    ai[5] = _mm_extract_epi16(vec_a, 5);
+    ai[6] = _mm_extract_epi16(vec_a, 6);
+    ai[7] = _mm_extract_epi16(vec_a, 7);
 
     const uint32_t flag =
         ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
 
-    m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
-    _mm_store_si128((m128i*)&values, val);
+    __m128i val = _mm_set_epi16(0, 0, 0, 0, ai[6], ai[4], ai[2], ai[0]);
+    _mm_store_si128((__m128i*)&values, val);
 
     b.flag = flag;
     b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
 }
 
-inline aint128 pack(__uint128_t a)
+inline __uint128_t pack(__uint128_t a)
 {
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    m128i b = _mm_set_epi32(
-        _mm_extract_epi16(_a, 3),
-        _mm_extract_epi16(_a, 2),
-        _mm_extract_epi16(_a, 1),
-        _mm_extract_epi16(_a, 0));
+    __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a));
+    __m128i b = _mm_set_epi32(
+        _mm_extract_epi16(vec_a, 3),
+        _mm_extract_epi16(vec_a, 2),
+        _mm_extract_epi16(vec_a, 1),
+        _mm_extract_epi16(vec_a, 0));
 
     return m128i_to_uint128(b);
 }
 
-inline aint128 pack(__uint128_t a, uint32_t flag)
+inline __uint128_t pack(__uint128_t a, uint32_t flag)
 {
     aint32 b0, b1, b2, b3;
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i vec_a = _mm_loadu_si128(reinterpret_cast<__m128i*>(&a));
 
     if (flag & 1)
         b0 = 65536;
     else
-        b0 = _mm_extract_epi16(_a, 0);
+        b0 = _mm_extract_epi16(vec_a, 0);
     flag >>= 1;
     if (flag & 1)
         b1 = 65536;
     else
-        b1 = _mm_extract_epi16(_a, 1);
+        b1 = _mm_extract_epi16(vec_a, 1);
     flag >>= 1;
     if (flag & 1)
         b2 = 65536;
     else
-        b2 = _mm_extract_epi16(_a, 2);
+        b2 = _mm_extract_epi16(vec_a, 2);
     flag >>= 1;
     if (flag & 1)
         b3 = 65536;
     else
-        b3 = _mm_extract_epi16(_a, 3);
+        b3 = _mm_extract_epi16(vec_a, 3);
 
-    m128i b = _mm_set_epi32(b3, b2, b1, b0);
+    __m128i b = _mm_set_epi32(b3, b2, b1, b0);
 
     return m128i_to_uint128(b);
 }
 
+/* ================= Basic operations for NF4 ================= */
+
+#if defined(__AVX2__)
+
+inline VecType load_to_reg(HalfVecType x)
+{
+    return _mm256_castsi128_si256(_mm_load_si128(&x));
+}
+
+inline VecType load_to_reg(__uint128_t x)
+{
+    const HalfVecType* _x = reinterpret_cast<const HalfVecType*>(&x);
+    return load_to_reg(*_x);
+}
+
+inline void store_low_half_to_mem(HalfVecType* address, VecType reg)
+{
+    _mm_store_si128(address, _mm256_castsi256_si128(reg));
+}
+
+inline __uint128_t add(__uint128_t a, __uint128_t b)
+{
+    HalfVecType res;
+    VecType vec_a = load_to_reg(a);
+    VecType vec_b = load_to_reg(b);
+    store_low_half_to_mem(&res, mod_add(vec_a, vec_b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t sub(__uint128_t a, __uint128_t b)
+{
+    HalfVecType res;
+    VecType vec_a = load_to_reg(a);
+    VecType vec_b = load_to_reg(b);
+    store_low_half_to_mem(&res, mod_sub(vec_a, vec_b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t mul(__uint128_t a, __uint128_t b)
+{
+    HalfVecType res;
+    VecType vec_a = load_to_reg(a);
+    VecType vec_b = load_to_reg(b);
+    store_low_half_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline void add_buf_to_two_bufs_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    // add last _y[] to x and x_next
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = load_to_reg(_x[i]);
+        VecType _x_next_p = load_to_reg(_x_half[i]);
+        VecType _y_p = load_to_reg(_y[i]);
+
+        store_low_half_to_mem(_x + i, mod_add(_x_p, _y_p, F4));
+        store_low_half_to_mem(_x_half + i, mod_add(_x_next_p, _y_p, F4));
+    }
+}
+
+inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = load_to_reg(_x[i]);
+        VecType _y_p = load_to_reg(_y[i]);
+
+        store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));
+    }
+}
+
+inline void hadamard_mul_doubled_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = load_to_reg(_x[i]);
+        VecType _x_next_p = load_to_reg(_x_half[i]);
+        VecType _y_p = load_to_reg(_y[i]);
+
+        store_low_half_to_mem(_x + i, mod_mul_safe(_x_p, _y_p, F4));
+        store_low_half_to_mem(_x_half + i, mod_mul_safe(_x_next_p, _y_p, F4));
+    }
+}
+
+#elif defined(__SSE4_1__)
+
+inline VecType load_to_reg(__uint128_t x)
+{
+    const VecType* _x = reinterpret_cast<const VecType*>(&x);
+    return _mm_load_si128(_x);
+}
+
+inline __uint128_t add(__uint128_t a, __uint128_t b)
+{
+    VecType res;
+    VecType vec_a = load_to_reg(a);
+    VecType vec_b = load_to_reg(b);
+    store_to_mem(&res, mod_add(vec_a, vec_b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t sub(__uint128_t a, __uint128_t b)
+{
+    VecType res;
+    VecType vec_a = load_to_reg(a);
+    VecType vec_b = load_to_reg(b);
+    store_to_mem(&res, mod_sub(vec_a, vec_b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline __uint128_t mul(__uint128_t a, __uint128_t b)
+{
+    VecType res;
+    VecType vec_a = load_to_reg(a);
+    VecType vec_b = load_to_reg(b);
+    store_to_mem(&res, mod_mul_safe(vec_a, vec_b, F4));
+    return reinterpret_cast<__uint128_t>(res);
+}
+
+inline void add_buf_to_two_bufs_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    // do nothing
+}
+
+inline void hadamard_mul_rem(unsigned n, __uint128_t* x, __uint128_t* y)
+{
+    // do nothing
+}
+
+inline void hadamard_mul_doubled_rem(
+    unsigned n,
+    __uint128_t* x,
+    __uint128_t* x_half,
+    __uint128_t* y)
+{
+    // do nothing
+}
+
+#endif
+
+/* ==================== Operations for NF4 =================== */
+
+/** Add buffer `y` to two halves of `x`. `x` is of length `n` */
+inline void add_buf_to_two_bufs(unsigned n, __uint128_t* _x, __uint128_t* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned half_len = n / 2;
+    const unsigned vec_len = half_len / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = half_len - num_len;
+
+    __uint128_t* x_half = _x + half_len;
+    VecType* x_next = reinterpret_cast<VecType*>(x_half);
+
+    // add y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = mod_add(x[i], y[i], F4);
+    }
+
+    // add y to the second half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x_next[i] = mod_add(x_next[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        add_buf_to_two_bufs_rem(
+            rem_len, _x + num_len, x_half + num_len, _y + num_len);
+    }
+}
+
+inline void hadamard_mul(unsigned n, __uint128_t* _x, __uint128_t* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned vec_len = n / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = n - num_len;
+
+    // multiply y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = mod_mul_safe(x[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        // add last _y[] to x
+        hadamard_mul_rem(rem_len, _x + num_len, _y + num_len);
+    }
+}
+
 } // namespace simd
 } // namespace quadiron
 

From fb0ea0b53cba0ffd5b9ccb49bf7d060cc75ce020 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Fri, 5 Oct 2018 13:44:34 +0200
Subject: [PATCH 11/13] SIMD FNT: vectorised operations for FNT

---
 src/fft_2n.h   |   2 +-
 src/simd_fnt.h | 529 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 530 insertions(+), 1 deletion(-)
 create mode 100644 src/simd_fnt.h

diff --git a/src/fft_2n.h b/src/fft_2n.h
index 8a63bbb2..4ded5bb4 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -394,7 +394,7 @@ void Radix2<T>::butterfly_ct_step(
 }
 
 /**
- * Butterly CT on two-layers at a time
+ * Butterfly CT on two-layers at a time
  *
  * For each quadruple
  * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
diff --git a/src/simd_fnt.h b/src/simd_fnt.h
new file mode 100644
index 00000000..8e23e1dd
--- /dev/null
+++ b/src/simd_fnt.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright 2017-2018 Scality
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *    this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __QUAD_SIMD_FNT_H__
+#define __QUAD_SIMD_FNT_H__
+
+#include <x86intrin.h>
+
+namespace quadiron {
+namespace simd {
+
+/* ================= Vectorized Operations ================= */
+
+/**
+ * Butterfly Cooley-Tukey operation
+ *
+ * x <- x + r * y
+ * y <- x - r * y
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param y working register
+ * @param q modular
+ */
+template <typename T>
+inline void butterfly_ct(T rp1, VecType c, VecType* x, VecType* y, T q)
+{
+    VecType z = (rp1 == 2) ? *y : mod_mul(c, *y, q);
+    if (rp1 < q) {
+        *y = mod_sub(*x, z, q);
+        *x = mod_add(*x, z, q);
+    } else { // i.e. r == q - 1
+        *y = mod_add(*x, z, q);
+        *x = mod_sub(*x, z, q);
+    }
+}
+
+/**
+ * Butterfly Genteleman-Sande operation
+ *
+ * x <- x + y
+ * y <- r * (x - y)
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param y working register
+ * @param q modular
+ */
+template <typename T>
+inline void butterfly_gs(T rp1, VecType c, VecType* x, VecType* y, T q)
+{
+    VecType add = mod_add(*x, *y, q);
+    if (rp1 == 2) {
+        *y = mod_sub(*x, *y, q);
+    } else if (rp1 < q) {
+        VecType sub = mod_sub(*x, *y, q);
+        *y = mod_mul(c, sub, q);
+    } else { // i.e. r == q - 1
+        *y = mod_sub(*y, *x, q);
+    }
+    *x = add;
+}
+
+/**
+ * Butterfly Genteleman-Sande simple operation where y = 0
+ *
+ * x <- x, i.e. no operation
+ * y <- r * x
+ *
+ * @param rp1 coefficient `r` plus one
+ * @param c a register stores coefficient `r`
+ * @param x working register
+ * @param q modular
+ * @return r * x
+ */
+template <typename T>
+inline VecType butterfly_simple_gs(T rp1, VecType c, VecType x, T q)
+{
+    if (rp1 == 2) {
+        return x;
+    } else if (rp1 < q) {
+        return mod_mul(c, x, q);
+    } else {
+        return mod_neg(x, q);
+    }
+}
+
+/**
+ * Vectorized butterfly CT step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + r * Q
+ *      Q = P - r * Q
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param step - next loop
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_ct_step(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    unsigned step,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const T rp1 = r + 1;
+    VecType c = set_one(r);
+
+    const size_t end = (len > 1) ? len - 1 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, y1;
+        VecType x2, y2;
+        VecType* p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        size_t j = 0;
+        for (; j < end; j += 2) {
+            x1 = load_to_reg(p + j);
+            y1 = load_to_reg(q + j);
+
+            butterfly_ct(rp1, c, &x1, &y1, card);
+
+            x2 = load_to_reg(p + j + 1);
+            y2 = load_to_reg(q + j + 1);
+
+            butterfly_ct(rp1, c, &x2, &y2, card);
+
+            // Store back to memory
+            store_to_mem(p + j, x1);
+            store_to_mem(p + j + 1, x2);
+            store_to_mem(q + j, y1);
+            store_to_mem(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = load_to_reg(p + j);
+            y1 = load_to_reg(q + j);
+
+            butterfly_ct(rp1, c, &x1, &y1, card);
+
+            // Store back to memory
+            store_to_mem(p + j, x1);
+            store_to_mem(q + j, y1);
+        }
+    }
+}
+
+template <typename T>
+inline static void do_butterfly_ct_2_layers(
+    const std::vector<T*>& mem,
+    T r1,
+    T r2,
+    T r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    const T r1p1 = r1 + 1;
+    const T r2p1 = r2 + 1;
+    const T r3p1 = r3 + 1;
+
+    VecType c1 = set_one(r1);
+    VecType c2 = set_one(r2);
+    VecType c3 = set_one(r3);
+
+    VecType* p = reinterpret_cast<VecType*>(mem[start]);
+    VecType* q = reinterpret_cast<VecType*>(mem[start + m]);
+    VecType* r = reinterpret_cast<VecType*>(mem[start + 2 * m]);
+    VecType* s = reinterpret_cast<VecType*>(mem[start + 3 * m]);
+
+    size_t j = 0;
+    const size_t end = (len > 1) ? len - 1 : 0;
+    while (j < end) {
+        // First layer (c1, x, y) & (c1, u, v)
+        VecType x1 = load_to_reg(p);
+        VecType x2 = load_to_reg(p + 1);
+        VecType y1 = load_to_reg(q);
+        VecType y2 = load_to_reg(q + 1);
+
+        butterfly_ct(r1p1, c1, &x1, &y1, card);
+        butterfly_ct(r1p1, c1, &x2, &y2, card);
+
+        VecType u1 = load_to_reg(r);
+        VecType u2 = load_to_reg(r + 1);
+        VecType v1 = load_to_reg(s);
+        VecType v2 = load_to_reg(s + 1);
+
+        butterfly_ct(r1p1, c1, &u1, &v1, card);
+        butterfly_ct(r1p1, c1, &u2, &v2, card);
+
+        // Second layer (c2, x, u) & (c3, y, v)
+        butterfly_ct(r2p1, c2, &x1, &u1, card);
+        butterfly_ct(r2p1, c2, &x2, &u2, card);
+
+        butterfly_ct(r3p1, c3, &y1, &v1, card);
+        butterfly_ct(r3p1, c3, &y2, &v2, card);
+
+        // Store back to memory
+        store_to_mem(p, x1);
+        store_to_mem(p + 1, x2);
+        store_to_mem(q, y1);
+        store_to_mem(q + 1, y2);
+
+        store_to_mem(r, u1);
+        store_to_mem(r + 1, u2);
+        store_to_mem(s, v1);
+        store_to_mem(s + 1, v2);
+        p = p + 2;
+        q = q + 2;
+        r = r + 2;
+        s = s + 2;
+        j = j + 2;
+    };
+
+    for (; j < len; ++j) {
+        // First layer (c1, x, y) & (c1, u, v)
+        VecType x1 = load_to_reg(p + j);
+        VecType y1 = load_to_reg(q + j);
+        VecType u1 = load_to_reg(r + j);
+        VecType v1 = load_to_reg(s + j);
+
+        // BUTTERFLY_3_test(c1, &x1, &y1, &u1, &v1, card);
+        butterfly_ct(r1p1, c1, &x1, &y1, card);
+        butterfly_ct(r1p1, c1, &u1, &v1, card);
+        butterfly_ct(r2p1, c2, &x1, &u1, card);
+        butterfly_ct(r3p1, c3, &y1, &v1, card);
+
+        // Store back to memory
+        store_to_mem(p + j, x1);
+        store_to_mem(q + j, y1);
+        store_to_mem(r + j, u1);
+        store_to_mem(s + j, v1);
+    }
+}
+
+/**
+ * Vectorized butterfly CT on two-layers at a time
+ *
+ * For each quadruple
+ * (P, Q, R, S) = (buf[i], buf[i + m], buf[i + 2 * m], buf[i + 3 * m])
+ * First layer: butterfly on (P, Q) and (R, S) for step = 2 * m
+ *      coef r1 = W[start * n / (2 * m)]
+ *      P = P + r1 * Q
+ *      Q = P - r1 * Q
+ *      R = R + r1 * S
+ *      S = R - r1 * S
+ * Second layer: butterfly on (P, R) and (Q, S) for step = 4 * m
+ *      coef r2 = W[start * n / (4 * m)]
+ *      coef r3 = W[(start + m) * n / (4 * m)]
+ *      P = P + r2 * R
+ *      R = P - r2 * R
+ *      Q = Q + r3 * S
+ *      S = Q - r3 * S
+ *
+ * @param buf - working buffers
+ * @param r1 - coefficient for the 1st layer
+ * @param r2 - 1st coefficient for the 2nd layer
+ * @param r3 - 2nd coefficient for the 2nd layer
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_ct_two_layers_step(
+    vec::Buffers<T>& buf,
+    T r1,
+    T r2,
+    T r3,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 2;
+    const unsigned bufs_nb = buf.get_n();
+
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        do_butterfly_ct_2_layers(mem, r1, r2, r3, i, m, len, card);
+    }
+}
+
+/**
+ * Vectorized butterfly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      P = P + Q
+ *      Q = r * (P - Q)
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_gs_step(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 1;
+    const T rp1 = r + 1;
+    VecType c = set_one(r);
+
+    const size_t end = (len > 3) ? len - 3 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, x2, x3, x4;
+        VecType y1, y2, y3, y4;
+        VecType* p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        size_t j = 0;
+        for (; j < end; j += 4) {
+            x1 = load_to_reg(p + j);
+            x2 = load_to_reg(p + j + 1);
+            x3 = load_to_reg(p + j + 2);
+            x4 = load_to_reg(p + j + 3);
+            y1 = load_to_reg(q + j);
+            y2 = load_to_reg(q + j + 1);
+            y3 = load_to_reg(q + j + 2);
+            y4 = load_to_reg(q + j + 3);
+
+            butterfly_gs(rp1, c, &x1, &y1, card);
+            butterfly_gs(rp1, c, &x2, &y2, card);
+            butterfly_gs(rp1, c, &x3, &y3, card);
+            butterfly_gs(rp1, c, &x4, &y4, card);
+
+            // Store back to memory
+            store_to_mem(p + j, x1);
+            store_to_mem(p + j + 1, x2);
+            store_to_mem(p + j + 2, x3);
+            store_to_mem(p + j + 3, x4);
+            store_to_mem(q + j, y1);
+            store_to_mem(q + j + 1, y2);
+            store_to_mem(q + j + 2, y3);
+            store_to_mem(q + j + 3, y4);
+        }
+        for (; j < len; ++j) {
+            x1 = load_to_reg(p + j);
+            y1 = load_to_reg(q + j);
+
+            butterfly_gs(rp1, c, &x1, &y1, card);
+
+            // Store back to memory
+            store_to_mem(p + j, x1);
+            store_to_mem(q + j, y1);
+        }
+    }
+}
+
+/**
+ * Vectorized butterfly GS step
+ *
+ * For each pair (P, Q) = (buf[i], buf[i + m]) for step = 2 * m and coef `r`
+ *      Q = r * P
+ *
+ * @param buf - working buffers
+ * @param r - coefficient
+ * @param start - index of buffer among `m` ones
+ * @param m - current group size
+ * @param len - number of vectors per buffer
+ * @param card - modulo cardinal
+ */
+template <typename T>
+inline void butterfly_gs_step_simple(
+    vec::Buffers<T>& buf,
+    T r,
+    unsigned start,
+    unsigned m,
+    size_t len,
+    T card)
+{
+    if (len == 0) {
+        return;
+    }
+    const unsigned step = m << 1;
+    const T rp1 = r + 1;
+    VecType c = set_one(r);
+
+    const size_t end = (len > 1) ? len - 1 : 0;
+    const unsigned bufs_nb = buf.get_n();
+    const std::vector<T*>& mem = buf.get_mem();
+    for (unsigned i = start; i < bufs_nb; i += step) {
+        VecType x1, y1;
+        VecType x2, y2;
+        VecType* p = reinterpret_cast<VecType*>(mem[i]);
+        VecType* q = reinterpret_cast<VecType*>(mem[i + m]);
+
+        size_t j = 0;
+        for (; j < end; j += 2) {
+            x1 = load_to_reg(p + j);
+            x2 = load_to_reg(p + j + 1);
+
+            y1 = butterfly_simple_gs(rp1, c, x1, card);
+            y2 = butterfly_simple_gs(rp1, c, x2, card);
+
+            // Store back to memory
+            store_to_mem(q + j, y1);
+            store_to_mem(q + j + 1, y2);
+        }
+        for (; j < len; ++j) {
+            x1 = load_to_reg(p + j);
+
+            y1 = butterfly_simple_gs(rp1, c, x1, card);
+
+            // Store back to memory
+            store_to_mem(q + j, y1);
+        }
+    }
+}
+
+template <typename T>
+inline void encode_post_process(
+    vec::Buffers<T>& output,
+    std::vector<Properties>& props,
+    off_t offset,
+    unsigned code_len,
+    T threshold,
+    size_t vecs_nb)
+{
+    const unsigned element_size = sizeof(T);
+    const unsigned vec_size = countof<T>();
+    const T max = 1 << (element_size * 8 - 1);
+    const VecType _threshold = set_one(threshold);
+    const VecType mask_hi = set_one(max);
+
+    const std::vector<T*>& mem = output.get_mem();
+    for (unsigned frag_id = 0; frag_id < code_len; ++frag_id) {
+        VecType* buf = reinterpret_cast<VecType*>(mem[frag_id]);
+
+        size_t vec_id = 0;
+        size_t end = (vecs_nb > 3) ? vecs_nb - 3 : 0;
+        for (; vec_id < end; vec_id += 4) {
+            VecType a1 = load_to_reg(buf + vec_id);
+            VecType a2 = load_to_reg(buf + vec_id + 1);
+            VecType a3 = load_to_reg(buf + vec_id + 2);
+            VecType a4 = load_to_reg(buf + vec_id + 3);
+
+            if (!and_is_zero(a1, _threshold)) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                add_props(
+                    props[frag_id], _threshold, mask_hi, a1, curr_offset, max);
+            }
+            if (!and_is_zero(a2, _threshold)) {
+                const off_t curr_offset = offset + (vec_id + 1) * vec_size;
+                add_props(
+                    props[frag_id], _threshold, mask_hi, a2, curr_offset, max);
+            }
+            if (!and_is_zero(a3, _threshold)) {
+                const off_t curr_offset = offset + (vec_id + 2) * vec_size;
+                add_props(
+                    props[frag_id], _threshold, mask_hi, a3, curr_offset, max);
+            }
+            if (!and_is_zero(a4, _threshold)) {
+                const off_t curr_offset = offset + (vec_id + 3) * vec_size;
+                add_props(
+                    props[frag_id], _threshold, mask_hi, a4, curr_offset, max);
+            }
+        }
+        for (; vec_id < vecs_nb; ++vec_id) {
+            VecType a = load_to_reg(buf + vec_id);
+            if (!and_is_zero(a, _threshold)) {
+                const off_t curr_offset = offset + vec_id * vec_size;
+                add_props(
+                    props[frag_id], _threshold, mask_hi, a, curr_offset, max);
+            }
+        }
+    }
+}
+
+} // namespace simd
+} // namespace quadiron
+
+#endif

From bf2b414a2c150354e8d192c1003636a478fa4a70 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:04:14 +0100
Subject: [PATCH 12/13] FFT_2n: compute indices for SIMD

- Indices for SIMD parts are computed once in FFT function
- Define butterfly_ct_two_layers_step_slow for non-vectorized functions
---
 src/fft_2n.cpp | 128 ++++++++++---------------------------------------
 src/fft_2n.h   |  34 +++++++++++--
 2 files changed, 54 insertions(+), 108 deletions(-)

diff --git a/src/fft_2n.cpp b/src/fft_2n.cpp
index f3d8847f..f7d91468 100644
--- a/src/fft_2n.cpp
+++ b/src/fft_2n.cpp
@@ -48,10 +48,6 @@ void Radix2<uint16_t>::butterfly_ct_two_layers_step(
     unsigned start,
     unsigned m)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
     const unsigned coefIndex = start * this->n / m / 2;
     const uint16_t r1 = vec_W[coefIndex];
     const uint16_t r2 = vec_W[coefIndex / 2];
@@ -59,29 +55,11 @@ void Radix2<uint16_t>::butterfly_ct_two_layers_step(
 
     // perform vector operations
     simd::butterfly_ct_two_layers_step(
-        buf, r1, r2, r3, start, m, vec_len, card);
+        buf, r1, r2, r3, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        const unsigned step = m << 2;
-        size_t offset = vec_len * ratio;
-        //  ---------
-        // First layer
-        //  ---------
-        const uint16_t r1 = W->get(start * this->n / m / 2);
-        // first pair
-        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
-        // second pair
-        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
-        //  ---------
-        // Second layer
-        //  ---------
-        // first pair
-        const uint16_t r2 = W->get(start * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
-        // second pair
-        const uint16_t r3 = W->get((start + m) * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
     }
 }
 
@@ -93,18 +71,12 @@ void Radix2<uint16_t>::butterfly_ct_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+    simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
     }
 }
 
@@ -116,18 +88,12 @@ void Radix2<uint16_t>::butterfly_gs_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
@@ -139,18 +105,12 @@ void Radix2<uint16_t>::butterfly_gs_step_simple(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint16_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
@@ -160,10 +120,6 @@ void Radix2<uint32_t>::butterfly_ct_two_layers_step(
     unsigned start,
     unsigned m)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
     const unsigned coefIndex = start * this->n / m / 2;
     const uint32_t r1 = vec_W[coefIndex];
     const uint32_t r2 = vec_W[coefIndex / 2];
@@ -171,29 +127,11 @@ void Radix2<uint32_t>::butterfly_ct_two_layers_step(
 
     // perform vector operations
     simd::butterfly_ct_two_layers_step(
-        buf, r1, r2, r3, start, m, vec_len, card);
+        buf, r1, r2, r3, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        const unsigned step = m << 2;
-        size_t offset = vec_len * ratio;
-        //  ---------
-        // First layer
-        //  ---------
-        const uint32_t r1 = W->get(start * this->n / m / 2);
-        // first pair
-        butterfly_ct_step_slow(buf, r1, start, m, step, offset);
-        // second pair
-        butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
-        //  ---------
-        // Second layer
-        //  ---------
-        // first pair
-        const uint32_t r2 = W->get(start * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
-        // second pair
-        const uint32_t r3 = W->get((start + m) * this->n / m / 4);
-        butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
     }
 }
 
@@ -205,18 +143,12 @@ void Radix2<uint32_t>::butterfly_ct_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_ct_step(buf, r, start, m, step, vec_len, card);
+    simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_ct_step_slow(buf, r, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
     }
 }
 
@@ -228,18 +160,12 @@ void Radix2<uint32_t>::butterfly_gs_step(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
@@ -251,18 +177,12 @@ void Radix2<uint32_t>::butterfly_gs_step_simple(
     unsigned m,
     unsigned step)
 {
-    const unsigned ratio = simd::countof<uint32_t>();
-    const size_t len = this->pkt_size;
-    const size_t vec_len = len / ratio;
-    const size_t last_len = len - vec_len * ratio;
-
     // perform vector operations
-    simd::butterfly_gs_step_simple(buf, coef, start, m, vec_len, card);
+    simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);
 
     // for last elements, perform as non-SIMD method
-    if (last_len > 0) {
-        size_t offset = vec_len * ratio;
-        butterfly_gs_step_simple_slow(buf, coef, start, m, step, offset);
+    if (simd_trailing_len > 0) {
+        butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
     }
 }
 
diff --git a/src/fft_2n.h b/src/fft_2n.h
index 4ded5bb4..2e371e9a 100644
--- a/src/fft_2n.h
+++ b/src/fft_2n.h
@@ -112,6 +112,11 @@ class Radix2 : public FourierTransform<T> {
         unsigned step);
 
     // Only used for non-vectorized elements
+    void butterfly_ct_two_layers_step_slow(
+        vec::Buffers<T>& buf,
+        unsigned start,
+        unsigned m,
+        size_t offset = 0);
     void butterfly_ct_step_slow(
         vec::Buffers<T>& buf,
         T coef,
@@ -142,6 +147,11 @@ class Radix2 : public FourierTransform<T> {
     size_t pkt_size;
     size_t buf_size;
 
+    // Indices used for accelerated functions
+    size_t simd_vec_len;
+    size_t simd_trailing_len;
+    size_t simd_offset;
+
     std::unique_ptr<T[]> rev = nullptr;
     std::unique_ptr<vec::Vector<T>> W = nullptr;
     std::unique_ptr<vec::Vector<T>> inv_W = nullptr;
@@ -182,6 +192,12 @@ Radix2<T>::Radix2(const gf::Field<T>& gf, int n, int data_len, size_t pkt_size)
 
     rev = std::unique_ptr<T[]>(new T[n]);
     init_bitrev();
+
+    // Indices used for accelerated functions
+    const unsigned ratio = simd::countof<T>();
+    simd_vec_len = this->pkt_size / ratio;
+    simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
+    simd_offset = simd_vec_len * ratio;
 }
 
 template <typename T>
@@ -421,6 +437,16 @@ void Radix2<T>::butterfly_ct_two_layers_step(
     vec::Buffers<T>& buf,
     unsigned start,
     unsigned m)
+{
+    butterfly_ct_two_layers_step_slow(buf, start, m);
+}
+
+template <typename T>
+void Radix2<T>::butterfly_ct_two_layers_step_slow(
+    vec::Buffers<T>& buf,
+    unsigned start,
+    unsigned m,
+    size_t offset)
 {
     const unsigned step = m << 2;
     //  ---------
@@ -428,18 +454,18 @@ void Radix2<T>::butterfly_ct_two_layers_step(
     //  ---------
     const T r1 = W->get(start * this->n / m / 2);
     // first pair
-    butterfly_ct_step(buf, r1, start, m, step);
+    butterfly_ct_step_slow(buf, r1, start, m, step, offset);
     // second pair
-    butterfly_ct_step(buf, r1, start + 2 * m, m, step);
+    butterfly_ct_step_slow(buf, r1, start + 2 * m, m, step, offset);
     //  ---------
     // Second layer
     //  ---------
     // first pair
     const T r2 = W->get(start * this->n / m / 4);
-    butterfly_ct_step(buf, r2, start, 2 * m, step);
+    butterfly_ct_step_slow(buf, r2, start, 2 * m, step, offset);
     // second pair
     const T r3 = W->get((start + m) * this->n / m / 4);
-    butterfly_ct_step(buf, r3, start + m, 2 * m, step);
+    butterfly_ct_step_slow(buf, r3, start + m, 2 * m, step, offset);
 }
 
 template <typename T>

From 9c47e189e03e6282af41c147ebd2b11977e5ebc2 Mon Sep 17 00:00:00 2001
From: Lam Pham-Sy <lam.pham-sy@scality.com>
Date: Tue, 30 Oct 2018 11:16:37 +0100
Subject: [PATCH 13/13] RS-FNT: simd indices as member variables

---
 src/fec_rs_fnt.h          | 11 +++++++++++
 src/fec_vectorisation.cpp | 26 ++++++--------------------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/src/fec_rs_fnt.h b/src/fec_rs_fnt.h
index 4bdd5475..55ffb487 100644
--- a/src/fec_rs_fnt.h
+++ b/src/fec_rs_fnt.h
@@ -60,6 +60,11 @@ class RsFnt : public FecCode<T> {
     // decoding context used in encoding of systematic FNT
     std::unique_ptr<DecodeContext<T>> enc_context;
 
+    // Indices used for accelerated functions
+    size_t simd_vec_len;
+    size_t simd_trailing_len;
+    size_t simd_offset;
+
   public:
     RsFnt(
         FecType type,
@@ -70,6 +75,12 @@ class RsFnt : public FecCode<T> {
         : FecCode<T>(type, word_size, n_data, n_parities, pkt_size)
     {
         this->fec_init();
+
+        // Indices used for accelerated functions
+        const unsigned ratio = simd::countof<T>();
+        simd_vec_len = this->pkt_size / ratio;
+        simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
+        simd_offset = simd_vec_len * ratio;
     }
 
     inline void check_params() override
diff --git a/src/fec_vectorisation.cpp b/src/fec_vectorisation.cpp
index 8684e1ab..ed82fab8 100644
--- a/src/fec_vectorisation.cpp
+++ b/src/fec_vectorisation.cpp
@@ -53,20 +53,13 @@ void RsFnt<uint16_t>::encode_post_process(
     uint16_t threshold = this->gf->card_minus_one();
     unsigned code_len = this->n_outputs;
 
-    // number of elements per vector register
-    unsigned vec_size = simd::countof<uint16_t>();
-    // number of vector registers per fragment packet
-    size_t vecs_nb = size / vec_size;
-    // odd number of elements not vectorized
-    size_t last_len = size - vecs_nb * vec_size;
-
     simd::encode_post_process(
-        output, props, offset, code_len, threshold, vecs_nb);
+        output, props, offset, code_len, threshold, simd_vec_len);
 
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
             uint16_t* chunk = output.get(i);
-            for (size_t j = vecs_nb * vec_size; j < size; ++j) {
+            for (size_t j = simd_offset; j < size; ++j) {
                 if (chunk[j] == threshold) {
                     props[i].add(offset + j, OOR_MARK);
                 }
@@ -85,20 +78,13 @@ void RsFnt<uint32_t>::encode_post_process(
     const uint32_t threshold = this->gf->card_minus_one();
     const unsigned code_len = this->n_outputs;
 
-    // number of elements per vector register
-    const unsigned vec_size = simd::countof<uint32_t>();
-    // number of vector registers per fragment packet
-    const size_t vecs_nb = size / vec_size;
-    // odd number of elements not vectorized
-    const size_t last_len = size - vecs_nb * vec_size;
-
     simd::encode_post_process(
-        output, props, offset, code_len, threshold, vecs_nb);
+        output, props, offset, code_len, threshold, simd_vec_len);
 
-    if (last_len > 0) {
+    if (simd_trailing_len > 0) {
         for (unsigned i = 0; i < code_len; ++i) {
             uint32_t* chunk = output.get(i);
-            for (size_t j = vecs_nb * vec_size; j < size; ++j) {
+            for (size_t j = simd_offset; j < size; ++j) {
                 if (chunk[j] == threshold) {
                     props[i].add(offset + j, OOR_MARK);
                 }