SIMD NF4 contains vectorized operations for NF4

scality · Oct 15, 2018 · a4c9b86 · a4c9b86
1 parent 1bed85d
commit a4c9b86
Showing 1 changed file with 240 additions and 25 deletions.
diff --git a/src/simd_nf4.h b/src/simd_nf4.h
@@ -36,48 +36,47 @@
 namespace quadiron {
 namespace simd {
 
-#ifdef __AVX2__
-typedef __m128i m128i;
+typedef uint32_t aint32 __attribute__((aligned(ALIGN_SIZE)));
+typedef __uint128_t NF4Type;
 
-/** Return aint128 integer from a _m128i register */
-static inline aint128 m128i_to_uint128(m128i v)
+/** Return NF4Type integer from a _m128i register */
+static inline NF4Type m128i_to_uint128(__m128i v)
 {
-    aint128 i;
-    _mm_store_si128((m128i*)&i, v);
+    NF4Type i;
+    _mm_store_si128((__m128i*)&i, v);
 
     return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
 }
-#endif // #ifdef __AVX2__
 
-inline aint128 expand16(uint16_t* arr, int n)
+inline NF4Type expand16(uint16_t* arr, int n)
 {
     // since n <= 4
     uint16_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
     std::copy_n(arr, n, _arr);
 
-    m128i b = _mm_set_epi64(
+    __m128i b = _mm_set_epi64(
         _mm_setzero_si64(), _mm_set_pi16(_arr[3], _arr[2], _arr[1], _arr[0]));
 
     return m128i_to_uint128(b);
 }
 
-inline aint128 expand32(uint32_t* arr, int n)
+inline NF4Type expand32(uint32_t* arr, int n)
 {
     // since n <= 4
     uint32_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
     std::copy_n(arr, n, _arr);
 
-    m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
+    __m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
 
     return m128i_to_uint128(b);
 }
 
 inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 {
     uint16_t ai[8];
-    aint128 values;
+    NF4Type values;
 
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
     ai[0] = _mm_extract_epi16(_a, 0);
     ai[1] = _mm_extract_epi16(_a, 1);
     ai[2] = _mm_extract_epi16(_a, 2);
@@ -90,9 +89,9 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
     const uint32_t flag =
         ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
 
-    m128i val = _mm_set_epi64(
+    __m128i val = _mm_set_epi64(
         _mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
-    _mm_store_si128((m128i*)&values, val);
+    _mm_store_si128((__m128i*)&values, val);
 
     GroupedValues<__uint128_t> b = {values, flag};
 
@@ -102,9 +101,9 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
 {
     uint16_t ai[8];
-    aint128 values;
+    NF4Type values;
 
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
     ai[0] = _mm_extract_epi16(_a, 0);
     ai[1] = _mm_extract_epi16(_a, 1);
     ai[2] = _mm_extract_epi16(_a, 2);
@@ -117,18 +116,18 @@ inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
     const uint32_t flag =
         ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
 
-    m128i val = _mm_set_epi64(
+    __m128i val = _mm_set_epi64(
         _mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
-    _mm_store_si128((m128i*)&values, val);
+    _mm_store_si128((__m128i*)&values, val);
 
     b.flag = flag;
     b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
 }
 
-inline aint128 pack(__uint128_t a)
+inline NF4Type pack(__uint128_t a)
 {
-    m128i _a = _mm_loadu_si128((m128i*)&a);
-    m128i b = _mm_set_epi32(
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
+    __m128i b = _mm_set_epi32(
         _mm_extract_epi16(_a, 3),
         _mm_extract_epi16(_a, 2),
         _mm_extract_epi16(_a, 1),
@@ -137,10 +136,10 @@ inline aint128 pack(__uint128_t a)
     return m128i_to_uint128(b);
 }
 
-inline aint128 pack(__uint128_t a, uint32_t flag)
+inline NF4Type pack(__uint128_t a, uint32_t flag)
 {
     aint32 b0, b1, b2, b3;
-    m128i _a = _mm_loadu_si128((m128i*)&a);
+    __m128i _a = _mm_loadu_si128((__m128i*)&a);
 
     if (flag & 1)
         b0 = 65536;
@@ -162,11 +161,227 @@ inline aint128 pack(__uint128_t a, uint32_t flag)
     else
         b3 = _mm_extract_epi16(_a, 3);
 
-    m128i b = _mm_set_epi32(b3, b2, b1, b0);
+    __m128i b = _mm_set_epi32(b3, b2, b1, b0);
 
     return m128i_to_uint128(b);
 }
 
+/* ================= Basic operations for NF4 ================= */
+
+#if defined(__AVX2__)
+
+inline VecType CAST_TO_DOUBLE(HalfVecType x)
+{
+    return _mm256_castsi128_si256(x);
+}
+
+inline void STORE_LOW(HalfVecType* address, VecType reg)
+{
+    _mm_store_si128(address, _mm256_castsi256_si128(reg));
+}
+
+inline NF4Type add(NF4Type a, NF4Type b)
+{
+    HalfVecType res;
+    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
+    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    STORE_LOW(&res, ADD_MOD(_a, _b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type sub(NF4Type a, NF4Type b)
+{
+    HalfVecType res;
+    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
+    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    STORE_LOW(&res, SUB_MOD(_a, _b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type mul(NF4Type a, NF4Type b)
+{
+    HalfVecType res;
+    VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
+    VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
+    STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
+    return (NF4Type)res;
+}
+
+inline void
+add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    // add last _y[] to x and x_next
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
+        VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
+        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+
+        STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4));
+        STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4));
+    }
+}
+
+inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
+        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+
+        STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
+    }
+}
+
+inline void
+hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
+    HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
+    HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
+    for (unsigned i = 0; i < n; ++i) {
+        VecType _x_p = CAST_TO_DOUBLE(_x[i]);
+        VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
+        VecType _y_p = CAST_TO_DOUBLE(_y[i]);
+
+        STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
+        STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4));
+    }
+}
+
+#elif defined(__SSE4_1__)
+
+inline NF4Type add(NF4Type a, NF4Type b)
+{
+    VecType res;
+    STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type sub(NF4Type a, NF4Type b)
+{
+    VecType res;
+    STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4));
+    return (NF4Type)res;
+}
+
+inline NF4Type mul(NF4Type a, NF4Type b)
+{
+    VecType res;
+    STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4));
+    return (NF4Type)res;
+}
+
+inline void
+add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    // do nothing
+}
+
+inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
+{
+    // do nothing
+}
+
+inline void
+hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
+{
+    // do nothing
+}
+
+#endif
+
+/* ==================== Operations for NF4 =================== */
+
+/** Add buffer `y` to two halves of `x`. `x` is of length `n` */
+inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned half_len = n / 2;
+    const unsigned vec_len = half_len / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = half_len - num_len;
+
+    NF4Type* x_half = _x + half_len;
+    VecType* x_next = reinterpret_cast<VecType*>(x_half);
+
+    // add y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = ADD_MOD(x[i], y[i], F4);
+    }
+
+    // add y to the second half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x_next[i] = ADD_MOD(x_next[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        add_buf_to_two_bufs_rem(
+            rem_len, _x + num_len, x_half + num_len, _y + num_len);
+    }
+}
+
+inline void hadamard_mul(unsigned n, NF4Type* _x, NF4Type* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned vec_len = n / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = n - num_len;
+
+    // multiply y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = MULFULL_MOD(x[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        // add last _y[] to x
+        hadamard_mul_rem(rem_len, _x + num_len, _y + num_len);
+    }
+}
+
+inline void hadamard_mul_doubled(unsigned n, NF4Type* _x, NF4Type* _y)
+{
+    unsigned i;
+    VecType* x = reinterpret_cast<VecType*>(_x);
+    VecType* y = reinterpret_cast<VecType*>(_y);
+
+    const unsigned ratio = sizeof(*x) / sizeof(*_x);
+    const unsigned half_len = n / 2;
+    const unsigned vec_len = half_len / ratio;
+    const unsigned num_len = vec_len * ratio;
+    const unsigned rem_len = half_len - num_len;
+
+    NF4Type* x_half = _x + half_len;
+    VecType* x_next = reinterpret_cast<VecType*>(x_half);
+
+    // multiply y to the first half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x[i] = MULFULL_MOD(x[i], y[i], F4);
+    }
+
+    // multiply y to the second half of `x`
+    for (i = 0; i < vec_len; ++i) {
+        x_next[i] = MULFULL_MOD(x_next[i], y[i], F4);
+    }
+
+    if (rem_len > 0) {
+        // add last _y[] to x and x_next
+        hadamard_mul_doubled_rem(
+            rem_len, _x + num_len, x_half + num_len, _y + num_len);
+    }
+}
+
 } // namespace simd
 } // namespace quadiron