scality · lamphamsy · Aug 24, 2018 · Jun 13, 2018 · Jun 13, 2018 · Jun 13, 2018
diff --git a/benchmark/benchmark.cpp b/benchmark/benchmark.cpp
@@ -171,7 +171,7 @@ int Benchmark<T>::init()
         fec = new quadiron::fec::RsGfpFft<T>(word_size, k, m);
         break;
     case EC_TYPE_RS_NF4:
-        fec = new quadiron::fec::RsNf4<T>(word_size, k, m);
+        fec = new quadiron::fec::RsNf4<T>(word_size, k, m, pkt_size);
         break;
     case EC_TYPE_RS_FNT:
         fec = new quadiron::fec::RsFnt<T>(word_size, k, m, pkt_size);
@@ -787,7 +787,8 @@ int main(int argc, char** argv)
     }
 
     // Currently support operating on packet:RS_FNT
-    if (params->fec_type != EC_TYPE_RS_FNT) {
+    if (params->fec_type != EC_TYPE_RS_FNT
+        && params->fec_type != EC_TYPE_RS_NF4) {
         params->operation_on_packet = false;
     }
 

diff --git a/src/fec_rs_nf4.h b/src/fec_rs_nf4.h
@@ -47,8 +47,17 @@ namespace fec {
 template <typename T>
 class RsNf4 : public FecCode<T> {
   public:
-    RsNf4(unsigned word_size, unsigned n_data, unsigned n_parities)
-        : FecCode<T>(FecType::NON_SYSTEMATIC, word_size, n_data, n_parities)
+    RsNf4(
+        unsigned word_size,
+        unsigned n_data,
+        unsigned n_parities,
+        size_t pkt_size = 8)
+        : FecCode<T>(
+              FecType::NON_SYSTEMATIC,
+              word_size,
+              n_data,
+              n_parities,
+              pkt_size)
     {
         this->fec_init();
     }
@@ -80,14 +89,14 @@ class RsNf4 : public FecCode<T> {
 
         int m = arith::get_smallest_power_of_2<int>(this->n_data);
         this->fft = std::unique_ptr<fft::Radix2<T>>(
-            new fft::Radix2<T>(*ngff4, this->n, m));
+            new fft::Radix2<T>(*ngff4, this->n, m, this->pkt_size));
 
         this->fft_full = std::unique_ptr<fft::Radix2<T>>(
-            new fft::Radix2<T>(*ngff4, this->n));
+            new fft::Radix2<T>(*ngff4, this->n, this->n, this->pkt_size));
 
         unsigned len_2k = this->gf->get_code_len_high_compo(2 * this->n_data);
         this->fft_2k = std::unique_ptr<fft::Radix2<T>>(
-            new fft::Radix2<T>(*ngff4, len_2k, len_2k));
+            new fft::Radix2<T>(*ngff4, len_2k, len_2k, this->pkt_size));
     }
 
     inline void init_others() override
@@ -134,9 +143,10 @@ class RsNf4 : public FecCode<T> {
         vec::ZeroExtended<T> vwords(words, this->n);
         this->fft->fft(output, &vwords);
         // std::cout << "encoded:"; output->dump();
+        GroupedValues<T> true_val;
         for (unsigned i = 0; i < this->code_len; i++) {
             T val = output->get(i);
-            GroupedValues<T> true_val = ngff4->unpack(val);
+            ngff4->unpack(val, true_val);
             if (true_val.flag > 0) {
                 props[i].add(
                     ValueLocation(offset, i), std::to_string(true_val.flag));
@@ -247,6 +257,102 @@ class RsNf4 : public FecCode<T> {
             output->set(i, ngff4->unpack(output->get(i)).values);
         }
     }
+
+    /********** Encoding & Decoding using Buffers **********/
+
+    void encode(
+        vec::Buffers<T>* output,
+        std::vector<Properties>& props,
+        off_t offset,
+        vec::Buffers<T>* words) override
+    {
+        for (unsigned i = 0; i < this->n_data; ++i) {
+            T* chunk = words->get(i);
+            for (size_t j = 0; j < this->pkt_size; ++j) {
+                chunk[j] = ngff4->pack(chunk[j]);
+            }
+        }
+        vec::BuffersZeroExtended<T> vwords(words, this->n);
+        this->fft->fft(output, &vwords);
+        size_t size = output->get_size();
+        GroupedValues<T> true_val;
+        for (unsigned frag_id = 0; frag_id < this->code_len; ++frag_id) {
+            T* chunk = output->get(frag_id);
+            for (size_t symb_id = 0; symb_id < size; symb_id++) {
+                ngff4->unpack(chunk[symb_id], true_val);
+                if (true_val.flag > 0) {
+                    const ValueLocation loc(
+                        offset + symb_id * this->word_size, frag_id);
+                    props[frag_id].add(loc, std::to_string(true_val.flag));
+                }
+                chunk[symb_id] = true_val.values;
+            }
+        }
+    }
+
+    void decode_prepare(
+        const DecodeContext<T>& context,
+        const std::vector<Properties>& props,
+        off_t offset,
+        vec::Buffers<T>* words) override
+    {
+        const vec::Vector<T>& fragments_ids = context.get_fragments_id();
+        off_t offset_max = offset + this->buf_size;
+        for (unsigned i = 0; i < this->n_data; ++i) {
+            const int frag_id = fragments_ids.get(i);
+            T* chunk = words->get(i);
+
+            // the vector will contain marked symbols that will be packed
+            // firstly. Since locations are stored in unordered map, the vector
+            // will be sorted later to facilitate packing un-marked symbols
+            std::vector<size_t> packed_symbs;
+            // pack marked symbols
+            for (auto const& data : props[frag_id].get_map()) {
+                const off_t loc_offset = data.first.get_offset();
+                if (loc_offset >= offset && loc_offset < offset_max) {
+                    // As loc.offset := offset + j * this->word_size
+                    const size_t j = (loc_offset - offset) / this->word_size;
+                    packed_symbs.push_back(j);
+                    // pack symbol at index `j`
+                    uint32_t flag = std::stoul(data.second);
+                    chunk[j] = ngff4->pack(chunk[j], flag);
+                }
+            }
+            // sort the list of packed symbols
+            std::sort(packed_symbs.begin(), packed_symbs.end());
+
+            // pack un-marked symbols
+            size_t curr_frag_index = 0;
+            for (auto const& done_id : packed_symbs) {
+                // pack symbols from `curr_frag_index` to `j-1`
+                for (; curr_frag_index < done_id; ++curr_frag_index) {
+                    chunk[curr_frag_index] =
+                        ngff4->pack(chunk[curr_frag_index]);
+                }
+                curr_frag_index++;
+            }
+            // pack last symbols from `curr_frag_index` to `this->pkt_size-1`
+            for (; curr_frag_index < this->pkt_size; ++curr_frag_index) {
+                chunk[curr_frag_index] = ngff4->pack(chunk[curr_frag_index]);
+            }
+        }
+    }
+
+    void decode_apply(
+        const DecodeContext<T>& context,
+        vec::Buffers<T>* output,
+        vec::Buffers<T>* words) override
+    {
+        // decode_apply: do the same thing as in fec_base
+        FecCode<T>::decode_apply(context, output, words);
+        // unpack decoded symbols
+        for (unsigned i = 0; i < this->n_data; ++i) {
+            T* chunk = output->get(i);
+            for (unsigned j = 0; j < this->pkt_size; ++j) {
+                chunk[j] = ngff4->unpack(chunk[j]).values;
+            }
+        }
+    }
 };
 
 } // namespace fec

diff --git a/src/gf_nf4.cpp b/src/gf_nf4.cpp
@@ -94,6 +94,13 @@ GroupedValues<__uint128_t> NF4<__uint128_t>::unpack(__uint128_t a) const
     return simd::unpack(a, this->n);
 }
 
+template <>
+void NF4<__uint128_t>::unpack(__uint128_t a, GroupedValues<__uint128_t>& b)
+    const
+{
+    simd::unpack(a, b, this->n);
+}
+
 template <>
 __uint128_t NF4<__uint128_t>::pack(__uint128_t a) const
 {

diff --git a/src/gf_nf4.h b/src/gf_nf4.h
@@ -77,6 +77,7 @@ class NF4 : public gf::Field<T> {
     T pack(T a) const;
     T pack(T a, uint32_t flag) const;
     GroupedValues<T> unpack(T a) const;
+    void unpack(T a, GroupedValues<T>& b) const;
     T get_nth_root(T n) const override;
     void compute_omegas(vec::Vector<T>* W, int n, T w) const override;
     const gf::Field<uint32_t>& get_sub_field() const;
@@ -430,6 +431,33 @@ GroupedValues<T> NF4<T>::unpack(T a) const
     return b;
 }
 
+template <typename T>
+void NF4<T>::unpack(T a, GroupedValues<T>& b) const
+{
+    uint32_t flag = 0;
+    uint32_t ae;
+    uint16_t arr[this->n];
+
+    ae = (uint32_t)(a & MASK32);
+    if (ae == 65536) {
+        flag |= 1;
+        arr[0] = 0;
+    } else
+        arr[0] = (uint16_t)ae;
+    for (int i = 1; i < this->n; i++) {
+        a = (a >> 16) >> 16;
+        ae = (uint32_t)(a & MASK32);
+        if (ae == 65536) {
+            flag |= (1 << i);
+            arr[i] = 0;
+        } else
+            arr[i] = ae;
+    }
+
+    b.flag = flag;
+    b.values = expand16(arr);
+}
+
 // Use for fft
 template <typename T>
 T NF4<T>::get_nth_root(T n) const
@@ -535,6 +563,10 @@ __uint128_t NF4<__uint128_t>::pack(__uint128_t a, uint32_t flag) const;
 template <>
 GroupedValues<__uint128_t> NF4<__uint128_t>::unpack(__uint128_t a) const;
 
+template <>
+void NF4<__uint128_t>::unpack(__uint128_t a, GroupedValues<__uint128_t>& b)
+    const;
+
 template <>
 void NF4<__uint128_t>::hadamard_mul(int n, __uint128_t* x, __uint128_t* y)
     const;

diff --git a/src/gf_ring.h b/src/gf_ring.h
@@ -380,7 +380,7 @@ void RingModN<T>::mul_coef_to_buf(T a, T* src, T* dest, size_t len) const
     DoubleSizeVal<T> coef = DoubleSizeVal<T>(a);
     for (i = 0; i < len; i++) {
         // perform multiplication
-        dest[i] = T((coef * src[i]) % this->_card);
+        dest[i] = mul(coef, src[i]);
     }
 }
 
@@ -405,7 +405,7 @@ void RingModN<T>::add_two_bufs(T* src, T* dest, size_t len) const
     size_t i;
     for (i = 0; i < len; i++) {
         // perform addition
-        dest[i] = (src[i] + dest[i]) % this->_card;
+        dest[i] = add(src[i], dest[i]);
     }
 }
 
@@ -427,14 +427,8 @@ template <typename T>
 void RingModN<T>::sub_two_bufs(T* bufa, T* bufb, T* res, size_t len) const
 {
     size_t i;
-    T result;
     for (i = 0; i < len; i++) {
-        if (bufa[i] >= bufb[i]) {
-            result = bufa[i] - bufb[i];
-        } else {
-            result = this->_card - (bufb[i] - bufa[i]);
-        }
-        res[i] = result;
+        res[i] = sub(bufa[i], bufb[i]);
     }
 }
 

diff --git a/src/simd_nf4.h b/src/simd_nf4.h
@@ -49,7 +49,7 @@ static inline aint128 m128i_to_uint128(m128i v)
 }
 #endif // #ifdef QUADIRON_USE_AVX2
 
-inline aint128 expand16(aint16* arr, int n)
+inline aint128 expand16(uint16_t* arr, int n)
 {
     // since n <= 4
     uint16_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
@@ -61,7 +61,7 @@ inline aint128 expand16(aint16* arr, int n)
     return m128i_to_uint128(b);
 }
 
-inline aint128 expand32(aint32* arr, int n)
+inline aint128 expand32(uint32_t* arr, int n)
 {
     // since n <= 4
     uint32_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
@@ -72,35 +72,60 @@ inline aint128 expand32(aint32* arr, int n)
     return m128i_to_uint128(b);
 }
 
-inline GroupedValues<__uint128_t> unpack(aint128 a, int n)
+inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
 {
-    aint32 flag = 0;
-    uint32_t ai[4] __attribute__((aligned(ALIGN_SIZE)));
-    uint32_t bi[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
+    uint16_t ai[8];
     aint128 values;
-    int i;
 
     m128i _a = _mm_loadu_si128((m128i*)&a);
-    ai[0] = _mm_extract_epi32(_a, 0);
-    ai[1] = _mm_extract_epi32(_a, 1);
-    ai[2] = _mm_extract_epi32(_a, 2);
-    ai[3] = _mm_extract_epi32(_a, 3);
-    for (i = 0; i < n; i++) {
-        if (ai[i] == 65536)
-            flag |= (1 << i);
-        else
-            bi[i] = (aint16)ai[i];
-    }
+    ai[0] = _mm_extract_epi16(_a, 0);
+    ai[1] = _mm_extract_epi16(_a, 1);
+    ai[2] = _mm_extract_epi16(_a, 2);
+    ai[3] = _mm_extract_epi16(_a, 3);
+    ai[4] = _mm_extract_epi16(_a, 4);
+    ai[5] = _mm_extract_epi16(_a, 5);
+    ai[6] = _mm_extract_epi16(_a, 6);
+    ai[7] = _mm_extract_epi16(_a, 7);
+
+    const uint32_t flag =
+        ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
+
     m128i val = _mm_set_epi64(
-        _mm_setzero_si64(), _mm_set_pi16(bi[3], bi[2], bi[1], bi[0]));
+        _mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
     _mm_store_si128((m128i*)&values, val);
 
     GroupedValues<__uint128_t> b = {values, flag};
 
     return b;
 }
 
-inline aint128 pack(aint128 a)
+inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
+{
+    uint16_t ai[8];
+    aint128 values;
+
+    m128i _a = _mm_loadu_si128((m128i*)&a);
+    ai[0] = _mm_extract_epi16(_a, 0);
+    ai[1] = _mm_extract_epi16(_a, 1);
+    ai[2] = _mm_extract_epi16(_a, 2);
+    ai[3] = _mm_extract_epi16(_a, 3);
+    ai[4] = _mm_extract_epi16(_a, 4);
+    ai[5] = _mm_extract_epi16(_a, 5);
+    ai[6] = _mm_extract_epi16(_a, 6);
+    ai[7] = _mm_extract_epi16(_a, 7);
+
+    const uint32_t flag =
+        ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);
+
+    m128i val = _mm_set_epi64(
+        _mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
+    _mm_store_si128((m128i*)&values, val);
+
+    b.flag = flag;
+    b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
+}
+
+inline aint128 pack(__uint128_t a)
 {
     m128i _a = _mm_loadu_si128((m128i*)&a);
     m128i b = _mm_set_epi32(
@@ -112,7 +137,7 @@ inline aint128 pack(aint128 a)
     return m128i_to_uint128(b);
 }
 
-inline aint128 pack(aint128 a, aint32 flag)
+inline aint128 pack(__uint128_t a, uint32_t flag)
 {
     aint32 b0, b1, b2, b3;
     m128i _a = _mm_loadu_si128((m128i*)&a);

diff --git a/src/vec_buffers.h b/src/vec_buffers.h
@@ -408,10 +408,10 @@ void Buffers<T>::dump(void)
     for (int i = 0; i < n; i++) {
         std::cout << "\n\t" << i << ": ";
         for (size_t j = 0; j < size - 1; j++) {
-            std::cout << unsigned((get(i))[j]) << "-";
+            std::cout << (get(i))[j] << "-";
         }
         if (size > 0) {
-            std::cout << unsigned((get(i))[size - 1]);
+            std::cout << (get(i))[size - 1];
         }
     }
     std::cout << "\n)\n";