Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RS NF4: support encoding/decoding using packets #215

Merged
merged 12 commits into from
Aug 24, 2018
5 changes: 3 additions & 2 deletions benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ int Benchmark<T>::init()
fec = new quadiron::fec::RsGfpFft<T>(word_size, k, m);
break;
case EC_TYPE_RS_NF4:
fec = new quadiron::fec::RsNf4<T>(word_size, k, m);
fec = new quadiron::fec::RsNf4<T>(word_size, k, m, pkt_size);
break;
case EC_TYPE_RS_FNT:
fec = new quadiron::fec::RsFnt<T>(word_size, k, m, pkt_size);
Expand Down Expand Up @@ -787,7 +787,8 @@ int main(int argc, char** argv)
}

// Currently support operating on packet:RS_FNT
if (params->fec_type != EC_TYPE_RS_FNT) {
if (params->fec_type != EC_TYPE_RS_FNT
&& params->fec_type != EC_TYPE_RS_NF4) {
params->operation_on_packet = false;
}

Expand Down
118 changes: 112 additions & 6 deletions src/fec_rs_nf4.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,17 @@ namespace fec {
template <typename T>
class RsNf4 : public FecCode<T> {
public:
RsNf4(unsigned word_size, unsigned n_data, unsigned n_parities)
: FecCode<T>(FecType::NON_SYSTEMATIC, word_size, n_data, n_parities)
RsNf4(
unsigned word_size,
unsigned n_data,
unsigned n_parities,
size_t pkt_size = 8)
: FecCode<T>(
FecType::NON_SYSTEMATIC,
word_size,
n_data,
n_parities,
pkt_size)
{
this->fec_init();
}
Expand Down Expand Up @@ -80,14 +89,14 @@ class RsNf4 : public FecCode<T> {

int m = arith::get_smallest_power_of_2<int>(this->n_data);
this->fft = std::unique_ptr<fft::Radix2<T>>(
new fft::Radix2<T>(*ngff4, this->n, m));
new fft::Radix2<T>(*ngff4, this->n, m, this->pkt_size));

this->fft_full = std::unique_ptr<fft::Radix2<T>>(
new fft::Radix2<T>(*ngff4, this->n));
new fft::Radix2<T>(*ngff4, this->n, this->n, this->pkt_size));

unsigned len_2k = this->gf->get_code_len_high_compo(2 * this->n_data);
this->fft_2k = std::unique_ptr<fft::Radix2<T>>(
new fft::Radix2<T>(*ngff4, len_2k, len_2k));
new fft::Radix2<T>(*ngff4, len_2k, len_2k, this->pkt_size));
}

inline void init_others() override
Expand Down Expand Up @@ -134,9 +143,10 @@ class RsNf4 : public FecCode<T> {
vec::ZeroExtended<T> vwords(words, this->n);
this->fft->fft(output, &vwords);
// std::cout << "encoded:"; output->dump();
GroupedValues<T> true_val;
for (unsigned i = 0; i < this->code_len; i++) {
T val = output->get(i);
GroupedValues<T> true_val = ngff4->unpack(val);
ngff4->unpack(val, true_val);
if (true_val.flag > 0) {
props[i].add(
ValueLocation(offset, i), std::to_string(true_val.flag));
Expand Down Expand Up @@ -247,6 +257,102 @@ class RsNf4 : public FecCode<T> {
output->set(i, ngff4->unpack(output->get(i)).values);
}
}

/********** Encoding & Decoding using Buffers **********/

void encode(
vec::Buffers<T>* output,
std::vector<Properties>& props,
off_t offset,
vec::Buffers<T>* words) override
{
for (unsigned i = 0; i < this->n_data; ++i) {
T* chunk = words->get(i);
for (size_t j = 0; j < this->pkt_size; ++j) {
chunk[j] = ngff4->pack(chunk[j]);
}
}
vec::BuffersZeroExtended<T> vwords(words, this->n);
this->fft->fft(output, &vwords);
size_t size = output->get_size();
GroupedValues<T> true_val;
for (unsigned frag_id = 0; frag_id < this->code_len; ++frag_id) {
T* chunk = output->get(frag_id);
for (size_t symb_id = 0; symb_id < size; symb_id++) {
ngff4->unpack(chunk[symb_id], true_val);
if (true_val.flag > 0) {
const ValueLocation loc(
offset + symb_id * this->word_size, frag_id);
props[frag_id].add(loc, std::to_string(true_val.flag));
}
chunk[symb_id] = true_val.values;
}
}
}

void decode_prepare(
const DecodeContext<T>& context,
const std::vector<Properties>& props,
off_t offset,
vec::Buffers<T>* words) override
{
const vec::Vector<T>& fragments_ids = context.get_fragments_id();
off_t offset_max = offset + this->buf_size;
for (unsigned i = 0; i < this->n_data; ++i) {
const int frag_id = fragments_ids.get(i);
T* chunk = words->get(i);

// the vector will contain marked symbols that will be packed
// firstly. Since locations are stored in unordered map, the vector
// will be sorted later to facilitate packing un-marked symbols
std::vector<size_t> packed_symbs;
// pack marked symbols
for (auto const& data : props[frag_id].get_map()) {
const off_t loc_offset = data.first.get_offset();
if (loc_offset >= offset && loc_offset < offset_max) {
// As loc.offset := offset + j * this->word_size
const size_t j = (loc_offset - offset) / this->word_size;
packed_symbs.push_back(j);
// pack symbol at index `j`
uint32_t flag = std::stoul(data.second);
chunk[j] = ngff4->pack(chunk[j], flag);
}
}
// sort the list of packed symbols
std::sort(packed_symbs.begin(), packed_symbs.end());

// pack un-marked symbols
size_t curr_frag_index = 0;
for (auto const& done_id : packed_symbs) {
// pack symbols from `curr_frag_index` to `j-1`
for (; curr_frag_index < done_id; ++curr_frag_index) {
chunk[curr_frag_index] =
ngff4->pack(chunk[curr_frag_index]);
}
curr_frag_index++;
}
// pack last symbols from `curr_frag_index` to `this->pkt_size-1`
for (; curr_frag_index < this->pkt_size; ++curr_frag_index) {
chunk[curr_frag_index] = ngff4->pack(chunk[curr_frag_index]);
}
}
}

void decode_apply(
const DecodeContext<T>& context,
vec::Buffers<T>* output,
vec::Buffers<T>* words) override
{
// decode_apply: do the same thing as in fec_base
FecCode<T>::decode_apply(context, output, words);
// unpack decoded symbols
for (unsigned i = 0; i < this->n_data; ++i) {
T* chunk = output->get(i);
for (unsigned j = 0; j < this->pkt_size; ++j) {
chunk[j] = ngff4->unpack(chunk[j]).values;
}
}
}
};

} // namespace fec
Expand Down
7 changes: 7 additions & 0 deletions src/gf_nf4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,13 @@ GroupedValues<__uint128_t> NF4<__uint128_t>::unpack(__uint128_t a) const
return simd::unpack(a, this->n);
}

template <>
void NF4<__uint128_t>::unpack(__uint128_t a, GroupedValues<__uint128_t>& b)
const
{
simd::unpack(a, b, this->n);
}

template <>
__uint128_t NF4<__uint128_t>::pack(__uint128_t a) const
{
Expand Down
32 changes: 32 additions & 0 deletions src/gf_nf4.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ class NF4 : public gf::Field<T> {
T pack(T a) const;
T pack(T a, uint32_t flag) const;
GroupedValues<T> unpack(T a) const;
void unpack(T a, GroupedValues<T>& b) const;
T get_nth_root(T n) const override;
void compute_omegas(vec::Vector<T>* W, int n, T w) const override;
const gf::Field<uint32_t>& get_sub_field() const;
Expand Down Expand Up @@ -430,6 +431,33 @@ GroupedValues<T> NF4<T>::unpack(T a) const
return b;
}

template <typename T>
void NF4<T>::unpack(T a, GroupedValues<T>& b) const
{
uint32_t flag = 0;
uint32_t ae;
uint16_t arr[this->n];

ae = (uint32_t)(a & MASK32);
if (ae == 65536) {
flag |= 1;
arr[0] = 0;
} else
arr[0] = (uint16_t)ae;
for (int i = 1; i < this->n; i++) {
a = (a >> 16) >> 16;
ae = (uint32_t)(a & MASK32);
if (ae == 65536) {
flag |= (1 << i);
arr[i] = 0;
} else
arr[i] = ae;
}

b.flag = flag;
b.values = expand16(arr);
}

// Use for fft
template <typename T>
T NF4<T>::get_nth_root(T n) const
Expand Down Expand Up @@ -535,6 +563,10 @@ __uint128_t NF4<__uint128_t>::pack(__uint128_t a, uint32_t flag) const;
template <>
GroupedValues<__uint128_t> NF4<__uint128_t>::unpack(__uint128_t a) const;

template <>
void NF4<__uint128_t>::unpack(__uint128_t a, GroupedValues<__uint128_t>& b)
const;

template <>
void NF4<__uint128_t>::hadamard_mul(int n, __uint128_t* x, __uint128_t* y)
const;
Expand Down
12 changes: 3 additions & 9 deletions src/gf_ring.h
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@ void RingModN<T>::mul_coef_to_buf(T a, T* src, T* dest, size_t len) const
DoubleSizeVal<T> coef = DoubleSizeVal<T>(a);
for (i = 0; i < len; i++) {
// perform multiplication
dest[i] = T((coef * src[i]) % this->_card);
dest[i] = mul(coef, src[i]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having a method mul, could we overload the * operator? That would make the code more readable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This kind of changes would be in a separated PR.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, you don't need to handle it here, I'll do it in #208

}
}

Expand All @@ -405,7 +405,7 @@ void RingModN<T>::add_two_bufs(T* src, T* dest, size_t len) const
size_t i;
for (i = 0; i < len; i++) {
// perform addition
dest[i] = (src[i] + dest[i]) % this->_card;
dest[i] = add(src[i], dest[i]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having a method add, could we overload the + operator? That would make the code more readable.

}
}

Expand All @@ -427,14 +427,8 @@ template <typename T>
void RingModN<T>::sub_two_bufs(T* bufa, T* bufb, T* res, size_t len) const
{
size_t i;
T result;
for (i = 0; i < len; i++) {
if (bufa[i] >= bufb[i]) {
result = bufa[i] - bufb[i];
} else {
result = this->_card - (bufb[i] - bufa[i]);
}
res[i] = result;
res[i] = sub(bufa[i], bufb[i]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of having a method sub, could we overload the - operator? That would make the code more readable.

If these operations are the one from RingModN, you could replace:

virtual T neg(T a) const;
virtual T add(T a, T b) const;
virtual T sub(T a, T b) const;
virtual T mul(T a, T b) const;
virtual T div(T a, T b) const;

by

virtual T operator-() const;
virtual T operator+(T lhs, T rhs) const;
virtual T operator-(T lhs, T rhs) const;
virtual T operator*(T lhs, T rhs) const;
virtual T operator/(T lhs, T rhs) const;

(replace T by const T& if the type are not limited to built-in integers, to avoid useless copy)

}
}

Expand Down
65 changes: 45 additions & 20 deletions src/simd_nf4.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ static inline aint128 m128i_to_uint128(m128i v)
}
#endif // #ifdef QUADIRON_USE_AVX2

inline aint128 expand16(aint16* arr, int n)
inline aint128 expand16(uint16_t* arr, int n)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why this change?

It should be explained in the commit message.

{
// since n <= 4
uint16_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
Expand All @@ -61,7 +61,7 @@ inline aint128 expand16(aint16* arr, int n)
return m128i_to_uint128(b);
}

inline aint128 expand32(aint32* arr, int n)
inline aint128 expand32(uint32_t* arr, int n)
{
// since n <= 4
uint32_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
Expand All @@ -72,35 +72,60 @@ inline aint128 expand32(aint32* arr, int n)
return m128i_to_uint128(b);
}

inline GroupedValues<__uint128_t> unpack(aint128 a, int n)
inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
{
aint32 flag = 0;
uint32_t ai[4] __attribute__((aligned(ALIGN_SIZE)));
uint32_t bi[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
uint16_t ai[8];
aint128 values;
int i;

m128i _a = _mm_loadu_si128((m128i*)&a);
ai[0] = _mm_extract_epi32(_a, 0);
ai[1] = _mm_extract_epi32(_a, 1);
ai[2] = _mm_extract_epi32(_a, 2);
ai[3] = _mm_extract_epi32(_a, 3);
for (i = 0; i < n; i++) {
if (ai[i] == 65536)
flag |= (1 << i);
else
bi[i] = (aint16)ai[i];
}
ai[0] = _mm_extract_epi16(_a, 0);
ai[1] = _mm_extract_epi16(_a, 1);
ai[2] = _mm_extract_epi16(_a, 2);
ai[3] = _mm_extract_epi16(_a, 3);
ai[4] = _mm_extract_epi16(_a, 4);
ai[5] = _mm_extract_epi16(_a, 5);
ai[6] = _mm_extract_epi16(_a, 6);
ai[7] = _mm_extract_epi16(_a, 7);

const uint32_t flag =
ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);

m128i val = _mm_set_epi64(
_mm_setzero_si64(), _mm_set_pi16(bi[3], bi[2], bi[1], bi[0]));
_mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
_mm_store_si128((m128i*)&values, val);

GroupedValues<__uint128_t> b = {values, flag};

return b;
}

inline aint128 pack(aint128 a)
inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
{
uint16_t ai[8];
aint128 values;

m128i _a = _mm_loadu_si128((m128i*)&a);
ai[0] = _mm_extract_epi16(_a, 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for (int i = 0; i < sizeof(ai)/sizeof(ai[0]); ++i) {
    ai[i] = _mm_extract_epi16(_a, i);
}

The loop count being constant, the compiler will unroll the loop anyway, but the code will be more readable.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_mm_extract_epi16 does not accept the second argument as a variable :(

ai[1] = _mm_extract_epi16(_a, 1);
ai[2] = _mm_extract_epi16(_a, 2);
ai[3] = _mm_extract_epi16(_a, 3);
ai[4] = _mm_extract_epi16(_a, 4);
ai[5] = _mm_extract_epi16(_a, 5);
ai[6] = _mm_extract_epi16(_a, 6);
ai[7] = _mm_extract_epi16(_a, 7);

const uint32_t flag =
ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);

m128i val = _mm_set_epi64(
_mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
_mm_store_si128((m128i*)&values, val);

b.flag = flag;
b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
}

inline aint128 pack(__uint128_t a)
{
m128i _a = _mm_loadu_si128((m128i*)&a);
m128i b = _mm_set_epi32(
Expand All @@ -112,7 +137,7 @@ inline aint128 pack(aint128 a)
return m128i_to_uint128(b);
}

inline aint128 pack(aint128 a, aint32 flag)
inline aint128 pack(__uint128_t a, uint32_t flag)
{
aint32 b0, b1, b2, b3;
m128i _a = _mm_loadu_si128((m128i*)&a);
Expand Down
4 changes: 2 additions & 2 deletions src/vec_buffers.h
Original file line number Diff line number Diff line change
Expand Up @@ -408,10 +408,10 @@ void Buffers<T>::dump(void)
for (int i = 0; i < n; i++) {
std::cout << "\n\t" << i << ": ";
for (size_t j = 0; j < size - 1; j++) {
std::cout << unsigned((get(i))[j]) << "-";
std::cout << (get(i))[j] << "-";
}
if (size > 0) {
std::cout << unsigned((get(i))[size - 1]);
std::cout << (get(i))[size - 1];
}
}
std::cout << "\n)\n";
Expand Down
Loading