Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Part2: Vectorize operations for Radix-2 FFT & re-work vectorization part #254

Merged
merged 13 commits into from
Dec 20, 2018
Merged
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ include(GNUInstallDirs)
set(LIB_SRC
${SOURCE_DIR}/core.cpp
${SOURCE_DIR}/fec_vectorisation.cpp
${SOURCE_DIR}/fft_2n.cpp
${SOURCE_DIR}/misc.cpp
${SOURCE_DIR}/gf_nf4.cpp
${SOURCE_DIR}/gf_ring.cpp
Expand Down
2 changes: 0 additions & 2 deletions src/fec_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ static inline uint64_t hrtime_usec(timeval begin)
return 1000000 * (tv.tv_sec - begin.tv_sec) + tv.tv_usec - begin.tv_usec;
}

#define OOR_MARK 1

enum class FecType {
/** Systematic code
*
Expand Down
11 changes: 11 additions & 0 deletions src/fec_rs_fnt.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ class RsFnt : public FecCode<T> {
// decoding context used in encoding of systematic FNT
std::unique_ptr<DecodeContext<T>> enc_context;

// Indices used for accelerated functions
size_t simd_vec_len;
size_t simd_trailing_len;
size_t simd_offset;

public:
RsFnt(
FecType type,
Expand All @@ -70,6 +75,12 @@ class RsFnt : public FecCode<T> {
: FecCode<T>(type, word_size, n_data, n_parities, pkt_size)
{
this->fec_init();

// Indices used for accelerated functions
const unsigned ratio = simd::countof<T>();
simd_vec_len = this->pkt_size / ratio;
simd_trailing_len = this->pkt_size - simd_vec_len * ratio;
simd_offset = simd_vec_len * ratio;
}

inline void check_params() override
Expand Down
28 changes: 7 additions & 21 deletions src/fec_vectorisation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#include "fec_rs_fnt.h"

/*
* The file includes vectorized operations used by FEC classes
* The file includes specialized operations used by FEC classes
*/

#ifdef QUADIRON_USE_SIMD
Expand All @@ -53,20 +53,13 @@ void RsFnt<uint16_t>::encode_post_process(
uint16_t threshold = this->gf->card_minus_one();
unsigned code_len = this->n_outputs;

// number of elements per vector register
unsigned vec_size = simd::countof<uint16_t>();
// number of vector registers per fragment packet
size_t vecs_nb = size / vec_size;
// odd number of elements not vectorized
size_t last_len = size - vecs_nb * vec_size;

simd::encode_post_process(
output, props, offset, code_len, threshold, vecs_nb);
output, props, offset, code_len, threshold, simd_vec_len);

if (last_len > 0) {
if (simd_trailing_len > 0) {
for (unsigned i = 0; i < code_len; ++i) {
uint16_t* chunk = output.get(i);
for (size_t j = vecs_nb * vec_size; j < size; ++j) {
for (size_t j = simd_offset; j < size; ++j) {
if (chunk[j] == threshold) {
props[i].add(offset + j, OOR_MARK);
}
Expand All @@ -85,20 +78,13 @@ void RsFnt<uint32_t>::encode_post_process(
const uint32_t threshold = this->gf->card_minus_one();
const unsigned code_len = this->n_outputs;

// number of elements per vector register
const unsigned vec_size = simd::countof<uint32_t>();
// number of vector registers per fragment packet
const size_t vecs_nb = size / vec_size;
// odd number of elements not vectorized
const size_t last_len = size - vecs_nb * vec_size;

simd::encode_post_process(
output, props, offset, code_len, threshold, vecs_nb);
output, props, offset, code_len, threshold, simd_vec_len);

if (last_len > 0) {
if (simd_trailing_len > 0) {
for (unsigned i = 0; i < code_len; ++i) {
uint32_t* chunk = output.get(i);
for (size_t j = vecs_nb * vec_size; j < size; ++j) {
for (size_t j = simd_offset; j < size; ++j) {
if (chunk[j] == threshold) {
props[i].add(offset + j, OOR_MARK);
}
Expand Down
192 changes: 192 additions & 0 deletions src/fft_2n.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
/* -*- mode: c++ -*- */
/*
* Copyright 2017-2018 Scality
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/

#include "fft_2n.h"

/*
* The file includes vectorized operations used by Radix2 classes
*/

#ifdef QUADIRON_USE_SIMD

#include "simd.h"

namespace quadiron {
namespace fft {

template <>
void Radix2<uint16_t>::butterfly_ct_two_layers_step(
vec::Buffers<uint16_t>& buf,
unsigned start,
unsigned m)
{
const unsigned coefIndex = start * this->n / m / 2;
const uint16_t r1 = vec_W[coefIndex];
const uint16_t r2 = vec_W[coefIndex / 2];
const uint16_t r3 = vec_W[coefIndex / 2 + this->n / 4];

// perform vector operations
simd::butterfly_ct_two_layers_step(
buf, r1, r2, r3, start, m, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
}
}

template <>
void Radix2<uint16_t>::butterfly_ct_step(
vec::Buffers<uint16_t>& buf,
uint16_t r,
unsigned start,
unsigned m,
unsigned step)
{
// perform vector operations
simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
}
}

template <>
void Radix2<uint16_t>::butterfly_gs_step(
vec::Buffers<uint16_t>& buf,
uint16_t coef,
unsigned start,
unsigned m,
unsigned step)
{
// perform vector operations
simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
}
}

template <>
void Radix2<uint16_t>::butterfly_gs_step_simple(
slaperche-scality marked this conversation as resolved.
Show resolved Hide resolved
vec::Buffers<uint16_t>& buf,
uint16_t coef,
unsigned start,
unsigned m,
unsigned step)
{
// perform vector operations
simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
}
}

template <>
void Radix2<uint32_t>::butterfly_ct_two_layers_step(
vec::Buffers<uint32_t>& buf,
unsigned start,
unsigned m)
{
const unsigned coefIndex = start * this->n / m / 2;
const uint32_t r1 = vec_W[coefIndex];
const uint32_t r2 = vec_W[coefIndex / 2];
const uint32_t r3 = vec_W[coefIndex / 2 + this->n / 4];

// perform vector operations
simd::butterfly_ct_two_layers_step(
buf, r1, r2, r3, start, m, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_ct_two_layers_step_slow(buf, start, m, simd_offset);
}
}

template <>
void Radix2<uint32_t>::butterfly_ct_step(
vec::Buffers<uint32_t>& buf,
uint32_t r,
unsigned start,
unsigned m,
unsigned step)
{
// perform vector operations
simd::butterfly_ct_step(buf, r, start, m, step, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_ct_step_slow(buf, r, start, m, step, simd_offset);
}
}

template <>
void Radix2<uint32_t>::butterfly_gs_step(
vec::Buffers<uint32_t>& buf,
uint32_t coef,
unsigned start,
unsigned m,
unsigned step)
{
// perform vector operations
simd::butterfly_gs_step(buf, coef, start, m, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_gs_step_slow(buf, coef, start, m, step, simd_offset);
}
}

template <>
void Radix2<uint32_t>::butterfly_gs_step_simple(
vec::Buffers<uint32_t>& buf,
uint32_t coef,
unsigned start,
unsigned m,
unsigned step)
{
// perform vector operations
simd::butterfly_gs_step_simple(buf, coef, start, m, simd_vec_len, card);

// for last elements, perform as non-SIMD method
if (simd_trailing_len > 0) {
butterfly_gs_step_simple_slow(buf, coef, start, m, step, simd_offset);
}
}

} // namespace fft
} // namespace quadiron

#endif // #ifdef QUADIRON_USE_SIMD
Loading