Skip to content

Commit

Permalink
SIMD NF4 contains vectorized operations for NF4
Browse files Browse the repository at this point in the history
  • Loading branch information
lamphamsy committed Oct 15, 2018
1 parent 1bed85d commit a4c9b86
Showing 1 changed file with 240 additions and 25 deletions.
265 changes: 240 additions & 25 deletions src/simd_nf4.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,48 +36,47 @@
namespace quadiron {
namespace simd {

#ifdef __AVX2__
typedef __m128i m128i;
typedef uint32_t aint32 __attribute__((aligned(ALIGN_SIZE)));
typedef __uint128_t NF4Type;

/** Return aint128 integer from a _m128i register */
static inline aint128 m128i_to_uint128(m128i v)
/** Return NF4Type integer from a _m128i register */
static inline NF4Type m128i_to_uint128(__m128i v)
{
aint128 i;
_mm_store_si128((m128i*)&i, v);
NF4Type i;
_mm_store_si128((__m128i*)&i, v);

return i; // NOLINT(clang-analyzer-core.uninitialized.UndefReturn)
}
#endif // #ifdef __AVX2__

inline aint128 expand16(uint16_t* arr, int n)
inline NF4Type expand16(uint16_t* arr, int n)
{
// since n <= 4
uint16_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
std::copy_n(arr, n, _arr);

m128i b = _mm_set_epi64(
__m128i b = _mm_set_epi64(
_mm_setzero_si64(), _mm_set_pi16(_arr[3], _arr[2], _arr[1], _arr[0]));

return m128i_to_uint128(b);
}

inline aint128 expand32(uint32_t* arr, int n)
inline NF4Type expand32(uint32_t* arr, int n)
{
// since n <= 4
uint32_t _arr[4] __attribute__((aligned(ALIGN_SIZE))) = {0, 0, 0, 0};
std::copy_n(arr, n, _arr);

m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);
__m128i b = _mm_set_epi32(_arr[3], _arr[2], _arr[1], _arr[0]);

return m128i_to_uint128(b);
}

inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
{
uint16_t ai[8];
aint128 values;
NF4Type values;

m128i _a = _mm_loadu_si128((m128i*)&a);
__m128i _a = _mm_loadu_si128((__m128i*)&a);
ai[0] = _mm_extract_epi16(_a, 0);
ai[1] = _mm_extract_epi16(_a, 1);
ai[2] = _mm_extract_epi16(_a, 2);
Expand All @@ -90,9 +89,9 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
const uint32_t flag =
ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);

m128i val = _mm_set_epi64(
__m128i val = _mm_set_epi64(
_mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
_mm_store_si128((m128i*)&values, val);
_mm_store_si128((__m128i*)&values, val);

GroupedValues<__uint128_t> b = {values, flag};

Expand All @@ -102,9 +101,9 @@ inline GroupedValues<__uint128_t> unpack(__uint128_t a, int n)
inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
{
uint16_t ai[8];
aint128 values;
NF4Type values;

m128i _a = _mm_loadu_si128((m128i*)&a);
__m128i _a = _mm_loadu_si128((__m128i*)&a);
ai[0] = _mm_extract_epi16(_a, 0);
ai[1] = _mm_extract_epi16(_a, 1);
ai[2] = _mm_extract_epi16(_a, 2);
Expand All @@ -117,18 +116,18 @@ inline void unpack(__uint128_t a, GroupedValues<__uint128_t>& b, int n)
const uint32_t flag =
ai[1] | (!!ai[3] << 1u) | (!!ai[5] << 2u) | (!!ai[7] << 3u);

m128i val = _mm_set_epi64(
__m128i val = _mm_set_epi64(
_mm_setzero_si64(), _mm_set_pi16(ai[6], ai[4], ai[2], ai[0]));
_mm_store_si128((m128i*)&values, val);
_mm_store_si128((__m128i*)&values, val);

b.flag = flag;
b.values = values; // NOLINT(clang-analyzer-core.uninitialized.Assign)
}

inline aint128 pack(__uint128_t a)
inline NF4Type pack(__uint128_t a)
{
m128i _a = _mm_loadu_si128((m128i*)&a);
m128i b = _mm_set_epi32(
__m128i _a = _mm_loadu_si128((__m128i*)&a);
__m128i b = _mm_set_epi32(
_mm_extract_epi16(_a, 3),
_mm_extract_epi16(_a, 2),
_mm_extract_epi16(_a, 1),
Expand All @@ -137,10 +136,10 @@ inline aint128 pack(__uint128_t a)
return m128i_to_uint128(b);
}

inline aint128 pack(__uint128_t a, uint32_t flag)
inline NF4Type pack(__uint128_t a, uint32_t flag)
{
aint32 b0, b1, b2, b3;
m128i _a = _mm_loadu_si128((m128i*)&a);
__m128i _a = _mm_loadu_si128((__m128i*)&a);

if (flag & 1)
b0 = 65536;
Expand All @@ -162,11 +161,227 @@ inline aint128 pack(__uint128_t a, uint32_t flag)
else
b3 = _mm_extract_epi16(_a, 3);

m128i b = _mm_set_epi32(b3, b2, b1, b0);
__m128i b = _mm_set_epi32(b3, b2, b1, b0);

return m128i_to_uint128(b);
}

/* ================= Basic operations for NF4 ================= */

#if defined(__AVX2__)

inline VecType CAST_TO_DOUBLE(HalfVecType x)
{
return _mm256_castsi128_si256(x);
}

inline void STORE_LOW(HalfVecType* address, VecType reg)
{
_mm_store_si128(address, _mm256_castsi256_si128(reg));
}

inline NF4Type add(NF4Type a, NF4Type b)
{
HalfVecType res;
VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
STORE_LOW(&res, ADD_MOD(_a, _b, F4));
return (NF4Type)res;
}

inline NF4Type sub(NF4Type a, NF4Type b)
{
HalfVecType res;
VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
STORE_LOW(&res, SUB_MOD(_a, _b, F4));
return (NF4Type)res;
}

inline NF4Type mul(NF4Type a, NF4Type b)
{
HalfVecType res;
VecType _a = CAST_TO_DOUBLE((HalfVecType)a);
VecType _b = CAST_TO_DOUBLE((HalfVecType)b);
STORE_LOW(&res, MULFULL_MOD(_a, _b, F4));
return (NF4Type)res;
}

inline void
add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
{
// add last _y[] to x and x_next
HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
for (unsigned i = 0; i < n; ++i) {
VecType _x_p = CAST_TO_DOUBLE(_x[i]);
VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
VecType _y_p = CAST_TO_DOUBLE(_y[i]);

STORE_LOW(_x + i, ADD_MOD(_x_p, _y_p, F4));
STORE_LOW(_x_half + i, ADD_MOD(_x_next_p, _y_p, F4));
}
}

inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
{
HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
for (unsigned i = 0; i < n; ++i) {
VecType _x_p = CAST_TO_DOUBLE(_x[i]);
VecType _y_p = CAST_TO_DOUBLE(_y[i]);

STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
}
}

inline void
hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
{
HalfVecType* _x = reinterpret_cast<HalfVecType*>(x);
HalfVecType* _x_half = reinterpret_cast<HalfVecType*>(x_half);
HalfVecType* _y = reinterpret_cast<HalfVecType*>(y);
for (unsigned i = 0; i < n; ++i) {
VecType _x_p = CAST_TO_DOUBLE(_x[i]);
VecType _x_next_p = CAST_TO_DOUBLE(_x_half[i]);
VecType _y_p = CAST_TO_DOUBLE(_y[i]);

STORE_LOW(_x + i, MULFULL_MOD(_x_p, _y_p, F4));
STORE_LOW(_x_half + i, MULFULL_MOD(_x_next_p, _y_p, F4));
}
}

#elif defined(__SSE4_1__)

inline NF4Type add(NF4Type a, NF4Type b)
{
VecType res;
STORE(&res, ADD_MOD((VecType)a, (VecType)b, F4));
return (NF4Type)res;
}

inline NF4Type sub(NF4Type a, NF4Type b)
{
VecType res;
STORE(&res, SUB_MOD((VecType)a, (VecType)b, F4));
return (NF4Type)res;
}

inline NF4Type mul(NF4Type a, NF4Type b)
{
VecType res;
STORE(&res, MULFULL_MOD((VecType)a, (VecType)b, F4));
return (NF4Type)res;
}

inline void
add_buf_to_two_bufs_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
{
// do nothing
}

inline void hadamard_mul_rem(unsigned n, NF4Type* x, NF4Type* y)
{
// do nothing
}

inline void
hadamard_mul_doubled_rem(unsigned n, NF4Type* x, NF4Type* x_half, NF4Type* y)
{
// do nothing
}

#endif

/* ==================== Operations for NF4 =================== */

/** Add buffer `y` to two halves of `x`. `x` is of length `n` */
inline void add_buf_to_two_bufs(unsigned n, NF4Type* _x, NF4Type* _y)
{
unsigned i;
VecType* x = reinterpret_cast<VecType*>(_x);
VecType* y = reinterpret_cast<VecType*>(_y);

const unsigned ratio = sizeof(*x) / sizeof(*_x);
const unsigned half_len = n / 2;
const unsigned vec_len = half_len / ratio;
const unsigned num_len = vec_len * ratio;
const unsigned rem_len = half_len - num_len;

NF4Type* x_half = _x + half_len;
VecType* x_next = reinterpret_cast<VecType*>(x_half);

// add y to the first half of `x`
for (i = 0; i < vec_len; ++i) {
x[i] = ADD_MOD(x[i], y[i], F4);
}

// add y to the second half of `x`
for (i = 0; i < vec_len; ++i) {
x_next[i] = ADD_MOD(x_next[i], y[i], F4);
}

if (rem_len > 0) {
add_buf_to_two_bufs_rem(
rem_len, _x + num_len, x_half + num_len, _y + num_len);
}
}

inline void hadamard_mul(unsigned n, NF4Type* _x, NF4Type* _y)
{
unsigned i;
VecType* x = reinterpret_cast<VecType*>(_x);
VecType* y = reinterpret_cast<VecType*>(_y);

const unsigned ratio = sizeof(*x) / sizeof(*_x);
const unsigned vec_len = n / ratio;
const unsigned num_len = vec_len * ratio;
const unsigned rem_len = n - num_len;

// multiply y to the first half of `x`
for (i = 0; i < vec_len; ++i) {
x[i] = MULFULL_MOD(x[i], y[i], F4);
}

if (rem_len > 0) {
// add last _y[] to x
hadamard_mul_rem(rem_len, _x + num_len, _y + num_len);
}
}

inline void hadamard_mul_doubled(unsigned n, NF4Type* _x, NF4Type* _y)
{
unsigned i;
VecType* x = reinterpret_cast<VecType*>(_x);
VecType* y = reinterpret_cast<VecType*>(_y);

const unsigned ratio = sizeof(*x) / sizeof(*_x);
const unsigned half_len = n / 2;
const unsigned vec_len = half_len / ratio;
const unsigned num_len = vec_len * ratio;
const unsigned rem_len = half_len - num_len;

NF4Type* x_half = _x + half_len;
VecType* x_next = reinterpret_cast<VecType*>(x_half);

// multiply y to the first half of `x`
for (i = 0; i < vec_len; ++i) {
x[i] = MULFULL_MOD(x[i], y[i], F4);
}

// multiply y to the second half of `x`
for (i = 0; i < vec_len; ++i) {
x_next[i] = MULFULL_MOD(x_next[i], y[i], F4);
}

if (rem_len > 0) {
// add last _y[] to x and x_next
hadamard_mul_doubled_rem(
rem_len, _x + num_len, x_half + num_len, _y + num_len);
}
}

} // namespace simd
} // namespace quadiron

Expand Down

0 comments on commit a4c9b86

Please sign in to comment.