diff --git a/src/avx512-64bit-argsort.hpp b/src/avx512-64bit-argsort.hpp index c6499a6c..000f822c 100644 --- a/src/avx512-64bit-argsort.hpp +++ b/src/avx512-64bit-argsort.hpp @@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) zmm_t arrzmm[4]; argzmm_t argzmm[4]; -#pragma GCC unroll 2 +#pragma X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::template i64gather(argzmm[ii], arr); @@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N) uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull; opmask_t load_mask[2] = {0xFF, 0xFF}; -#pragma GCC unroll 2 +#pragma X86_SIMD_SORT_UNROLL_LOOP(2) for (int ii = 0; ii < 2; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii); @@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) zmm_t arrzmm[8]; argzmm_t argzmm[8]; -#pragma GCC unroll 4 +#pragma X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argzmm[ii] = argtype::loadu(arg + 8 * ii); arrzmm[ii] = vtype::template i64gather(argzmm[ii], arr); @@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF}; uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull; -#pragma GCC unroll 4 +#pragma X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF; argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii); @@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) argzmm[ii + 4]); } -#pragma GCC unroll 4 +#pragma X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 8; ii = ii + 2) { bitonic_merge_two_zmm_64bit( arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]); @@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) bitonic_merge_four_zmm_64bit(arrzmm + 4, argzmm + 4); bitonic_merge_eight_zmm_64bit(arrzmm, argzmm); -#pragma GCC unroll 4 +#pragma X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::storeu(arg + 8 * ii, argzmm[ii]); } -#pragma GCC unroll 4 +#pragma X86_SIMD_SORT_UNROLL_LOOP(4) for (int ii = 0; ii < 4; ++ii) { argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]); } @@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) // zmm_t arrzmm[16]; // argzmm_t argzmm[16]; // -//#pragma GCC unroll 8 +//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argzmm[ii] = argtype::loadu(arg + 8*ii); // arrzmm[ii] = vtype::template i64gather(argzmm[ii], arr); @@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) // opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; // if (N != 128) { // uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull; -//#pragma GCC unroll 8 +//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF; // } // } -//#pragma GCC unroll 8 +//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii); // arrzmm[ii+8] = vtype::template mask_i64gather(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr); // arrzmm[ii+8] = sort_zmm_64bit(arrzmm[ii+8], argzmm[ii+8]); // } // -//#pragma GCC unroll 8 +//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 16; ii = ii + 2) { // bitonic_merge_two_zmm_64bit(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]); // } @@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N) // bitonic_merge_eight_zmm_64bit(arrzmm+8, argzmm+8); // bitonic_merge_sixteen_zmm_64bit(arrzmm, argzmm); // -//#pragma GCC unroll 8 +//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argtype::storeu(arg + 8*ii, argzmm[ii]); // } -//#pragma GCC unroll 8 +//#pragma X86_SIMD_SORT_UNROLL_LOOP(8) // for (int ii = 0; ii < 8; ++ii) { // argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]); // } diff --git a/src/avx512-common-argsort.h b/src/avx512-common-argsort.h index e829ab62..7a36c0ef 100644 --- a/src/avx512-common-argsort.h +++ b/src/avx512-common-argsort.h @@ -198,7 +198,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // first and last vtype::numlanes values are partitioned at the end zmm_t vec_left[num_unroll], vec_right[num_unroll]; argzmm_t argvec_left[num_unroll], argvec_right[num_unroll]; -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii); vec_left[ii] = vtype::template i64gather( @@ -224,7 +224,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + right + ii * vtype::numlanes); @@ -233,7 +233,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, } } else { -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes); curr_vec[ii] = vtype::template i64gather( @@ -242,7 +242,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, left += num_unroll * vtype::numlanes; } // partition the current vector and save it on both sides of the array -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -259,7 +259,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, } /* partition and save vec_left and vec_right */ -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, @@ -273,7 +273,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, l_store += (vtype::numlanes - amount_gt_pivot); r_store -= amount_gt_pivot; } -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_gt_pivot = partition_vec(arg, diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 6e5cd15e..8cd75d70 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -67,9 +67,12 @@ #define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16) #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d +/* Compiler specific macros specific */ #ifdef _MSC_VER #define X86_SIMD_SORT_INLINE static inline #define X86_SIMD_SORT_FINLINE static __forceinline +#define LIKELY(x) +#define UNLIKELY(x) #elif defined(__CYGWIN__) /* * Force inline in cygwin to work around a compiler bug. See @@ -80,13 +83,21 @@ #elif defined(__GNUC__) #define X86_SIMD_SORT_INLINE static inline #define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) +#define LIKELY(x) __builtin_expect((x), 1) +#define UNLIKELY(x) __builtin_expect((x), 0) #else #define X86_SIMD_SORT_INLINE static #define X86_SIMD_SORT_FINLINE static +#define LIKELY(x) +#define UNLIKELY(x) #endif -#define LIKELY(x) __builtin_expect((x), 1) -#define UNLIKELY(x) __builtin_expect((x), 0) +#if __GNUC__ >= 8 +#define X86_SIMD_SORT_UNROLL_LOOP(num)\ +GCC unroll num +#else +#define X86_SIMD_SORT_UNROLL_LOOP(num) +#endif template struct zmm_vector; @@ -382,7 +393,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, // We will now have atleast 16 registers worth of data to process: // left and right vtype::numlanes values are partitioned at the end zmm_t vec_left[num_unroll], vec_right[num_unroll]; -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii); vec_right[ii] = vtype::loadu( @@ -403,20 +414,20 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, */ if ((r_store + vtype::numlanes) - right < left - l_store) { right -= num_unroll * vtype::numlanes; -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes); } } else { -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes); } left += num_unroll * vtype::numlanes; } // partition the current vector and save it on both sides of the array -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -432,7 +443,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, } /* partition and save vec_left[8] and vec_right[8] */ -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr, @@ -445,7 +456,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr, l_store += (vtype::numlanes - amount_ge_pivot); r_store -= amount_ge_pivot; } -#pragma GCC unroll 8 +#pragma X86_SIMD_SORT_UNROLL_LOOP(8) for (int ii = 0; ii < num_unroll; ++ii) { int32_t amount_ge_pivot = partition_vec(arr,