Skip to content

Commit

Permalink
Changed to using new unroll macro
Browse files Browse the repository at this point in the history
  • Loading branch information
sterrettm2 committed Sep 7, 2023
1 parent 09fce7a commit 70733ff
Showing 1 changed file with 14 additions and 12 deletions.
26 changes: 14 additions & 12 deletions src/xss-network-qsort.hpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
#ifndef XSS_NETWORK_QSORT
#define XSS_NETWORK_QSORT

#include "avx512-common-qsort.h"

template <typename vtype,
int64_t numVecs,
typename reg_t = typename vtype::reg_t>
X86_SIMD_SORT_INLINE void bitonic_clean_n_vec(reg_t *regs)
{
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int num = numVecs / 2; num >= 2; num /= 2) {
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int j = 0; j < numVecs; j += num) {
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < num / 2; i++) {
COEX<vtype>(regs[i + j], regs[i + j + num / 2]);
}
Expand All @@ -30,7 +32,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
}
else if constexpr (numVecs > 2) {
// Reverse upper half
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < numVecs / 2; i++) {
reg_t rev = vtype::reverse(regs[numVecs - i - 1]);
reg_t maxV = vtype::max(regs[i], rev);
Expand All @@ -44,7 +46,7 @@ X86_SIMD_SORT_INLINE void bitonic_merge_n_vec(reg_t *regs)
bitonic_clean_n_vec<vtype, numVecs>(regs);

// Now do bitonic_merge
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < numVecs; i++) {
regs[i] = vtype::bitonic_merge(regs[i]);
}
Expand All @@ -59,7 +61,7 @@ X86_SIMD_SORT_INLINE void bitonic_fullmerge_n_vec(reg_t *regs)
if constexpr (numPer > numVecs)
return;
else {
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < numVecs / numPer; i++) {
bitonic_merge_n_vec<vtype, numPer>(regs + i * numPer);
}
Expand All @@ -79,7 +81,7 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)

// Generate masks for loading and storing
typename vtype::opmask_t ioMasks[numVecs - numVecs / 2];
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
int64_t num_to_read
= std::min((int64_t)std::max(0, N - i * vtype::numlanes),
Expand All @@ -88,19 +90,19 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
}

// Unmasked part of the load
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < numVecs / 2; i++) {
vecs[i] = vtype::loadu(arr + i * vtype::numlanes);
}
// Masked part of the load
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
vecs[i] = vtype::mask_loadu(
vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes);
}

// Sort each loaded vector
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < numVecs; i++) {
vecs[i] = vtype::sort_vec(vecs[i]);
}
Expand All @@ -109,12 +111,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int32_t N)
bitonic_fullmerge_n_vec<vtype, numVecs>(&vecs[0]);

// Unmasked part of the store
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = 0; i < numVecs / 2; i++) {
vtype::storeu(arr + i * vtype::numlanes, vecs[i]);
}
// Masked part of the store
#pragma GCC unroll 64
X86_SIMD_SORT_UNROLL_LOOP(64)
for (int i = numVecs / 2, j = 0; i < numVecs; i++, j++) {
vtype::mask_storeu(arr + i * vtype::numlanes, ioMasks[j], vecs[i]);
}
Expand Down

0 comments on commit 70733ff

Please sign in to comment.