Skip to content

Commit

Permalink
Merge pull request #153 from sterrettm2/kvsort-openmp
Browse files Browse the repository at this point in the history
Adds OpenMP based parallelization to key-value sorting
  • Loading branch information
r-devulap authored May 28, 2024
2 parents 2315766 + 5224f60 commit d3acd51
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 10 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,38 @@ jobs:
- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe

SKX-SKL-openmp:

runs-on: intel-ubuntu-latest

steps:
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Install dependencies
run: |
sudo apt update
sudo apt -y install g++-10 libgtest-dev meson curl git
- name: Install Intel SDE
run: |
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
- name: Build
env:
CXX: g++-10
run: |
make clean
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir
ninja
- name: Run test suite on SKX and SKL
run: |
sde -skx -- ./builddir/testexe
sde -skl -- ./builddir/testexe
SPR-gcc13-special-cases:

runs-on: intel-ubuntu-latest
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
test:
meson setup -Dbuild_tests=true --warnlevel 2 --werror --buildtype release builddir
meson setup -Dbuild_tests=true -Duse_openmp=false --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

test_openmp:
meson setup -Dbuild_tests=true -Duse_openmp=true --warnlevel 2 --werror --buildtype release builddir
cd builddir && ninja

bench:
Expand Down
10 changes: 8 additions & 2 deletions lib/meson.build
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
libtargets = []

# Add compile flags for OpenMP if enabled
openmpflags = []
if get_option('use_openmp')
openmpflags = ['-DXSS_USE_OPENMP=true', '-fopenmp']
endif

if cpp.has_argument('-march=haswell')
libtargets += static_library('libavx',
files(
'x86simdsort-avx2.cpp',
),
include_directories : [src],
cpp_args : ['-march=haswell'],
cpp_args : ['-march=haswell', openmpflags],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand All @@ -17,7 +23,7 @@ if cpp.has_argument('-march=skylake-avx512')
'x86simdsort-skx.cpp',
),
include_directories : [src],
cpp_args : ['-march=skylake-avx512'],
cpp_args : ['-march=skylake-avx512', openmpflags],
gnu_symbol_visibility : 'inlineshidden',
)
endif
Expand Down
1 change: 1 addition & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ subdir('lib')
libsimdsort = shared_library('x86simdsortcpp',
'lib/x86simdsort.cpp',
include_directories : [src, utils, lib],
link_args : [openmpflags],
link_with : [libtargets],
gnu_symbol_visibility : 'inlineshidden',
install : true,
Expand Down
6 changes: 4 additions & 2 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,7 @@ option('build_benchmarks', type : 'boolean', value : false,
description : 'Build benchmarking suite (default: "false").')
option('build_ippbench', type : 'boolean', value : false,
description : 'Add IPP sort to benchmarks (default: "false").')
option('build_vqsortbench', type : 'boolean', value : false,
description : 'Add google vqsort to benchmarks (default: "false").')
option('build_vqsortbench', type : 'boolean', value : true,
description : 'Add google vqsort to benchmarks (default: "true").')
option('use_openmp', type : 'boolean', value : false,
description : 'Use OpenMP to accelerate key-value sort (default: "false").')
2 changes: 1 addition & 1 deletion scripts/branch-compare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ build_branch() {
fi
fi
cd $dir_name
meson setup -Dbuild_benchmarks=true --warnlevel 0 --buildtype release builddir
meson setup -Dbuild_benchmarks=true -Duse_openmp=true --warnlevel 0 --buildtype release builddir
cd builddir
ninja
cd ../../
Expand Down
87 changes: 83 additions & 4 deletions src/xss-common-keyvaluesort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@
#include "xss-common-qsort.h"
#include "xss-network-keyvaluesort.hpp"

#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
#define XSS_COMPILE_OPENMP
#include <omp.h>
#endif

/*
* Parition one ZMM register based on the pivot and returns the index of the
* last element that is less than equal to the pivot.
Expand Down Expand Up @@ -366,7 +371,8 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
type2_t *indexes,
arrsize_t left,
arrsize_t right,
int max_iters)
int max_iters,
arrsize_t task_threshold)
{
/*
* Resort to std::sort if quicksort isnt making any progress
Expand All @@ -391,14 +397,61 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys,
type1_t biggest = vtype1::type_min();
arrsize_t pivot_index = kvpartition_unrolled<vtype1, vtype2, 4>(
keys, indexes, left, right + 1, pivot, &smallest, &biggest);

#ifdef XSS_COMPILE_OPENMP
if (pivot != smallest) {
bool parallel_left = (pivot_index - left) > task_threshold;
if (parallel_left) {
#pragma omp task
kvsort_<vtype1, vtype2>(keys,
indexes,
left,
pivot_index - 1,
max_iters - 1,
task_threshold);
}
else {
kvsort_<vtype1, vtype2>(keys,
indexes,
left,
pivot_index - 1,
max_iters - 1,
task_threshold);
}
}
if (pivot != biggest) {
bool parallel_right = (right - pivot_index) > task_threshold;

if (parallel_right) {
#pragma omp task
kvsort_<vtype1, vtype2>(keys,
indexes,
pivot_index,
right,
max_iters - 1,
task_threshold);
}
else {
kvsort_<vtype1, vtype2>(keys,
indexes,
pivot_index,
right,
max_iters - 1,
task_threshold);
}
}
#else
UNUSED(task_threshold);

if (pivot != smallest) {
kvsort_<vtype1, vtype2>(
keys, indexes, left, pivot_index - 1, max_iters - 1);
keys, indexes, left, pivot_index - 1, max_iters - 1, 0);
}
if (pivot != biggest) {
kvsort_<vtype1, vtype2>(
keys, indexes, pivot_index, right, max_iters - 1);
keys, indexes, pivot_index, right, max_iters - 1, 0);
}
#endif
}

template <typename vtype1,
Expand Down Expand Up @@ -486,7 +539,33 @@ X86_SIMD_SORT_INLINE void xss_qsort_kv(
UNUSED(hasnan);
}

kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters);
#ifdef XSS_COMPILE_OPENMP

bool use_parallel = arrsize > 10000;

if (use_parallel) {
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
constexpr int thread_limit = 8;
int thread_count = std::min(thread_limit, omp_get_max_threads());
arrsize_t task_threshold
= std::max((arrsize_t)10000, arrsize / 100);

// We use omp parallel and then omp single to setup the threads that will run the omp task calls in kvsort_
// The omp single prevents multiple threads from running the initial kvsort_ simultaneously and causing problems
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
#pragma omp parallel num_threads(thread_count)
#pragma omp single
kvsort_<keytype, valtype>(
keys, indexes, 0, arrsize - 1, maxiters, task_threshold);
}
else {
kvsort_<keytype, valtype>(
keys, indexes, 0, arrsize - 1, maxiters, 0);
}
#else
kvsort_<keytype, valtype>(keys, indexes, 0, arrsize - 1, maxiters, 0);
#endif

replace_inf_with_nan(keys, arrsize, nan_count);

if (descending) {
Expand Down

0 comments on commit d3acd51

Please sign in to comment.