From 1f218def3985869b081803b4f9a09a5cf29b404b Mon Sep 17 00:00:00 2001 From: Liam Keegan Date: Fri, 10 Nov 2023 10:28:21 +0100 Subject: [PATCH] Add NEON simd implementation for arm64 cpus (#114) - resolves #110 --- .github/workflows/ci.yml | 6 ++- CMakeLists.txt | 10 +++-- include/hamming/distance_neon.hh | 13 +++++++ include/hamming/hamming_impl.hh | 14 ++++--- pyproject.toml | 2 +- src/CMakeLists.txt | 15 ++++++++ src/distance_neon.cc | 65 ++++++++++++++++++++++++++++++++ src/distance_neon_bench.cc | 26 +++++++++++++ src/distance_neon_t.cc | 62 ++++++++++++++++++++++++++++++ 9 files changed, 202 insertions(+), 11 deletions(-) create mode 100644 include/hamming/distance_neon.hh create mode 100644 src/distance_neon.cc create mode 100644 src/distance_neon_bench.cc create mode 100644 src/distance_neon_t.cc diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 260ea75..478281a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -28,12 +28,16 @@ jobs: include: - os: ubuntu-latest open-mp: "ON" + neon: "OFF" - os: macos-latest open-mp: "OFF" + neon: "OFF" - os: macos-arm64-ssc open-mp: "OFF" + neon: "ON" - os: windows-latest open-mp: "OFF" + neon: "OFF" steps: - uses: actions/checkout@v4 @@ -51,7 +55,7 @@ jobs: - name: configure cmake shell: bash working-directory: ${{runner.workspace}}/build - run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DHAMMING_BUILD_BENCHMARKS=ON -DHAMMING_WITH_SSE2=$HAMMING_WITH_SSE2 -DHAMMING_WITH_AVX2=$HAMMING_WITH_AVX2 -DHAMMING_WITH_AVX512=$HAMMING_WITH_AVX512 -DHAMMING_BUILD_PYTHON=ON -DHAMMING_WITH_OPENMP=${{ matrix.open-mp }} + run: cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DBUILD_TESTING=ON -DHAMMING_BUILD_BENCHMARKS=ON -DHAMMING_WITH_SSE2=$HAMMING_WITH_SSE2 -DHAMMING_WITH_AVX2=$HAMMING_WITH_AVX2 -DHAMMING_WITH_AVX512=$HAMMING_WITH_AVX512 -DHAMMING_WITH_NEON=${{ matrix.neon }} -DHAMMING_BUILD_PYTHON=ON -DHAMMING_WITH_OPENMP=${{ matrix.open-mp }} - name: build shell: bash diff --git a/CMakeLists.txt b/CMakeLists.txt index ca282d9..ec5a5c5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,15 +25,19 @@ set(HAMMING_BUILD_PYTHON set(HAMMING_WITH_SSE2 yes - CACHE BOOL "Enable SSE2 optimized code") + CACHE BOOL "Enable SSE2 optimized code on x86_64 CPUs") set(HAMMING_WITH_AVX2 yes - CACHE BOOL "Enable AVX2 optimized code") + CACHE BOOL "Enable AVX2 optimized code on x86_64 CPUs") set(HAMMING_WITH_AVX512 yes - CACHE BOOL "Enable AVX512 optimized code") + CACHE BOOL "Enable AVX512 optimized code on x86_64 CPUs") + +set(HAMMING_WITH_NEON + no + CACHE BOOL "Enable NEON optimized code on Arm64 CPUs") # Add git submodules add_subdirectory(ext) diff --git a/include/hamming/distance_neon.hh b/include/hamming/distance_neon.hh new file mode 100644 index 0000000..1cdebd6 --- /dev/null +++ b/include/hamming/distance_neon.hh @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +#include "hamming/hamming_impl_types.hh" + +namespace hamming { + +int distance_neon(const std::vector &a, + const std::vector &b); + +} diff --git a/include/hamming/hamming_impl.hh b/include/hamming/hamming_impl.hh index d7798e7..4e0a97e 100644 --- a/include/hamming/hamming_impl.hh +++ b/include/hamming/hamming_impl.hh @@ -2,9 +2,7 @@ #define _HAMMING_IMPL_HH #include -#if defined(__aarch64__) || defined(_M_ARM64) -#include -#else +#if !(defined(__aarch64__) || defined(_M_ARM64)) #include #endif #include @@ -14,6 +12,9 @@ #ifdef HAMMING_WITH_OPENMP #include #endif +#ifdef HAMMING_WITH_NEON +#include "hamming/distance_neon.hh" +#endif #ifdef HAMMING_WITH_SSE2 #include "hamming/distance_sse2.hh" #endif @@ -101,11 +102,11 @@ std::vector distances(std::vector &data, const std::vector &b) = distance_cpp; #if defined(__aarch64__) || defined(_M_ARM64) - const auto features = cpu_features::GetAarch64Info().features; +#ifdef HAMMING_WITH_NEON + distance_func = distance_neon; +#endif #else const auto features = cpu_features::GetX86Info().features; -#endif - #ifdef HAMMING_WITH_SSE2 if (features.sse2) { distance_func = distance_sse2; @@ -121,6 +122,7 @@ std::vector distances(std::vector &data, distance_func = distance_avx512; } #endif +#endif #ifdef HAMMING_WITH_OPENMP #pragma omp parallel for schedule(static, 1) diff --git a/pyproject.toml b/pyproject.toml index 2827a42..74d77f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,4 +55,4 @@ environment = { CMAKE_ARGS="-DHAMMING_WITH_OPENMP=ON" } [[tool.cibuildwheel.overrides]] select = "*-macosx_arm64*" -environment = { CMAKE_ARGS="-DHAMMING_WITH_SSE2=OFF -DHAMMING_WITH_AVX2=OFF -DHAMMING_WITH_AVX512=OFF" } +environment = { CMAKE_ARGS="-DHAMMING_WITH_SSE2=OFF -DHAMMING_WITH_AVX2=OFF -DHAMMING_WITH_AVX512=OFF -DHAMMING_WITH_NEON=ON" } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c35cf0f..6f2d66c 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -34,6 +34,13 @@ if(HAMMING_WITH_AVX512) target_link_libraries(hamming PRIVATE distance_avx512) endif() +if(HAMMING_WITH_NEON) + target_compile_definitions(hamming PUBLIC HAMMING_WITH_NEON) + add_library(distance_neon STATIC distance_neon.cc) + target_include_directories(distance_neon PUBLIC ../include) + target_link_libraries(hamming PRIVATE distance_neon) +endif() + # Build library benchmarks if(HAMMING_BUILD_BENCHMARKS) add_executable(bench bench.cc hamming_bench.cc hamming_impl_bench.cc) @@ -49,6 +56,10 @@ if(HAMMING_BUILD_BENCHMARKS) target_sources(bench PRIVATE distance_avx512_bench.cc) target_link_libraries(bench PRIVATE distance_avx512) endif() + if(HAMMING_WITH_NEON) + target_sources(bench PRIVATE distance_neon_bench.cc) + target_link_libraries(bench PRIVATE distance_neon) + endif() target_link_libraries(bench PRIVATE hamming benchmark::benchmark CpuFeatures::cpu_features) endif() @@ -69,6 +80,10 @@ if(BUILD_TESTING) target_sources(tests PRIVATE distance_avx512_t.cc) target_link_libraries(tests PRIVATE distance_avx512) endif() + if(HAMMING_WITH_NEON) + target_sources(tests PRIVATE distance_neon_t.cc) + target_link_libraries(tests PRIVATE distance_neon) + endif() target_link_libraries(tests PRIVATE hamming Catch2::Catch2 CpuFeatures::cpu_features) catch_discover_tests(tests) diff --git a/src/distance_neon.cc b/src/distance_neon.cc new file mode 100644 index 0000000..8e8783e --- /dev/null +++ b/src/distance_neon.cc @@ -0,0 +1,65 @@ +#include "hamming/distance_neon.hh" +#include + +namespace hamming { + +int distance_neon(const std::vector &a, + const std::vector &b) { + // distance implementation using NEON simd intrinsics + // a 128-bit register holds 16 GeneBlocks, i.e. 32 genes + constexpr std::size_t n_geneblocks{16}; + int r{0}; + // mask to select LSB of each gene + const uint8x16_t lsb = vdupq_n_u8(1); + // mask to select lower gene from each GeneBlock + const uint8x16_t mask0 = vdupq_n_u8(mask_gene0); + // mask to select upper gene from each GeneBlock + const uint8x16_t mask1 = vdupq_n_u8(mask_gene1); + // vector of partial distance counts + uint8x16_t r_s; + // work registers + uint8x16_t r_a; + uint8x16_t r_b; + // each iteration processes 16 GeneBlocks + std::size_t n_iter{a.size() / n_geneblocks}; + // each partial distance count is stored in a uint8, so max value = 255, + // and the value can be increased by at most 2 with each iteration, + // so we do 127 inner iterations for a max value of 254 to avoid overflow + std::size_t n_inner{127}; + std::size_t n_outer{1 + n_iter / n_inner}; + for (std::size_t j = 0; j < n_outer; ++j) { + std::size_t n{std::min((j + 1) * n_inner, n_iter)}; + r_s = vdupq_n_u8(0); + for (std::size_t i = j * n_inner; i < n; ++i) { + // load a[i], b[i] into registers + r_a = vld1q_u8(a.data() + n_geneblocks * i); + r_b = vld1q_u8(b.data() + n_geneblocks * i); + // a[i] & b[i] + r_a = vandq_u8(r_a, r_b); + // mask lower genes + r_b = vandq_u8(r_a, mask0); + // mask upper genes + r_a = vandq_u8(r_a, mask1); + // compare genes with zero to get either 00000000 or 11111111 + r_a = vceqzq_u8(r_a); + r_b = vceqzq_u8(r_b); + // only keep LSB for each uint8 to get either 0 or 1 + r_a = vandq_u8(r_a, lsb); + r_b = vandq_u8(r_b, lsb); + // add these values to distance counts + r_s = vaddq_u8(r_s, r_a); + r_s = vaddq_u8(r_s, r_b); + } + // sum the 16 distances in r_s & add to r + r += vaddlvq_u8(r_s); + } + // do last partial block without simd intrinsics + for (std::size_t i = n_geneblocks * n_iter; i < a.size(); ++i) { + auto c{static_cast(a[i] & b[i])}; + r += static_cast((c & mask_gene0) == 0); + r += static_cast((c & mask_gene1) == 0); + } + return r; +} + +} // namespace hamming diff --git a/src/distance_neon_bench.cc b/src/distance_neon_bench.cc new file mode 100644 index 0000000..8bacd0c --- /dev/null +++ b/src/distance_neon_bench.cc @@ -0,0 +1,26 @@ +#include "bench.hh" +#include "hamming/distance_neon.hh" +#include "hamming/hamming.hh" +#include "hamming/hamming_impl.hh" +#ifdef HAMMING_WITH_OPENMP +#include +#endif + +using namespace hamming; + +static void bench_distance_neon(benchmark::State &state) { +#ifdef HAMMING_WITH_OPENMP + omp_set_num_threads(1); +#endif + std::mt19937 gen(12345); + int64_t n{state.range(0)}; + auto s1{from_string(make_string(n, gen))}; + auto s2{from_string(make_string(n, gen))}; + int d{0}; + for (auto _ : state) { + d += distance_neon(s1, s2); + } + state.SetComplexityN(n); +} + +BENCHMARK(bench_distance_neon)->Range(4096, 4194304)->Complexity(); diff --git a/src/distance_neon_t.cc b/src/distance_neon_t.cc new file mode 100644 index 0000000..0f3a7a8 --- /dev/null +++ b/src/distance_neon_t.cc @@ -0,0 +1,62 @@ +#include "hamming/distance_neon.hh" +#include "tests.hh" + +using namespace hamming; + +TEST_CASE("distance_neon() returns all return zero for identical vectors", + "[impl][distance][neon]") { + std::mt19937 gen(12345); + for (int n : + {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 31, 32, 33, 63, + 64, 65, 127, 128, 129, 254, 255, 256, + 256, 511, 512, 513, 1023, 1024, 1025, 2047, + 2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193, + 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, + 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, + 1048576, 1048577}) { + CAPTURE(n); + auto g1{make_gene_vector(n, gen)}; + REQUIRE(distance_neon(g1, g1) == 0); + } +} + +TEST_CASE("distance_neon() all return n for n A's and n G's", + "[impl][distance][neon]") { + for (int n : + {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 31, 32, 33, 63, + 64, 65, 127, 128, 129, 254, 255, 256, + 256, 511, 512, 513, 1023, 1024, 1025, 2047, + 2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193, + 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, + 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, + 1048576, 1048577}) { + CAPTURE(n); + auto g1 = from_string(std::string(n, 'A')); + auto g2 = from_string(std::string(n, 'G')); + REQUIRE(distance_neon(g1, g2) == n); + } +} + +TEST_CASE("distance_neon() returns same as distance_cpp() for random vectors", + "[impl][distance][neon]") { + std::mt19937 gen(12345); + for (int n : + {1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 31, 32, 33, 63, + 64, 65, 127, 128, 129, 254, 255, 256, + 256, 511, 512, 513, 1023, 1024, 1025, 2047, + 2048, 2049, 4095, 4096, 4097, 8191, 8192, 8193, + 32767, 32768, 32769, 65535, 65536, 65537, 131071, 131072, + 131073, 262143, 262144, 262145, 524287, 524288, 524289, 1048575, + 1048576, 1048577}) { + CAPTURE(n); + auto g1{make_gene_vector(n, gen)}; + auto g2{make_gene_vector(n, gen)}; + REQUIRE(distance_neon(g1, g2) == distance_cpp(g1, g2)); + } +}