Skip to content

Commit

Permalink
Update to latest libprimesieve
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Nov 11, 2024
1 parent 23be137 commit 1972182
Show file tree
Hide file tree
Showing 10 changed files with 555 additions and 99 deletions.
7 changes: 6 additions & 1 deletion lib/primesieve/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,14 @@ set(LIB_SRC src/api-c.cpp

if(WITH_MULTIARCH)
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_x86_popcnt.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_bw.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_avx512_vbmi2.cmake")

if(multiarch_x86_popcnt OR multiarch_avx512_vbmi2)
if(NOT multiarch_avx512_bw)
include("${PROJECT_SOURCE_DIR}/cmake/multiarch_sve_arm.cmake")
endif()

if(multiarch_x86_popcnt OR multiarch_avx512_bw OR multiarch_avx512_vbmi2)
set(LIB_SRC ${LIB_SRC} src/x86/cpuid.cpp)
endif()
endif()
Expand Down
3 changes: 2 additions & 1 deletion lib/primesieve/ChangeLog
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
Changes in version 12.6, 10/11/2024
Changes in version 12.6, 11/11/2024
===================================

* CpuInfo.cpp: Correctly detect Intel Arrow Lake CPU cache
topology on Windows and Linux.
* PreSieve.cpp: Increased pre-sieving from primes <= 100 to
primes <= 163. Memory usage of pre-sieve lookup tables has been
reduced from 210 kilobytes to 123 kilobytes.
* PreSieve.cpp: Added AVX512 and ARM SVE multiarch support.

Changes in version 12.5, 22/10/2024
===================================
Expand Down
58 changes: 29 additions & 29 deletions lib/primesieve/cmake/auto_vectorization.cmake
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
# The AND_PreSieveTables() function in PreSieve.cpp is important for
# performance and therefore it is important that this function is
# auto-vectorized by the compiler. For GCC & Clang we can enable
# auto vectorization using -ftree-vectorize.

# GCC/Clang enable auto-vectorization with -O2 and -O3, but for -O2
# GCC uses the "very-cheap" cost model which prevents our AND_PreSieveTables()
# function from getting auto vectorized. But compiling with e.g.
# "-O2 -ftree-vectorize -fvect-cost-model=dynamic" fixes this issue.

include(CheckCXXCompilerFlag)

cmake_push_check_state()
set(CMAKE_REQUIRED_FLAGS -Werror)
check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize)
cmake_pop_check_state()

if(ftree_vectorize)
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize")

cmake_push_check_state()
set(CMAKE_REQUIRED_FLAGS -Werror)
check_cxx_compiler_flag(-fvect-cost-model=dynamic fvect_cost_model)
cmake_pop_check_state()

if(fvect_cost_model)
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic")
endif()
endif()
# The AND_PreSieveTables() function in PreSieve.cpp is important for
# performance and therefore it is important that this function is
# auto-vectorized by the compiler. For GCC & Clang we can enable
# auto vectorization using -ftree-vectorize.

# GCC/Clang enable auto-vectorization with -O2 and -O3, but for -O2
# GCC uses the "very-cheap" cost model which prevents our AND_PreSieveTables()
# function from getting auto vectorized. But compiling with e.g.
# "-O2 -ftree-vectorize -fvect-cost-model=dynamic" fixes this issue.

include(CheckCXXCompilerFlag)

cmake_push_check_state()
set(CMAKE_REQUIRED_FLAGS -Werror)
check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize)
cmake_pop_check_state()

if(ftree_vectorize)
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize")

cmake_push_check_state()
set(CMAKE_REQUIRED_FLAGS -Werror)
check_cxx_compiler_flag(-fvect-cost-model=dynamic fvect_cost_model)
cmake_pop_check_state()

if(fvect_cost_model)
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic")
endif()
endif()
80 changes: 80 additions & 0 deletions lib/primesieve/cmake/multiarch_avx512_bw.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# We use GCC/Clang's function multi-versioning for AVX512
# support. This code will automatically dispatch to the
# AVX512 BW algorithm if the CPU supports it and use the
# default (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512bw.
// GCC/Clang function multiversioning generally causes a minor
// overhead, hence we disable it if it is not needed.
#if defined(__AVX512F__) && \
defined(__AVX512BW__)
Error: AVX512BW multiarch not needed!
#endif
#include <src/x86/cpuid.cpp>
#include <immintrin.h>
#include <stdint.h>
#include <cstddef>
__attribute__ ((target (\"avx512f,avx512bw\")))
void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0,
const uint8_t* __restrict preSieve1,
uint8_t* __restrict sieve,
std::size_t bytes)
{
std::size_t i = 0;
for (; i + 64 <= bytes; i += sizeof(__m512i))
{
_mm512_storeu_epi8((__m512i*) &sieve[i],
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]),
_mm512_loadu_epi8((const __m512i*) &preSieve1[i])));
}
if (i < bytes)
{
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes);
_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask,
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]),
_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i])));
}
}
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] = preSieved0[i] & preSieved1[i];
}
int main()
{
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
if (primesieve::has_cpuid_avx512_bw())
AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10);
else
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10);
return (sieve[0] == 0) ? 0 : 1;
}
" multiarch_avx512_bw)

if(multiarch_avx512_bw)
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BW")
endif()

cmake_pop_check_state()
78 changes: 78 additions & 0 deletions lib/primesieve/cmake/multiarch_sve_arm.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# We use GCC/Clang's function multi-versioning for ARM SVE
# support. This code will automatically dispatch to the
# ARM SVE algorithm if the CPU supports it and use the default
# (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for ARM SVE is not needed
// if the user compiles with -march=armv8-a+sve. GCC/Clang
// function multiversioning generally causes a minor overhead,
// hence we disable it if it is not needed.
#if defined(__ARM_FEATURE_SVE) && \
__has_include(<arm_sve.h>)
Error: ARM SVE multiarch not needed!
#endif
#include <primesieve/cpu_supports_arm_sve.hpp>
#include <arm_sve.h>
#include <stdint.h>
#include <cstddef>
__attribute__ ((target (\"arch=armv8-a+sve\")))
void AND_PreSieveTables_arm_sve(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i += svcntb())
{
svbool_t pg = svwhilelt_b8(i, bytes);
svst1_u8(pg, &sieve[i],
svand_u8_x(svptrue_b64(),
svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])),
svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i]))));
}
}
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0,
const uint8_t* __restrict preSieved1,
const uint8_t* __restrict preSieved2,
const uint8_t* __restrict preSieved3,
uint8_t* __restrict sieve,
std::size_t bytes)
{
for (std::size_t i = 0; i < bytes; i++)
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i];
}
int main()
{
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable3[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
uint8_t PreSieveTable4[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
if (cpu_supports_sve)
AND_PreSieveTables_arm_sve(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10);
else
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10);
return (sieve[0] == 0) ? 0 : 1;
}
" multiarch_sve_arm)

if(multiarch_sve_arm)
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_ARM_SVE")
endif()

cmake_pop_check_state()
27 changes: 27 additions & 0 deletions lib/primesieve/include/primesieve/cpu_supports_arm_sve.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
///
/// @file cpu_supports_arm_sve.hpp
/// Check if the CPU supports the ARM SVE instruction set.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_ARM_SVE_HPP
#define CPU_SUPPORTS_ARM_SVE_HPP

#include "macros.hpp"

#if __has_builtin(__builtin_cpu_supports)

namespace {

/// Initialized at startup
const bool cpu_supports_sve = __builtin_cpu_supports("sve");

} // namespace

#endif // __builtin_cpu_supports

#endif
27 changes: 27 additions & 0 deletions lib/primesieve/include/primesieve/cpu_supports_avx512_bw.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
///
/// @file cpu_supports_avx512_bw.hpp
/// @brief Detect if the x86 CPU supports AVX512 BW.
///
/// Copyright (C) 2024 Kim Walisch, <[email protected]>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_BW_HPP
#define CPU_SUPPORTS_AVX512_BW_HPP

namespace primesieve {

bool has_cpuid_avx512_bw();

} // namespace

namespace {

/// Initialized at startup
const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw();

} // namespace

#endif
Loading

0 comments on commit 1972182

Please sign in to comment.