-
-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
23be137
commit 1972182
Showing
10 changed files
with
555 additions
and
99 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,29 @@ | ||
# The AND_PreSieveTables() function in PreSieve.cpp is important for | ||
# performance and therefore it is important that this function is | ||
# auto-vectorized by the compiler. For GCC & Clang we can enable | ||
# auto vectorization using -ftree-vectorize. | ||
|
||
# GCC/Clang enable auto-vectorization with -O2 and -O3, but for -O2 | ||
# GCC uses the "very-cheap" cost model which prevents our AND_PreSieveTables() | ||
# function from getting auto vectorized. But compiling with e.g. | ||
# "-O2 -ftree-vectorize -fvect-cost-model=dynamic" fixes this issue. | ||
|
||
include(CheckCXXCompilerFlag) | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_FLAGS -Werror) | ||
check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize) | ||
cmake_pop_check_state() | ||
|
||
if(ftree_vectorize) | ||
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize") | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_FLAGS -Werror) | ||
check_cxx_compiler_flag(-fvect-cost-model=dynamic fvect_cost_model) | ||
cmake_pop_check_state() | ||
|
||
if(fvect_cost_model) | ||
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic") | ||
endif() | ||
endif() | ||
# The AND_PreSieveTables() function in PreSieve.cpp is important for | ||
# performance and therefore it is important that this function is | ||
# auto-vectorized by the compiler. For GCC & Clang we can enable | ||
# auto vectorization using -ftree-vectorize. | ||
|
||
# GCC/Clang enable auto-vectorization with -O2 and -O3, but for -O2 | ||
# GCC uses the "very-cheap" cost model which prevents our AND_PreSieveTables() | ||
# function from getting auto vectorized. But compiling with e.g. | ||
# "-O2 -ftree-vectorize -fvect-cost-model=dynamic" fixes this issue. | ||
|
||
include(CheckCXXCompilerFlag) | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_FLAGS -Werror) | ||
check_cxx_compiler_flag(-ftree-vectorize ftree_vectorize) | ||
cmake_pop_check_state() | ||
|
||
if(ftree_vectorize) | ||
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-ftree-vectorize") | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_FLAGS -Werror) | ||
check_cxx_compiler_flag(-fvect-cost-model=dynamic fvect_cost_model) | ||
cmake_pop_check_state() | ||
|
||
if(fvect_cost_model) | ||
list(APPEND PRIMESIEVE_COMPILE_OPTIONS "-fvect-cost-model=dynamic") | ||
endif() | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# We use GCC/Clang's function multi-versioning for AVX512 | ||
# support. This code will automatically dispatch to the | ||
# AVX512 BW algorithm if the CPU supports it and use the | ||
# default (portable) algorithm otherwise. | ||
|
||
include(CheckCXXSourceCompiles) | ||
include(CMakePushCheckState) | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}") | ||
|
||
check_cxx_source_compiles(" | ||
// GCC/Clang function multiversioning for AVX512 is not needed if | ||
// the user compiles with -mavx512f -mavx512bw. | ||
// GCC/Clang function multiversioning generally causes a minor | ||
// overhead, hence we disable it if it is not needed. | ||
#if defined(__AVX512F__) && \ | ||
defined(__AVX512BW__) | ||
Error: AVX512BW multiarch not needed! | ||
#endif | ||
#include <src/x86/cpuid.cpp> | ||
#include <immintrin.h> | ||
#include <stdint.h> | ||
#include <cstddef> | ||
__attribute__ ((target (\"avx512f,avx512bw\"))) | ||
void AND_PreSieveTables_avx512(const uint8_t* __restrict preSieve0, | ||
const uint8_t* __restrict preSieve1, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
std::size_t i = 0; | ||
for (; i + 64 <= bytes; i += sizeof(__m512i)) | ||
{ | ||
_mm512_storeu_epi8((__m512i*) &sieve[i], | ||
_mm512_and_si512(_mm512_loadu_epi8((const __m512i*) &preSieve0[i]), | ||
_mm512_loadu_epi8((const __m512i*) &preSieve1[i]))); | ||
} | ||
if (i < bytes) | ||
{ | ||
__mmask64 mask = 0xffffffffffffffffull >> (i + 64 - bytes); | ||
_mm512_mask_storeu_epi8((__m512i*) &sieve[i], mask, | ||
_mm512_and_si512(_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve0[i]), | ||
_mm512_maskz_loadu_epi8(mask, (const __m512i*) &preSieve1[i]))); | ||
} | ||
} | ||
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, | ||
const uint8_t* __restrict preSieved1, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
for (std::size_t i = 0; i < bytes; i++) | ||
sieve[i] = preSieved0[i] & preSieved1[i]; | ||
} | ||
int main() | ||
{ | ||
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
if (primesieve::has_cpuid_avx512_bw()) | ||
AND_PreSieveTables_avx512(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10); | ||
else | ||
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &sieve[0], 10); | ||
return (sieve[0] == 0) ? 0 : 1; | ||
} | ||
" multiarch_avx512_bw) | ||
|
||
if(multiarch_avx512_bw) | ||
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_AVX512_BW") | ||
endif() | ||
|
||
cmake_pop_check_state() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# We use GCC/Clang's function multi-versioning for ARM SVE | ||
# support. This code will automatically dispatch to the | ||
# ARM SVE algorithm if the CPU supports it and use the default | ||
# (portable) algorithm otherwise. | ||
|
||
include(CheckCXXSourceCompiles) | ||
include(CMakePushCheckState) | ||
|
||
cmake_push_check_state() | ||
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include") | ||
|
||
check_cxx_source_compiles(" | ||
// GCC/Clang function multiversioning for ARM SVE is not needed | ||
// if the user compiles with -march=armv8-a+sve. GCC/Clang | ||
// function multiversioning generally causes a minor overhead, | ||
// hence we disable it if it is not needed. | ||
#if defined(__ARM_FEATURE_SVE) && \ | ||
__has_include(<arm_sve.h>) | ||
Error: ARM SVE multiarch not needed! | ||
#endif | ||
#include <primesieve/cpu_supports_arm_sve.hpp> | ||
#include <arm_sve.h> | ||
#include <stdint.h> | ||
#include <cstddef> | ||
__attribute__ ((target (\"arch=armv8-a+sve\"))) | ||
void AND_PreSieveTables_arm_sve(const uint8_t* __restrict preSieved0, | ||
const uint8_t* __restrict preSieved1, | ||
const uint8_t* __restrict preSieved2, | ||
const uint8_t* __restrict preSieved3, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
for (std::size_t i = 0; i < bytes; i += svcntb()) | ||
{ | ||
svbool_t pg = svwhilelt_b8(i, bytes); | ||
svst1_u8(pg, &sieve[i], | ||
svand_u8_x(svptrue_b64(), | ||
svand_u8_z(pg, svld1_u8(pg, &preSieved0[i]), svld1_u8(pg, &preSieved1[i])), | ||
svand_u8_z(pg, svld1_u8(pg, &preSieved2[i]), svld1_u8(pg, &preSieved3[i])))); | ||
} | ||
} | ||
void AND_PreSieveTables_default(const uint8_t* __restrict preSieved0, | ||
const uint8_t* __restrict preSieved1, | ||
const uint8_t* __restrict preSieved2, | ||
const uint8_t* __restrict preSieved3, | ||
uint8_t* __restrict sieve, | ||
std::size_t bytes) | ||
{ | ||
for (std::size_t i = 0; i < bytes; i++) | ||
sieve[i] = preSieved0[i] & preSieved1[i] & preSieved2[i] & preSieved3[i]; | ||
} | ||
int main() | ||
{ | ||
uint8_t sieve[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable1[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable2[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable3[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
uint8_t PreSieveTable4[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }; | ||
if (cpu_supports_sve) | ||
AND_PreSieveTables_arm_sve(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10); | ||
else | ||
AND_PreSieveTables_default(&PreSieveTable1[0], &PreSieveTable2[1], &PreSieveTable3[1], &PreSieveTable4[1], &sieve[0], 10); | ||
return (sieve[0] == 0) ? 0 : 1; | ||
} | ||
" multiarch_sve_arm) | ||
|
||
if(multiarch_sve_arm) | ||
list(APPEND PRIMESIEVE_COMPILE_DEFINITIONS "ENABLE_MULTIARCH_ARM_SVE") | ||
endif() | ||
|
||
cmake_pop_check_state() |
27 changes: 27 additions & 0 deletions
27
lib/primesieve/include/primesieve/cpu_supports_arm_sve.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/// | ||
/// @file cpu_supports_arm_sve.hpp | ||
/// Check if the CPU supports the ARM SVE instruction set. | ||
/// | ||
/// Copyright (C) 2024 Kim Walisch, <[email protected]> | ||
/// | ||
/// This file is distributed under the BSD License. See the COPYING | ||
/// file in the top level directory. | ||
/// | ||
|
||
#ifndef CPU_SUPPORTS_ARM_SVE_HPP | ||
#define CPU_SUPPORTS_ARM_SVE_HPP | ||
|
||
#include "macros.hpp" | ||
|
||
#if __has_builtin(__builtin_cpu_supports) | ||
|
||
namespace { | ||
|
||
/// Initialized at startup | ||
const bool cpu_supports_sve = __builtin_cpu_supports("sve"); | ||
|
||
} // namespace | ||
|
||
#endif // __builtin_cpu_supports | ||
|
||
#endif |
27 changes: 27 additions & 0 deletions
27
lib/primesieve/include/primesieve/cpu_supports_avx512_bw.hpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/// | ||
/// @file cpu_supports_avx512_bw.hpp | ||
/// @brief Detect if the x86 CPU supports AVX512 BW. | ||
/// | ||
/// Copyright (C) 2024 Kim Walisch, <[email protected]> | ||
/// | ||
/// This file is distributed under the BSD License. See the COPYING | ||
/// file in the top level directory. | ||
/// | ||
|
||
#ifndef CPU_SUPPORTS_AVX512_BW_HPP | ||
#define CPU_SUPPORTS_AVX512_BW_HPP | ||
|
||
namespace primesieve { | ||
|
||
bool has_cpuid_avx512_bw(); | ||
|
||
} // namespace | ||
|
||
namespace { | ||
|
||
/// Initialized at startup | ||
const bool cpu_supports_avx512_bw = primesieve::has_cpuid_avx512_bw(); | ||
|
||
} // namespace | ||
|
||
#endif |
Oops, something went wrong.