Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Arctan avx512 #759

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Added AVX512 kernels to arctan and atan2
Signed-off-by: Magnus Lundmark <magnuslundmark@gmail.com>
Ka-zam committed Feb 25, 2024
commit 2ce41403de53f7966e2b602e622ccd25947b1c42
90 changes: 46 additions & 44 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
#
# Copyright 2011-2020 Free Software Foundation, Inc.
# Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
#Copyright 2011 - 2020 Free Software Foundation, Inc.
#Copyright 2023 Magnus Lundmark < magnuslundmark @gmail.com>
#
# This file is part of VOLK
#This file is part of VOLK
#
# SPDX-License-Identifier: LGPL-3.0-or-later
#SPDX - License - Identifier : LGPL - 3.0 - or -later
#

########################################################################
# Project setup
#Project setup
########################################################################
cmake_minimum_required(VERSION 3.8)
set(CMAKE_BUILD_TYPE
@@ -25,10 +25,10 @@ set(CMAKE_CXX_STANDARD 17)
enable_testing()

########################################################################
# Common compile flags
#Common compile flags
########################################################################

# Disable complex math NaN/INFO range checking for performance
#Disable complex math NaN / INFO range checking for performance
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag(-fcx-limited-range HAVE_CX_LIMITED_RANGE)
if(HAVE_CX_LIMITED_RANGE)
@@ -46,15 +46,15 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall")
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)

if(CMAKE_C_COMPILER_ID MATCHES "Clang|GNU")
# Abort compilation if kernel implementations have inconsistent function
# prototypes, i.e. if
#
# kernel_foo_sse(uint32_t *dst, lv32fc_t *src)
# kernel_foo_avx(uint16_t *dst, lv32fc_t *src)
#
# are defined. Note the different data type of the first argument). By
# default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
# is a warning enabled by '-Wall'. These warnings are only applicable to C.
#Abort compilation if kernel implementations have inconsistent function
#prototypes, i.e.if
#
#kernel_foo_sse(uint32_t* dst, lv32fc_t* src)
#kernel_foo_avx(uint16_t* dst, lv32fc_t* src)
#
#are defined.Note the different data type of the first argument).By
#default 'incompatible-pointer-types' is a warning only and 'pointer-sign'
#is a warning enabled by '-Wall'.These warnings are only applicable to C.
set(CMAKE_C_FLAGS
"${CMAKE_C_FLAGS} -Werror=incompatible-pointer-types -Werror=pointer-sign")
endif()
@@ -77,7 +77,7 @@ set(CMAKE_BUILD_TYPE
message(STATUS "Build type set to ${CMAKE_BUILD_TYPE}.")

########################################################################
# Version setup
#Version setup
########################################################################

set(VERSION_INFO_MAJOR_VERSION 3)
@@ -87,13 +87,14 @@ include(VolkVersion) #setup version info

math(EXPR VOLK_VERSION_DECIMAL "${VERSION_INFO_MAJOR_VERSION} * 10000
+ ${VERSION_INFO_MINOR_VERSION} * 100
+ ${VERSION_INFO_MAINT_VERSION}")
+ ${
VERSION_INFO_MAINT_VERSION}")

configure_file(${CMAKE_SOURCE_DIR}/include/volk/volk_version.h.in
${CMAKE_BINARY_DIR}/include/volk/volk_version.h @ONLY)

########################################################################
# Environment setup
#Environment setup
########################################################################
if(NOT DEFINED CROSSCOMPILE_MULTILIB)
set(CROSSCOMPILE_MULTILIB "")
@@ -116,10 +117,10 @@ if(MSVC)
endif(MSVC)

########################################################################
# Dependencies setup
#Dependencies setup
########################################################################

# cpu_features - sensible defaults, user settable option
#cpu_features - sensible defaults, user settable option
if(CMAKE_SYSTEM_PROCESSOR MATCHES
"(^mips)|(^arm)|(^aarch64)|(x86_64)|(AMD64|amd64)|(^i.86$)|(^powerpc)|(^ppc)|(^riscv)")
option(VOLK_CPU_FEATURES "Volk uses cpu_features" ON)
@@ -158,7 +159,7 @@ else()
message(STATUS "Building Volk without cpu_features")
endif()

# Python
#Python
include(VolkPython) #sets PYTHON_EXECUTABLE and PYTHON_DASH_B
volk_python_check_module("python >= 3.4" sys "sys.version_info >= (3, 4)"
PYTHON_MIN_VER_FOUND)
@@ -168,12 +169,12 @@ if(NOT PYTHON_MIN_VER_FOUND)
message(FATAL_ERROR "Python 3.4 or greater required to build VOLK")
endif()

# Mako
#Mako
if(NOT MAKO_FOUND)
message(FATAL_ERROR "Mako templates required to build VOLK")
endif()

# Check if we have std::filesystem
#Check if we have std::filesystem
find_package(
FILESYSTEM
COMPONENTS Final Experimental
@@ -183,9 +184,9 @@ set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

########################################################################
# check for aligned_alloc, since some compilers lack this C11 feature.
# For Apple-clang use `posix_memalign`
# For MSVC use `_aligned_malloc`.
#check for aligned_alloc, since some compilers lack this C11 feature.
#For Apple - clang use `posix_memalign`
#For MSVC use `_aligned_malloc`.
########################################################################
include(CheckSymbolExists)
if(NOT (${CMAKE_SYSTEM_NAME} MATCHES "Darwin"))
@@ -196,7 +197,7 @@ if(NOT USE_ALIGNED_ALLOC)
endif()

########################################################################
# Check if Orc is available
#Check if Orc is available
########################################################################
option(ENABLE_ORC "Enable Orc" True)
if(ENABLE_ORC)
@@ -206,17 +207,17 @@ else(ENABLE_ORC)
endif(ENABLE_ORC)

########################################################################
# Setup doxygen
#Setup doxygen
########################################################################
add_subdirectory(docs)

########################################################################
# Detect /lib versus /lib64
#Detect / lib versus / lib64
########################################################################
include(GNUInstallDirs)

########################################################################
# Setup the package config file
#Setup the package config file
########################################################################
#set variables found in the pc.in file
set(prefix ${CMAKE_INSTALL_PREFIX})
@@ -233,7 +234,7 @@ install(
COMPONENT "volk_devel")

########################################################################
# Install all headers in the include directories
#Install all headers in the include directories
########################################################################
set(VOLK_RUNTIME_DIR bin)
set(VOLK_LIBRARY_DIR ${CMAKE_INSTALL_LIBDIR})
@@ -255,6 +256,7 @@ install(
${CMAKE_SOURCE_DIR}/include/volk/volk_avx_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx2_fma_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_avx512_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_sse3_intrinsics.h
${CMAKE_SOURCE_DIR}/include/volk/volk_neon_intrinsics.h
@@ -269,7 +271,7 @@ install(
COMPONENT "volk_devel")

########################################################################
# On Apple only, set install name and use rpath correctly, if not already set
#On Apple only, set install name and use rpath correctly, if not already set
########################################################################
if(APPLE)
if(NOT CMAKE_INSTALL_NAME_DIR)
@@ -290,21 +292,21 @@ if(APPLE)
endif(APPLE)

########################################################################
# Create uninstall target
#Create uninstall target
########################################################################
configure_file(${CMAKE_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake @ONLY)

# Only add the target if there isn't one defined already
#Only add the target if there isn't one defined already
if(NOT TARGET uninstall)
add_custom_target(uninstall ${CMAKE_COMMAND} -P
${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
endif()

########################################################################
# Install our Cmake modules into $prefix/lib/cmake/volk
# See "Package Configuration Files" on page:
# http://www.cmake.org/Wiki/CMake/Tutorials/Packaging
#Install our Cmake modules into $prefix / lib / cmake / volk
#See "Package Configuration Files" on page:
#http: // www.cmake.org/Wiki/CMake/Tutorials/Packaging
########################################################################

configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
@@ -314,7 +316,7 @@ configure_file(${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake @ONLY)

########################################################################
# Install cmake search routine for external use
#Install cmake search routine for external use
########################################################################

if(NOT CMAKE_MODULES_DIR)
@@ -334,7 +336,7 @@ install(
DESTINATION ${CMAKE_MODULES_DIR}/volk)

########################################################################
# Option to enable QA testing, on by default
#Option to enable QA testing, on by default
########################################################################
option(ENABLE_TESTING "Enable QA testing" ON)
if(ENABLE_TESTING)
@@ -345,7 +347,7 @@ endif()
message(STATUS " Modify using: -DENABLE_TESTING=ON/OFF")

########################################################################
# Option to enable post-build profiling using volk_profile, off by default
#Option to enable post - build profiling using volk_profile, off by default
########################################################################
option(ENABLE_PROFILING "Launch system profiler after build" OFF)
if(ENABLE_PROFILING)
@@ -371,12 +373,12 @@ endif()
message(STATUS " Modify using: -DENABLE_PROFILING=ON/OFF")

########################################################################
# Setup the library
#Setup the library
########################################################################
add_subdirectory(lib)

########################################################################
# And the utility apps
#And the utility apps
########################################################################
add_subdirectory(apps)
option(ENABLE_MODTOOL "Enable volk_modtool python utility" True)
@@ -385,6 +387,6 @@ if(ENABLE_MODTOOL)
endif()

########################################################################
# Print summary
#Print summary
########################################################################
message(STATUS "Using install prefix: ${CMAKE_INSTALL_PREFIX}")
8 changes: 8 additions & 0 deletions gen/archs.xml
Original file line number Diff line number Diff line change
@@ -178,6 +178,14 @@ at the top, as a last resort.
<alignment>64</alignment>
</arch>

<arch name="avx512dq">
<check name="avx512dq"></check>
<flag compiler="gnu">-mavx512dq</flag>
<flag compiler="clang">-mavx512dq</flag>
<flag compiler="msvc">/arch:AVX512DQ</flag>
<alignment>64</alignment>
</arch>

<arch name="riscv64">
</arch>

5 changes: 5 additions & 0 deletions gen/machines.xml
Original file line number Diff line number Diff line change
@@ -65,4 +65,9 @@
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512cd orc|</archs>
</machine>

<!-- trailing | bar means generate without either for MSVC -->
<machine name="avx512dq">
<archs>generic 32|64| mmx| sse sse2 sse3 ssse3 sse4_1 sse4_2 popcount avx fma avx2 avx512f avx512dq orc|</archs>
</machine>

</grammar>
4 changes: 2 additions & 2 deletions include/volk/volk_avx2_fma_intrinsics.h
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@
*/

/*
* This file is intended to hold AVX2 FMA intrinsics of intrinsics.
* This file is intended to hold AVX2 FMA intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

@@ -23,7 +23,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx2_fma(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx2_fma(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
67 changes: 67 additions & 0 deletions include/volk/volk_avx512_intrinsics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*
* This file is intended to hold AVX512 intrinsics.
* They should be used in VOLK kernels to avoid copy-paste.
*/

#ifndef INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#define INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_
#include <immintrin.h>

static inline __m512 _mm512_real(const __m512 z1, const __m512 z2)
{
// r = z1_0 z1_2 ... z1_6 z2_0 z2_2 ... z2_6
const __m512i idx =
_mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
const __m512 r = _mm512_permutex2var_ps(z1, idx, z2);
return r;
}

static inline __m512 _mm512_imag(const __m512 z1, const __m512 z2)
{
const __m512i idx =
_mm512_set_epi32(31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9, 7, 5, 3, 1);
const __m512 i = _mm512_permutex2var_ps(z1, idx, z2);
return i;
}

/*
* Approximate arctan(x) via polynomial expansion
* on the interval [-1, 1]
*
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m512 _mm512_arctan_poly_avx512(const __m512 x)
{
const __m512 a1 = _mm512_set1_ps(+0x1.ffffeap-1f);
const __m512 a3 = _mm512_set1_ps(-0x1.55437p-2f);
const __m512 a5 = _mm512_set1_ps(+0x1.972be6p-3f);
const __m512 a7 = _mm512_set1_ps(-0x1.1436ap-3f);
const __m512 a9 = _mm512_set1_ps(+0x1.5785aap-4f);
const __m512 a11 = _mm512_set1_ps(-0x1.2f3004p-5f);
const __m512 a13 = _mm512_set1_ps(+0x1.01a37cp-7f);

const __m512 x_times_x = _mm512_mul_ps(x, x);
__m512 arctan;
arctan = a13;
arctan = _mm512_fmadd_ps(x_times_x, arctan, a11);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a9);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a7);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a5);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a3);
arctan = _mm512_fmadd_ps(x_times_x, arctan, a1);
arctan = _mm512_mul_ps(x, arctan);

return arctan;
}

#endif /* INCLUDE_VOLK_VOLK_AVX512_INTRINSICS_H_ */
4 changes: 2 additions & 2 deletions include/volk/volk_avx_intrinsics.h
Original file line number Diff line number Diff line change
@@ -9,7 +9,7 @@
*/

/*
* This file is intended to hold AVX intrinsics of intrinsics.
* This file is intended to hold AVX intrinsics.
* They should be used in VOLK kernels to avoid copy-pasta.
*/

@@ -24,7 +24,7 @@
* Maximum relative error ~6.5e-7
* Polynomial evaluated via Horner's method
*/
static inline __m256 _m256_arctan_poly_avx(const __m256 x)
static inline __m256 _mm256_arctan_poly_avx(const __m256 x)
{
const __m256 a1 = _mm256_set1_ps(+0x1.ffffeap-1f);
const __m256 a3 = _mm256_set1_ps(-0x1.55437p-2f);
Loading