diff --git a/src/Magnum/Math/CMakeLists.txt b/src/Magnum/Math/CMakeLists.txt index fa77b2b508..2f5cec82ec 100644 --- a/src/Magnum/Math/CMakeLists.txt +++ b/src/Magnum/Math/CMakeLists.txt @@ -48,6 +48,7 @@ set(MagnumMath_HEADERS Packing.h Range.h RectangularMatrix.h + Simd.h StrictWeakOrdering.h Swizzle.h Tags.h diff --git a/src/Magnum/Math/Packing.cpp b/src/Magnum/Math/Packing.cpp index 4d61646d17..a4e54840e1 100644 --- a/src/Magnum/Math/Packing.cpp +++ b/src/Magnum/Math/Packing.cpp @@ -25,6 +25,10 @@ #include "Packing.h" +#include +#include +#include + namespace Magnum { namespace Math { namespace { @@ -101,4 +105,71 @@ UnsignedShort packHalf(const Float value) { return h; } +namespace Implementation { + +void unpackUnsignedByteToShort(Simd::NoneT, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + for(std::size_t i = 0; i < in.size(); ++i) out[i] = in[i]; +} + +void unpackUnsignedByteToShort(Simd::Sse2T, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + const __m128i* in128 = reinterpret_cast(in.data()); + __m128i* out128 = reinterpret_cast<__m128i*>(out.data()); + for(std::size_t i = 0; i < in.size()/16; ++i) { + __m128i a = _mm_loadu_si128(in128 + i); + _mm_storeu_si128(out128 + i*2 + 0, _mm_unpacklo_epi8(a, _mm_setzero_si128())); + _mm_storeu_si128(out128 + i*2 + 1, _mm_unpacklo_epi8(a, _mm_setzero_si128())); + } +} + +void unpackUnsignedByteToShort(Simd::Sse41T, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + const __m128i* in128 = reinterpret_cast(in.data()); + __m128i* out128 = reinterpret_cast<__m128i*>(out.data()); + for(std::size_t i = 0; i < in.size()/16; ++i) { + __m128i a = _mm_loadu_si128(in128 + i); + _mm_storeu_si128(out128 + i*2 + 0, _mm_cvtepu8_epi16(a)); + _mm_storeu_si128(out128 + i*2 + 1, _mm_cvtepu8_epi16(_mm_srli_si128(a, 8))); + } +} + +void unpackUnsignedByteToShort(Simd::Avx2T, const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + const __m128i* in128 = reinterpret_cast(in.data()); + __m256i* out256 = reinterpret_cast<__m256i*>(out.data()); + for(std::size_t i = 0; i < in.size()/16; ++i) { + __m128i a = _mm_load_si128(in128 + i); + _mm256_store_si256(out256 + i, _mm256_cvtepu8_epi16(a)); + } +} + +} + +namespace { + +__attribute__ ((target ("default"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out); +} + +// TODO: why gcc complains about unused functions here?! +__attribute__ ((target ("sse2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse2, in, out); +} + +__attribute__ ((target ("sse4.1"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out); +} + +__attribute__ ((target ("avx2"))) void unpackUnsignedByteToShortDispatch(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + Implementation::unpackUnsignedByteToShort(Simd::Sse41, in, out); +} + +} + +void unpackUnsignedByteToShort(const Corrade::Containers::ArrayView in, const Corrade::Containers::ArrayView out) { + CORRADE_ASSERT(in.size() == out.size(), "Math::unpackUnsignedByteToShort(): input has" << in.size() << "elements while output has" << out.size(), ); + CORRADE_ASSERT(!(reinterpret_cast(in.data())%16) && !(reinterpret_cast(in.data())%16), "Math::unpackUnsignedByteToShort(): the data are not 16-byte aligned", ); + + /** @todo run only for a multiple of 16, do the rest scalar */ + CORRADE_INTERNAL_ASSERT(!(in.size()%16)); + unpackUnsignedByteToShortDispatch(in, out); +} + }} diff --git a/src/Magnum/Math/Packing.h b/src/Magnum/Math/Packing.h index c12f77f700..12c6edf9d4 100644 --- a/src/Magnum/Math/Packing.h +++ b/src/Magnum/Math/Packing.h @@ -30,6 +30,7 @@ */ #include "Magnum/Math/Functions.h" +#include "Magnum/Math/Simd.h" namespace Magnum { namespace Math { @@ -209,6 +210,39 @@ template Vector unpackHalf(const Vector in, Corrade::Containers::ArrayView out); + MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse2T, Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Sse41T, Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + MAGNUM_EXPORT void unpackUnsignedByteToShort(Simd::Avx2T, Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); +} + +/** +@brief Unpack an array of 8-bit unsigned integers to 16-bit + +The @p in and @p out are expected to have the same size and be aligned to 16 +bytes. +*/ +// TODO: mention SIMD? +MAGNUM_EXPORT void unpackUnsignedByteToShort(Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + +/** +@brief Unpack an array of 8-bit unsigned integers to 32-bit + +The @p in and @p out are expected to have the same size and be aligned to 16 +bytes. +*/ +MAGNUM_EXPORT void unpackUnsignedByteToInt(Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + +/** +@brief Unpack an array of 16-bit unsigned integers to 32-bit + +The @p in and @p out are expected to have the same size and be aligned to 16 +bytes. +*/ +MAGNUM_EXPORT void unpackUnsignedShortToInt(Corrade::Containers::ArrayView in, Corrade::Containers::ArrayView out); + }} #endif diff --git a/src/Magnum/Math/Simd.h b/src/Magnum/Math/Simd.h new file mode 100644 index 0000000000..9e28a94c65 --- /dev/null +++ b/src/Magnum/Math/Simd.h @@ -0,0 +1,148 @@ +#ifndef Magnum_Math_Simd_h +#define Magnum_Math_Simd_h +/* + This file is part of Magnum. + + Copyright © 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019 + Vladimír Vondruš + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. +*/ + +/** @file + * @brief Namespace @ref Magnum::Math::Simd + */ + +/** @namespace Magnum::Math::Simd +@brief SIMD dispatch tags + +Tags for dispatching to particular SIMD-optimized versions of batch math +algorithms. + +This library is built as part of Magnum by default. To use this library with +CMake, you need to find the `Magnum` package and link to the `Magnum::Magnum` +target: + +@code{.cmake} +find_package(Magnum REQUIRED) + +# ... +target_link_libraries(your-app Magnum::Magnum) +@endcode + +See @ref building and @ref cmake for more information. +*/ +namespace Magnum { namespace Math { namespace Simd { + +/** +@brief No SIMD acceleration tag type + +Used to distinguish algorithms that have no explicit SIMD optimizations, apart +from compiler magic. +@see @ref None +*/ +/* Explicit constructor to avoid ambiguous calls when using {} */ +struct NoneT { + #ifndef DOXYGEN_GENERATING_OUTPUT + struct Init{}; + constexpr explicit NoneT(Init) {} + #endif +}; + +/** +@brief SSE2 SIMD acceleration tag type + +Used to distinguish algorithms that use at most the +[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set. +@see @ref Sse2 +*/ +/* Explicit constructor to avoid ambiguous calls when using {} */ +struct Sse2T { + #ifndef DOXYGEN_GENERATING_OUTPUT + struct Init{}; + constexpr explicit Sse2T(Init) {} + #endif +}; + +/** +@brief SSE4.1 SIMD acceleration tag type + +Used to distinguish algorithms that use at most the +[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set. +@see @ref Sse41 +*/ +/* Explicit constructor to avoid ambiguous calls when using {} */ +struct Sse41T { + #ifndef DOXYGEN_GENERATING_OUTPUT + struct Init{}; + constexpr explicit Sse41T(Init) {} + #endif +}; + +/** +@brief AVX2 SIMD acceleration tag type + +Used to distinguish algorithms that use at most the +[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2) +instruction set. +@see @ref Avx2 +*/ +/* Explicit constructor to avoid ambiguous calls when using {} */ +struct Avx2T { + #ifndef DOXYGEN_GENERATING_OUTPUT + struct Init{}; + constexpr explicit Avx2T(Init) {} + #endif +}; + +/** +@brief No SIMD acceleration tag + +Use for selecting algorithms with no explicit SIMD optimizations. +*/ +constexpr NoneT None{NoneT::Init{}}; + +/** +@brief SSE2 SIMD acceleration tag + +Use for selecting algorithms that use at most the +[SSE2](https://en.wikipedia.org/wiki/SSE2) instruction set. +*/ +constexpr Sse2T Sse2{Sse2T::Init{}}; + +/** +@brief SSE4.1 SIMD acceleration tag + +Use for selecting algorithms that use at most the +[SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) instruction set. +*/ +constexpr Sse41T Sse41{Sse41T::Init{}}; + +/** +@brief AVX2 SIMD acceleration tag type + +Use for selecting algorithms that use at most the +[AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#AVX2) +instruction set. +*/ +constexpr Avx2T Avx2{Avx2T::Init{}}; + +}}} + +#endif diff --git a/src/Magnum/Math/Test/PackingTest.cpp b/src/Magnum/Math/Test/PackingTest.cpp index 759e7328e1..b335efdfb0 100644 --- a/src/Magnum/Math/Test/PackingTest.cpp +++ b/src/Magnum/Math/Test/PackingTest.cpp @@ -24,6 +24,7 @@ */ #include +#include #include #include "Magnum/Math/Packing.h" @@ -46,6 +47,9 @@ struct PackingTest: Corrade::TestSuite::Tester { /* Half (un)pack functions are tested and benchmarked in HalfTest.cpp, because there's involved comparison and benchmarks to ground truth */ + + void unpackUnsignedByteToShortBenchmark(); + template void unpackUnsignedByteToShortBenchmark(); }; typedef Math::Vector3 Vector3; @@ -62,6 +66,14 @@ PackingTest::PackingTest() { &PackingTest::reunpackUnsinged, &PackingTest::reunpackSinged, &PackingTest::unpackTypeDeduction}); + + addBenchmarks({ + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark, + &PackingTest::unpackUnsignedByteToShortBenchmark + }, 1000); } void PackingTest::bitMax() { @@ -279,6 +291,44 @@ void PackingTest::unpackTypeDeduction() { CORRADE_COMPARE((Math::unpack('\x7F')), 1.0f); } +void PackingTest::unpackUnsignedByteToShortBenchmark() { + Corrade::Containers::Array in{20000}; + Corrade::Containers::Array out{20000}; + UnsignedByte a = 0; + for(auto& i: in) i = a++; + + CORRADE_BENCHMARK(100) + unpackUnsignedByteToShort(in, out); +} + +// TODO: uh provide this elsewhere +template struct SimdTraits; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; +template<> struct SimdTraits { + static const char* name() { return "unpackUnsignedByteToShortBenchmark"; } +}; + +template void PackingTest::unpackUnsignedByteToShortBenchmark() { + setTestCaseName(SimdTraits::name()); + + Corrade::Containers::Array in{20000}; + Corrade::Containers::Array out{20000}; + UnsignedByte a = 0; + for(auto& i: in) i = a++; + + CORRADE_BENCHMARK(100) + // TODO: uh the typename wat + Implementation::unpackUnsignedByteToShort(T{typename T::Init{}}, in, out); +} + }}}} CORRADE_TEST_MAIN(Magnum::Math::Test::PackingTest)