From 5ff843bc39ae9acfee55ae4221e03f60ac396802 Mon Sep 17 00:00:00 2001 From: David Piuva Date: Sun, 9 Feb 2025 22:03:45 +0100 Subject: [PATCH] Debugging... --- Source/DFPSR/base/simd.h | 104 +++++++++++++++++------------ Source/test/tests/DataLoopTest.cpp | 6 +- Source/test/tests/SimdTest.cpp | 10 +-- Source/test/tests/TextureTest.cpp | 11 +-- 4 files changed, 77 insertions(+), 54 deletions(-) diff --git a/Source/DFPSR/base/simd.h b/Source/DFPSR/base/simd.h index 85b7586..1d7fd1f 100644 --- a/Source/DFPSR/base/simd.h +++ b/Source/DFPSR/base/simd.h @@ -110,6 +110,19 @@ #include "../base/noSimd.h" #include "../api/stringAPI.h" + #ifdef USE_SSE2 + #include // SSE2 + #ifdef USE_SSSE3 + #include // SSSE3 + #endif + #ifdef USE_AVX + #include // AVX / AVX2 + #endif + #endif + #ifdef USE_NEON + #include // NEON + #endif + namespace dsr { // Alignment in bytes @@ -123,14 +136,6 @@ // Everything declared in here handles things specific for SSE. // Direct use of the macros will not provide portability to all hardware. #ifdef USE_SSE2 - #include // SSE2 - #ifdef USE_SSSE3 - #include // SSSE3 - #endif - #ifdef USE_AVX - #include // AVX / AVX2 - #endif - // Vector types #define SIMD_F32x4 __m128 #define SIMD_U8x16 __m128i @@ -296,8 +301,6 @@ // Everything declared in here handles things specific for NEON. // Direct use of the macros will not provide portability to all hardware. #ifdef USE_NEON - #include // NEON - // Vector types #define SIMD_F32x4 float32x4_t #define SIMD_U8x16 uint8x16_t @@ -307,7 +310,7 @@ // Vector uploads in address order inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) { - float data[4] ALIGN16 = {a, b, c, d}; + ALIGN16 float data[4] = {a, b, c, d}; #ifdef SAFE_POINTER_CHECKS if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_F32_SIMD for NEON!\n"); } #endif @@ -318,7 +321,7 @@ } inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) { - uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}; + ALIGN16 uint8_t data[16] = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p}; #ifdef SAFE_POINTER_CHECKS if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U8_SIMD for NEON!\n"); } #endif @@ -328,7 +331,7 @@ return vdupq_n_u8(a); } inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) { - uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h}; + ALIGN16 uint16_t data[8] = {a, b, c, d, e, f, g, h}; #ifdef SAFE_POINTER_CHECKS if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U16_SIMD for NEON!\n"); } #endif @@ -338,7 +341,7 @@ return vdupq_n_u16(a); } inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - uint32_t data[4] ALIGN16 = {a, b, c, d}; + ALIGN16 uint32_t data[4] = {a, b, c, d}; #ifdef SAFE_POINTER_CHECKS if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U32_SIMD for NEON!\n"); } #endif @@ -348,7 +351,7 @@ return vdupq_n_u32(a); } inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) { - int32_t data[4] ALIGN16 = {a, b, c, d}; + ALIGN16 int32_t data[4] = {a, b, c, d}; #ifdef SAFE_POINTER_CHECKS if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_I32_SIMD for NEON!\n"); } #endif @@ -480,9 +483,11 @@ #endif #ifdef USE_BASIC_SIMD #if defined(USE_SSE2) - return F32x4(_mm_load_ps(data)); + ALIGN16 SIMD_F32x4 result = _mm_load_ps(data); + return F32x4(result); #elif defined(USE_NEON) - return F32x4(vld1q_f32(data)); + ALIGN16 SIMD_F32x4 result = vld1q_f32(data); + return F32x4(result); #endif #else return F32x4(data[0], data[1], data[2], data[3]); @@ -509,7 +514,7 @@ } #if defined(DFPSR_GEOMETRY_FVECTOR) dsr::FVector4D get() const { - float data[4] ALIGN16; + ALIGN16 float data[4]; #ifdef SAFE_POINTER_CHECKS if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in FVector4D F32x4::get!\n"); } #endif @@ -585,9 +590,11 @@ #endif #if defined(USE_BASIC_SIMD) #if defined(USE_SSE2) - return I32x4(_mm_load_si128((const __m128i*)data)); + ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data); + return I32x4(result); #elif defined(USE_NEON) - return I32x4(vld1q_s32(data)); + ALIGN16 SIMD_I32x4 result = vld1q_s32(data); + return I32x4(result); #endif #else return I32x4(data[0], data[1], data[2], data[3]); @@ -614,7 +621,7 @@ } #if defined(DFPSR_GEOMETRY_IVECTOR) dsr::IVector4D get() const { - int32_t data[4] ALIGN16; + ALIGN16 int32_t data[4]; #ifdef SAFE_POINTER_CHECKS if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in IVector4D I32x4::get!\n"); } #endif @@ -690,9 +697,11 @@ #endif #if defined(USE_BASIC_SIMD) #if defined(USE_SSE2) - return U32x4(_mm_load_si128((const __m128i*)data)); + ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data); + return U32x4(result); #elif defined(USE_NEON) - return U32x4(vld1q_u32(data)); + ALIGN16 SIMD_I32x4 result = vld1q_u32(data); + return U32x4(result); #endif #else return U32x4(data[0], data[1], data[2], data[3]); @@ -719,7 +728,7 @@ } #if defined(DFPSR_GEOMETRY_UVECTOR) dsr::UVector4D get() const { - uint32_t data[4] ALIGN16; + ALIGN16 uint32_t data[4]; #ifdef SAFE_POINTER_CHECKS if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in UVector4D U32x4::get!\n"); } #endif @@ -847,9 +856,11 @@ #endif #if defined(USE_BASIC_SIMD) #if defined(USE_SSE2) - return U16x8(_mm_load_si128((const __m128i*)data)); + ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data); + return U16x8(result); #elif defined(USE_NEON) - return U16x8(vld1q_u16(data)); + ALIGN16 SIMD_I32x4 result = vld1q_u16(data); + return U16x8(result); #endif #else return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); @@ -987,9 +998,11 @@ #endif #if defined(USE_BASIC_SIMD) #if defined(USE_SSE2) - return U8x16(_mm_load_si128((const __m128i*)data)); + ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data); + return U8x16(result); #elif defined(USE_NEON) - return U8x16(vld1q_u8(data)); + ALIGN16 SIMD_I32x4 result = vld1q_u8(data); + return U8x16(result); #endif #else return U8x16( @@ -1112,7 +1125,8 @@ if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::readAlignedUnsafe!\n"); } #endif #if defined(USE_AVX2) - return F32x8(_mm256_load_ps(data)); + ALIGN32 __m256 result = _mm256_load_ps(data); + return F32x8(result); #else return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); #endif @@ -1219,7 +1233,8 @@ if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::readAlignedUnsafe!\n"); } #endif #if defined(USE_AVX2) - return I32x8(_mm256_load_si256((const __m256i*)data)); + ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data); + return I32x8(result); #else return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); #endif @@ -1327,7 +1342,8 @@ if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); } #endif #if defined(USE_AVX2) - return U32x8(_mm256_load_si256((const __m256i*)data)); + ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data); + return U32x8(result); #else return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); #endif @@ -1506,7 +1522,8 @@ if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::readAlignedUnsafe!\n"); } #endif #if defined(USE_AVX2) - return U16x16(_mm256_load_si256((const __m256i*)data)); + ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data); + return U16x16(result); #else return U16x16( data[0], @@ -1690,7 +1707,8 @@ if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::readAlignedUnsafe!\n"); } #endif #if defined(USE_AVX2) - return U8x32(_mm256_load_si256((const __m256i*)data)); + ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data); + return U8x32(result); #else U8x32 result; for (int i = 0; i < 32; i++) { @@ -1731,9 +1749,9 @@ }; #define IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \ - ALIGN_BYTES(alignof(VECTOR_TYPE)) ELEMENT_TYPE lanesA[LANE_COUNT]; \ - ALIGN_BYTES(alignof(VECTOR_TYPE)) ELEMENT_TYPE lanesB[LANE_COUNT]; \ - ALIGN_BYTES(alignof(VECTOR_TYPE)) ELEMENT_TYPE lanesR[LANE_COUNT]; \ + ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesA[LANE_COUNT]; \ + ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesB[LANE_COUNT]; \ + ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesR[LANE_COUNT]; \ A.writeAlignedUnsafe(&(lanesA[0])); \ B.writeAlignedUnsafe(&(lanesB[0])); @@ -3340,15 +3358,19 @@ // To avoid crashing with non-immediate input when optimization is turned off, the half indices must be given as template arguments to remain known in compile time. template inline __m256i impl_extractBytes_AVX2(const __m256i &leftInput, const __m256i &middleInput, const __m256i &rightInput) { + static_assert(0 <= INNER_OFFSET && INNER_OFFSET < 16, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..15!\n"); + static_assert(0 <= EDGE_HALF_INDEX && EDGE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!n"); + static_assert(0 <= MIDDLE_HALF_INDEX && MIDDLE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!\n"); // Extract three halves depending on which ones overlap with the offset. ALIGN16 __m128i leftPart = _mm256_extractf128_si256(leftInput , EDGE_HALF_INDEX ); ALIGN16 __m128i middlePart = _mm256_extractf128_si256(middleInput, MIDDLE_HALF_INDEX); ALIGN16 __m128i rightPart = _mm256_extractf128_si256(rightInput , EDGE_HALF_INDEX ); - // Combine two 128-bit extracts into a whole 256-bit extract. - return _mm256_set_m128i( - _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET), - _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET) - ); + // Make two 128-bit vector extractions. + ALIGN16 __m128i leftResult = _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET); + ALIGN16 __m128i rightResult = _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET); + // Combine the results. + ALIGN32 __m256i result = _mm256_set_m128i(leftResult, rightResult); + return result; } #define VECTOR_EXTRACT_GENERATOR_256(OFFSET, A, B) \ impl_extractBytes_AVX2< \ diff --git a/Source/test/tests/DataLoopTest.cpp b/Source/test/tests/DataLoopTest.cpp index d5ae3c3..865b5ba 100644 --- a/Source/test/tests/DataLoopTest.cpp +++ b/Source/test/tests/DataLoopTest.cpp @@ -23,9 +23,9 @@ START_TEST(DataLoop) // Allocate aligned memory const int elements = 256; - int32_t allocationA[elements] ALIGN16; - int32_t allocationB[elements] ALIGN16; - int32_t allocationC[elements] ALIGN16; + ALIGN16 int32_t allocationA[elements]; + ALIGN16 int32_t allocationB[elements]; + ALIGN16 int32_t allocationC[elements]; // The SafePointer class will emulate the behaviour of a raw data pointer while providing full bound checks in debug mode. SafePointer bufferA("bufferA", allocationA, sizeof(allocationA)); SafePointer bufferB("bufferB", allocationB, sizeof(allocationB)); diff --git a/Source/test/tests/SimdTest.cpp b/Source/test/tests/SimdTest.cpp index 90000a8..b75c144 100755 --- a/Source/test/tests/SimdTest.cpp +++ b/Source/test/tests/SimdTest.cpp @@ -23,11 +23,11 @@ ALIGN16 __m128i middlePart = _mm256_extractf128_si256(middleInput, MIDDLE_HALF_INDEX); stateName = U"impl_extractBytes_AVX2_TEST: _mm256_extractf128_si256 RIGHT 3.\n"; ALIGN16 __m128i rightPart = _mm256_extractf128_si256(rightInput , EDGE_HALF_INDEX ); - stateName = U"impl_extractBytes_AVX2_TEST: _mm256_set_m128i 4.\n"; - return _mm256_set_m128i( - _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET), - _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET) - ); + stateName = U"impl_extractBytes_AVX2_TEST: _mm256_set_m128i 4.\n"; // Crashing here on the server. + ALIGN16 __m128i leftResult = _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET); + ALIGN16 __m128i rightResult = _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET); + ALIGN32 __m256i result = _mm256_set_m128i(leftResult, rightResult); + return result; } #define VECTOR_EXTRACT_GENERATOR_256_TEST(OFFSET, A, B) \ impl_extractBytes_AVX2_TEST< \ diff --git a/Source/test/tests/TextureTest.cpp b/Source/test/tests/TextureTest.cpp index 7050282..08a2d5d 100644 --- a/Source/test/tests/TextureTest.cpp +++ b/Source/test/tests/TextureTest.cpp @@ -12,8 +12,9 @@ stateName = U"readAlignedUnsafe_U32x8_test A.\n"; if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); } #endif #if defined(USE_AVX2) -stateName = U"readAlignedUnsafe_U32x8_test B AVX2.\n"; - return U32x8(_mm256_load_si256((const __m256i*)data)); +stateName = string_combine(U"readAlignedUnsafe_U32x8_test B AVX2. source pointer = ", uintptr_t(data), U"\n"); + ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data); + return U32x8(result); #else stateName = U"readAlignedUnsafe_U32x8_test B SCALAR.\n"; return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]); @@ -32,9 +33,9 @@ stateName = U"shiftRight_test C.\n"; #endif #if defined(USE_AVX2) stateName = U"shiftRight_test D AVX2.\n"; - ALIGN_BYTES(alignof(U32x8)) uint32_t lanesA[8]; - ALIGN_BYTES(alignof(U32x8)) uint32_t lanesB[8]; - ALIGN_BYTES(alignof(U32x8)) uint32_t lanesR[8]; + ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesA[8]; + ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesB[8]; + ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesR[8]; stateName = U"shiftRight_test E.\n"; left.writeAlignedUnsafe(&(lanesA[0])); stateName = U"shiftRight_test F.\n";