Skip to content

Commit

Permalink
Debugging...
Browse files Browse the repository at this point in the history
  • Loading branch information
Dawoodoz committed Feb 9, 2025
1 parent a694e17 commit 5ff843b
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 54 deletions.
104 changes: 63 additions & 41 deletions Source/DFPSR/base/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@
#include "../base/noSimd.h"
#include "../api/stringAPI.h"

#ifdef USE_SSE2
#include <emmintrin.h> // SSE2
#ifdef USE_SSSE3
#include <tmmintrin.h> // SSSE3
#endif
#ifdef USE_AVX
#include <immintrin.h> // AVX / AVX2
#endif
#endif
#ifdef USE_NEON
#include <arm_neon.h> // NEON
#endif

namespace dsr {

// Alignment in bytes
Expand All @@ -123,14 +136,6 @@
// Everything declared in here handles things specific for SSE.
// Direct use of the macros will not provide portability to all hardware.
#ifdef USE_SSE2
#include <emmintrin.h> // SSE2
#ifdef USE_SSSE3
#include <tmmintrin.h> // SSSE3
#endif
#ifdef USE_AVX
#include <immintrin.h> // AVX / AVX2
#endif

// Vector types
#define SIMD_F32x4 __m128
#define SIMD_U8x16 __m128i
Expand Down Expand Up @@ -296,8 +301,6 @@
// Everything declared in here handles things specific for NEON.
// Direct use of the macros will not provide portability to all hardware.
#ifdef USE_NEON
#include <arm_neon.h> // NEON

// Vector types
#define SIMD_F32x4 float32x4_t
#define SIMD_U8x16 uint8x16_t
Expand All @@ -307,7 +310,7 @@

// Vector uploads in address order
inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
float data[4] ALIGN16 = {a, b, c, d};
ALIGN16 float data[4] = {a, b, c, d};
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_F32_SIMD for NEON!\n"); }
#endif
Expand All @@ -318,7 +321,7 @@
}
inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
ALIGN16 uint8_t data[16] = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U8_SIMD for NEON!\n"); }
#endif
Expand All @@ -328,7 +331,7 @@
return vdupq_n_u8(a);
}
inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
ALIGN16 uint16_t data[8] = {a, b, c, d, e, f, g, h};
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U16_SIMD for NEON!\n"); }
#endif
Expand All @@ -338,7 +341,7 @@
return vdupq_n_u16(a);
}
inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
uint32_t data[4] ALIGN16 = {a, b, c, d};
ALIGN16 uint32_t data[4] = {a, b, c, d};
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_U32_SIMD for NEON!\n"); }
#endif
Expand All @@ -348,7 +351,7 @@
return vdupq_n_u32(a);
}
inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
int32_t data[4] ALIGN16 = {a, b, c, d};
ALIGN16 int32_t data[4] = {a, b, c, d};
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t((void*)data) & 15u) { throwError(U"Unaligned stack memory detected in LOAD_VECTOR_I32_SIMD for NEON!\n"); }
#endif
Expand Down Expand Up @@ -480,9 +483,11 @@
#endif
#ifdef USE_BASIC_SIMD
#if defined(USE_SSE2)
return F32x4(_mm_load_ps(data));
ALIGN16 SIMD_F32x4 result = _mm_load_ps(data);
return F32x4(result);
#elif defined(USE_NEON)
return F32x4(vld1q_f32(data));
ALIGN16 SIMD_F32x4 result = vld1q_f32(data);
return F32x4(result);
#endif
#else
return F32x4(data[0], data[1], data[2], data[3]);
Expand All @@ -509,7 +514,7 @@
}
#if defined(DFPSR_GEOMETRY_FVECTOR)
dsr::FVector4D get() const {
float data[4] ALIGN16;
ALIGN16 float data[4];
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in FVector4D F32x4::get!\n"); }
#endif
Expand Down Expand Up @@ -585,9 +590,11 @@
#endif
#if defined(USE_BASIC_SIMD)
#if defined(USE_SSE2)
return I32x4(_mm_load_si128((const __m128i*)data));
ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
return I32x4(result);
#elif defined(USE_NEON)
return I32x4(vld1q_s32(data));
ALIGN16 SIMD_I32x4 result = vld1q_s32(data);
return I32x4(result);
#endif
#else
return I32x4(data[0], data[1], data[2], data[3]);
Expand All @@ -614,7 +621,7 @@
}
#if defined(DFPSR_GEOMETRY_IVECTOR)
dsr::IVector4D get() const {
int32_t data[4] ALIGN16;
ALIGN16 int32_t data[4];
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in IVector4D I32x4::get!\n"); }
#endif
Expand Down Expand Up @@ -690,9 +697,11 @@
#endif
#if defined(USE_BASIC_SIMD)
#if defined(USE_SSE2)
return U32x4(_mm_load_si128((const __m128i*)data));
ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
return U32x4(result);
#elif defined(USE_NEON)
return U32x4(vld1q_u32(data));
ALIGN16 SIMD_I32x4 result = vld1q_u32(data);
return U32x4(result);
#endif
#else
return U32x4(data[0], data[1], data[2], data[3]);
Expand All @@ -719,7 +728,7 @@
}
#if defined(DFPSR_GEOMETRY_UVECTOR)
dsr::UVector4D get() const {
uint32_t data[4] ALIGN16;
ALIGN16 uint32_t data[4];
#ifdef SAFE_POINTER_CHECKS
if (uintptr_t(data) & 15u) { throwError(U"Unaligned stack memory detected in UVector4D U32x4::get!\n"); }
#endif
Expand Down Expand Up @@ -847,9 +856,11 @@
#endif
#if defined(USE_BASIC_SIMD)
#if defined(USE_SSE2)
return U16x8(_mm_load_si128((const __m128i*)data));
ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
return U16x8(result);
#elif defined(USE_NEON)
return U16x8(vld1q_u16(data));
ALIGN16 SIMD_I32x4 result = vld1q_u16(data);
return U16x8(result);
#endif
#else
return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
Expand Down Expand Up @@ -987,9 +998,11 @@
#endif
#if defined(USE_BASIC_SIMD)
#if defined(USE_SSE2)
return U8x16(_mm_load_si128((const __m128i*)data));
ALIGN16 SIMD_I32x4 result = _mm_load_si128((const __m128i*)data);
return U8x16(result);
#elif defined(USE_NEON)
return U8x16(vld1q_u8(data));
ALIGN16 SIMD_I32x4 result = vld1q_u8(data);
return U8x16(result);
#endif
#else
return U8x16(
Expand Down Expand Up @@ -1112,7 +1125,8 @@
if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in F32x8::readAlignedUnsafe!\n"); }
#endif
#if defined(USE_AVX2)
return F32x8(_mm256_load_ps(data));
ALIGN32 __m256 result = _mm256_load_ps(data);
return F32x8(result);
#else
return F32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
#endif
Expand Down Expand Up @@ -1219,7 +1233,8 @@
if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in I32x8::readAlignedUnsafe!\n"); }
#endif
#if defined(USE_AVX2)
return I32x8(_mm256_load_si256((const __m256i*)data));
ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
return I32x8(result);
#else
return I32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
#endif
Expand Down Expand Up @@ -1327,7 +1342,8 @@
if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); }
#endif
#if defined(USE_AVX2)
return U32x8(_mm256_load_si256((const __m256i*)data));
ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
return U32x8(result);
#else
return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
#endif
Expand Down Expand Up @@ -1506,7 +1522,8 @@
if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U16x16::readAlignedUnsafe!\n"); }
#endif
#if defined(USE_AVX2)
return U16x16(_mm256_load_si256((const __m256i*)data));
ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
return U16x16(result);
#else
return U16x16(
data[0],
Expand Down Expand Up @@ -1690,7 +1707,8 @@
if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U8x32::readAlignedUnsafe!\n"); }
#endif
#if defined(USE_AVX2)
return U8x32(_mm256_load_si256((const __m256i*)data));
ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
return U8x32(result);
#else
U8x32 result;
for (int i = 0; i < 32; i++) {
Expand Down Expand Up @@ -1731,9 +1749,9 @@
};

#define IMPL_SCALAR_FALLBACK_START(A, B, VECTOR_TYPE, ELEMENT_TYPE, LANE_COUNT) \
ALIGN_BYTES(alignof(VECTOR_TYPE)) ELEMENT_TYPE lanesA[LANE_COUNT]; \
ALIGN_BYTES(alignof(VECTOR_TYPE)) ELEMENT_TYPE lanesB[LANE_COUNT]; \
ALIGN_BYTES(alignof(VECTOR_TYPE)) ELEMENT_TYPE lanesR[LANE_COUNT]; \
ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesA[LANE_COUNT]; \
ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesB[LANE_COUNT]; \
ALIGN_BYTES(sizeof(VECTOR_TYPE)) ELEMENT_TYPE lanesR[LANE_COUNT]; \
A.writeAlignedUnsafe(&(lanesA[0])); \
B.writeAlignedUnsafe(&(lanesB[0]));

Expand Down Expand Up @@ -3340,15 +3358,19 @@
// To avoid crashing with non-immediate input when optimization is turned off, the half indices must be given as template arguments to remain known in compile time.
template <int INNER_OFFSET, int EDGE_HALF_INDEX, int MIDDLE_HALF_INDEX>
inline __m256i impl_extractBytes_AVX2(const __m256i &leftInput, const __m256i &middleInput, const __m256i &rightInput) {
static_assert(0 <= INNER_OFFSET && INNER_OFFSET < 16, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..15!\n");
static_assert(0 <= EDGE_HALF_INDEX && EDGE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!n");
static_assert(0 <= MIDDLE_HALF_INDEX && MIDDLE_HALF_INDEX < 2, "impl_extractBytes_AVX2: INNER_OFFSET is out of bound 0..1!\n");
// Extract three halves depending on which ones overlap with the offset.
ALIGN16 __m128i leftPart = _mm256_extractf128_si256(leftInput , EDGE_HALF_INDEX );
ALIGN16 __m128i middlePart = _mm256_extractf128_si256(middleInput, MIDDLE_HALF_INDEX);
ALIGN16 __m128i rightPart = _mm256_extractf128_si256(rightInput , EDGE_HALF_INDEX );
// Combine two 128-bit extracts into a whole 256-bit extract.
return _mm256_set_m128i(
_mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET),
_mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET)
);
// Make two 128-bit vector extractions.
ALIGN16 __m128i leftResult = _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET);
ALIGN16 __m128i rightResult = _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET);
// Combine the results.
ALIGN32 __m256i result = _mm256_set_m128i(leftResult, rightResult);
return result;
}
#define VECTOR_EXTRACT_GENERATOR_256(OFFSET, A, B) \
impl_extractBytes_AVX2< \
Expand Down
6 changes: 3 additions & 3 deletions Source/test/tests/DataLoopTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
START_TEST(DataLoop)
// Allocate aligned memory
const int elements = 256;
int32_t allocationA[elements] ALIGN16;
int32_t allocationB[elements] ALIGN16;
int32_t allocationC[elements] ALIGN16;
ALIGN16 int32_t allocationA[elements];
ALIGN16 int32_t allocationB[elements];
ALIGN16 int32_t allocationC[elements];
// The SafePointer class will emulate the behaviour of a raw data pointer while providing full bound checks in debug mode.
SafePointer<int32_t> bufferA("bufferA", allocationA, sizeof(allocationA));
SafePointer<int32_t> bufferB("bufferB", allocationB, sizeof(allocationB));
Expand Down
10 changes: 5 additions & 5 deletions Source/test/tests/SimdTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@
ALIGN16 __m128i middlePart = _mm256_extractf128_si256(middleInput, MIDDLE_HALF_INDEX);
stateName = U"impl_extractBytes_AVX2_TEST: _mm256_extractf128_si256 RIGHT 3.\n";
ALIGN16 __m128i rightPart = _mm256_extractf128_si256(rightInput , EDGE_HALF_INDEX );
stateName = U"impl_extractBytes_AVX2_TEST: _mm256_set_m128i 4.\n";
return _mm256_set_m128i(
_mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET),
_mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET)
);
stateName = U"impl_extractBytes_AVX2_TEST: _mm256_set_m128i 4.\n"; // Crashing here on the server.
ALIGN16 __m128i leftResult = _mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET);
ALIGN16 __m128i rightResult = _mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET);
ALIGN32 __m256i result = _mm256_set_m128i(leftResult, rightResult);
return result;
}
#define VECTOR_EXTRACT_GENERATOR_256_TEST(OFFSET, A, B) \
impl_extractBytes_AVX2_TEST< \
Expand Down
11 changes: 6 additions & 5 deletions Source/test/tests/TextureTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ stateName = U"readAlignedUnsafe_U32x8_test A.\n";
if (uintptr_t(data) & 31u) { throwError(U"Unaligned pointer detected in U32x8::readAlignedUnsafe!\n"); }
#endif
#if defined(USE_AVX2)
stateName = U"readAlignedUnsafe_U32x8_test B AVX2.\n";
return U32x8(_mm256_load_si256((const __m256i*)data));
stateName = string_combine(U"readAlignedUnsafe_U32x8_test B AVX2. source pointer = ", uintptr_t(data), U"\n");
ALIGN32 __m256i result = _mm256_load_si256((const __m256i*)data);
return U32x8(result);
#else
stateName = U"readAlignedUnsafe_U32x8_test B SCALAR.\n";
return U32x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
Expand All @@ -32,9 +33,9 @@ stateName = U"shiftRight_test C.\n";
#endif
#if defined(USE_AVX2)
stateName = U"shiftRight_test D AVX2.\n";
ALIGN_BYTES(alignof(U32x8)) uint32_t lanesA[8];
ALIGN_BYTES(alignof(U32x8)) uint32_t lanesB[8];
ALIGN_BYTES(alignof(U32x8)) uint32_t lanesR[8];
ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesA[8];
ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesB[8];
ALIGN_BYTES(sizeof(U32x8)) uint32_t lanesR[8];
stateName = U"shiftRight_test E.\n";
left.writeAlignedUnsafe(&(lanesA[0]));
stateName = U"shiftRight_test F.\n";
Expand Down

0 comments on commit 5ff843b

Please sign in to comment.