Skip to content

Commit

Permalink
Making sure that vector extract generates immediate indices even when…
Browse files Browse the repository at this point in the history
… optimizations are disabled.
  • Loading branch information
Dawoodoz committed Feb 9, 2025
1 parent c86667c commit 60c4d32
Showing 1 changed file with 42 additions and 22 deletions.
64 changes: 42 additions & 22 deletions Source/DFPSR/base/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -3319,44 +3319,64 @@
// Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
#if defined(USE_AVX2)
// AVX2 does not offer any 256-bit element extraction, only two 128-bit shifts done in parallel, so we might as well use two separate 128-bit extractions.
template <int OFFSET>
__m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i rightInput) {
// To avoid crashing with non-immediate input when optimization is turned off, the half indices must be given as template arguments to remain known in compile time.
template <int INNER_OFFSET, int EDGE_HALF_INDEX, int MIDDLE_HALF_INDEX>
__m256i impl_extractBytes_AVX2(const __m256i leftInput, const __m256i middleInput, const __m256i rightInput) {
// Extract three halves depending on which ones overlap with the offset.
__m128i leftPart = _mm256_extractf128_si256(leftInput, (OFFSET < 16) ? 0 : 1);
__m128i middlePart = _mm256_extractf128_si256(OFFSET < 16 ? rightInput : leftInput, (OFFSET < 16) ? 1 : 0);
__m128i rightPart = _mm256_extractf128_si256(rightInput, (OFFSET < 16) ? 0 : 1);
ALIGN16 __m128i leftPart = _mm256_extractf128_si256(leftInput , EDGE_HALF_INDEX );
ALIGN16 __m128i middlePart = _mm256_extractf128_si256(middleInput, MIDDLE_HALF_INDEX);
ALIGN16 __m128i rightPart = _mm256_extractf128_si256(rightInput , EDGE_HALF_INDEX );
// Combine two 128-bit extracts into a whole 256-bit extract.
return _mm256_set_m128i(
_mm_alignr_epi8(leftPart, middlePart, OFFSET - ((OFFSET < 16) ? 0 : 16)),
_mm_alignr_epi8(middlePart, rightPart, OFFSET - ((OFFSET < 16) ? 0 : 16))
_mm_alignr_epi8(leftPart, middlePart, INNER_OFFSET),
_mm_alignr_epi8(middlePart, rightPart, INNER_OFFSET)
);
}
#define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return U8x32(impl_extractBytes_AVX2<OFFSET>(b.v, a.v));
#define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(impl_extractBytes_AVX2<OFFSET * 2>(b.v, a.v));
#define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
#define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(impl_extractBytes_AVX2<OFFSET * 4>(b.v, a.v));
#define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(SIMD_F32x8(impl_extractBytes_AVX2<OFFSET * 4>(SIMD_U32x8(b.v), SIMD_U32x8(a.v))));
#define VECTOR_EXTRACT_GENERATOR_256(OFFSET) \
impl_extractBytes_AVX2< \
(OFFSET) - ((OFFSET) < 16 ? 0 : 16), \
(OFFSET) < 16 ? 0 : 1, \
(OFFSET) < 16 ? 1 : 0 \
> \
( \
__m256i(b.v), \
(OFFSET) < 16 ? a.v : b.v, \
__m256i(a.v) \
)
#define VECTOR_EXTRACT_GENERATOR_256_U8( OFFSET) return U8x32(VECTOR_EXTRACT_GENERATOR_256(OFFSET));
#define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return U16x16(VECTOR_EXTRACT_GENERATOR_256(OFFSET * 2));
#define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return U32x8(VECTOR_EXTRACT_GENERATOR_256(OFFSET * 4));
#define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return I32x8(VECTOR_EXTRACT_GENERATOR_256(OFFSET * 4));
#define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return F32x8(VECTOR_EXTRACT_GENERATOR_256(OFFSET * 4));
#else
template<typename T, int elementCount>
T impl_vectorExtract_emulated(const T &a, const T &b, int offset) {
// For safety reasons, uninitialized default construction of SIMD vectors is only available during full emulation, not partial AVX support.
// TODO: Implement bound checks for scalars in debug mode. A static index can be checked in compile time.
template<typename T, int ELEMENT_COUNT, int OFFSET>
T impl_vectorExtract_emulated(const T &a, const T &b) {
static_assert(0 <= OFFSET && OFFSET <= ELEMENT_COUNT, "Offset is out of bound in impl_vectorExtract_emulated!\n");
static_assert(sizeof(a.scalars) == sizeof(a.scalars[0]) * ELEMENT_COUNT, "A does not match the element count in impl_vectorExtract_emulated!\n");
static_assert(sizeof(b.scalars) == sizeof(b.scalars[0]) * ELEMENT_COUNT, "B does not match the element count in impl_vectorExtract_emulated!\n");
T result = T::create_dangerous_uninitialized();
static_assert(sizeof(result.scalars) == sizeof(result.scalars[0]) * ELEMENT_COUNT, "The result does not match the element count in impl_vectorExtract_emulated!\n");
int t = 0;
for (int s = offset; s < elementCount; s++) {
for (int s = OFFSET; s < ELEMENT_COUNT; s++) {
assert(0 <= s && s < ELEMENT_COUNT);
assert(0 <= t && t < ELEMENT_COUNT);
result.scalars[t] = a.scalars[s];
t++;
}
for (int s = 0; s < offset; s++) {
for (int s = 0; s < OFFSET; s++) {
assert(0 <= s && s < ELEMENT_COUNT);
assert(0 <= t && t < ELEMENT_COUNT);
result.scalars[t] = b.scalars[s];
t++;
}
return result;
}
#define VECTOR_EXTRACT_GENERATOR_256_U8(OFFSET) return impl_vectorExtract_emulated<U8x32, 32>(a, b, OFFSET);
#define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return impl_vectorExtract_emulated<U16x16, 16>(a, b, OFFSET);
#define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return impl_vectorExtract_emulated<U32x8, 8>(a, b, OFFSET);
#define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return impl_vectorExtract_emulated<I32x8, 8>(a, b, OFFSET);
#define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return impl_vectorExtract_emulated<F32x8, 8>(a, b, OFFSET);
#define VECTOR_EXTRACT_GENERATOR_256_U8( OFFSET) return impl_vectorExtract_emulated< U8x32, 32, OFFSET>(a, b);
#define VECTOR_EXTRACT_GENERATOR_256_U16(OFFSET) return impl_vectorExtract_emulated<U16x16, 16, OFFSET>(a, b);
#define VECTOR_EXTRACT_GENERATOR_256_U32(OFFSET) return impl_vectorExtract_emulated< U32x8, 8, OFFSET>(a, b);
#define VECTOR_EXTRACT_GENERATOR_256_I32(OFFSET) return impl_vectorExtract_emulated< I32x8, 8, OFFSET>(a, b);
#define VECTOR_EXTRACT_GENERATOR_256_F32(OFFSET) return impl_vectorExtract_emulated< F32x8, 8, OFFSET>(a, b);
#endif

// Vector extraction concatunates two input vectors and reads a vector between them using an offset.
Expand Down

0 comments on commit 60c4d32

Please sign in to comment.