Skip to content

Commit

Permalink
[Core] Fix potential crash when calling ArrayMath functions on arrays
Browse files Browse the repository at this point in the history
with unaligned start addresses.

Fixes: #351
  • Loading branch information
lakulish committed May 16, 2024
1 parent d2c8ec6 commit 60a8220
Show file tree
Hide file tree
Showing 3 changed files with 189 additions and 40 deletions.
203 changes: 163 additions & 40 deletions core/src/core/array_math.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,19 @@ void ArrayMath::add(int size,
{
auto simdSize = size & ~3;

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in1) && float4::isAligned(in2) && float4::isAligned(out))
{
float4::store(&out[i], float4::add(float4::load(&in1[i]), float4::load(&in2[i])));
for (auto i = 0; i < simdSize; i += 4)
{
float4::store(&out[i], float4::add(float4::load(&in1[i]), float4::load(&in2[i])));
}
}
else
{
for (auto i = 0; i < simdSize; i += 4)
{
float4::storeu(&out[i], float4::add(float4::loadu(&in1[i]), float4::loadu(&in2[i])));
}
}

for (auto i = simdSize; i < size; ++i)
Expand All @@ -57,9 +67,19 @@ void ArrayMath::multiply(int size,
{
auto simdSize = size & ~3;

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in1) && float4::isAligned(in2) && float4::isAligned(out))
{
float4::store(&out[i], float4::mul(float4::load(&in1[i]), float4::load(&in2[i])));
for (auto i = 0; i < simdSize; i += 4)
{
float4::store(&out[i], float4::mul(float4::load(&in1[i]), float4::load(&in2[i])));
}
}
else
{
for (auto i = 0; i < simdSize; i += 4)
{
float4::storeu(&out[i], float4::mul(float4::loadu(&in1[i]), float4::loadu(&in2[i])));
}
}

for (auto i = simdSize; i < size; ++i)
Expand Down Expand Up @@ -88,19 +108,39 @@ void ArrayMath::multiply(int size,

#if (defined(IPL_CPU_X86) || defined(IPL_CPU_X64))

for (auto i = 0U; i < simdArraySizeAsReal; i += 4)
if (float4::isAligned(in1) && float4::isAligned(in2) && float4::isAligned(out))
{
auto x1 = float4::load(&in1Data[i]);
auto x2 = float4::load(&in2Data[i]);
for (auto i = 0U; i < simdArraySizeAsReal; i += 4)
{
auto x1 = float4::load(&in1Data[i]);
auto x2 = float4::load(&in2Data[i]);

auto b0 = float4::set(-1.0f, 1.0f, -1.0f, 1.0f);
auto b1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(2, 2, 0, 0));
auto b3 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(3, 3, 1, 1));
auto b4 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));
auto b0 = float4::set(-1.0f, 1.0f, -1.0f, 1.0f);
auto b1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(2, 2, 0, 0));
auto b3 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(3, 3, 1, 1));
auto b4 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));

auto y = float4::add(float4::mul(b1, x2), float4::mul(b0, float4::mul(b3, b4)));
auto y = float4::add(float4::mul(b1, x2), float4::mul(b0, float4::mul(b3, b4)));

float4::store(&outData[i], y);
float4::store(&outData[i], y);
}
}
else
{
for (auto i = 0U; i < simdArraySizeAsReal; i += 4)
{
auto x1 = float4::loadu(&in1Data[i]);
auto x2 = float4::loadu(&in2Data[i]);

auto b0 = float4::set(-1.0f, 1.0f, -1.0f, 1.0f);
auto b1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(2, 2, 0, 0));
auto b3 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(3, 3, 1, 1));
auto b4 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));

auto y = float4::add(float4::mul(b1, x2), float4::mul(b0, float4::mul(b3, b4)));

float4::storeu(&outData[i], y);
}
}

#elif (defined(IPL_CPU_ARMV7) || defined(IPL_CPU_ARM64))
Expand Down Expand Up @@ -137,15 +177,31 @@ void ArrayMath::multiplyAccumulate(int size,
{
auto simdSize = size & ~3;

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in1) && float4::isAligned(in2) && float4::isAligned(accum))
{
for (auto i = 0; i < simdSize; i += 4)
{
auto x1 = float4::load(&in1[i]);
auto x2 = float4::load(&in2[i]);
auto y = float4::load(&accum[i]);

y = float4::add(y, float4::mul(x1, x2));

float4::store(&accum[i], y);
}
}
else
{
auto x1 = float4::load(&in1[i]);
auto x2 = float4::load(&in2[i]);
auto y = float4::load(&accum[i]);
for (auto i = 0; i < simdSize; i += 4)
{
auto x1 = float4::loadu(&in1[i]);
auto x2 = float4::loadu(&in2[i]);
auto y = float4::loadu(&accum[i]);

y = float4::add(y, float4::mul(x1, x2));
y = float4::add(y, float4::mul(x1, x2));

float4::store(&accum[i], y);
float4::storeu(&accum[i], y);
}
}

for (auto i = simdSize; i < size; ++i)
Expand All @@ -168,21 +224,43 @@ void ArrayMath::multiplyAccumulate(int size,

#if (defined(IPL_CPU_X86) || defined(IPL_CPU_X64))

for (auto i = 0U; i < simdArraySizeAsReal; i += 4)
if (float4::isAligned(in1) && float4::isAligned(in2) && float4::isAligned(accum))
{
auto x1 = float4::load(&in1Data[i]);
auto x2 = float4::load(&in2Data[i]);
for (auto i = 0U; i < simdArraySizeAsReal; i += 4)
{
auto x1 = float4::load(&in1Data[i]);
auto x2 = float4::load(&in2Data[i]);

auto b0 = float4::set(-1.0f, 1.0f, -1.0f, 1.0f);
auto b1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(2, 2, 0, 0));
auto b3 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(3, 3, 1, 1));
auto b4 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));
auto b0 = float4::set(-1.0f, 1.0f, -1.0f, 1.0f);
auto b1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(2, 2, 0, 0));
auto b3 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(3, 3, 1, 1));
auto b4 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));

auto y = float4::add(float4::mul(b1, x2), float4::mul(b0, float4::mul(b3, b4)));
auto y = float4::add(float4::mul(b1, x2), float4::mul(b0, float4::mul(b3, b4)));

y = float4::add(y, float4::load(&outData[i]));
y = float4::add(y, float4::load(&outData[i]));

float4::store(&outData[i], y);
float4::store(&outData[i], y);
}
}
else
{
for (auto i = 0U; i < simdArraySizeAsReal; i += 4)
{
auto x1 = float4::loadu(&in1Data[i]);
auto x2 = float4::loadu(&in2Data[i]);

auto b0 = float4::set(-1.0f, 1.0f, -1.0f, 1.0f);
auto b1 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(2, 2, 0, 0));
auto b3 = _mm_shuffle_ps(x1, x1, _MM_SHUFFLE(3, 3, 1, 1));
auto b4 = _mm_shuffle_ps(x2, x2, _MM_SHUFFLE(2, 3, 0, 1));

auto y = float4::add(float4::mul(b1, x2), float4::mul(b0, float4::mul(b3, b4)));

y = float4::add(y, float4::load(&outData[i]));

float4::storeu(&outData[i], y);
}
}

#elif (defined(IPL_CPU_ARMV7) || defined(IPL_CPU_ARM64))
Expand Down Expand Up @@ -217,9 +295,19 @@ void ArrayMath::scale(int size,
auto simdSize = size & ~3;
auto simdScalar = float4::set1(scalar);

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in) && float4::isAligned(out))
{
for (auto i = 0; i < simdSize; i += 4)
{
float4::store(&out[i], float4::mul(float4::load(&in[i]), simdScalar));
}
}
else
{
float4::store(&out[i], float4::mul(float4::load(&in[i]), simdScalar));
for (auto i = 0; i < simdSize; i += 4)
{
float4::storeu(&out[i], float4::mul(float4::loadu(&in[i]), simdScalar));
}
}

for (auto i = simdSize; i < size; ++i)
Expand All @@ -244,14 +332,29 @@ void ArrayMath::scaleAccumulate(int size,
auto simdSize = size & ~3;
auto simdScalar = float4::set1(scalar);

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in) && float4::isAligned(out))
{
for (auto i = 0; i < simdSize; i += 4)
{
auto x = float4::load(&in[i]);
auto y = float4::load(&out[i]);

y = float4::add(y, float4::mul(x, simdScalar));

float4::store(&out[i], y);
}
}
else
{
auto x = float4::load(&in[i]);
auto y = float4::load(&out[i]);
for (auto i = 0; i < simdSize; i += 4)
{
auto x = float4::loadu(&in[i]);
auto y = float4::loadu(&out[i]);

y = float4::add(y, float4::mul(x, simdScalar));
y = float4::add(y, float4::mul(x, simdScalar));

float4::store(&out[i], y);
float4::storeu(&out[i], y);
}
}

for (auto i = simdSize; i < size; ++i)
Expand All @@ -268,9 +371,19 @@ void ArrayMath::addConstant(int size,
auto simdSize = size & ~3;
auto simdConstant = float4::set1(constant);

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in) && float4::isAligned(out))
{
for (auto i = 0; i < simdSize; i += 4)
{
float4::store(&out[i], float4::add(float4::load(&in[i]), simdConstant));
}
}
else
{
float4::store(&out[i], float4::add(float4::load(&in[i]), simdConstant));
for (auto i = 0; i < simdSize; i += 4)
{
float4::storeu(&out[i], float4::add(float4::loadu(&in[i]), simdConstant));
}
}

for (auto i = simdSize; i < size; ++i)
Expand Down Expand Up @@ -315,9 +428,19 @@ void ArrayMath::threshold(int size,
auto simdSize = size & ~3;
auto simdMinValue = float4::set1(minValue);

for (auto i = 0; i < simdSize; i += 4)
if (float4::isAligned(in) && float4::isAligned(out))
{
for (auto i = 0; i < simdSize; i += 4)
{
float4::store(&out[i], float4::max(float4::load(&in[i]), simdMinValue));
}
}
else
{
float4::store(&out[i], float4::max(float4::load(&in[i]), simdMinValue));
for (auto i = 0; i < simdSize; i += 4)
{
float4::storeu(&out[i], float4::max(float4::loadu(&in[i]), simdMinValue));
}
}

for (auto i = simdSize; i < size; ++i)
Expand Down
13 changes: 13 additions & 0 deletions core/src/core/float4.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,19 @@
#include "neon_float4.h"
#endif

namespace ipl {

namespace float4
{
template <typename T>
inline bool isAligned(const T* p)
{
return ((reinterpret_cast<size_t>(p) & 0xf) == 0);
}
}

}

#if defined(IPL_OS_WINDOWS)

namespace ipl {
Expand Down
13 changes: 13 additions & 0 deletions core/src/core/float8.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,16 @@
#endif

#endif

namespace ipl {

namespace float8
{
template <typename T>
inline bool isAligned(const T* p)
{
return ((reinterpret_cast<size_t>(p) & 0x1f) == 0);
}
}

}

0 comments on commit 60a8220

Please sign in to comment.