Skip to content

Commit

Permalink
Removed unnecessary min and max calls for rdsum avx512.
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 660299297
  • Loading branch information
Misha Gutman authored and xnnpack-bot committed Aug 7, 2024
1 parent 1cda6cd commit 1ecee10
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 20 deletions.
4 changes: 0 additions & 4 deletions src/f32-rdsum/avx512.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -126,17 +126,13 @@ void xnn_f32_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx512f_c${CHANNELS
}
for (int i = 0; i < channels >> 4; ++i) {
vacc[i] = _mm512_add_ps(vo[i], vacc[i]);
vacc[i] = _mm512_max_ps(vacc[i], vmin);
vacc[i] = _mm512_min_ps(vacc[i], vmax);
}
for (int i = 0; i < channels >> 4; ++i) {
_mm512_storeu_ps(output, vacc[i]); output += 16;
}
if (remainder) {
const size_t pos = num_full_chunks;
__m512 vout = vacc[pos];
vout = _mm512_max_ps(vout, vmin);
vout = _mm512_min_ps(vout, vmax);
vout = _mm512_maskz_add_ps(vmask, vout, _mm512_maskz_loadu_ps(vmask, output));
_mm512_mask_storeu_ps(output, vmask, vout);
}
Expand Down
4 changes: 0 additions & 4 deletions src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c128.c
Original file line number Diff line number Diff line change
Expand Up @@ -341,17 +341,13 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c128(
}
for (int i = 0; i < channels >> 4; ++i) {
vacc[i] = _mm512_add_ps(vo[i], vacc[i]);
vacc[i] = _mm512_max_ps(vacc[i], vmin);
vacc[i] = _mm512_min_ps(vacc[i], vmax);
}
for (int i = 0; i < channels >> 4; ++i) {
_mm512_storeu_ps(output, vacc[i]); output += 16;
}
if (remainder) {
const size_t pos = num_full_chunks;
__m512 vout = vacc[pos];
vout = _mm512_max_ps(vout, vmin);
vout = _mm512_min_ps(vout, vmax);
vout = _mm512_maskz_add_ps(vmask, vout, _mm512_maskz_loadu_ps(vmask, output));
_mm512_mask_storeu_ps(output, vmask, vout);
}
Expand Down
4 changes: 0 additions & 4 deletions src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c16.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,17 +180,13 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c16(
}
for (int i = 0; i < channels >> 4; ++i) {
vacc[i] = _mm512_add_ps(vo[i], vacc[i]);
vacc[i] = _mm512_max_ps(vacc[i], vmin);
vacc[i] = _mm512_min_ps(vacc[i], vmax);
}
for (int i = 0; i < channels >> 4; ++i) {
_mm512_storeu_ps(output, vacc[i]); output += 16;
}
if (remainder) {
const size_t pos = num_full_chunks;
__m512 vout = vacc[pos];
vout = _mm512_max_ps(vout, vmin);
vout = _mm512_min_ps(vout, vmax);
vout = _mm512_maskz_add_ps(vmask, vout, _mm512_maskz_loadu_ps(vmask, output));
_mm512_mask_storeu_ps(output, vmask, vout);
}
Expand Down
4 changes: 0 additions & 4 deletions src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c32.c
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,13 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c32(
}
for (int i = 0; i < channels >> 4; ++i) {
vacc[i] = _mm512_add_ps(vo[i], vacc[i]);
vacc[i] = _mm512_max_ps(vacc[i], vmin);
vacc[i] = _mm512_min_ps(vacc[i], vmax);
}
for (int i = 0; i < channels >> 4; ++i) {
_mm512_storeu_ps(output, vacc[i]); output += 16;
}
if (remainder) {
const size_t pos = num_full_chunks;
__m512 vout = vacc[pos];
vout = _mm512_max_ps(vout, vmin);
vout = _mm512_min_ps(vout, vmax);
vout = _mm512_maskz_add_ps(vmask, vout, _mm512_maskz_loadu_ps(vmask, output));
_mm512_mask_storeu_ps(output, vmask, vout);
}
Expand Down
4 changes: 0 additions & 4 deletions src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx512f-c64.c
Original file line number Diff line number Diff line change
Expand Up @@ -249,17 +249,13 @@ void xnn_f32_rdsum_ukernel_7p7x__avx512f_c64(
}
for (int i = 0; i < channels >> 4; ++i) {
vacc[i] = _mm512_add_ps(vo[i], vacc[i]);
vacc[i] = _mm512_max_ps(vacc[i], vmin);
vacc[i] = _mm512_min_ps(vacc[i], vmax);
}
for (int i = 0; i < channels >> 4; ++i) {
_mm512_storeu_ps(output, vacc[i]); output += 16;
}
if (remainder) {
const size_t pos = num_full_chunks;
__m512 vout = vacc[pos];
vout = _mm512_max_ps(vout, vmin);
vout = _mm512_min_ps(vout, vmax);
vout = _mm512_maskz_add_ps(vmask, vout, _mm512_maskz_loadu_ps(vmask, output));
_mm512_mask_storeu_ps(output, vmask, vout);
}
Expand Down

0 comments on commit 1ecee10

Please sign in to comment.