Skip to content

Commit

Permalink
Remove f16_f32_vcvt_params struct
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 658171197
  • Loading branch information
dsharletg authored and xnnpack-bot committed Aug 1, 2024
1 parent c4a28da commit c97e447
Show file tree
Hide file tree
Showing 122 changed files with 783 additions and 1,168 deletions.
132 changes: 52 additions & 80 deletions bench/f16-f32-vcvt.cc

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions bench/f16-vabs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
#include "xnnpack/vunary.h"

void f16_vabs(benchmark::State& state, xnn_f16_vabs_ukernel_fn ukernel,
xnn_init_f16_abs_params_fn init_params = nullptr,
xnn_init_f16_default_params_fn init_params = nullptr,
benchmark::utils::IsaCheckFunction isa_check = nullptr) {
f16_vunary_benchmark<xnn_f16_abs_params>(
f16_vunary_benchmark<xnn_f16_default_params>(
state, ukernel,
init_params,
isa_check,
Expand All @@ -48,12 +48,12 @@ void f16_vabs(benchmark::State& state, xnn_f16_vabs_ukernel_fn ukernel,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_vabs, sse2_u8,
xnn_f16_vabs_ukernel__sse2_u8,
xnn_init_f16_abs_sse_params)
/*init_params=*/nullptr)
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
->UseRealTime();
BENCHMARK_CAPTURE(f16_vabs, sse2_u16,
xnn_f16_vabs_ukernel__sse2_u16,
xnn_init_f16_abs_sse_params)
/*init_params=*/nullptr)
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
Expand Down
8 changes: 4 additions & 4 deletions bench/f16-vneg.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
#include "xnnpack/vunary.h"

void f16_vneg(benchmark::State& state, xnn_f16_vneg_ukernel_fn ukernel,
xnn_init_f16_neg_params_fn init_params = nullptr,
xnn_init_f16_default_params_fn init_params = nullptr,
benchmark::utils::IsaCheckFunction isa_check = nullptr) {
f16_vunary_benchmark<xnn_f16_neg_params>(
f16_vunary_benchmark<xnn_f16_default_params>(
state, ukernel,
init_params,
isa_check,
Expand All @@ -48,12 +48,12 @@ void f16_vneg(benchmark::State& state, xnn_f16_vneg_ukernel_fn ukernel,
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_vneg, sse2_u8,
xnn_f16_vneg_ukernel__sse2_u8,
xnn_init_f16_neg_sse_params)
/*init_params=*/nullptr)
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
->UseRealTime();
BENCHMARK_CAPTURE(f16_vneg, sse2_u16,
xnn_f16_vneg_ukernel__sse2_u16,
xnn_init_f16_neg_sse_params)
/*init_params=*/nullptr)
->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, uint16_t>)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
Expand Down
8 changes: 2 additions & 6 deletions bench/vcvt-benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace {
static void f16_f32_vcvt(
benchmark::State& state,
xnn_f16_f32_vcvt_ukernel_fn cvt,
xnn_init_f16_f32_cvt_params_fn init_params = nullptr,
void* /*init_params*/ = nullptr,
benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (isa_check && !isa_check(state)) {
Expand All @@ -45,12 +45,8 @@ static void f16_f32_vcvt(
std::generate(x.begin(), x.end(), std::ref(f16rng));
std::fill(y.begin(), y.end(), std::nanf(""));

xnn_f16_f32_cvt_params params;
if (init_params != nullptr) {
init_params(&params);
}
for (auto _ : state) {
cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr);
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
Expand Down
14 changes: 7 additions & 7 deletions src/amalgam/gen/avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,19 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
const __m128i vsign_mask = _mm_set1_epi16(0x8000);
const __m128i vexp_offset = _mm_set1_epi16(0x7000);
const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f);
const __m128i vmagic_mask = _mm_set1_epi16(0x3F00);
const __m128 vmagic_bias = _mm_set1_ps(0.5f);
const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
2 changes: 1 addition & 1 deletion src/amalgam/gen/avx512skx.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
2 changes: 1 addition & 1 deletion src/amalgam/gen/f16c.c
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand All @@ -64,7 +64,7 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16(

const uint16x8_t vsign_mask = vmovq_n_u16(0x8000);
const uint16x8_t vexp_offset = vmovq_n_u16(0x7000);
const float32x4_t vexp_scale = vld1q_dup_f32(&params->neon.exp_scale);
const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f);
const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000);
const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400);

Expand Down
2 changes: 1 addition & 1 deletion src/amalgam/gen/neonfp16.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
4 changes: 2 additions & 2 deletions src/amalgam/gen/neonfp16arith.c
Original file line number Diff line number Diff line change
Expand Up @@ -10835,7 +10835,7 @@ void xnn_f16_vabs_ukernel__neonfp16arith_u16(
size_t batch,
const void* input,
void* output,
const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down Expand Up @@ -10881,7 +10881,7 @@ void xnn_f16_vneg_ukernel__neonfp16arith_u16(
size_t batch,
const void* input,
void* output,
const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
28 changes: 14 additions & 14 deletions src/amalgam/gen/scalar.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u1(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const uint32_t vsign_mask = params->scalar.sign_mask;
const uint32_t vexp_offset = params->scalar.exp_offset;
const float vexp_scale = params->scalar.exp_scale;
const uint32_t vmagic_mask = params->scalar.magic_mask;
const float vmagic_bias = params->scalar.magic_bias;
const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
const uint32_t vsign_mask = 0x80000000;
const uint32_t vexp_offset = 0x70000000;
const float vexp_scale = 0x1.0p-112f;
const uint32_t vmagic_mask = 0x3F000000;
const float vmagic_bias = 0.5f;
const uint32_t vdenorm_cutoff = 0x08000000;

const uint16_t* i = (const uint16_t*) input;
uint32_t* o = (uint32_t*) output;
Expand All @@ -93,19 +93,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u4(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const uint32_t vsign_mask = params->scalar.sign_mask;
const uint32_t vexp_offset = params->scalar.exp_offset;
const float vexp_scale = params->scalar.exp_scale;
const uint32_t vmagic_mask = params->scalar.magic_mask;
const float vmagic_bias = params->scalar.magic_bias;
const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
const uint32_t vsign_mask = 0x80000000;
const uint32_t vexp_offset = 0x70000000;
const float vexp_scale = 0x1.0p-112f;
const uint32_t vmagic_mask = 0x3F000000;
const float vmagic_bias = 0.5f;
const uint32_t vdenorm_cutoff = 0x08000000;

const uint16_t* i = (const uint16_t*) input;
uint32_t* o = (uint32_t*) output;
Expand Down
22 changes: 11 additions & 11 deletions src/amalgam/gen/sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,19 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u32(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
const __m128i vsign_mask = _mm_set1_epi16(0x8000);
const __m128i vexp_offset = _mm_set1_epi16(0x7000);
const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f);
const __m128i vmagic_mask = _mm_set1_epi16(0x3F00);
const __m128 vmagic_bias = _mm_set1_ps(0.5f);
const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
Expand Down Expand Up @@ -224,7 +224,7 @@ void xnn_f16_vabs_ukernel__sse2_u16(
size_t batch,
const void* input,
void* output,
const union xnn_f16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand All @@ -233,7 +233,7 @@ void xnn_f16_vabs_ukernel__sse2_u16(

const uint16_t* i = (const uint16_t*) input;
uint16_t* o = (uint16_t*) output;
const __m128i vnonsign_mask = _mm_load_si128((const __m128i*) params->sse.nonsign_mask);
const __m128i vnonsign_mask = _mm_set1_epi16(0x7FFF);
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
__m128i vacc0 = _mm_loadu_si128((const __m128i*) i);
__m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8));
Expand Down Expand Up @@ -276,7 +276,7 @@ void xnn_f16_vneg_ukernel__sse2_u16(
size_t batch,
const void* input,
void* output,
const union xnn_f16_neg_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand All @@ -285,7 +285,7 @@ void xnn_f16_vneg_ukernel__sse2_u16(

const uint16_t* i = (const uint16_t*) input;
uint16_t* o = (uint16_t*) output;
const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse.sign_mask);
const __m128i vsign_mask = _mm_set1_epi16(0x8000);
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
__m128i vacc0 = _mm_loadu_si128((const __m128i*) i);
__m128i vacc1 = _mm_loadu_si128((const __m128i*) (i + 8));
Expand Down
14 changes: 7 additions & 7 deletions src/amalgam/gen/sse41.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,19 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
const __m128i vsign_mask = _mm_set1_epi16(0x8000);
const __m128i vexp_offset = _mm_set1_epi16(0x7000);
const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f);
const __m128i vmagic_mask = _mm_set1_epi16(0x3F00);
const __m128 vmagic_bias = _mm_set1_ps(0.5f);
const __m128i vdenorm_cutoff = _mm_set1_epi16(0x0400);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
14 changes: 7 additions & 7 deletions src/amalgam/gen/wasmrelaxedsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask);
const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset);
const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale);
const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask);
const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias);
const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff);
const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000);
const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000);
const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f);
const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00);
const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f);
const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
14 changes: 7 additions & 7 deletions src/amalgam/gen/wasmsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,19 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask);
const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset);
const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale);
const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask);
const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias);
const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff);
const v128_t vsign_mask = wasm_u16x8_const_splat(0x8000);
const v128_t vexp_offset = wasm_u16x8_const_splat(0x7000);
const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f);
const v128_t vmagic_mask = wasm_u16x8_const_splat(0x3F00);
const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f);
const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(0x0400);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
4 changes: 2 additions & 2 deletions src/bf16-vunary/gen/bf16-vabs-neonbf16-u16.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u16(
size_t batch,
const void* input,
void* output,
const union xnn_bf16_abs_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const union xnn_bf16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(bfloat16_t) == 0);
Expand All @@ -28,7 +28,7 @@ void xnn_bf16_vabs_ukernel__neonbf16_u16(

const bfloat16_t* i = (const bfloat16_t*) input;
bfloat16_t* o = (bfloat16_t*) output;
uint16x8_t vmask = vld1q_u16(params->neon.nonsign_mask);
uint16x8_t vmask = vdupq_n_u16(0x7FFF);
for (; batch >= 16 * sizeof(bfloat16_t); batch -= 16 * sizeof(bfloat16_t)) {
const bfloat16x8_t vx01234567 = vld1q_bf16(i); i+= 8;
const bfloat16x8_t vx89ABCDEF = vld1q_bf16(i); i+= 8;
Expand Down
Loading

0 comments on commit c97e447

Please sign in to comment.