Skip to content

Commit

Permalink
Remove f16_f32_vcvt_params struct
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 658171197
  • Loading branch information
dsharletg authored and xnnpack-bot committed Aug 13, 2024
1 parent 9ddeb74 commit 1a1874b
Show file tree
Hide file tree
Showing 96 changed files with 1,123 additions and 998 deletions.
132 changes: 52 additions & 80 deletions bench/f16-f32-vcvt.cc

Large diffs are not rendered by default.

8 changes: 2 additions & 6 deletions bench/vcvt-benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ namespace {
static void f16_f32_vcvt(
benchmark::State& state,
xnn_f16_f32_vcvt_ukernel_fn cvt,
xnn_init_f16_f32_cvt_params_fn init_params = nullptr,
void* /*init_params*/ = nullptr,
benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (isa_check && !isa_check(state)) {
Expand All @@ -45,12 +45,8 @@ static void f16_f32_vcvt(
std::generate(x.begin(), x.end(), std::ref(f16rng));
std::fill(y.begin(), y.end(), std::nanf(""));

xnn_f16_f32_cvt_params params;
if (init_params != nullptr) {
init_params(&params);
}
for (auto _ : state) {
cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), &params);
cvt(num_elements * sizeof(uint16_t), x.data(), y.data(), nullptr);
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
Expand Down
21 changes: 14 additions & 7 deletions src/amalgam/gen/avx.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,26 @@ void xnn_f16_f32_vcvt_ukernel__avx_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000));
const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000));
const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f);
const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00));
const __m128 vmagic_bias = _mm_set1_ps(0.5f);
const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400));

XNN_FORCE_REALIZATION(vsign_mask);
XNN_FORCE_REALIZATION(vexp_offset);
XNN_FORCE_REALIZATION(vexp_scale);
XNN_FORCE_REALIZATION(vmagic_mask);
XNN_FORCE_REALIZATION(vmagic_bias);
XNN_FORCE_REALIZATION(vdenorm_cutoff);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
2 changes: 1 addition & 1 deletion src/amalgam/gen/avx512skx.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
2 changes: 1 addition & 1 deletion src/amalgam/gen/f16c.c
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
18 changes: 12 additions & 6 deletions src/amalgam/gen/neon.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,18 +55,24 @@ void xnn_f16_f32_vcvt_ukernel__neon_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const uint16x8_t vsign_mask = vmovq_n_u16(0x8000);
const uint16x8_t vexp_offset = vmovq_n_u16(0x7000);
const float32x4_t vexp_scale = vld1q_dup_f32(&params->neon.exp_scale);
const uint32x4_t vmagic_bias = vmovq_n_u32(0x3F000000);
const uint16x8_t vdenorm_cutoff = vmovq_n_u16(0x0400);
const uint16x8_t vsign_mask = vmovq_n_u16(UINT16_C(0x8000));
const uint16x8_t vexp_offset = vmovq_n_u16(UINT16_C(0x7000));
const float32x4_t vexp_scale = vmovq_n_f32(0x1.0p-112f);
const uint32x4_t vmagic_bias = vmovq_n_u32(UINT32_C(0x3F000000));
const uint16x8_t vdenorm_cutoff = vmovq_n_u16(UINT16_C(0x0400));

XNN_FORCE_REALIZATION(vsign_mask);
XNN_FORCE_REALIZATION(vexp_offset);
XNN_FORCE_REALIZATION(vexp_scale);
XNN_FORCE_REALIZATION(vmagic_bias);
XNN_FORCE_REALIZATION(vdenorm_cutoff);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
2 changes: 1 addition & 1 deletion src/amalgam/gen/neonfp16.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__neonfp16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
28 changes: 14 additions & 14 deletions src/amalgam/gen/scalar.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,19 +57,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u1(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const uint32_t vsign_mask = params->scalar.sign_mask;
const uint32_t vexp_offset = params->scalar.exp_offset;
const float vexp_scale = params->scalar.exp_scale;
const uint32_t vmagic_mask = params->scalar.magic_mask;
const float vmagic_bias = params->scalar.magic_bias;
const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
const uint32_t vsign_mask = 0x80000000;
const uint32_t vexp_offset = 0x70000000;
const float vexp_scale = 0x1.0p-112f;
const uint32_t vmagic_mask = 0x3F000000;
const float vmagic_bias = 0.5f;
const uint32_t vdenorm_cutoff = 0x08000000;

const uint16_t* i = (const uint16_t*) input;
uint32_t* o = (uint32_t*) output;
Expand All @@ -93,19 +93,19 @@ void xnn_f16_f32_vcvt_ukernel__scalar_u4(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const uint32_t vsign_mask = params->scalar.sign_mask;
const uint32_t vexp_offset = params->scalar.exp_offset;
const float vexp_scale = params->scalar.exp_scale;
const uint32_t vmagic_mask = params->scalar.magic_mask;
const float vmagic_bias = params->scalar.magic_bias;
const uint32_t vdenorm_cutoff = params->scalar.denorm_cutoff;
const uint32_t vsign_mask = 0x80000000;
const uint32_t vexp_offset = 0x70000000;
const float vexp_scale = 0x1.0p-112f;
const uint32_t vmagic_mask = 0x3F000000;
const float vmagic_bias = 0.5f;
const uint32_t vdenorm_cutoff = 0x08000000;

const uint16_t* i = (const uint16_t*) input;
uint32_t* o = (uint32_t*) output;
Expand Down
21 changes: 14 additions & 7 deletions src/amalgam/gen/sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,26 @@ void xnn_f16_f32_vcvt_ukernel__sse2_int16_u32(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000));
const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000));
const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f);
const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00));
const __m128 vmagic_bias = _mm_set1_ps(0.5f);
const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400));

XNN_FORCE_REALIZATION(vsign_mask);
XNN_FORCE_REALIZATION(vexp_offset);
XNN_FORCE_REALIZATION(vexp_scale);
XNN_FORCE_REALIZATION(vmagic_mask);
XNN_FORCE_REALIZATION(vmagic_bias);
XNN_FORCE_REALIZATION(vdenorm_cutoff);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
Expand Down
21 changes: 14 additions & 7 deletions src/amalgam/gen/sse41.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,26 @@ void xnn_f16_f32_vcvt_ukernel__sse41_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
const __m128i vsign_mask = _mm_set1_epi16(UINT16_C(0x8000));
const __m128i vexp_offset = _mm_set1_epi16(UINT16_C(0x7000));
const __m128 vexp_scale = _mm_set1_ps(0x1.0p-112f);
const __m128i vmagic_mask = _mm_set1_epi16(UINT16_C(0x3F00));
const __m128 vmagic_bias = _mm_set1_ps(0.5f);
const __m128i vdenorm_cutoff = _mm_set1_epi16(UINT16_C(0x0400));

XNN_FORCE_REALIZATION(vsign_mask);
XNN_FORCE_REALIZATION(vexp_offset);
XNN_FORCE_REALIZATION(vexp_scale);
XNN_FORCE_REALIZATION(vmagic_mask);
XNN_FORCE_REALIZATION(vmagic_bias);
XNN_FORCE_REALIZATION(vdenorm_cutoff);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
21 changes: 14 additions & 7 deletions src/amalgam/gen/wasmrelaxedsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,19 +31,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask);
const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset);
const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale);
const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask);
const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias);
const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff);
const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000));
const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000));
const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f);
const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00));
const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f);
const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400));

XNN_FORCE_REALIZATION(vsign_mask);
XNN_FORCE_REALIZATION(vexp_offset);
XNN_FORCE_REALIZATION(vexp_scale);
XNN_FORCE_REALIZATION(vmagic_mask);
XNN_FORCE_REALIZATION(vmagic_bias);
XNN_FORCE_REALIZATION(vdenorm_cutoff);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
21 changes: 14 additions & 7 deletions src/amalgam/gen/wasmsimd.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,19 +55,26 @@ void xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
assert(input != NULL);
assert(output != NULL);

const v128_t vsign_mask = wasm_v128_load64_splat(params->wasmsimd_int16.sign_mask);
const v128_t vexp_offset = wasm_v128_load64_splat(params->wasmsimd_int16.exp_offset);
const v128_t vexp_scale = wasm_v128_load64_splat(params->wasmsimd_int16.exp_scale);
const v128_t vmagic_mask = wasm_v128_load64_splat(params->wasmsimd_int16.magic_mask);
const v128_t vmagic_bias = wasm_v128_load64_splat(params->wasmsimd_int16.magic_bias);
const v128_t vdenorm_cutoff = wasm_v128_load64_splat(params->wasmsimd_int16.denorm_cutoff);
const v128_t vsign_mask = wasm_u16x8_const_splat(UINT16_C(0x8000));
const v128_t vexp_offset = wasm_u16x8_const_splat(UINT16_C(0x7000));
const v128_t vexp_scale = wasm_f32x4_const_splat(0x1.0p-112f);
const v128_t vmagic_mask = wasm_u16x8_const_splat(UINT16_C(0x3F00));
const v128_t vmagic_bias = wasm_f32x4_const_splat(0.5f);
const v128_t vdenorm_cutoff = wasm_u16x8_const_splat(UINT16_C(0x0400));

XNN_FORCE_REALIZATION(vsign_mask);
XNN_FORCE_REALIZATION(vexp_offset);
XNN_FORCE_REALIZATION(vexp_scale);
XNN_FORCE_REALIZATION(vmagic_mask);
XNN_FORCE_REALIZATION(vmagic_bias);
XNN_FORCE_REALIZATION(vdenorm_cutoff);

const uint16_t* i = (const uint16_t*) input;
for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
Expand Down
10 changes: 0 additions & 10 deletions src/configs/unary-elementwise-config.c
Original file line number Diff line number Diff line change
Expand Up @@ -520,12 +520,10 @@ static void init_f16_to_f32_cvt_config(void) {
f16_to_f32_cvt_config.element_tile = 16;
} else {
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__neon_int16_u16;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_neon_params;
f16_to_f32_cvt_config.element_tile = 16;
}
} else if (!XNN_PLATFORM_MOBILE) {
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params;
f16_to_f32_cvt_config.element_tile = 4;
}
#elif XNN_ARCH_ARM64
Expand All @@ -542,38 +540,30 @@ static void init_f16_to_f32_cvt_config(void) {
f16_to_f32_cvt_config.element_tile = 16;
} else if (hardware_config->use_x86_avx) {
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__avx_int16_u16;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params;
f16_to_f32_cvt_config.element_tile = 16;
} else if (hardware_config->use_x86_sse4_1) {
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse41_int16_u16;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params;
f16_to_f32_cvt_config.element_tile = 16;
} else {
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__sse2_int16_u32;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_sse_int16_params;
f16_to_f32_cvt_config.element_tile = 32;
}
#elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
#if XNN_ARCH_WASMRELAXEDSIMD
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmrelaxedsimd_int16_u16;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params;
f16_to_f32_cvt_config.element_tile = 16;
#else
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_u16;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_wasmsimd_int16_params;
f16_to_f32_cvt_config.element_tile = 16;
#endif
#elif XNN_ARCH_WASM
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u1;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params;
f16_to_f32_cvt_config.element_tile = 1;
#elif XNN_ARCH_RISCV
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params;
f16_to_f32_cvt_config.element_tile = 4;
#else
f16_to_f32_cvt_config.ukernel = (xnn_vunary_ukernel_fn) xnn_f16_f32_vcvt_ukernel__scalar_u4;
f16_to_f32_cvt_config.init.f16_f32_cvt = xnn_init_f16_f32_cvt_scalar_params;
f16_to_f32_cvt_config.element_tile = 4;
#endif
}
Expand Down
2 changes: 1 addition & 1 deletion src/f16-f32-vcvt/avx512skx.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ void xnn_f16_f32_vcvt_ukernel__avx512skx_u${BATCH_TILE}(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
const void* params)
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
2 changes: 1 addition & 1 deletion src/f16-f32-vcvt/f16c.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ void xnn_f16_f32_vcvt_ukernel__f16c_u${BATCH_TILE}(
size_t batch,
const void* input,
float* output,
const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
const void* params) XNN_OOB_READS
{
assert(batch != 0);
assert(batch % sizeof(uint16_t) == 0);
Expand Down
Loading

0 comments on commit 1a1874b

Please sign in to comment.