From 71487a6d7ae7658598e688e59c5a72a239460ef0 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Sat, 16 Nov 2024 16:02:23 +0100 Subject: [PATCH 01/11] implemented argmax kernel --- src/ggml-metal/ggml-metal.m | 18 ++++++++++++++++++ src/ggml-metal/ggml-metal.metal | 17 +++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index 95b21fbf9..bb3a2b5a2 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -348,6 +348,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte GGML_METAL_KERNEL_TYPE_SUM_ROWS, GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, + GGML_METAL_KERNEL_TYPE_ARGMAX, GGML_METAL_KERNEL_TYPE_COUNT }; @@ -869,6 +870,7 @@ @implementation GGMLMetalClass GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN, sin, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS, cos, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX, argmax, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32, pool_2d_avg_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32, pool_2d_max_f32, true); } @@ -996,6 +998,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex case GGML_OP_RMS_NORM: case GGML_OP_GROUP_NORM: return has_simdgroup_reduction; + case GGML_OP_ARGMAX: case GGML_OP_NORM: case GGML_OP_ROPE: return true; @@ -3469,6 +3472,21 @@ static void ggml_metal_encode_node( [encoder dispatchThreadgroups:MTLSizeMake(n_tg, 1, 1) threadsPerThreadgroup:MTLSizeMake(n_threads, 1, 1)]; } break; + case GGML_OP_ARGMAX: + { + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + const int64_t nrows = ggml_nrows(src0); + + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGMAX].pipeline; + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nrows, 1, 1)]; + } break; default: { GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op)); diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index 8c7fcb113..a576ecdab 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -1344,6 +1344,23 @@ kernel void kernel_ssm_scan_f32( } } +kernel void kernel_argmax( + device const void * x, + device int32_t * dst, + constant int64_t & ncols, + uint tpitg[[thread_position_in_threadgroup]]) { + device const float * x_row = (device const float *) ((device const char *) x + tpitg * ncols * sizeof(float)); + + // initialize + dst[tpitg] = 0; + + for (int i = 0; i < ncols; i++) { + if (x_row[i] > x_row[dst[tpitg]]) { + dst[tpitg] = i; + } + } +} + kernel void kernel_norm( device const void * src0, device float * dst, From 630d65d435e656a2b02845ca671d1f9c032407f0 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Tue, 19 Nov 2024 22:09:06 +0100 Subject: [PATCH 02/11] tpig -> tgpig --- src/ggml-metal/ggml-metal.m | 2 +- src/ggml-metal/ggml-metal.metal | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index bb3a2b5a2..426f28387 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -3485,7 +3485,7 @@ static void ggml_metal_encode_node( [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; - [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(nrows, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; default: { diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index a576ecdab..a9c265b92 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -1348,10 +1348,9 @@ kernel void kernel_argmax( device const void * x, device int32_t * dst, constant int64_t & ncols, - uint tpitg[[thread_position_in_threadgroup]]) { - device const float * x_row = (device const float *) ((device const char *) x + tpitg * ncols * sizeof(float)); + uint tgpig[[threadgroup_position_in_grid]]) { + device const float * x_row = (device const float *) ((device const char *) x + tgpig * ncols * sizeof(float)); - // initialize dst[tpitg] = 0; for (int i = 0; i < ncols; i++) { From 093f08755b7da794aef8647119f2d449d0373ea2 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Tue, 19 Nov 2024 22:13:17 +0100 Subject: [PATCH 03/11] change to strides --- src/ggml-metal/ggml-metal.m | 2 +- src/ggml-metal/ggml-metal.metal | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index 426f28387..b4e91f0eb 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -3483,7 +3483,7 @@ static void ggml_metal_encode_node( [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof( int64_t) atIndex:2]; [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index a9c265b92..f430ba5d8 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -1347,9 +1347,9 @@ kernel void kernel_ssm_scan_f32( kernel void kernel_argmax( device const void * x, device int32_t * dst, - constant int64_t & ncols, + constant int64_t & nb01, uint tgpig[[threadgroup_position_in_grid]]) { - device const float * x_row = (device const float *) ((device const char *) x + tgpig * ncols * sizeof(float)); + device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01); dst[tpitg] = 0; From 9cf977dbf6ecdbca1e2cee7916eb81fc76f1d072 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Wed, 20 Nov 2024 18:49:23 +0100 Subject: [PATCH 04/11] contiguous assertions --- src/ggml-metal/ggml-metal.m | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index b4e91f0eb..7fd3a8c5d 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -3475,6 +3475,8 @@ static void ggml_metal_encode_node( case GGML_OP_ARGMAX: { GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous_1(src0)); + GGML_ASSERT(nb00 == ggml_type_size(src0->type)); const int64_t nrows = ggml_nrows(src0); From d782d29abafc2c36a19dbb636c129b1637444310 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Wed, 20 Nov 2024 19:00:46 +0100 Subject: [PATCH 05/11] kernel working and tested --- src/ggml-metal/ggml-metal.m | 3 ++- src/ggml-metal/ggml-metal.metal | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index 7fd3a8c5d..36ae974cc 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -3485,7 +3485,8 @@ static void ggml_metal_encode_node( [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&nb01 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index f430ba5d8..b0dc0fd57 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -1347,15 +1347,16 @@ kernel void kernel_ssm_scan_f32( kernel void kernel_argmax( device const void * x, device int32_t * dst, - constant int64_t & nb01, + constant int64_t & ncols, + constant uint64_t & nb01, uint tgpig[[threadgroup_position_in_grid]]) { device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01); - dst[tpitg] = 0; + dst[tgpig] = 0; for (int i = 0; i < ncols; i++) { - if (x_row[i] > x_row[dst[tpitg]]) { - dst[tpitg] = i; + if (x_row[i] > x_row[dst[tgpig]]) { + dst[tgpig] = i; } } } From 7e3080220b005c250dd80b4185d62b8bab6a0866 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 29 Nov 2024 17:46:35 +0100 Subject: [PATCH 06/11] argmax simd parallel implementation --- src/ggml-metal/ggml-metal.m | 9 +++++- src/ggml-metal/ggml-metal.metal | 50 +++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/src/ggml-metal/ggml-metal.m b/src/ggml-metal/ggml-metal.m index 36ae974cc..d0e9e4b79 100644 --- a/src/ggml-metal/ggml-metal.m +++ b/src/ggml-metal/ggml-metal.m @@ -3480,6 +3480,11 @@ static void ggml_metal_encode_node( const int64_t nrows = ggml_nrows(src0); + int nth = 32; // SIMD width + while (nth < ne00 && nth*ne01*ne02*ne03 < 256) { + nth *= 2; + } + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ARGMAX].pipeline; [encoder setComputePipelineState:pipeline]; @@ -3487,8 +3492,10 @@ static void ggml_metal_encode_node( [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3]; + [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; + [encoder setThreadgroupMemoryLength:32*sizeof(int32_t) atIndex:1]; - [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; default: { diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index b0dc0fd57..54ecc81df 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -1349,16 +1349,56 @@ kernel void kernel_argmax( device int32_t * dst, constant int64_t & ncols, constant uint64_t & nb01, - uint tgpig[[threadgroup_position_in_grid]]) { + threadgroup float * shared_maxval [[threadgroup(0)]], + threadgroup int32_t * shared_argmax [[threadgroup(1)]], + uint tgpig[[threadgroup_position_in_grid]], + uint tpitg[[thread_position_in_threadgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint ntg[[threads_per_threadgroup]]) { device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01); - dst[tgpig] = 0; + float lmax = -INFINITY; + int32_t larg = -1; - for (int i = 0; i < ncols; i++) { - if (x_row[i] > x_row[dst[tgpig]]) { - dst[tgpig] = i; + for (int i00 = tpitg; i00 < ncols; i00 += ntg) { + if (x_row[i00] > lmax) { + lmax = x_row[i00]; + larg = i00; } } + + // find the argmax value in the block + float max_val = simd_max(lmax); + int32_t arg_val = simd_max(select(-1, larg, lmax == max_val)); + + if (ntg > N_SIMDWIDTH) { + if (sgitg == 0) { + shared_maxval[tiisg] = -INFINITY; + shared_argmax[tiisg] = -1; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (tiisg == 0) { + shared_maxval[sgitg] = max_val; + shared_argmax[sgitg] = arg_val; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + max_val = shared_maxval[tiisg]; + arg_val = shared_argmax[tiisg]; + + float max_val_reduced = simd_max(max_val); + int32_t arg_val_reduced = simd_max(select(-1, arg_val, max_val == max_val_reduced)); + + dst[tgpig] = arg_val_reduced; + + return; + } + + dst[tgpig] = arg_val; } kernel void kernel_norm( From 09bec3ef67603bfc428384cbf287442b285b4f79 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 29 Nov 2024 17:46:57 +0100 Subject: [PATCH 07/11] added 2 new tests for argmax in test-backend-ops --- tests/test-backend-ops.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f8a59b6df..f0f57100d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3440,7 +3440,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); - test_cases.emplace_back(new test_argmax()); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {10, 100 , 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {12, 1024, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {3 , 5438, 1, 1})); + test_cases.emplace_back(new test_count_equal()); for (int ne3 : {1, 3}) { // CUDA backward pass only supports ne3 == 1 From ff2faa3523a12bd70b0a34b06b2aff9e51b6c19c Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 29 Nov 2024 17:48:23 +0100 Subject: [PATCH 08/11] cosmit --- src/ggml-metal/ggml-metal.metal | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ggml-metal/ggml-metal.metal b/src/ggml-metal/ggml-metal.metal index 54ecc81df..9fc95acd1 100644 --- a/src/ggml-metal/ggml-metal.metal +++ b/src/ggml-metal/ggml-metal.metal @@ -1353,8 +1353,8 @@ kernel void kernel_argmax( threadgroup int32_t * shared_argmax [[threadgroup(1)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], uint ntg[[threads_per_threadgroup]]) { device const float * x_row = (device const float *) ((device const char *) x + tgpig * nb01); From 66460e0e33b032718c058d5e8d42aae4fb33b8e5 Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 29 Nov 2024 20:14:23 +0100 Subject: [PATCH 09/11] added 3 tests cases for perf eval --- tests/test-backend-ops.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index f0f57100d..394a16f7a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3824,6 +3824,10 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {10, 100 , 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {12, 1024, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {3 , 5438, 1, 1})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); for (int bs : {1, 512}) { From 98d7a51feb9743d9d7763969de23588b087890ee Mon Sep 17 00:00:00 2001 From: Pierre-Antoine Bannier Date: Fri, 29 Nov 2024 20:22:49 +0100 Subject: [PATCH 10/11] add test_argmax in make_test_cases_perf --- tests/test-backend-ops.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 394a16f7a..15b5744c6 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3440,9 +3440,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1)); test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1)); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {10, 100 , 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {12, 1024, 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {3 , 5438, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 100, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438, 3, 1, 1})); test_cases.emplace_back(new test_count_equal()); @@ -3824,9 +3824,9 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {10, 100 , 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {12, 1024, 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {3 , 5438, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 100, 10, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1})); + test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438, 3, 1, 1})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); From 6f3cf624cc2cc7a43fe55ebd27a4b4b3f206f9c4 Mon Sep 17 00:00:00 2001 From: PAB Date: Fri, 29 Nov 2024 21:04:29 +0100 Subject: [PATCH 11/11] Update test-backend-ops.cpp Co-authored-by: Diego Devesa --- tests/test-backend-ops.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 15b5744c6..89bd98f6e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3824,10 +3824,6 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, { 100, 10, 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 12, 1, 1})); - test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {5438, 3, 1, 1})); - test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F16, {512, 3072, 1, 1})); for (int bs : {1, 512}) {