diff --git a/CMakeLists.txt b/CMakeLists.txt index 49ba45356a78d..9cfe08d7b7d59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,9 @@ option(LLAMA_BUILD_SERVER "llama: build server example" option(LLAMA_LASX "llama: enable lasx" ON) option(LLAMA_LSX "llama: enable lsx" ON) +# add perf arguments +option(LLAMA_PERF "llama: enable perf" OFF) + # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake) @@ -867,6 +870,10 @@ if (LLAMA_CPU_HBM) target_link_libraries(ggml PUBLIC memkind) endif() +if (LLAMA_PERF) + add_compile_definitions(GGML_PERF) +endif() + function(get_flags CCID CCVER) set(C_FLAGS "") set(CXX_FLAGS "") diff --git a/Makefile b/Makefile index 3aad77394c5ac..4ea59c0b4ef29 100644 --- a/Makefile +++ b/Makefile @@ -344,6 +344,9 @@ ifdef LLAMA_GPROF MK_CFLAGS += -pg MK_CXXFLAGS += -pg endif +ifdef LLAMA_PERF + MK_CPPFLAGS += -DGGML_PERF +endif # Architecture specific # TODO: probably these flags need to be tweaked on some architectures diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 101781ede4b4f..c31877403b0d5 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -513,8 +513,8 @@ static size_t vk_skip_checks; static size_t vk_output_tensor; static void ggml_vk_print_tensor(ggml_backend * ctx, const ggml_tensor * tensor, const char * name); -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor); -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor); +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor); #endif typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); @@ -5644,7 +5644,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } } -static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * tensor){ +static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ ggml_tensor_extra_gpu * extra = nullptr; switch (tensor->op) { @@ -5697,10 +5697,17 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * return false; } + if (params->ith != 0) { + return true; + } + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return true; + } + VK_LOG_DEBUG("ggml_vk_compute_forward(" << tensor << ", name=" << tensor->name << ", op=" << ggml_op_name(tensor->op) << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << ", view_src=" << tensor->view_src << ", view_offs=" << tensor->view_offs << ")"); #ifdef GGML_VULKAN_CHECK_RESULTS - ggml_vk_check_results_0(ctx, tensor); + ggml_vk_check_results_0(ctx, params, tensor); #endif vk_context& subctx = ctx->gc.contexts[extra->ctx_idx]; @@ -6207,6 +6214,9 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen ggml_vk_build_graph(ctx,cgraph->nodes[i], i == last_node); } + ggml_compute_params params = {}; + params.type = GGML_TASK_TYPE_COMPUTE; + params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -6214,13 +6224,13 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen continue; } - bool ok = ggml_vk_compute_forward(ctx, node); + bool ok = ggml_vk_compute_forward(ctx, ¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } #ifdef GGML_VULKAN_CHECK_RESULTS else { - ggml_vk_check_results_1(ctx, node); + ggml_vk_check_results_1(ctx, ¶ms, node); } #endif GGML_ASSERT(ok); @@ -6590,8 +6600,11 @@ void * comp_result; size_t comp_size; size_t comp_nb[GGML_MAX_DIMS]; size_t check_counter = 0; -static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * tensor) { - if (tensor->op == GGML_OP_TRANSPOSE) { +static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { + if (params->ith != 0) { + return; + } + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } @@ -6895,8 +6908,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_tensor * ggml_free(ggml_ctx); } -static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_tensor * tensor) { - if (tensor->op == GGML_OP_TRANSPOSE) { +static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor) { + if (params->ith != 0) { + return; + } + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE || tensor->op == GGML_OP_TRANSPOSE) { return; } if (!(vk_output_tensor > 0 && vk_output_tensor == check_counter) && check_counter <= vk_skip_checks) { diff --git a/ggml.c b/ggml.c index f5502afbe98b3..778ca3fdf1f8f 100644 --- a/ggml.c +++ b/ggml.c @@ -175,6 +175,7 @@ void ggml_print_backtrace(void) { } #endif +/*#define GGML_PERF*/ #define GGML_DEBUG 0 #define GGML_GELU_FP16 #define GGML_GELU_QUICK_FP16 @@ -292,7 +293,7 @@ inline static void * ggml_calloc(size_t num, size_t size) { #define GGML_FREE(ptr) free(ptr) #define UNUSED GGML_UNUSED -#define SWAP(x, y, T) do { T SWAP = x; (x) = y; (y) = SWAP; } while (0) +#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0) #if defined(GGML_USE_ACCELERATE) #include @@ -473,6 +474,18 @@ int64_t ggml_cycles_per_ms(void) { return CLOCKS_PER_SEC/1000; } +#ifdef GGML_PERF +#define ggml_perf_time_ms() ggml_time_ms() +#define ggml_perf_time_us() ggml_time_us() +#define ggml_perf_cycles() ggml_cycles() +#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms() +#else +#define ggml_perf_time_ms() 0 +#define ggml_perf_time_us() 0 +#define ggml_perf_cycles() 0 +#define ggml_perf_cycles_per_ms() 0 +#endif + // // cross-platform UTF-8 file paths // @@ -1717,8 +1730,8 @@ struct ggml_context { int n_objects; - struct ggml_object * objects_begin; - struct ggml_object * objects_end; + struct ggml_object* objects_begin; + struct ggml_object* objects_end; struct ggml_scratch scratch; struct ggml_scratch scratch_save; @@ -1731,8 +1744,11 @@ struct ggml_context_container { }; struct ggml_compute_state_shared { - const struct ggml_cgraph * cgraph; - const struct ggml_cplan * cplan; + const struct ggml_cgraph* cgraph; + const struct ggml_cplan* cplan; + + int64_t perf_node_start_cycles; + int64_t perf_node_start_time_us; int n_threads; @@ -1741,28 +1757,16 @@ struct ggml_compute_state_shared { atomic_int n_barrier_passed; ggml_abort_callback abort_callback; // abort ggml_graph_compute when true - void * abort_callback_data; + void* abort_callback_data; - atomic_int current_chunk; // currently processing chunk during mul_mat, shared between all the threads - - enum ggml_status ec; + atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads. }; struct ggml_compute_state { ggml_thread_t thrd; int ith; - struct ggml_compute_state_shared * shared; -}; - -struct ggml_compute_params { - // ith = thread index, nth = number of threads - int ith, nth; - - // work buffer for all threads - size_t wsize; - void * wdata; - - struct ggml_compute_state_shared * shared; + struct ggml_compute_state_shared* shared; + enum ggml_status ec; }; // @@ -2810,6 +2814,42 @@ static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13"); static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN"); static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN"); +// WARN: +// Mis-configuration can lead to problem that's hard to reason about: +// * At best it crash or talks nosense. +// * At worst it talks slightly difference but hard to perceive. +// +// An op has to enable INIT or FINALIZE when any of it's branch needs that pass. +// Take care about compile options (e.g., GGML_USE_xxx). +static bool GGML_OP_HAS_INIT [GGML_OP_COUNT] = { 0 }; +static bool GGML_OP_HAS_FINALIZE[GGML_OP_COUNT] = { 0 }; + +static void ggml_setup_op_has_task_pass(void) { + { // INIT + bool * p = GGML_OP_HAS_INIT; + + p[GGML_OP_ACC ] = true; + p[GGML_OP_MUL_MAT ] = true; + p[GGML_OP_MUL_MAT_ID ] = true; + p[GGML_OP_OUT_PROD ] = true; + p[GGML_OP_SET ] = true; + p[GGML_OP_GET_ROWS_BACK ] = true; + p[GGML_OP_DIAG_MASK_INF ] = true; + p[GGML_OP_DIAG_MASK_ZERO ] = true; + p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_TRANSPOSE_2D ] = true; + p[GGML_OP_FLASH_ATTN_BACK ] = true; + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + p[GGML_OP_ADD_REL_POS ] = true; + } + + { // FINALIZE + bool * p = GGML_OP_HAS_FINALIZE; + + p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; + } +} + // // NUMA support // @@ -2848,7 +2888,7 @@ struct ggml_state { static struct ggml_state g_state; static atomic_flag g_state_critical = ATOMIC_FLAG_INIT; -// critical section via spin lock +// barrier via spin lock inline static void ggml_critical_section_start(void) { while (atomic_flag_test_and_set(&g_state_critical)) { // spin @@ -2856,48 +2896,6 @@ inline static void ggml_critical_section_start(void) { } } -#ifdef GGML_USE_OPENMP -static void ggml_barrier(struct ggml_compute_state_shared * shared) { - if (shared->n_threads == 1) { - return; - } - - #pragma omp barrier -} -#else -static void ggml_barrier(struct ggml_compute_state_shared * shared) { - if (shared->n_threads == 1) { - return; - } - - atomic_int * n_barrier = &shared->n_barrier; - atomic_int * n_barrier_passed = &shared->n_barrier_passed; - - int n_threads = shared->n_threads; - int passed_old = atomic_load(n_barrier_passed); - - if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { - // last thread - atomic_store(n_barrier, 0); - atomic_fetch_add(n_barrier_passed, 1); - } else { - // wait for other threads - const int n_spin_before_sleep = 100000; - while (true) { - for (int i = 0; i < n_spin_before_sleep; i++) { - if (atomic_load(n_barrier_passed) != passed_old) { - return; - } - #if defined(__SSE3__) - _mm_pause(); - #endif - } - sched_yield(); - } - } -} -#endif - // TODO: make this somehow automatically executed // some sort of "sentry" mechanism inline static void ggml_critical_section_end(void) { @@ -3002,7 +3000,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { } } #else - UNUSED(numa_flag); + GGML_UNUSED(numa_flag); // TODO #endif } @@ -3108,7 +3106,9 @@ GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t) { enum ggml_unary_op uop = ggml_get_unary_op(t); return ggml_unary_op_name(uop); } - return ggml_op_name(t->op); + else { + return ggml_op_name(t->op); + } } GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor) { @@ -3375,6 +3375,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } + ggml_setup_op_has_task_pass(); + is_first_call = false; } @@ -3641,12 +3643,15 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.flags =*/ 0, /*.grad =*/ NULL, /*.src =*/ { NULL }, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, /*.view_src =*/ view_src, /*.view_offs =*/ view_offs, /*.data =*/ obj_alloc_size > 0 ? (void *)(result + 1) : data, /*.name =*/ { 0 }, /*.extra =*/ NULL, - ///*.padding =*/ { 0 }, + /*.padding =*/ { 0 }, }; #ifdef __clang__ @@ -7824,6 +7829,10 @@ static void ggml_compute_forward_dup_same_cont( GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); GGML_ASSERT(src0->type == dst->type); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const size_t nb00 = src0->nb[0]; const size_t nb0 = dst->nb[0]; @@ -7852,6 +7861,10 @@ static void ggml_compute_forward_dup_f16( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index @@ -8121,6 +8134,10 @@ static void ggml_compute_forward_dup_bf16( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index @@ -8477,6 +8494,10 @@ static void ggml_compute_forward_dup_f32( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_UNARY_OP_LOCALS const int ith = params->ith; // thread index @@ -8796,6 +8817,10 @@ static void ggml_compute_forward_dup_bytes( GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); GGML_ASSERT(src0->type == dst->type); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { ggml_compute_forward_dup_same_cont(params, dst); return; @@ -8976,6 +9001,10 @@ static void ggml_compute_forward_add_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -9051,6 +9080,10 @@ static void ggml_compute_forward_add_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -9126,6 +9159,10 @@ static void ggml_compute_forward_add_bf16_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -9201,6 +9238,10 @@ static void ggml_compute_forward_add_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -9253,6 +9294,10 @@ static void ggml_compute_forward_add_bf16_bf16( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -9305,6 +9350,10 @@ static void ggml_compute_forward_add_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int nr = ggml_nrows(src0); GGML_TENSOR_BINARY_OP_LOCALS @@ -9454,6 +9503,10 @@ static void ggml_compute_forward_add1_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -9504,6 +9557,10 @@ static void ggml_compute_forward_add1_f16_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // scalar to add const float v = *(float *) src1->data; @@ -9552,6 +9609,10 @@ static void ggml_compute_forward_add1_f16_f16( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // scalar to add const float v = GGML_FP16_TO_FP32(*(ggml_fp16_t *) src1->data); @@ -9600,6 +9661,10 @@ static void ggml_compute_forward_add1_q_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // scalar to add const float v = *(float *) src1->data; @@ -9665,6 +9730,10 @@ static void ggml_compute_forward_add1_bf16_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // scalar to add const float v = *(float *) src1->data; @@ -9713,6 +9782,10 @@ static void ggml_compute_forward_add1_bf16_bf16( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_scalar(src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // scalar to add const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data); @@ -9837,16 +9910,20 @@ static void ggml_compute_forward_acc_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace) { - if (params->ith == 0) { - // memcpy needs to be synchronized across threads to avoid race conditions. - // => do it in INIT phase - memcpy( - ((char *) dst->data), - ((char *) src0->data), - ggml_nbytes(dst)); + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { + if (params->ith != 0) { + return; } - ggml_barrier(params->shared); + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; } const int ith = params->ith; @@ -9948,12 +10025,13 @@ static void ggml_compute_forward_sub_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); - const int nr = ggml_nrows(src0); GGML_TENSOR_BINARY_OP_LOCALS @@ -10031,6 +10109,9 @@ static void ggml_compute_forward_mul_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } const int ith = params->ith; const int nth = params->nth; @@ -10125,6 +10206,10 @@ static void ggml_compute_forward_div_f32( GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -10213,12 +10298,13 @@ static void ggml_compute_forward_sqr_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - assert(ggml_are_same_shape(src0, dst)); - const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -10258,12 +10344,13 @@ static void ggml_compute_forward_sqrt_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + assert(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - assert(ggml_are_same_shape(src0, dst)); - const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -10303,12 +10390,13 @@ static void ggml_compute_forward_log_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - GGML_ASSERT(ggml_are_same_shape(src0, dst)); - const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -10348,12 +10436,12 @@ static void ggml_compute_forward_sum_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_scalar(dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } assert(ggml_is_scalar(dst)); assert(src0->nb[0] == sizeof(float)); @@ -10383,12 +10471,13 @@ static void ggml_compute_forward_sum_f16( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - assert(ggml_is_scalar(dst)); - assert(src0->nb[0] == sizeof(ggml_fp16_t)); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) @@ -10416,12 +10505,13 @@ static void ggml_compute_forward_sum_bf16( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - assert(ggml_is_scalar(dst)); - assert(src0->nb[0] == sizeof(ggml_bf16_t)); GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) @@ -10477,7 +10567,9 @@ static void ggml_compute_forward_sum_rows_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + GGML_ASSERT(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10530,7 +10622,9 @@ static void ggml_compute_forward_mean_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10587,7 +10681,9 @@ static void ggml_compute_forward_argmax_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -10635,12 +10731,13 @@ static void ggml_compute_forward_repeat_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - GGML_ASSERT(ggml_can_repeat(src0, dst)); - GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat @@ -10679,12 +10776,13 @@ static void ggml_compute_forward_repeat_f16( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - GGML_ASSERT(ggml_can_repeat(src0, dst)); - GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat @@ -10753,12 +10851,13 @@ static void ggml_compute_forward_repeat_back_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + GGML_ASSERT(params->ith == 0); + GGML_ASSERT(ggml_can_repeat(dst, src0)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } - GGML_ASSERT(ggml_can_repeat(dst, src0)); - GGML_TENSOR_UNARY_OP_LOCALS // guaranteed to be an integer due to the check in ggml_can_repeat @@ -10832,6 +10931,10 @@ static void ggml_compute_forward_concat_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; @@ -10900,14 +11003,15 @@ static void ggml_compute_forward_abs_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -10944,14 +11048,15 @@ static void ggml_compute_forward_sgn_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -10988,14 +11093,15 @@ static void ggml_compute_forward_neg_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11032,14 +11138,15 @@ static void ggml_compute_forward_step_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11076,14 +11183,15 @@ static void ggml_compute_forward_tanh_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11120,14 +11228,15 @@ static void ggml_compute_forward_elu_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11164,14 +11273,15 @@ static void ggml_compute_forward_relu_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11208,14 +11318,15 @@ static void ggml_compute_forward_sigmoid_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11256,6 +11367,10 @@ static void ggml_compute_forward_gelu_f32( assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -11315,6 +11430,10 @@ static void ggml_compute_forward_gelu_quick_f32( assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -11374,6 +11493,10 @@ static void ggml_compute_forward_silu_f32( assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -11428,14 +11551,15 @@ static void ggml_compute_forward_leaky_relu_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11485,6 +11609,10 @@ static void ggml_compute_forward_silu_back_f32( assert(ggml_are_same_shape(src0, dst)); assert(ggml_are_same_shape(src0, grad)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int ith = params->ith; const int nth = params->nth; @@ -11540,14 +11668,15 @@ static void ggml_compute_forward_hardswish_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11581,14 +11710,15 @@ static void ggml_compute_forward_hardsigmoid_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -11628,6 +11758,10 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; @@ -11699,6 +11833,10 @@ static void ggml_compute_forward_rms_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; @@ -11766,6 +11904,10 @@ static void ggml_compute_forward_rms_norm_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst) && ggml_are_same_shape(src0, src1)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; @@ -11940,6 +12082,10 @@ static void ggml_compute_forward_group_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->nb[0] == sizeof(float)); const int ith = params->ith; @@ -12044,8 +12190,8 @@ static void ggml_compute_forward_mul_mat_one_chunk( const bool src1_cont = ggml_is_contiguous(src1); - ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; - enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; + ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot; + enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; // broadcast factors const int64_t r2 = ne12 / ne02; @@ -12119,11 +12265,15 @@ static void ggml_compute_forward_mul_mat_one_chunk( static void ggml_compute_forward_mul_mat( const struct ggml_compute_params * params, - struct ggml_tensor * dst) { + struct ggml_tensor * dst, + struct ggml_compute_state * state) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; @@ -12150,14 +12300,16 @@ static void ggml_compute_forward_mul_mat( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - // nb01 >= nb00 - src0 is not transposed - // compute by src0 rows - -#if GGML_USE_LLAMAFILE // broadcast factors const int64_t r2 = ne12 / ne02; const int64_t r3 = ne13 / ne03; + UNUSED(r2); + UNUSED(r3); + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + +#if GGML_USE_LLAMAFILE const bool src1_cont = ggml_is_contiguous(src1); if (src1_cont) { @@ -12171,6 +12323,7 @@ static void ggml_compute_forward_mul_mat( (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, + params->type, src0->type, src1->type, dst->type)) @@ -12180,33 +12333,35 @@ static void ggml_compute_forward_mul_mat( UseGgmlGemm1:; #endif - if (src1->type != vec_dot_type) { - char * wdata = params->wdata; - - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; - - assert(params->wsize >= ne13*nbw3); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } + // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. + atomic_store(&state->shared->current_chunk, nth); + if (src1->type != vec_dot_type) { + char * wdata = params->wdata; + const size_t row_size = ggml_row_size(vec_dot_type, ne10); + + assert(params->wsize >= ne11*ne12*ne13*row_size); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } } } } - } - if (ith == 0) { - // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. - atomic_store(¶ms->shared->current_chunk, nth); + return; } - ggml_barrier(params->shared); + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } #if GGML_USE_LLAMAFILE if (src1->type != vec_dot_type) { @@ -12223,6 +12378,7 @@ UseGgmlGemm1:; (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), ith, nth, + params->type, src0->type, vec_dot_type, dst->type)) @@ -12232,6 +12388,11 @@ UseGgmlGemm1:; UseGgmlGemm2:; #endif +#ifdef GGML_PERF + int chunks_executed = 0; + UNUSED(chunks_executed); +#endif + // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers) const int64_t nr0 = ne0; @@ -12273,6 +12434,9 @@ UseGgmlGemm2:; const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0; const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1; + //if (ith == 0) + // printf("MUL_MAT = [%d, %d, %d, %d] x [%d, %d, %d, %d] = %d x %d = %d. Fp Ops/Ch %d\n", ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nchunk0, nchunk1, nchunk0 * nchunk1, ne00 * nr0 * nr1 / nchunk0 / nchunk1); + // The first chunk comes from our thread_id, the rest will get auto-assigned. int current_chunk = ith; @@ -12288,12 +12452,23 @@ UseGgmlGemm2:; ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end); +#ifdef GGML_PERF + chunks_executed++; +#endif + if (nth >= nchunk0 * nchunk1) { break; } - current_chunk = atomic_fetch_add(¶ms->shared->current_chunk, 1); + current_chunk = atomic_fetch_add(&state->shared->current_chunk, 1); } + +#ifdef GGML_PERF + // These numbers are useful when trying to measure how well the threading scheduling works. + //int64_t workSize = (ne01 * ne11 * ne12 * ne13 * ne00) / nchunk0 / nchunk1; + //float time = (ggml_perf_time_us() - t0); + //printf("MUL_MAT = %f ms, [%d, %d, %d, %d] x [%d, %d, %d, %d] = %I64u, %f ops/usec in %d chunks.\n", time / 1000.0, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, workSize, (float)workSize/time, chunks_executed); +#endif } // ggml_compute_forward_mul_mat_id @@ -12345,33 +12520,32 @@ static void ggml_compute_forward_mul_mat_id( int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] - if (src1->type != vec_dot_type) { + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } char * wdata = params->wdata; + if (src1->type != vec_dot_type) { + const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const size_t nbw1 = ggml_row_size(vec_dot_type, ne10); - const size_t nbw2 = nbw1*ne11; - const size_t nbw3 = nbw2*ne12; + assert(params->wsize >= ne11*ne12*ne13*row_size); + assert(src1->type == GGML_TYPE_F32); - assert(params->wsize >= ne13*nbw3); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - for (int64_t i13 = 0; i13 < ne13; ++i13) { - for (int64_t i12 = 0; i12 < ne12; ++i12) { - for (int64_t i11 = ith; i11 < ne11; i11 += nth) { - from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), - (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1), - ne10); + for (int64_t i13 = 0; i13 < ne13; ++i13) { + for (int64_t i12 = 0; i12 < ne12; ++i12) { + for (int64_t i11 = 0; i11 < ne11; ++i11) { + from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10); + wdata += row_size; + } } } } - } -#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] - - if (ith == 0) { // initialize matrix_row_counts memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] + // group rows by src0 matrix for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { for (int id = 0; id < n_ids; ++id) { @@ -12383,9 +12557,13 @@ static void ggml_compute_forward_mul_mat_id( matrix_row_counts[i02] += 1; } } + + return; } - ggml_barrier(params->shared); + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } // compute each matrix multiplication in sequence for (int cur_a = 0; cur_a < n_as; ++cur_a) { @@ -12484,6 +12662,9 @@ static void ggml_compute_forward_out_prod_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); + // UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; @@ -12508,10 +12689,17 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - if (ith == 0) { + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; } - ggml_barrier(params->shared); // dst[:,:,:,:] = 0 // for i2,i3: @@ -12587,6 +12775,19 @@ static void ggml_compute_forward_out_prod_f32( } } } + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} } static void ggml_compute_forward_out_prod_q_f32( @@ -12596,6 +12797,9 @@ static void ggml_compute_forward_out_prod_q_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + // int64_t t0 = ggml_perf_time_us(); + // UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS; const int ith = params->ith; @@ -12626,10 +12830,17 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - if (ith == 0) { + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0); + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; } - ggml_barrier(params->shared); // parallelize by last three dimensions @@ -12676,6 +12887,19 @@ static void ggml_compute_forward_out_prod_q_f32( ggml_vec_mad_f32(ne0, d, wdata, *s1); } } + + //int64_t t1 = ggml_perf_time_us(); + //static int64_t acc = 0; + //acc += t1 - t0; + //if (t1 - t0 > 10) { + // printf("\n"); + // printf("ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\n", ne00, ne01, ne02, ne03); + // printf("nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\n", nb00, nb01, nb02, nb03); + // printf("ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\n", ne10, ne11, ne12, ne13); + // printf("nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\n", nb10, nb11, nb12, nb13); + + // printf("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\n", ith, nth, (int) (t1 - t0), (int) acc); + //} } static void ggml_compute_forward_out_prod( @@ -12735,6 +12959,10 @@ static void ggml_compute_forward_scale_f32( GGML_ASSERT(ggml_is_contiguous(dst)); GGML_ASSERT(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // scale factor float v; memcpy(&v, dst->op_params, sizeof(float)); @@ -12803,16 +13031,20 @@ static void ggml_compute_forward_set_f32( size_t offset = ((int32_t *) dst->op_params)[3]; bool inplace = (bool) ((int32_t *) dst->op_params)[4]; - if (!inplace) { - if (params->ith == 0) { - // memcpy needs to be synchronized across threads to avoid race conditions. - // => do it in INIT phase - memcpy( - ((char *) dst->data), - ((char *) src0->data), - ggml_nbytes(dst)); + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { + if (params->ith != 0) { + return; } - ggml_barrier(params->shared); + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; } const int ith = params->ith; @@ -12961,6 +13193,10 @@ static void ggml_compute_forward_get_rows_q( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; @@ -13005,6 +13241,10 @@ static void ggml_compute_forward_get_rows_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; @@ -13046,6 +13286,10 @@ static void ggml_compute_forward_get_rows_bf16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; @@ -13087,6 +13331,10 @@ static void ggml_compute_forward_get_rows_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_BINARY_OP_LOCALS const int64_t nc = ne00; @@ -13198,15 +13446,21 @@ static void ggml_compute_forward_get_rows_back_f32_f16( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } - + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); // ggml_compute_forward_dup_same_cont(params, opt0, dst); - memset(dst->data, 0, ggml_nbytes(dst)); + if (params->type == GGML_TASK_TYPE_INIT) { + if (params->ith != 0) { + return; + } + memset(dst->data, 0, ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); @@ -13231,15 +13485,21 @@ static void ggml_compute_forward_get_rows_back_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } - + GGML_ASSERT(params->ith == 0); GGML_ASSERT(ggml_is_contiguous(dst)); // ggml_compute_forward_dup_same_cont(params, opt0, dst); - memset(dst->data, 0, ggml_nbytes(dst)); + if (params->type == GGML_TASK_TYPE_INIT) { + if (params->ith != 0) { + return; + } + memset(dst->data, 0, ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } const int nc = src0->ne[0]; const int nr = ggml_nelements(src1); @@ -13305,7 +13565,9 @@ static void ggml_compute_forward_diag_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + GGML_ASSERT(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13374,18 +13636,22 @@ static void ggml_compute_forward_diag_mask_f32( GGML_ASSERT(n_past >= 0); - if (!inplace) { - if (ith == 0) { - // memcpy needs to be synchronized across threads to avoid race conditions. - // => do it in INIT phase - GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); - GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - memcpy( - ((char *) dst->data), - ((char *) src0->data), - ggml_nbytes(dst)); + if (!inplace && (params->type == GGML_TASK_TYPE_INIT)) { + if (ith != 0) { + return; } - ggml_barrier(params->shared); + // memcpy needs to be synchronized across threads to avoid race conditions. + // => do it in INIT phase + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); + memcpy( + ((char *) dst->data), + ((char *) src0->data), + ggml_nbytes(dst)); + } + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; } // TODO: handle transposed/permuted matrices @@ -13457,6 +13723,10 @@ static void ggml_compute_forward_soft_max_f32( assert(ggml_is_contiguous(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + float scale = 1.0f; float max_bias = 0.0f; @@ -13578,6 +13848,10 @@ static void ggml_compute_forward_soft_max_back_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_are_same_shape(src1, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // TODO: handle transposed/permuted matrices const int ith = params->ith; @@ -13666,7 +13940,9 @@ static void ggml_compute_forward_clamp_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -13813,6 +14089,10 @@ static void ggml_compute_forward_rope_f32( const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src2 = dst->src[2]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; //const int n_past = ((int32_t *) dst->op_params)[0]; @@ -13939,6 +14219,10 @@ static void ggml_compute_forward_rope_f16( const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src2 = dst->src[2]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; //const int n_past = ((int32_t *) dst->op_params)[0]; @@ -14113,6 +14397,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; @@ -14123,7 +14410,10 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (ith == 0) { + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) @@ -14156,8 +14446,13 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( // need to zero dst since we are accumulating into it memset(dst->data, 0, ggml_nbytes(dst)); + + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; } - ggml_barrier(params->shared); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -14201,6 +14496,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; @@ -14211,7 +14509,10 @@ static void ggml_compute_forward_conv_transpose_1d_f32( GGML_ASSERT(nb00 == sizeof(float)); GGML_ASSERT(nb10 == sizeof(float)); - if (ith == 0) { + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout) @@ -14244,8 +14545,13 @@ static void ggml_compute_forward_conv_transpose_1d_f32( // need to zero dst since we are accumulating into it memset(dst->data, 0, ggml_nbytes(dst)); + + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; } - ggml_barrier(params->shared); const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; @@ -14314,6 +14620,9 @@ static void ggml_compute_forward_im2col_f32( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS; const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; @@ -14344,6 +14653,14 @@ static void ggml_compute_forward_im2col_f32( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); + if (params->type == GGML_TASK_TYPE_INIT) { + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] { float * const wdata = (float *) dst->data; @@ -14391,6 +14708,9 @@ static void ggml_compute_forward_im2col_f16( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16); + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS; const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; @@ -14421,6 +14741,14 @@ static void ggml_compute_forward_im2col_f16( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); + if (params->type == GGML_TASK_TYPE_INIT) { + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] { ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; @@ -14486,6 +14814,9 @@ static void ggml_compute_forward_conv_transpose_2d( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_BINARY_OP_LOCALS const int ith = params->ith; @@ -14496,7 +14827,10 @@ static void ggml_compute_forward_conv_transpose_2d( GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); - if (ith == 0) { + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith != 0) { + return; + } memset(params->wdata, 0, params->wsize); // permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout) @@ -14531,8 +14865,13 @@ static void ggml_compute_forward_conv_transpose_2d( } memset(dst->data, 0, ggml_nbytes(dst)); + + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; } - ggml_barrier(params->shared); const int32_t stride = ggml_get_op_params_i32(dst, 0); @@ -14580,8 +14919,9 @@ static void ggml_compute_forward_pool_1d_sk_p0( const struct ggml_tensor * src = dst->src[0]; assert(src->type == GGML_TYPE_F32); + assert(params->ith == 0); - if (params->ith != 0) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14648,8 +14988,9 @@ static void ggml_compute_forward_pool_2d( const struct ggml_tensor * src = dst->src[0]; GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(params->ith == 0); - if (params->ith != 0) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -14722,6 +15063,10 @@ static void ggml_compute_forward_upscale_f32( const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->type == GGML_TYPE_F32); const int ith = params->ith; @@ -14782,6 +15127,10 @@ static void ggml_compute_forward_pad_f32( const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(src0->nb[0] == sizeof(float)); GGML_ASSERT( dst->nb[0] == sizeof(float)); @@ -14838,6 +15187,10 @@ static void ggml_compute_forward_arange_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_ASSERT(dst->nb[0] == sizeof(float)); const int ith = params->ith; @@ -14876,6 +15229,10 @@ static void ggml_compute_forward_timestep_embedding_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const struct ggml_tensor * src0 = dst->src[0]; GGML_ASSERT(src0->nb[0] == sizeof(float)); @@ -14931,6 +15288,10 @@ static void ggml_compute_forward_argsort_f32( const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_UNARY_OP_LOCALS GGML_ASSERT(nb0 == sizeof(float)); @@ -14991,6 +15352,8 @@ static void ggml_compute_forward_flash_attn_ext_f16( const struct ggml_tensor * v, const struct ggml_tensor * mask, struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); GGML_TENSOR_LOCALS(int64_t, neq, q, ne) GGML_TENSOR_LOCALS(size_t, nbq, q, nb) @@ -15035,6 +15398,14 @@ static void ggml_compute_forward_flash_attn_ext_f16( const int64_t rv2 = neq2/nev2; const int64_t rv3 = neq3/nev3; + if (params->type == GGML_TASK_TYPE_INIT) { + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // parallelize by q rows using ggml_vec_dot_f32 // total rows in q @@ -15217,6 +15588,9 @@ static void ggml_compute_forward_flash_attn_back_f32( const struct ggml_tensor * v = dst->src[2]; const struct ggml_tensor * d = dst->src[3]; + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) GGML_TENSOR_LOCALS(size_t, nbq, q, nb) GGML_TENSOR_LOCALS(int64_t, nek, k, ne) @@ -15263,10 +15637,16 @@ static void ggml_compute_forward_flash_attn_back_f32( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - if (ith == 0) { - memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith == 0) { + memset(dst->data, 0, nb0*ne0*ne1*ne2*ne3); + } + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; } - ggml_barrier(params->shared); const int64_t elem_q = ggml_nelements(q); const int64_t elem_k = ggml_nelements(k); @@ -15546,6 +15926,10 @@ static void ggml_compute_forward_flash_attn_back( static void ggml_compute_forward_ssm_conv_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const struct ggml_tensor * src0 = dst->src[0]; // conv_state const struct ggml_tensor * src1 = dst->src[1]; // x const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight @@ -15668,6 +16052,10 @@ static void ggml_compute_forward_ssm_conv( static void ggml_compute_forward_ssm_scan_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const struct ggml_tensor * src0 = dst->src[0]; // s const struct ggml_tensor * src1 = dst->src[1]; // x const struct ggml_tensor * src2 = dst->src[2]; // dt @@ -15789,10 +16177,13 @@ static void ggml_compute_forward_ssm_scan( static void ggml_compute_forward_win_part_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - UNUSED(params); const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) @@ -15852,10 +16243,13 @@ static void ggml_compute_forward_win_part( static void ggml_compute_forward_win_unpart_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - UNUSED(params); const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) @@ -15981,10 +16375,13 @@ static void ggml_compute_forward_unary( static void ggml_compute_forward_get_rel_pos_f16( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - UNUSED(params); const struct ggml_tensor * src0 = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L292-L322 GGML_TENSOR_UNARY_OP_LOCALS @@ -16034,12 +16431,20 @@ static void ggml_compute_forward_add_rel_pos_f32( const struct ggml_tensor * src2 = dst->src[2]; const bool inplace = (bool) ((int32_t *) dst->op_params)[0]; - if (!inplace) { - if (params->ith == 0) { - memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + if (!inplace && params->type == GGML_TASK_TYPE_INIT) { + if (params->ith != 0) { + return; } - ggml_barrier(params->shared); + memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst)); + return; + } + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; } + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L357-L359 float * src1_data = (float *) src1->data; @@ -16113,14 +16518,15 @@ static void ggml_compute_forward_map_unary_f32( const struct ggml_tensor * src0 = dst->src[0]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -16160,15 +16566,16 @@ static void ggml_compute_forward_map_binary_f32( const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - if (params->ith != 0) { - return; - } - + assert(params->ith == 0); assert(ggml_is_contiguous_1(src0)); assert(ggml_is_contiguous_1(src1)); assert(ggml_is_contiguous_1(dst)); assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const int n = ggml_nrows(src0); const int nc = src0->ne[0]; @@ -16208,7 +16615,9 @@ static void ggml_compute_forward_map_custom1_f32( const struct ggml_tensor * a = dst->src[0]; - if (params->ith != 0) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -16225,7 +16634,9 @@ static void ggml_compute_forward_map_custom2_f32( const struct ggml_tensor * a = dst->src[0]; const struct ggml_tensor * b = dst->src[1]; - if (params->ith != 0) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -16243,7 +16654,9 @@ static void ggml_compute_forward_map_custom3_f32( const struct ggml_tensor * b = dst->src[1]; const struct ggml_tensor * c = dst->src[1]; - if (params->ith != 0) { + assert(params->ith == 0); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { return; } @@ -16258,6 +16671,10 @@ static void ggml_compute_forward_map_custom1( const struct ggml_tensor * a = dst->src[0]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + struct ggml_map_custom1_op_params p; memcpy(&p, dst->op_params, sizeof(p)); @@ -16273,6 +16690,10 @@ static void ggml_compute_forward_map_custom2( const struct ggml_tensor * a = dst->src[0]; const struct ggml_tensor * b = dst->src[1]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + struct ggml_map_custom2_op_params p; memcpy(&p, dst->op_params, sizeof(p)); @@ -16289,6 +16710,10 @@ static void ggml_compute_forward_map_custom3( const struct ggml_tensor * b = dst->src[1]; const struct ggml_tensor * c = dst->src[2]; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + struct ggml_map_custom3_op_params p; memcpy(&p, dst->op_params, sizeof(p)); @@ -16320,10 +16745,21 @@ static void ggml_compute_forward_cross_entropy_loss_f32( GGML_ASSERT(params->wsize >= sizeof(float) * (nth + nth * nc)); - if (ith == 0) { - memset(sums, 0, sizeof(float) * (nth + nth * nc)); + if (params->type == GGML_TASK_TYPE_INIT) { + if (ith == 0) { + memset(sums, 0, sizeof(float) * (nth + nth * nc)); + } + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + if (ith == 0) { + float * dp = (float *) dst->data; + ggml_vec_sum_f32(nth, dp, sums); + dp[0] *= -1.0f / (float) nr; + } + return; } - ggml_barrier(params->shared); const double eps = 1e-9; @@ -16371,13 +16807,7 @@ static void ggml_compute_forward_cross_entropy_loss_f32( } #endif } - ggml_barrier(params->shared); - if (ith == 0) { - float * dp = (float *) dst->data; - ggml_vec_sum_f32(nth, dp, sums); - dp[0] *= -1.0f / (float) nr; - } } static void ggml_compute_forward_cross_entropy_loss( @@ -16417,6 +16847,10 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( const int64_t ith = params->ith; const int64_t nth = params->nth; + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + const double eps = 1e-9; // TODO: handle transposed/permuted matrices @@ -16487,7 +16921,7 @@ static void ggml_compute_forward_cross_entropy_loss_back( ///////////////////////////////// -static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor, struct ggml_compute_state * state) { GGML_ASSERT(params); if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { @@ -16585,7 +17019,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_MUL_MAT: { - ggml_compute_forward_mul_mat(params, tensor); + ggml_compute_forward_mul_mat(params, tensor, state); } break; case GGML_OP_MUL_MAT_ID: { @@ -18063,6 +18497,9 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz /*.leafs =*/ leafs_ptr, /*.hash_table =*/ { hash_size, hash_keys_ptr }, /*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, }; return cgraph; @@ -18082,6 +18519,9 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1) /*.leafs =*/ NULL, /*.hash_table =*/ { 0, NULL }, /*.order =*/ cgraph0->order, + /*.perf_runs =*/ 0, + /*.perf_cycles =*/ 0, + /*.perf_time_us =*/ 0, }; return cgraph; @@ -18275,7 +18715,16 @@ static void set_numa_thread_affinity(int thread_n) { UNUSED(thread_n); } static void clear_numa_thread_affinity(void) {} #endif -static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { +static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const struct ggml_compute_state_shared * st) { + int64_t cycles_cur = ggml_perf_cycles() - st->perf_node_start_cycles; + int64_t time_us_cur = ggml_perf_time_us() - st->perf_node_start_time_us; + + node->perf_runs++; + node->perf_cycles += cycles_cur; + node->perf_time_us += time_us_cur; +} + +static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { int n_tasks = 0; if (ggml_is_empty(node)) { @@ -18318,8 +18767,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_SIGMOID: - case GGML_UNARY_OP_HARDSWISH: - case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads + case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads { n_tasks = 1; } break; @@ -18342,18 +18791,33 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_RMS_NORM_BACK: case GGML_OP_GROUP_NORM: case GGML_OP_CONCAT: + { + n_tasks = n_threads; + } break; case GGML_OP_MUL_MAT: + { + n_tasks = n_threads; + + // TODO: use different scheduling for different matrix sizes + //const int nr0 = ggml_nrows(node->src[0]); + //const int nr1 = ggml_nrows(node->src[1]); + + //n_tasks = MIN(n_threads, MAX(1, nr0/128)); + //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks); + } break; case GGML_OP_MUL_MAT_ID: + { + n_tasks = n_threads; + } break; case GGML_OP_OUT_PROD: { n_tasks = n_threads; } break; case GGML_OP_GET_ROWS: { - // FIXME: get_rows can use additional threads, but the cost of launching additional threads - // decreases performance with GPU offloading - //n_tasks = n_threads; - n_tasks = 1; + // FIXME: the cost of launching additional threads decreases performance with GPU offloading + //n_tasks = MIN(n_threads, ggml_nelements(node->src[1])); + n_tasks = MIN(n_cur_threads, ggml_nelements(node->src[1])); } break; case GGML_OP_SCALE: case GGML_OP_SET: @@ -18383,8 +18847,14 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { { n_tasks = MIN(n_threads, ggml_nrows(node->src[0])); } break; - case GGML_OP_IM2COL: case GGML_OP_CONV_TRANSPOSE_1D: + { + n_tasks = n_threads; + } break; + case GGML_OP_IM2COL: + { + n_tasks = n_threads; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; @@ -18395,12 +18865,33 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { n_tasks = 1; } break; case GGML_OP_UPSCALE: + { + n_tasks = n_threads; + } break; case GGML_OP_PAD: + { + n_tasks = n_threads; + } break; case GGML_OP_ARANGE: + { + n_tasks = n_threads; + } break; case GGML_OP_TIMESTEP_EMBEDDING: + { + n_tasks = n_threads; + } break; case GGML_OP_ARGSORT: + { + n_tasks = n_threads; + } break; case GGML_OP_FLASH_ATTN_EXT: + { + n_tasks = n_threads; + } break; case GGML_OP_FLASH_ATTN_BACK: + { + n_tasks = n_threads; + } break; case GGML_OP_SSM_CONV: case GGML_OP_SSM_SCAN: { @@ -18448,6 +18939,9 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { } } break; case GGML_OP_CROSS_ENTROPY_LOSS: + { + n_tasks = n_threads; + } break; case GGML_OP_CROSS_ENTROPY_LOSS_BACK: { n_tasks = n_threads; @@ -18477,6 +18971,110 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { return n_tasks; } +#ifdef GGML_USE_OPENMP +static void ggml_barrier(struct ggml_compute_state * state) { + if (state->shared->n_threads == 1) { + return; + } + + #pragma omp barrier +} +#else +static void ggml_barrier(struct ggml_compute_state * state) { + if (state->shared->n_threads == 1) { + return; + } + + atomic_int * n_barrier = &state->shared->n_barrier; + atomic_int * n_barrier_passed = &state->shared->n_barrier_passed; + + int n_threads = state->shared->n_threads; + int passed_old = atomic_load(n_barrier_passed); + + if (atomic_fetch_add(n_barrier, 1) == n_threads - 1) { + // last thread + atomic_store(n_barrier, 0); + atomic_fetch_add(n_barrier_passed, 1); + } else { + // wait for other threads + //while (atomic_load(n_barrier_passed) == passed_old) { + //} + const int n_spin_before_sleep = 100000; + while (true) { + for (int i = 0; i < n_spin_before_sleep; i++) { + if (atomic_load(n_barrier_passed) != passed_old) { + return; + } + #if defined(__SSE3__) + _mm_pause(); + #endif + } + sched_yield(); + } + } +} +#endif + +static thread_ret_t ggml_graph_compute_thread(void * data) { + struct ggml_compute_state * state = (struct ggml_compute_state *) data; + + const struct ggml_cgraph * cgraph = state->shared->cgraph; + const struct ggml_cplan * cplan = state->shared->cplan; + + const int ith = state->ith; + const int n_threads = state->shared->n_threads; + + set_numa_thread_affinity(ith); + + struct ggml_compute_params params = { + /*.type =*/ GGML_TASK_TYPE_INIT, + /*.ith =*/ ith, + /*.nth =*/ state->shared->n_threads, + /*.wsize =*/ cplan->work_size, + /*.wdata =*/ cplan->work_data, + }; + + for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { + if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { + state->ec = GGML_STATUS_ABORTED; + return 0; + } + + struct ggml_tensor * node = cgraph->nodes[node_n]; + const int n_tasks = ggml_get_n_tasks(node, n_threads, state->shared->n_threads); + + params.nth = n_tasks; + + /* INIT */ + if (GGML_OP_HAS_INIT[node->op]) { + if (ith < n_tasks) { + params.type = GGML_TASK_TYPE_INIT; + ggml_compute_forward(¶ms, node, state); + } + ggml_barrier(state); + } + + /* COMPUTE */ + if (ith < n_tasks) { + params.type = GGML_TASK_TYPE_COMPUTE; + ggml_compute_forward(¶ms, node, state); + } + + ggml_barrier(state); + + /* FINALIZE */ + if (GGML_OP_HAS_FINALIZE[node->op]) { + if (params.ith == 0) { + params.type = GGML_TASK_TYPE_FINALIZE; + ggml_compute_forward(¶ms, node, state); + } + ggml_barrier(state); + } + } + + return 0; +} + struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; @@ -18493,7 +19091,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - const int n_tasks = ggml_get_n_tasks(node, n_threads); + const int n_tasks = ggml_get_n_tasks(node, n_threads, 1); max_tasks = MAX(max_tasks, n_tasks); @@ -18645,121 +19243,119 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa return cplan; } -static thread_ret_t ggml_graph_compute_thread(void * data) { - struct ggml_compute_state * state = (struct ggml_compute_state *) data; - - const struct ggml_cgraph * cgraph = state->shared->cgraph; - const struct ggml_cplan * cplan = state->shared->cplan; - - set_numa_thread_affinity(state->ith); - - struct ggml_compute_params params = { - /*.ith =*/ state->ith, - /*.nth =*/ state->shared->n_threads, - /*.wsize =*/ cplan->work_size, - /*.wdata =*/ cplan->work_data, - /*.shared=*/ state->shared, - }; +static enum ggml_status ggml_graph_compute_parallel(struct ggml_compute_state * workers, int n_threads) { + enum ggml_status compute_status = GGML_STATUS_SUCCESS; - for (int node_n = 0; node_n < cgraph->n_nodes; node_n++) { - struct ggml_tensor * node = cgraph->nodes[node_n]; +#ifdef GGML_USE_OPENMP + if (n_threads > 1) { + #pragma omp parallel num_threads(n_threads) + { + #pragma omp single + { + // update the number of threads from the actual number of threads that we got from OpenMP + n_threads = omp_get_num_threads(); + workers[0].shared->n_threads = n_threads; + } + ggml_graph_compute_thread(&workers[omp_get_thread_num()]); + } + } else { + ggml_graph_compute_thread(&workers[0]); + } +#else + // create thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; ++j) { + const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); + GGML_ASSERT(rc == 0); + UNUSED(rc); + } + } - ggml_compute_forward(¶ms, node); + // this is a work thread too + ggml_graph_compute_thread(&workers[0]); - if (state->ith == 0 && cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) { - state->shared->ec = GGML_STATUS_ABORTED; + // join or kill thread pool + if (n_threads > 1) { + for (int j = 1; j < n_threads; j++) { + const int rc = ggml_thread_join(workers[j].thrd, NULL); + GGML_ASSERT(rc == 0); + UNUSED(rc); } + } +#endif + // don't leave affinity set on the main thread + clear_numa_thread_affinity(); - ggml_barrier(state->shared); - - if (state->shared->ec != GGML_STATUS_SUCCESS) { + for (int j = 0; j < n_threads; j++) { + if (workers[j].ec != GGML_STATUS_SUCCESS) { + compute_status = workers[j].ec; break; } } - - return 0; + return compute_status; } enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) { - GGML_ASSERT(cplan); - GGML_ASSERT(cplan->n_threads > 0); - GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); + { + GGML_ASSERT(cplan); + GGML_ASSERT(cplan->n_threads > 0); + + if (cplan->work_size > 0) { + GGML_ASSERT(cplan->work_data); + } + } int n_threads = cplan->n_threads; +#if defined(GGML_USE_OPENMP) + n_threads = MIN(n_threads, omp_get_max_threads()); +#endif + struct ggml_compute_state_shared state_shared = { /*.cgraph =*/ cgraph, /*.cgraph_plan =*/ cplan, + /*.perf_node_start_cycles =*/ 0, + /*.perf_node_start_time_us =*/ 0, /*.n_threads =*/ n_threads, /*.n_barrier =*/ 0, /*.n_barrier_passed =*/ 0, /*.abort_callback =*/ NULL, /*.abort_callback_data =*/ NULL, - /*.current_chunk =*/ 0, - /*.ec =*/ GGML_STATUS_SUCCESS, + /*.current_chunk; =*/ 0, }; - -#ifdef GGML_USE_OPENMP - if (n_threads > 1) { - #pragma omp parallel num_threads(n_threads) - { - #pragma omp single - { - // update the number of threads from the actual number of threads that we got from OpenMP - n_threads = omp_get_num_threads(); - state_shared.n_threads = n_threads; - } - - struct ggml_compute_state worker = { - .thrd = 0, - .ith = omp_get_thread_num(), - .shared = &state_shared, - }; - ggml_graph_compute_thread(&worker); - } - } else { - struct ggml_compute_state worker = { - .thrd = 0, - .ith = 0, - .shared = &state_shared, - }; - ggml_graph_compute_thread(&worker); - } -#else struct ggml_compute_state * workers = alloca(sizeof(struct ggml_compute_state)*n_threads); + const int64_t perf_start_cycles = ggml_perf_cycles(); + const int64_t perf_start_time_us = ggml_perf_time_us(); for (int j = 0; j < n_threads; ++j) { workers[j] = (struct ggml_compute_state) { .thrd = 0, .ith = j, .shared = &state_shared, + .ec = GGML_STATUS_SUCCESS, }; } - // create thread pool - for (int j = 1; j < n_threads; ++j) { - const int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } + enum ggml_status compute_status = ggml_graph_compute_parallel(workers, n_threads); - // this is a work thread too - ggml_graph_compute_thread(&workers[0]); + // performance stats (graph) + { + int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; + int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us; - // join or kill thread pool - if (n_threads > 1) { - for (int j = 1; j < n_threads; j++) { - const int rc = ggml_thread_join(workers[j].thrd, NULL); - GGML_ASSERT(rc == 0); - UNUSED(rc); - } - } -#endif + cgraph->perf_runs++; + cgraph->perf_cycles += perf_cycles_cur; + cgraph->perf_time_us += perf_time_us_cur; - // don't leave affinity set on the main thread - clear_numa_thread_affinity(); + GGML_PRINT_DEBUG("%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\n", + __func__, cgraph->perf_runs, + (double) perf_cycles_cur / (double) ggml_cycles_per_ms(), + (double) cgraph->perf_cycles / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs, + (double) perf_time_us_cur / 1000.0, + (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs); + } - return state_shared.ec; + return compute_status; } enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) { @@ -19258,16 +19854,24 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * } void ggml_graph_print(const struct ggml_cgraph * cgraph) { + int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0}; + GGML_PRINT("=== GRAPH ===\n"); GGML_PRINT("n_nodes = %d\n", cgraph->n_nodes); for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s\n", + perf_total_per_op_us[node->op] += MAX(1, node->perf_time_us); + + GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " "); + ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs, + (double) node->perf_cycles / (double) ggml_cycles_per_ms(), + (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, + (double) node->perf_time_us / 1000.0, + (double) node->perf_time_us / 1000.0 / node->perf_runs); } GGML_PRINT("n_leafs = %d\n", cgraph->n_leafs); @@ -19281,6 +19885,14 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { ggml_get_name(node)); } + for (int i = 0; i < GGML_OP_COUNT; i++) { + if (perf_total_per_op_us[i] == 0) { + continue; + } + + GGML_PRINT("perf_total_per_op_us[%16s] = %7.3f ms\n", ggml_op_name(i), (double) perf_total_per_op_us[i] / 1000.0); + } + GGML_PRINT("========================================\n"); } diff --git a/ggml.h b/ggml.h index d895c9acdb596..2e8fd0dbc2e31 100644 --- a/ggml.h +++ b/ggml.h @@ -591,7 +591,11 @@ extern "C" { struct ggml_tensor * grad; struct ggml_tensor * src[GGML_MAX_SRC]; - // source tensor and offset for views + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; + struct ggml_tensor * view_src; size_t view_offs; @@ -601,7 +605,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - // char padding[4]; + char padding[8]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -648,6 +652,11 @@ extern "C" { struct ggml_hash_set visited_hash_table; enum ggml_cgraph_eval_order order; + + // performance + int perf_runs; + int64_t perf_cycles; + int64_t perf_time_us; }; // scratch buffer @@ -664,6 +673,28 @@ extern "C" { bool no_alloc; // don't allocate memory for the tensor data }; + + // compute types + + // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled. + // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995. + enum ggml_task_type { + GGML_TASK_TYPE_INIT = 0, + GGML_TASK_TYPE_COMPUTE, + GGML_TASK_TYPE_FINALIZE, + }; + + struct ggml_compute_params { + enum ggml_task_type type; + + // ith = thread index, nth = number of threads + int ith, nth; + + // work buffer for all threads + size_t wsize; + void * wdata; + }; + // numa strategies enum ggml_numa_strategy { GGML_NUMA_STRATEGY_DISABLED = 0, diff --git a/llama.cpp b/llama.cpp index 989c731495dbb..8c812f2d7049b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12941,6 +12941,12 @@ static int llama_decode_internal( } } +#ifdef GGML_PERF + // print timing information per ggml operation (for debugging purposes) + // requires GGML_PERF to be defined + ggml_graph_print(gf); +#endif + // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); diff --git a/sgemm.cpp b/sgemm.cpp index 6626ceb26213f..bbe263ddd2bb4 100644 --- a/sgemm.cpp +++ b/sgemm.cpp @@ -249,8 +249,9 @@ class tinyBLAS { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int64_t m, int64_t n) { - mnpack(0, m, 0, n); + void matmul(int64_t m, int64_t n, int task) { + if (task == GGML_TASK_TYPE_COMPUTE) + mnpack(0, m, 0, n); } private: @@ -457,8 +458,9 @@ class tinyBLAS_Q0_ARM { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int64_t m, int64_t n) { - mnpack(0, m, 0, n); + void matmul(int64_t m, int64_t n, int task) { + if (task == GGML_TASK_TYPE_COMPUTE) + mnpack(0, m, 0, n); } private: @@ -594,8 +596,9 @@ class tinyBLAS_Q0_AVX { : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { } - void matmul(int64_t m, int64_t n) { - mnpack(0, m, 0, n); + void matmul(int64_t m, int64_t n, int task) { + if (task == GGML_TASK_TYPE_COMPUTE) + mnpack(0, m, 0, n); } private: @@ -826,7 +829,7 @@ class tinyBLAS_Q0_AVX { * For example, for single-threaded single-precision GEMM you can say * * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, - * 0, 1, + * 0, 1, GGML_TASK_TYPE_COMPUTE, * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32); * * @param m is rows in `A` and `C` @@ -840,13 +843,14 @@ class tinyBLAS_Q0_AVX { * @param ldc is row stride of `C` * @param ith is thread id (must be less than `nth`) * @param nth is number of threads (must be greater than zero) + * @param task is GGML task type * @param Atype is GGML data type of `A` * @param Btype is GGML data type of `B` * @param Ctype is GGML data type of `C` * @return true if this function was able to service the matmul request */ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) { + int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { assert(m >= 0); assert(n >= 0); @@ -873,7 +877,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const float *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif defined(__AVX__) || defined(__AVX2__) if (k % 8) @@ -883,7 +887,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const float *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif defined(__ARM_NEON) if (n < 4) @@ -895,7 +899,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const float *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #else return false; @@ -913,7 +917,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const float *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) if (k % 8) @@ -925,7 +929,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const float *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) if (n < 8) @@ -939,7 +943,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const ggml_fp16_t *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif defined(__ARM_NEON) && !defined(_MSC_VER) if (k % 4) @@ -951,7 +955,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const float *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #else return false; @@ -967,7 +971,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const block_q8_0 *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif defined(__ARM_FEATURE_DOTPROD) tinyBLAS_Q0_ARM tb{ @@ -975,7 +979,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const block_q8_0 *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #else return false; @@ -991,7 +995,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const block_q8_0 *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #elif defined(__ARM_FEATURE_DOTPROD) tinyBLAS_Q0_ARM tb{ @@ -999,7 +1003,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (const block_q8_0 *)B, ldb, (float *)C, ldc, ith, nth}; - tb.matmul(m, n); + tb.matmul(m, n, task); return true; #else return false; @@ -1021,6 +1025,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (void)ldc; (void)ith; (void)nth; + (void)task; (void)Atype; (void)Btype; (void)Ctype; diff --git a/sgemm.h b/sgemm.h index caf6dd5567b3a..f29747d0a477a 100644 --- a/sgemm.h +++ b/sgemm.h @@ -7,7 +7,7 @@ extern "C" { bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, const void *, int64_t, void *, int64_t, int, int, - int, int, int); + int, int, int, int); #ifdef __cplusplus }