diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c index 13c71c310c446..dbbaa3941febe 100644 --- a/ggml/src/ggml-backend.c +++ b/ggml/src/ggml-backend.c @@ -394,7 +394,7 @@ void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event) // backend registry -#define GGML_REG_MAX_BACKENDS 16 +#define GGML_REG_MAX_BACKENDS 64 struct ggml_backend_reg { char name[128]; diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp index 9c419ba896706..5a890237f2453 100644 --- a/ggml/src/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl.cpp @@ -3768,37 +3768,13 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); - const ggml_tensor_extra_gpu *src0_extra = - (const ggml_tensor_extra_gpu *)src0->extra; - const ggml_tensor_extra_gpu *src1_extra = - (const ggml_tensor_extra_gpu *)src1->extra; - const ggml_tensor_extra_gpu *dst_extra = - (const ggml_tensor_extra_gpu *)dst->extra; - - ggml_tensor_extra_gpu src0_row_extra; - ggml_tensor_extra_gpu src1_row_extra; - ggml_tensor_extra_gpu dst_row_extra; - ggml_tensor src0_row = *src0; ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_TYPE_GPU; - dst_row.backend = GGML_BACKEND_TYPE_GPU; - - src0_row.extra = &src0_row_extra; - src1_row.extra = &src1_row_extra; - dst_row.extra = &dst_row_extra; - - char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU - ? (char *)src0->data - : (char *)src0_extra->data_device[ctx.device]; - char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU - ? (char *)src1->data - : (char *)src1_extra->data_device[ctx.device]; - char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU - ? (char *)dst->data - : (char *)dst_extra->data_device[ctx.device]; + char *src0_original = (char *)src0->data; + char *src1_original = (char *)src1->data; + char *dst_original = (char *)dst->data; src0_row.ne[2] = 1; src0_row.ne[3] = 1; @@ -3827,12 +3803,9 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten const int64_t i1 = id; const int64_t i2 = i12; - src0_row_extra.data_device[ctx.device] = - src0_original + i02*nb02; - src1_row_extra.data_device[ctx.device] = - src1_original + + i11*nb11 + i12*nb12; - dst_row_extra.data_device[ctx.device] = - dst_original + i1*nb1 + i2*nb2; + src0_row.data = src0_original + i02*nb02; + src1_row.data = src1_original + + i11*nb11 + i12*nb12; + dst_row.data = dst_original + i1*nb1 + i2*nb2; ggml_sycl_mul_mat(ctx, &src0_row, &src1_row, &dst_row); } @@ -3841,8 +3814,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten ggml_sycl_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); ggml_sycl_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); - src1_row_extra.data_device[ctx.device] = src1_contiguous.get(); - dst_row_extra.data_device[ctx.device] = dst_contiguous.get(); + src1_row.data = src1_contiguous.get(); + dst_row.data = dst_contiguous.get(); for (int64_t i02 = 0; i02 < n_as; i02++) { int64_t num_src1_rows = 0; @@ -3898,7 +3871,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, const ggml_ten }); } - src0_row_extra.data_device[ctx.device] = src0_original + i02*nb02; + src0_row.data = src0_original + i02*nb02; GGML_ASSERT(nb11 == sizeof(float)*ne10); GGML_ASSERT(nb1 == sizeof(float)*ne0); @@ -5221,6 +5194,10 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } } + ggml_type src0_type = op->src[0]->type; + if (src0_type == GGML_TYPE_BF16) { + return false; + } return true; } break; case GGML_OP_GET_ROWS: diff --git a/src/llama.cpp b/src/llama.cpp index ed77ed918e554..f91ac777969d9 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5883,13 +5883,6 @@ static bool llm_load_tensors( auto & hparams = model.hparams; -#ifdef GGML_USE_SYCL - // disable MoE with SYCL until mul_mat_id is updated - if (hparams.n_expert > 0) { - n_gpu_layers = 0; - } -#endif - model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers;