From 4d0383321184aadf91968d9e3c6a45286ed2473b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Yusuf=20Sar=C4=B1g=C3=B6z?= Date: Sat, 7 Oct 2023 22:14:10 +0300 Subject: [PATCH 01/26] gguf.py : fix CI for publishing GGUF package (#3532) * Fix CI for publishing GGUF package * Bump version * fix * bump version * bump version * bump version --- .github/workflows/gguf-publish.yml | 3 ++- gguf-py/README.md | 1 - gguf-py/pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml index e61bfc6c32b60..57db175122c03 100644 --- a/.github/workflows/gguf-publish.yml +++ b/.github/workflows/gguf-publish.yml @@ -36,8 +36,9 @@ jobs: poetry install - name: Build package - run: poetry build + run: cd gguf-py && poetry build - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_API_TOKEN }} + packages-dir: gguf-py/dist diff --git a/gguf-py/README.md b/gguf-py/README.md index ffe25c495ab1e..a28d8c57adc7d 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -69,4 +69,3 @@ python -m twine upload dist/* ## TODO - [ ] Add tests - [ ] Include conversion scripts as command line entry points in this package. -- Add CI workflow for releasing the package. diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 400607ce1dc30..07a7ab4dd11fc 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.4.0" +version = "0.4.4" description = "Write ML models in GGUF for GGML" authors = ["GGML "] packages = [ From a16e89cec83b4bd5f6af8f1ce1400f94c12356f9 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 7 Oct 2023 15:31:41 -0600 Subject: [PATCH 02/26] Fix trying to strip newline from empty prompt and cfg prompt file content (#3534) --- common/common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 60b00b5fbb8f1..0f55c33a713a7 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -170,7 +170,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // store the external file name in params params.prompt_file = argv[i]; std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (params.prompt.back() == '\n') { + if (!params.prompt.empty() && params.prompt.back() == '\n') { params.prompt.pop_back(); } } else if (arg == "-n" || arg == "--n-predict") { @@ -295,7 +295,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.cfg_negative_prompt)); - if (params.cfg_negative_prompt.back() == '\n') { + if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') { params.cfg_negative_prompt.pop_back(); } } else if (arg == "--cfg-scale") { From 63d3b06a4318329f92b078e8aa0be7ab6e9f871f Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Sat, 7 Oct 2023 23:22:17 -0600 Subject: [PATCH 03/26] llama : fix missing break in Persimmon arch case statements (#3535) --- llama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama.cpp b/llama.cpp index d10656bb801db..a4312ab72d613 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2047,7 +2047,7 @@ static void llm_load_hparams( case 36: model.type = e_model::MODEL_8B; break; default: model.type = e_model::MODEL_UNKNOWN; } - } + } break; case LLM_ARCH_REFACT: { GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); @@ -4926,7 +4926,7 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_PERSIMMON: { result = llm_build_persimmon(lctx, batch); - } + } break; case LLM_ARCH_REFACT: { result = llm_build_refact(lctx, batch); From b0ec5218c3d24755786b80ecce9cf4ffc07583f8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Oct 2023 10:01:53 +0300 Subject: [PATCH 04/26] metal : support MTLGPUFamily < Apple7, formatting, style (#3524) * metal : improve decoding speed for batches of 2-16 * metal : rename kernels mul_mat_ to mul_mv_ * metal : indentations * minor * metal : print more GPU info + disable mul_mm for MTLGPUFamiliy < Apple7 --- ggml-metal.m | 202 +++++++++++++++++++++++++++++------------------ ggml-metal.metal | 92 ++++++++++++--------- 2 files changed, 176 insertions(+), 118 deletions(-) diff --git a/ggml-metal.m b/ggml-metal.m index f8fa05dd9f4a7..57c238ddaaf67 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -81,18 +81,18 @@ GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(norm); - GGML_METAL_DECL_KERNEL(mul_mat_f32_f32); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_1row); - GGML_METAL_DECL_KERNEL(mul_mat_f16_f32_l4); - GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q8_0_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32); - GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_f32_f32); + GGML_METAL_DECL_KERNEL(mul_mv_f16_f32); + GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_1row); + GGML_METAL_DECL_KERNEL(mul_mv_f16_f32_l4); + GGML_METAL_DECL_KERNEL(mul_mv_q4_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q4_1_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q8_0_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q2_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q3_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); + GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); @@ -262,28 +262,30 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(norm); - GGML_METAL_ADD_KERNEL(mul_mat_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_1row); - GGML_METAL_ADD_KERNEL(mul_mat_f16_f32_l4); - GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); - GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_f32_f32); + GGML_METAL_ADD_KERNEL(mul_mv_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_1row); + GGML_METAL_ADD_KERNEL(mul_mv_f16_f32_l4); + GGML_METAL_ADD_KERNEL(mul_mv_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); + if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { + GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); + GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); + } GGML_METAL_ADD_KERNEL(rope_f32); GGML_METAL_ADD_KERNEL(rope_f16); GGML_METAL_ADD_KERNEL(alibi_f32); @@ -296,8 +298,22 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ #undef GGML_METAL_ADD_KERNEL } - GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); #if TARGET_OS_OSX + // print MTL GPU family: + GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); + GGML_METAL_LOG_INFO("%s: GPU arch: %s\n", __func__, [[ctx->device architecture].name UTF8String]); + + // determine max supported GPU family + // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf + // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf + for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) { + if ([ctx->device supportsFamily:i]) { + GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i); + break; + } + } + + GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); if (ctx->device.maxTransferRate != 0) { GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); @@ -339,28 +355,30 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(get_rows_q6_K); GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(norm); - GGML_METAL_DEL_KERNEL(mul_mat_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_1row); - GGML_METAL_DEL_KERNEL(mul_mat_f16_f32_l4); - GGML_METAL_DEL_KERNEL(mul_mat_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mat_q6_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); - GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); - GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mv_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_1row); + GGML_METAL_DEL_KERNEL(mul_mv_f16_f32_l4); + GGML_METAL_DEL_KERNEL(mul_mv_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); + if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { + GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); + GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q8_0_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_1_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q2_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q3_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); + GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); + } GGML_METAL_DEL_KERNEL(rope_f32); GGML_METAL_DEL_KERNEL(rope_f16); GGML_METAL_DEL_KERNEL(alibi_f32); @@ -986,21 +1004,46 @@ void ggml_metal_graph_compute( } break; case GGML_OP_MUL_MAT: { - // TODO: needs to be updated after PR: https://github.com/ggerganov/ggml/pull/224 - GGML_ASSERT(ne00 == ne10); - // GGML_ASSERT(ne02 == ne12); // Should be checked on individual data types until broadcast is implemented everywhere - uint gqa = ne12/ne02; GGML_ASSERT(ne03 == ne13); + const uint gqa = ne12/ne02; + + // find the break-even point where the matrix-matrix kernel becomes more efficient compared + // to the matrix-vector kernel + int ne11_mm_min = 1; + +#if 0 + // the numbers below are measured on M2 Ultra for 7B and 13B models + // these numbers do not translate to other devices or model sizes + // TODO: need to find a better approach + if ([ctx->device.name isEqualToString:@"Apple M2 Ultra"]) { + switch (src0t) { + case GGML_TYPE_F16: ne11_mm_min = 2; break; + case GGML_TYPE_Q8_0: ne11_mm_min = 7; break; + case GGML_TYPE_Q2_K: ne11_mm_min = 15; break; + case GGML_TYPE_Q3_K: ne11_mm_min = 7; break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: ne11_mm_min = 15; break; + case GGML_TYPE_Q4_K: ne11_mm_min = 11; break; + case GGML_TYPE_Q5_0: // not tested yet + case GGML_TYPE_Q5_1: ne11_mm_min = 13; break; // not tested yet + case GGML_TYPE_Q5_K: ne11_mm_min = 7; break; + case GGML_TYPE_Q6_K: ne11_mm_min = 7; break; + default: ne11_mm_min = 1; break; + } + } +#endif + // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel - if (!ggml_is_transposed(src0) && + if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && + !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && - [ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne00%32 == 0 && - ne11 > 2) { + ne00 % 32 == 0 && + ne11 > ne11_mm_min) { + //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); switch (src0->type) { case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f32_f32]; break; case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_f16_f32]; break; @@ -1029,17 +1072,18 @@ void ggml_metal_graph_compute( [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:12]; [encoder setBytes:&gqa length:sizeof(gqa) atIndex:13]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake( (ne11+31)/32, (ne01+63) / 64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake( (ne11 + 31)/32, (ne01 + 63)/64, ne12) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; int nrows = 1; + //printf("vector: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); // use custom matrix x vector kernel switch (src0t) { case GGML_TYPE_F32: { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f32_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f32_f32]; nrows = 4; } break; case GGML_TYPE_F16: @@ -1047,12 +1091,12 @@ void ggml_metal_graph_compute( nth0 = 32; nth1 = 1; if (ne11 * ne12 < 4) { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_1row]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_1row]; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32_l4]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32_l4]; nrows = ne11; } else { - [encoder setComputePipelineState:ctx->pipeline_mul_mat_f16_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_f16_f32]; nrows = 4; } } break; @@ -1063,7 +1107,7 @@ void ggml_metal_graph_compute( nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_0_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_0_f32]; } break; case GGML_TYPE_Q4_1: { @@ -1072,7 +1116,7 @@ void ggml_metal_graph_compute( nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_1_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_1_f32]; } break; case GGML_TYPE_Q8_0: { @@ -1081,7 +1125,7 @@ void ggml_metal_graph_compute( nth0 = 8; nth1 = 8; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q8_0_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q8_0_f32]; } break; case GGML_TYPE_Q2_K: { @@ -1090,7 +1134,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q2_K_f32]; } break; case GGML_TYPE_Q3_K: { @@ -1099,7 +1143,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q3_K_f32]; } break; case GGML_TYPE_Q4_K: { @@ -1108,7 +1152,7 @@ void ggml_metal_graph_compute( nth0 = 4; //1; nth1 = 8; //32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q4_K_f32]; } break; case GGML_TYPE_Q5_K: { @@ -1117,7 +1161,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q5_K_f32]; } break; case GGML_TYPE_Q6_K: { @@ -1126,7 +1170,7 @@ void ggml_metal_graph_compute( nth0 = 2; nth1 = 32; - [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32]; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32]; } break; default: { @@ -1155,7 +1199,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&gqa length:sizeof(gqa) atIndex:17]; if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K) {// || src0t == GGML_TYPE_Q4_K) { + src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q4_K) { diff --git a/ggml-metal.metal b/ggml-metal.metal index 9bd94e82bfe1e..b6288db28660d 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -13,8 +13,8 @@ typedef struct { #define QK4_1 32 typedef struct { - half d; // delta - half m; // min + half d; // delta + half m; // min uint8_t qs[QK4_1 / 2]; // nibbles / quants } block_q4_1; @@ -423,8 +423,8 @@ inline float block_q_n_dot_y(device const block_q4_1 * qb_curr, float sumy, thre } // putting them in the kernel cause a significant performance penalty -#define N_DST 4 // each SIMD group works on 4 rows -#define N_SIMDGROUP 2 // number of SIMD groups in a thread group +#define N_DST 4 // each SIMD group works on 4 rows +#define N_SIMDGROUP 2 // number of SIMD groups in a thread group #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 //Note: This is a template, but strictly speaking it only applies to // quantizations where the block size is 32. It also does not @@ -435,18 +435,23 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device int64_t ne00, int64_t ne01, int64_t ne02, int64_t ne10, int64_t ne12, int64_t ne0, int64_t ne1, uint gqa, uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; + const int r0 = tgpig.x; const int r1 = tgpig.y; const int im = tgpig.z; + const int first_row = (r0 * nsg + sgitg) * nr; + const uint offset0 = first_row * nb + im/gqa*(nb*ne0); + device const block_q_type * x = (device const block_q_type *) src0 + offset0; device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; - float yl[16]; // src1 vector cache - float sumf[nr]={0.f}; - const int ix = tiisg/2; - const int il = 8*(tiisg%2); + float yl[16]; // src1 vector cache + float sumf[nr] = {0.f}; + + const int ix = (tiisg/2); + const int il = (tiisg%2)*8; device const float * yb = y + ix * QK4_0 + il; @@ -457,6 +462,7 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device sumy += yb[i] + yb[i+1]; yl[i+0] = yb[i+ 0]; yl[i+1] = yb[i+ 1]/256.f; + sumy += yb[i+16] + yb[i+17]; yl[i+8] = yb[i+16]/16.f; yl[i+9] = yb[i+17]/4096.f; @@ -472,12 +478,12 @@ void mul_vec_q_n_f32(device const void * src0, device const float * src1, device for (int row = 0; row < nr; ++row) { const float tot = simd_sum(sumf[row]); if (tiisg == 0 && first_row + row < ne01) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row] = tot; + dst[im*ne0*ne1 + r1*ne0 + first_row + row] = tot; } } } -kernel void kernel_mul_mat_q4_0_f32( +kernel void kernel_mul_mv_q4_0_f32( device const void * src0, device const float * src1, device float * dst, @@ -490,12 +496,12 @@ kernel void kernel_mul_mat_q4_0_f32( constant int64_t & ne1[[buffer(16)]], constant uint & gqa[[buffer(17)]], uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { mul_vec_q_n_f32(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,gqa,tgpig,tiisg,sgitg); } -kernel void kernel_mul_mat_q4_1_f32( +kernel void kernel_mul_mv_q4_1_f32( device const void * src0, device const float * src1, device float * dst, @@ -515,7 +521,7 @@ kernel void kernel_mul_mat_q4_1_f32( #define NB_Q8_0 8 -kernel void kernel_mul_mat_q8_0_f32( +kernel void kernel_mul_mv_q8_0_f32( device const void * src0, device const float * src1, device float * dst, @@ -579,7 +585,7 @@ kernel void kernel_mul_mat_q8_0_f32( #define N_F32_F32 4 -kernel void kernel_mul_mat_f32_f32( +kernel void kernel_mul_mv_f32_f32( device const char * src0, device const char * src1, device float * dst, @@ -650,7 +656,7 @@ kernel void kernel_mul_mat_f32_f32( } } -kernel void kernel_mul_mat_f16_f32_1row( +kernel void kernel_mul_mv_f16_f32_1row( device const char * src0, device const char * src1, device float * dst, @@ -669,7 +675,7 @@ kernel void kernel_mul_mat_f16_f32_1row( constant int64_t & ne0, constant int64_t & ne1, uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]]) { + uint tiisg[[thread_index_in_simdgroup]]) { const int64_t r0 = tgpig.x; const int64_t r1 = tgpig.y; @@ -704,7 +710,7 @@ kernel void kernel_mul_mat_f16_f32_1row( #define N_F16_F32 4 -kernel void kernel_mul_mat_f16_f32( +kernel void kernel_mul_mv_f16_f32( device const char * src0, device const char * src1, device float * dst, @@ -776,7 +782,7 @@ kernel void kernel_mul_mat_f16_f32( } // Assumes row size (ne00) is a multiple of 4 -kernel void kernel_mul_mat_f16_f32_l4( +kernel void kernel_mul_mv_f16_f32_l4( device const char * src0, device const char * src1, device float * dst, @@ -1253,7 +1259,7 @@ static inline uchar4 get_scale_min_k4(int j, device const uint8_t * q) { //====================================== dot products ========================= -kernel void kernel_mul_mat_q2_K_f32( +kernel void kernel_mul_mv_q2_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1397,7 +1403,7 @@ kernel void kernel_mul_mat_q2_K_f32( } #if QK_K == 256 -kernel void kernel_mul_mat_q3_K_f32( +kernel void kernel_mul_mv_q3_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1549,7 +1555,7 @@ kernel void kernel_mul_mat_q3_K_f32( } } #else -kernel void kernel_mul_mat_q3_K_f32( +kernel void kernel_mul_mv_q3_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1620,7 +1626,7 @@ kernel void kernel_mul_mat_q3_K_f32( #endif #if QK_K == 256 -kernel void kernel_mul_mat_q4_K_f32( +kernel void kernel_mul_mv_q4_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1726,7 +1732,7 @@ kernel void kernel_mul_mat_q4_K_f32( } } #else -kernel void kernel_mul_mat_q4_K_f32( +kernel void kernel_mul_mv_q4_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1815,7 +1821,7 @@ kernel void kernel_mul_mat_q4_K_f32( } #endif -kernel void kernel_mul_mat_q5_K_f32( +kernel void kernel_mul_mv_q5_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -1988,7 +1994,7 @@ kernel void kernel_mul_mat_q5_K_f32( } -kernel void kernel_mul_mat_q6_K_f32( +kernel void kernel_mul_mv_q6_K_f32( device const void * src0, device const float * src1, device float * dst, @@ -2326,7 +2332,7 @@ kernel void kernel_get_rows( } #define BLOCK_SIZE_M 64 // 8 simdgroup matrices from matrix A -#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix A +#define BLOCK_SIZE_N 32 // 4 simdgroup matrices from matrix B #define BLOCK_SIZE_K 32 #define THREAD_MAT_M 4 // each thread take 4 simdgroup matrices from matrix A #define THREAD_MAT_N 2 // each thread take 2 simdgroup matrices from matrix B @@ -2363,9 +2369,11 @@ kernel void kernel_mul_mm(device const uchar * src0, const uint r0 = tgpig.y; const uint r1 = tgpig.x; const uint im = tgpig.z; + // if this block is of 64x32 shape or smaller short n_rows = (ne0 - r0 * BLOCK_SIZE_M < BLOCK_SIZE_M) ? (ne0 - r0 * BLOCK_SIZE_M) : BLOCK_SIZE_M; short n_cols = (ne1 - r1 * BLOCK_SIZE_N < BLOCK_SIZE_N) ? (ne1 - r1 * BLOCK_SIZE_N) : BLOCK_SIZE_N; + // a thread shouldn't load data outside of the matrix short thread_row = ((short)tiitg/THREAD_PER_ROW) < n_rows ? ((short)tiitg/THREAD_PER_ROW) : n_rows - 1; short thread_col = ((short)tiitg/THREAD_PER_COL) < n_cols ? ((short)tiitg/THREAD_PER_COL) : n_cols - 1; @@ -2389,26 +2397,30 @@ kernel void kernel_mul_mm(device const uchar * src0, + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { - //load data and store to threadgroup memory + // load data and store to threadgroup memory half4x4 temp_a; dequantize_func(x, il, temp_a); threadgroup_barrier(mem_flags::mem_threadgroup); + #pragma unroll(16) for (int i = 0; i < 16; i++) { *(sa + SG_MAT_SIZE * ((tiitg / THREAD_PER_ROW / 8) \ - + 16 * (tiitg % THREAD_PER_ROW) + 8 * (i / 8)) \ - + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; + + (tiitg % THREAD_PER_ROW) * 16 + (i / 8) * 8) \ + + (tiitg / THREAD_PER_ROW) % 8 + (i & 7) * 8) = temp_a[i/4][i%4]; } - *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) \ - = *((device float2x4 *)y); + + *(threadgroup float2x4 *)(sb + (tiitg % THREAD_PER_COL) * 8 * 32 + 8 * (tiitg / THREAD_PER_COL)) = *((device float2x4 *)y); + il = (il + 2 < nl) ? il + 2 : il % 2; x = (il < 2) ? x + (2+nl-1)/nl : x; y += BLOCK_SIZE_K; threadgroup_barrier(mem_flags::mem_threadgroup); - //load matrices from threadgroup memory and conduct outer products + + // load matrices from threadgroup memory and conduct outer products threadgroup half * lsma = (sa + THREAD_MAT_M * SG_MAT_SIZE * (sgitg % 2)); threadgroup float * lsmb = (sb + THREAD_MAT_N * SG_MAT_SIZE * (sgitg / 2)); + #pragma unroll(4) for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { #pragma unroll(4) @@ -2423,6 +2435,7 @@ kernel void kernel_mul_mm(device const uchar * src0, lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; lsmb += BLOCK_SIZE_N / SG_MAT_ROW * SG_MAT_SIZE; + #pragma unroll(8) for (int i = 0; i < 8; i++){ simdgroup_multiply_accumulate(c_res[i], mb[i/4], ma[i%4], c_res[i]); @@ -2431,25 +2444,26 @@ kernel void kernel_mul_mm(device const uchar * src0, } if ((r0 + 1) * BLOCK_SIZE_M <= ne0 && (r1 + 1) * BLOCK_SIZE_N <= ne1) { - device float *C = dst + BLOCK_SIZE_M * r0 + 32 * (sgitg&1) \ - + (BLOCK_SIZE_N * r1 + 16 * (sgitg>>1)) * ne0 + im*ne1*ne0; + device float * C = dst + (BLOCK_SIZE_M * r0 + 32 * (sgitg & 1)) \ + + (BLOCK_SIZE_N * r1 + 16 * (sgitg >> 1)) * ne0 + im*ne1*ne0; for (int i = 0; i < 8; i++) { simdgroup_store(c_res[i], C + 8 * (i%4) + 8 * ne0 * (i/4), ne0); } } else { // block is smaller than 64x32, we should avoid writing data outside of the matrix threadgroup_barrier(mem_flags::mem_threadgroup); - threadgroup float *temp_str = ((threadgroup float *)shared_memory) \ + threadgroup float * temp_str = ((threadgroup float *)shared_memory) \ + 32 * (sgitg&1) + (16 * (sgitg>>1)) * BLOCK_SIZE_M; for (int i = 0; i < 8; i++) { simdgroup_store(c_res[i], temp_str + 8 * (i%4) + 8 * BLOCK_SIZE_M * (i/4), BLOCK_SIZE_M); } threadgroup_barrier(mem_flags::mem_threadgroup); - device float *C = dst + BLOCK_SIZE_M * r0 + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; - if (sgitg==0) { + + device float * C = dst + (BLOCK_SIZE_M * r0) + (BLOCK_SIZE_N * r1) * ne0 + im*ne1*ne0; + if (sgitg == 0) { for (int i = 0; i < n_rows; i++) { - for (int j = tiitg; j< n_cols; j += BLOCK_SIZE_N) { + for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { *(C + i + j * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); } } From 7d8b24932fe788a4cda76459a0c5df3e0073cb98 Mon Sep 17 00:00:00 2001 From: Luo Tian Date: Sun, 8 Oct 2023 16:24:01 +0800 Subject: [PATCH 05/26] zig : fix build by introducing train.cpp (#3539) --- build.zig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build.zig b/build.zig index 3a8978bc37fe2..b95491e03f12f 100644 --- a/build.zig +++ b/build.zig @@ -111,12 +111,14 @@ pub fn build(b: *std.build.Builder) !void { const common = make.obj("common", "common/common.cpp"); const console = make.obj("common", "common/console.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); + const train = make.obj("train", "common/train.cpp"); _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train }); const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); if (server.target.isWindows()) { From 94e502dfb79430870b42b8e8ee132b4aaa93e4a8 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Oct 2023 11:24:50 +0300 Subject: [PATCH 06/26] ci : enable on obj-c changes + fix metal build (#3540) --- .github/workflows/build.yml | 4 ++-- ggml-metal.m | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index c1e36ee28d202..e41be76db0820 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift'] + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift'] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} diff --git a/ggml-metal.m b/ggml-metal.m index 57c238ddaaf67..92956ed97e9c9 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -301,12 +301,11 @@ static void ggml_metal_log(enum ggml_log_level level, const char* format, ...){ #if TARGET_OS_OSX // print MTL GPU family: GGML_METAL_LOG_INFO("%s: GPU name: %s\n", __func__, [[ctx->device name] UTF8String]); - GGML_METAL_LOG_INFO("%s: GPU arch: %s\n", __func__, [[ctx->device architecture].name UTF8String]); // determine max supported GPU family // https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf // https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf - for (int i = MTLGPUFamilyApple9 + 10; i >= MTLGPUFamilyApple1; --i) { + for (int i = MTLGPUFamilyApple1 + 20; i >= MTLGPUFamilyApple1; --i) { if ([ctx->device supportsFamily:i]) { GGML_METAL_LOG_INFO("%s: GPU family: MTLGPUFamilyApple%d (%d)\n", __func__, i - MTLGPUFamilyApple1 + 1, i); break; From a1202a31ed8c35705bd09fe91c3e7410c619bd70 Mon Sep 17 00:00:00 2001 From: Johannes Rudolph Date: Sun, 8 Oct 2023 12:21:19 +0200 Subject: [PATCH 07/26] k-quants : fix comments about block sizing (#3499) --- k_quants.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/k_quants.h b/k_quants.h index adc6a391376d4..9de089e7a4719 100644 --- a/k_quants.h +++ b/k_quants.h @@ -29,7 +29,7 @@ // 2-bit quantization // weight is represented as x = a * q + b -// 16 blocks of 16 elemenets each +// 16 blocks of 16 elements each // Effectively 2.5625 bits per weight typedef struct { uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits @@ -41,7 +41,7 @@ static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "w // 3-bit quantization // weight is represented as x = a * q -// 16 blocks of 16 elemenets each +// 16 blocks of 16 elements each // Effectively 3.4375 bits per weight #ifdef GGML_QKK_64 typedef struct { @@ -62,7 +62,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + #endif // 4-bit quantization -// 16 blocks of 32 elements each +// 8 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 4.5 bits per weight #ifdef GGML_QKK_64 @@ -83,7 +83,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/ #endif // 5-bit quantization -// 16 blocks of 32 elements each +// 8 blocks of 32 elements each // weight is represented as x = a * q + b // Effectively 5.5 bits per weight #ifdef GGML_QKK_64 @@ -107,7 +107,7 @@ static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/ // 6-bit quantization // weight is represented as x = a * q -// 16 blocks of 16 elemenets each +// 16 blocks of 16 elements each // Effectively 6.5625 bits per weight typedef struct { uint8_t ql[QK_K/2]; // quants, lower 4 bits From 9c38d181d40b9d27f8f42152c18e7c70bfffcf37 Mon Sep 17 00:00:00 2001 From: arcrank Date: Sun, 8 Oct 2023 06:52:57 -0400 Subject: [PATCH 08/26] api_like_OAI.py : simplify function (#2796) Simplify function --- examples/server/api_like_OAI.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py index ed19237b0b3e5..1b0bf575706ea 100755 --- a/examples/server/api_like_OAI.py +++ b/examples/server/api_like_OAI.py @@ -23,13 +23,7 @@ args = parser.parse_args() def is_present(json, key): - try: - buf = json[key] - except KeyError: - return False - return True - - + return key in json #convert chat to prompt def convert_chat(messages): From 8e6716a102e390e930594d51302730184dac83cc Mon Sep 17 00:00:00 2001 From: Ryder Wishart Date: Sun, 8 Oct 2023 03:55:58 -0700 Subject: [PATCH 09/26] api_like_OAI.py : compat with Microsoft Guidance (#2746) Check for None in addition to empty string check in all request params Co-authored-by: Georgi Gerganov --- examples/server/api_like_OAI.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py index 1b0bf575706ea..14d2dcf6580e4 100755 --- a/examples/server/api_like_OAI.py +++ b/examples/server/api_like_OAI.py @@ -23,7 +23,13 @@ args = parser.parse_args() def is_present(json, key): - return key in json + try: + buf = json[key] + except KeyError: + return False + if json[key] == None: + return False + return True #convert chat to prompt def convert_chat(messages): From eee42c670e6fa6df9cf17e7ffc319f74cbd81354 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matheus=20C=2E=20Fran=C3=A7a?= Date: Sun, 8 Oct 2023 10:59:20 -0300 Subject: [PATCH 10/26] ci : add Zig CI/CD and fix build (#2996) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * zig CI/CD and fix build Signed-off-by: Matheus Catarino França * fix build_compiler * ci : remove trailing whitespace --------- Signed-off-by: Matheus Catarino França Co-authored-by: Georgi Gerganov --- .github/workflows/zig-build.yml | 25 +++++++++++++++++++++++++ build.zig | 31 ++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 .github/workflows/zig-build.yml diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml new file mode 100644 index 0000000000000..68a698ab94f55 --- /dev/null +++ b/.github/workflows/zig-build.yml @@ -0,0 +1,25 @@ +name: Zig CI + +on: + pull_request: + push: + branches: + - master + +jobs: + build: + strategy: + fail-fast: false + matrix: + runs-on: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.runs-on }} + steps: + - uses: actions/checkout@v3 + with: + submodules: recursive + fetch-depth: 0 + - uses: goto-bus-stop/setup-zig@v2 + with: + version: 0.11.0 + - name: Build Summary + run: zig build --summary all -freference-trace diff --git a/build.zig b/build.zig index b95491e03f12f..c86e4c667cf30 100644 --- a/build.zig +++ b/build.zig @@ -36,14 +36,17 @@ const Maker = struct { } fn init(builder: *std.build.Builder) !Maker { - // const commit_hash = @embedFile(".git/refs/heads/master"); const target = builder.standardTargetOptions(.{}); + const zig_version = @import("builtin").zig_version_string; + const commit_hash = try std.ChildProcess.exec( + .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } }, + ); const config_header = builder.addConfigHeader( .{ .style = .blank, .include_path = "build-info.h" }, .{ .BUILD_NUMBER = 0, - .BUILD_COMMIT = "12345", // omit newline - .BUILD_COMPILER = "Zig 0.11.0", + .BUILD_COMMIT = commit_hash.stdout[0 .. commit_hash.stdout.len - 1], // omit newline + .BUILD_COMPILER = builder.fmt("Zig {s}", .{zig_version}), .BUILD_TARGET = try target.allocDescription(builder.allocator), }, ); @@ -67,12 +70,20 @@ const Maker = struct { fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile { const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize }); + if (o.target.getAbi() != .msvc) + o.defineCMacro("_GNU_SOURCE", null); + o.addConfigHeader(m.config_header); if (std.mem.endsWith(u8, src, ".c")) { o.addCSourceFiles(&.{src}, m.cflags.items); o.linkLibC(); } else { o.addCSourceFiles(&.{src}, m.cxxflags.items); - o.linkLibCpp(); + if (o.target.getAbi() == .msvc) { + o.linkLibC(); // need winsdk + crt + } else { + // linkLibCpp already add (libc++ + libunwind + libc) + o.linkLibCpp(); + } } o.addConfigHeader(m.config_header); for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i }); @@ -86,8 +97,14 @@ const Maker = struct { for (deps) |d| e.addObject(d); for (m.objs.items) |o| e.addObject(o); for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i }); - e.linkLibC(); - e.linkLibCpp(); + + // https://github.com/ziglang/zig/issues/15448 + if (e.target.getAbi() == .msvc) { + e.linkLibC(); // need winsdk + crt + } else { + // linkLibCpp already add (libc++ + libunwind + libc) + e.linkLibCpp(); + } e.addConfigHeader(m.config_header); m.builder.installArtifact(e); e.want_lto = m.enable_lto; @@ -109,7 +126,7 @@ pub fn build(b: *std.build.Builder) !void { const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const llama = make.obj("llama", "llama.cpp"); const common = make.obj("common", "common/common.cpp"); - const console = make.obj("common", "common/console.cpp"); + const console = make.obj("console", "common/console.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const train = make.obj("train", "common/train.cpp"); From db3abcc114d5d1790ba814aa1a80ac673d4ccc3e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 8 Oct 2023 20:19:14 +0300 Subject: [PATCH 11/26] sync : ggml (ggml-backend) (#3548) * sync : ggml (ggml-backend) ggml-ci * zig : add ggml-backend to the build --- CMakeLists.txt | 2 + Makefile | 7 +- build.zig | 15 +- ggml-alloc.c | 169 +++++--------- ggml-alloc.h | 16 +- ggml-backend.c | 385 +++++++++++++++++++++++++++++++ ggml-backend.h | 143 ++++++++++++ ggml-cuda.cu | 535 ++++++++++++++++++++++++++++++++++++------- ggml-cuda.h | 4 + ggml-metal.h | 19 +- ggml-metal.m | 137 +++++++++++ ggml.c | 37 +-- ggml.h | 16 +- llama.cpp | 44 ++-- scripts/sync-ggml.sh | 24 +- 15 files changed, 1285 insertions(+), 268 deletions(-) create mode 100644 ggml-backend.c create mode 100644 ggml-backend.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c79ec48609b0..9184eda8f2988 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -663,6 +663,8 @@ add_library(ggml OBJECT ggml.h ggml-alloc.c ggml-alloc.h + ggml-backend.c + ggml-backend.h ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} diff --git a/Makefile b/Makefile index b8b0d4b562512..40187c4a25e62 100644 --- a/Makefile +++ b/Makefile @@ -512,9 +512,12 @@ ggml.o: ggml.c ggml.h ggml-cuda.h ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h $(CC) $(CFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o +ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h + $(CC) $(CFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o ggml-backend.o -llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h +llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: common/common.cpp common/common.h build-info.h common/log.h diff --git a/build.zig b/build.zig index c86e4c667cf30..fdc5bc084eb46 100644 --- a/build.zig +++ b/build.zig @@ -124,20 +124,21 @@ pub fn build(b: *std.build.Builder) !void { const ggml = make.obj("ggml", "ggml.c"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); + const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); const llama = make.obj("llama", "llama.cpp"); const common = make.obj("common", "common/common.cpp"); const console = make.obj("console", "common/console.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const train = make.obj("train", "common/train.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, llama, common, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, llama, common }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, llama, common }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, llama, common }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, llama, common, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, llama, common, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, llama, common, grammar_parser }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/ggml-alloc.c b/ggml-alloc.c index 805759db74fef..3321f05e2ef94 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -1,4 +1,5 @@ #include "ggml-alloc.h" +#include "ggml-backend.h" #include "ggml.h" #include #include @@ -6,25 +7,6 @@ #include #include -#ifdef __has_include - #if __has_include() - #include - #if defined(_POSIX_MAPPED_FILES) - #include - #include - #endif - #endif -#endif - -#if defined(_WIN32) - #define WIN32_LEAN_AND_MEAN - #ifndef NOMINMAX - #define NOMINMAX - #endif - #include - #include -#endif - #define UNUSED(x) (void)(x) #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -80,8 +62,9 @@ struct free_block { #define MAX_FREE_BLOCKS 256 struct ggml_allocr { + struct ggml_backend_buffer * buffer; + bool buffer_owned; void * data; - size_t size; size_t alignment; int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; @@ -119,16 +102,9 @@ static void remove_allocated_tensor(struct ggml_allocr * alloc, struct ggml_tens } #endif -static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { - return ggml_nbytes(tensor); - - UNUSED(alloc); -} - // check if a tensor is allocated by this buffer static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) { - void * ptr = tensor->data; - return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size; + return tensor->buffer == alloc->buffer; } static bool ggml_is_view(struct ggml_tensor * t) { @@ -136,11 +112,10 @@ static bool ggml_is_view(struct ggml_tensor * t) { } void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { -#ifdef GGML_ALLOCATOR_DEBUG GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated -#endif - size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + + size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); @@ -188,6 +163,8 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) tensor->data = addr; AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data); + tensor->buffer = alloc->buffer; + ggml_backend_buffer_init_tensor(alloc->buffer, tensor); #ifdef GGML_ALLOCATOR_DEBUG add_allocated_tensor(alloc, tensor); @@ -208,19 +185,21 @@ void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) // this is a very naive implementation, but for our case the number of free blocks should be very small static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) { - void * ptr = tensor->data; - if (ggml_allocr_is_own(alloc, tensor) == false) { // the tensor was not allocated in this buffer // this can happen because the graph allocator will try to free weights and other tensors from different buffers // the easiest way to deal with this is just to ignore it + AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); return; } - size_t size = ggml_allocr_get_alloc_size(alloc, tensor); + void * ptr = tensor->data; + + size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); - AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size); + + ggml_backend_buffer_free_tensor(alloc->buffer, tensor); #ifdef GGML_ALLOCATOR_DEBUG remove_allocated_tensor(alloc, tensor); @@ -285,15 +264,18 @@ void ggml_allocr_reset(struct ggml_allocr * alloc) { alloc->n_free_blocks = 1; size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment); alloc->free_blocks[0].addr = (char *)alloc->data + align_offset; - alloc->free_blocks[0].size = alloc->size - align_offset; + alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; } struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) { - struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size); + + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr)); *alloc = (struct ggml_allocr){ - /*.data = */ data, - /*.size = */ size, + /*.buffer = */ buffer, + /*.buffer_owned = */ true, + /*.base = */ ggml_backend_buffer_get_base(buffer), /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, @@ -312,74 +294,26 @@ struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) return alloc; } -// OS specific functions to allocate and free uncommitted virtual memory -static void * alloc_vmem(size_t size) { -#if defined(_WIN32) - return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS); -#elif defined(_POSIX_MAPPED_FILES) - void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0); - if (ptr == MAP_FAILED) { - return NULL; - } - return ptr; -#else - // use a fixed address for other platforms - uintptr_t base_addr = (uintptr_t)-size - 0x100; - return (void *)base_addr; -#endif -} - -static void free_vmem(void * base_addr, size_t size) { -#if defined(_WIN32) - VirtualFree(base_addr, 0, MEM_RELEASE); - UNUSED(size); -#elif defined(_POSIX_MAPPED_FILES) - munmap(base_addr, size); -#else - // nothing to do - UNUSED(base_addr); - UNUSED(size); -#endif -} - -// allocate uncommitted virtual memory to measure the size of the graph -static void alloc_measure_vmem(void ** base_addr, size_t * size) { - // 128GB for 64-bit, 1GB for 32-bit - *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37; - do { - *base_addr = alloc_vmem(*size); - if (*base_addr != NULL) { - AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr); - return; - } - // try again with half the size - *size /= 2; - } while (*size > 0); - - GGML_ASSERT(!"failed to allocate virtual memory for measure buffer"); -} - -static void free_measure_vmem(void * base_addr, size_t size) { - free_vmem(base_addr, size); -} - struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { - struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */); + struct ggml_allocr * alloc = ggml_allocr_new((void *)0x1000, (size_t)-0x1001, alignment); + alloc->measure = true; - void * base_addr; - size_t size; + return alloc; +} - alloc_measure_vmem(&base_addr, &size); +struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) { + struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr)); *alloc = (struct ggml_allocr){ - /*.data = */ base_addr, - /*.size = */ size, - /*.alignment = */ alignment, + /*.buffer = */ buffer, + /*.buffer_owned = */ false, + /*.base = */ ggml_backend_buffer_get_base(buffer), + /*.alignment = */ ggml_backend_buffer_get_alignment(buffer), /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, /*.hash_table = */ {{0}}, /*.max_size = */ 0, - /*.measure = */ true, + /*.measure = */ false, /*.parse_seq = */ {0}, /*.parse_seq_len = */ 0, #ifdef GGML_ALLOCATOR_DEBUG @@ -393,8 +327,8 @@ struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) { } void ggml_allocr_free(struct ggml_allocr * alloc) { - if (alloc->measure) { - free_measure_vmem(alloc->data, alloc->size); + if (alloc->buffer_owned) { + ggml_backend_buffer_free(alloc->buffer); } free(alloc); } @@ -437,7 +371,6 @@ static bool ggml_op_can_inplace(enum ggml_op op) { case GGML_OP_ROPE: case GGML_OP_RMS_NORM: case GGML_OP_SOFT_MAX: - case GGML_OP_CONT: return true; default: @@ -445,12 +378,23 @@ static bool ggml_op_can_inplace(enum ggml_op op) { } } +static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) { + assert(view->view_src != NULL && view->view_src->data != NULL); + view->backend = view->view_src->backend; + view->buffer = view->view_src->buffer; + view->data = (char *)view->view_src->data + view->view_offs; + + // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend + // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras + assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend); + ggml_backend_buffer_init_tensor(alloc->buffer, view); +} + static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) { struct hash_node * ht = alloc->hash_table; if (node->data == NULL) { if (ggml_is_view(node)) { - assert(node->view_src->data != NULL); - node->data = (char *)node->view_src->data + node->view_offs; + init_view(alloc, node); } else { // see if we can reuse a parent's buffer (inplace) if (ggml_op_can_inplace(node->op)) { @@ -478,13 +422,17 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->data = parent->data; + node->view_src = view_src; + view_src_hn->n_views += 1; + init_view(alloc, node); return; } } else { AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->data = parent->data; + node->view_src = parent; + p_hn->n_views += 1; + init_view(alloc, node); return; } } @@ -495,7 +443,7 @@ static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) } } -static size_t ggml_allocr_alloc_graph_tensors_n( +size_t ggml_allocr_alloc_graph_n( struct ggml_allocr * alloc, struct ggml_cgraph ** graphs, int n_graphs, struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) { @@ -513,6 +461,10 @@ static size_t ggml_allocr_alloc_graph_tensors_n( if (ggml_is_view(node)) { struct ggml_tensor * view_src = node->view_src; hash_get(ht, view_src)->n_views += 1; + if (node->buffer == NULL && node->data != NULL) { + // view of a pre-allocated tensor, didn't call init_view() yet + init_view(alloc, node); + } } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -521,6 +473,9 @@ static size_t ggml_allocr_alloc_graph_tensors_n( break; } hash_get(ht, parent)->n_children += 1; + if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { + init_view(alloc, parent); + } } } } @@ -631,7 +586,7 @@ static size_t ggml_allocr_alloc_graph_tensors_n( } size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) { - return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL); + return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL); } size_t ggml_allocr_max_size(struct ggml_allocr * alloc) { diff --git a/ggml-alloc.h b/ggml-alloc.h index 0c224f174f396..e38758878b91a 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -6,21 +6,27 @@ extern "C" { #endif +struct ggml_backend_buffer; GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment); GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment); +GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer); // tell the allocator to parse nodes following the order described in the list // you should call this if your graph are optimized to execute out-of-order GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n); -GGML_API void ggml_allocr_free(struct ggml_allocr * alloc); -GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc); -GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc); -GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor); +GGML_API void ggml_allocr_free (struct ggml_allocr * alloc); +GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc); +GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc); +GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor); GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph); -GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc); +GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc); +GGML_API size_t ggml_allocr_alloc_graph_n( + struct ggml_allocr * alloc, + struct ggml_cgraph ** graphs, int n_graphs, + struct ggml_tensor *** inputs, struct ggml_tensor *** outputs); #ifdef __cplusplus } diff --git a/ggml-backend.c b/ggml-backend.c new file mode 100644 index 0000000000000..ca8d83dafe47c --- /dev/null +++ b/ggml-backend.c @@ -0,0 +1,385 @@ +#include "ggml-backend.h" +#include "ggml-alloc.h" + +#include +#include +#include +#include +#include + +#define UNUSED GGML_UNUSED + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// backend buffer + +ggml_backend_buffer_t ggml_backend_buffer_init( + struct ggml_backend * backend, + struct ggml_backend_buffer_i iface, + ggml_backend_buffer_context_t context, + size_t size) { + ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer)); + + GGML_ASSERT(iface.get_base != NULL); + + (*buffer) = (struct ggml_backend_buffer) { + /* .interface = */ iface, + /* .backend = */ backend, + /* .context = */ context, + /* .size = */ size, + }; + + return buffer; +} + +void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { + if (buffer->iface.free_buffer != NULL) { + buffer->iface.free_buffer(buffer); + } + free(buffer); +} + +size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) { + return ggml_backend_get_alignment(buffer->backend); +} + +void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) { + return buffer->iface.get_base(buffer); +} + +size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) { + return buffer->size; +} + +size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + if (buffer->iface.get_alloc_size) { + return buffer->iface.get_alloc_size(buffer, tensor); + } + return ggml_nbytes(tensor); +} + +void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + if (buffer->iface.init_tensor) { + buffer->iface.init_tensor(buffer, tensor); + } +} + +void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { + if (buffer->iface.free_tensor) { + buffer->iface.free_tensor(buffer, tensor); + } +} + +// backend + +ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) { + return tensor->buffer->backend; +} + +const char * ggml_backend_name(ggml_backend_t backend) { + return backend->iface.get_name(backend); +} + +void ggml_backend_free(ggml_backend_t backend) { + backend->iface.free(backend); +} + +ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) { + return backend->iface.alloc_buffer(backend, size); +} + +size_t ggml_backend_get_alignment(ggml_backend_t backend) { + return backend->iface.get_alignment(backend); +} + +void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); +} + +void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); +} + +void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->iface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); + ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); +} + +void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_get_backend(tensor)->iface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size); + ggml_get_backend(tensor)->iface.synchronize(ggml_get_backend(tensor)); +} + +void ggml_backend_synchronize(ggml_backend_t backend) { + backend->iface.synchronize(backend); +} + +ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + return backend->iface.graph_plan_create(backend, cgraph); +} + +void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + backend->iface.graph_plan_free(backend, plan); +} + +void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + backend->iface.graph_plan_compute(backend, plan); +} + +void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + backend->iface.graph_compute(backend, cgraph); +} + +bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return backend->iface.supports_op(backend, op); +} + +// backend copy + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { + //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]); + //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); + GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); + + // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); + + if (src == dst) { + return; + } + + // TODO: allow backends to support copy to/from same backend + + if (ggml_get_backend(dst)->iface.cpy_tensor_from != NULL) { + ggml_get_backend(dst)->iface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst); + } else if (ggml_get_backend(src)->iface.cpy_tensor_to != NULL) { + ggml_get_backend(src)->iface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst); + } else { + // shouldn't be hit when copying from/to CPU + #ifndef NDEBUG + fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend)); + #endif + size_t nbytes = ggml_nbytes(src); + void * data = malloc(nbytes); + ggml_backend_tensor_get(src, data, 0, nbytes); + ggml_backend_tensor_set(dst, data, 0, nbytes); + free(data); + } +} + +// backend CPU + +struct ggml_backend_cpu_context { + int n_threads; + void * work_data; + size_t work_size; +}; + +static const char * ggml_backend_cpu_name(ggml_backend_t backend) { + return "CPU"; + + UNUSED(backend); +} + +static void ggml_backend_cpu_free(ggml_backend_t backend) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + free(cpu_ctx->work_data); + free(cpu_ctx); + free(backend); +} + +static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)buffer->context; +} + +static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i cpu_backend_buffer_i = { + /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, // no initialization required + /* .free_tensor = */ NULL, // no cleanup required +}; + +// for buffers from ptr, free is not called +static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { + /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed + /* .get_base = */ ggml_backend_cpu_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, + /* .free_tensor = */ NULL, +}; + +static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 + +static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) { + size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned + void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? + + return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size); +} + +static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) { + return TENSOR_ALIGNMENT; + UNUSED(backend); +} + +static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(backend); +} + +static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_cpu_synchronize(ggml_backend_t backend) { + UNUSED(backend); +} + +static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +struct ggml_backend_plan_cpu { + struct ggml_cplan cplan; + struct ggml_cgraph cgraph; +}; + +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); + + cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + cpu_plan->cgraph = *cgraph; + + if (cpu_plan->cplan.work_size > 0) { + cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + } + + return cpu_plan; +} + +static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; + + free(cpu_plan->cplan.work_data); + free(cpu_plan); + + UNUSED(backend); +} + +static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan; + + ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan); + + UNUSED(backend); +} + +static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; + + struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads); + + if (cpu_ctx->work_size < cplan.work_size) { + // TODO: may be faster to free and use malloc to avoid the copy + cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size); + cpu_ctx->work_size = cplan.work_size; + } + + cplan.work_data = cpu_ctx->work_data; + + ggml_graph_compute(cgraph, &cplan); +} + +static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return true; + UNUSED(backend); + UNUSED(op); +} + +static struct ggml_backend_i cpu_backend_i = { + /* .get_name = */ ggml_backend_cpu_name, + /* .free = */ ggml_backend_cpu_free, + /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_get_alignment, + /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async, + /* .synchronize = */ ggml_backend_cpu_synchronize, + /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to, + /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cpu_graph_compute, + /* .supports_op = */ ggml_backend_cpu_supports_op, +}; + +ggml_backend_t ggml_backend_cpu_init(void) { + struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + + ctx->n_threads = GGML_DEFAULT_N_THREADS; + ctx->work_data = NULL; + ctx->work_size = 0; + + ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + + *cpu_backend = (struct ggml_backend) { + /* .interface = */ cpu_backend_i, + /* .context = */ ctx + }; + return cpu_backend; +} + +bool ggml_backend_is_cpu(ggml_backend_t backend) { + return backend->iface.get_name == ggml_backend_cpu_name; +} + +void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { + GGML_ASSERT(ggml_backend_is_cpu(backend_cpu)); + + struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context; + ctx->n_threads = n_threads; +} + +ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) { + return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size); +} diff --git a/ggml-backend.h b/ggml-backend.h new file mode 100644 index 0000000000000..da134b0dbed51 --- /dev/null +++ b/ggml-backend.h @@ -0,0 +1,143 @@ +#pragma once + +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + struct ggml_backend; + struct ggml_backend_buffer; + + // type-erased backend-specific types / wrappers + typedef void * ggml_backend_context_t; + typedef void * ggml_backend_graph_plan_t; + typedef void * ggml_backend_buffer_context_t; + + // avoid accessing internals of these types + typedef struct ggml_backend * ggml_backend_t; + typedef struct ggml_backend_buffer * ggml_backend_buffer_t; + + // + // backend buffer + // + + struct ggml_backend_buffer_i { + void (*free_buffer) (ggml_backend_buffer_t buffer); + void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer + size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback + void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback + void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback + }; + + // TODO: hide behind API + struct ggml_backend_buffer { + struct ggml_backend_buffer_i iface; + + ggml_backend_t backend; + ggml_backend_buffer_context_t context; + + size_t size; + }; + + // backend buffer functions + GGML_API ggml_backend_buffer_t ggml_backend_buffer_init( + struct ggml_backend * backend, + struct ggml_backend_buffer_i iface, + ggml_backend_buffer_context_t context, + size_t size); + + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + + // + // backend + // + + struct ggml_backend_i { + const char * (*get_name)(ggml_backend_t backend); + + void (*free)(ggml_backend_t backend); + + // buffer allocation + ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size); + + // get buffer alignment + size_t (*get_alignment)(ggml_backend_t backend); + + // tensor data access + // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize + void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + void (*synchronize) (ggml_backend_t backend); + + // (optional) copy tensor between different backends, allow for single-copy tranfers + void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); + + // compute graph with a plan + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + + // compute graph without a plan + void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); + + // check if the backend supports an operation + bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + }; + + // TODO: hide behind API + struct ggml_backend { + struct ggml_backend_i iface; + + ggml_backend_context_t context; + }; + + // backend helper functions + GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor); + + GGML_API const char * ggml_backend_name(ggml_backend_t backend); + GGML_API void ggml_backend_free(ggml_backend_t backend); + + GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size); + + GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend); + + GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + + GGML_API void ggml_backend_synchronize(ggml_backend_t backend); + + GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph); + + GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op); + + // tensor copy between different backends + GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); + + // + // CPU backend + // + + GGML_API ggml_backend_t ggml_backend_cpu_init(void); + + GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend); + + GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads); + + GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size); + +#ifdef __cplusplus +} +#endif diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 989c419cd0ea4..7e92c519741b9 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -62,6 +62,7 @@ #define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyKind hipMemcpyKind #define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags @@ -419,6 +420,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 #define CUDA_QUANTIZE_BLOCK_SIZE 256 #define CUDA_DEQUANTIZE_BLOCK_SIZE 256 +#define CUDA_GET_ROWS_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_CUDA_DMMV_X @@ -1574,6 +1576,34 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest reinterpret_cast(y[ib].ds.y) = sum; } +template +static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) { + const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2; + const int row = blockDim.y*blockIdx.y + threadIdx.y; + + if (col >= ncols) { + return; + } + + const int r = y[row]; + + // copy x[r*ncols + col] to dst[row*ncols + col] + const int xi = r*ncols + col; + const int di = row*ncols + col; + + const int ib = xi/qk; // block index + const int iqs = (xi%qk)/qr; // quant index + const int iybs = di - di%qk; // y block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(x, ib, iqs, v); + + dst[iybs + iqs + 0] = v.x; + dst[iybs + iqs + y_offset] = v.y; +} + template static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { const int i = blockDim.x*blockIdx.x + 2*threadIdx.x; @@ -4555,6 +4585,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale dst[i] = scale * x[i]; } + +template +static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const dim3 block_nums(block_num_x, nrows, 1); + k_get_rows<<>>(x, y, dst, ncols); +} + static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) { const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE; add_f32<<>>(x, y, dst, kx, ky); @@ -5703,7 +5742,7 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) { GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); kind = cudaMemcpyDeviceToDevice; - struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; CUDA_CHECK(cudaGetDevice(&id)); src_ptr = (char *) extra->data_device[id]; @@ -5739,6 +5778,107 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } } +static void ggml_cuda_op_repeat( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) { + // guaranteed to be an integer due to the check in ggml_can_repeat + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const size_t nb0 = dst->nb[0]; + const size_t nb1 = dst->nb[1]; + const size_t nb2 = dst->nb[2]; + const size_t nb3 = dst->nb[3]; + + const size_t nb00 = src0->nb[0]; + const size_t nb01 = src0->nb[1]; + const size_t nb02 = src0->nb[2]; + const size_t nb03 = src0->nb[3]; + + const int nr0 = (int)(ne0/ne00); + const int nr1 = (int)(ne1/ne01); + const int nr2 = (int)(ne2/ne02); + const int nr3 = (int)(ne3/ne03); + + // TODO: support for transposed / permuted tensors + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb00 == sizeof(float)); + + // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors + for (int i3 = 0; i3 < nr3; i3++) { + for (int k3 = 0; k3 < ne03; k3++) { + for (int i2 = 0; i2 < nr2; i2++) { + for (int k2 = 0; k2 < ne02; k2++) { + for (int i1 = 0; i1 < nr1; i1++) { + for (int k1 = 0; k1 < ne01; k1++) { + for (int i0 = 0; i0 < nr0; i0++) { + CUDA_CHECK(cudaMemcpyAsync( + (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0, + (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01, + ne00*nb0, cudaMemcpyDeviceToDevice, stream)); + } + } + } + } + } + } + } + + (void) src1; + (void) src1_d; +} + +static void ggml_cuda_op_get_rows( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + const int ncols = src0->ne[0]; + const int nrows = ggml_nelements(src1); + + const int32_t * src1_i32 = (const int32_t *) src1_d; + + switch (src0->type) { + case GGML_TYPE_F16: + get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda(src0_d, src1_i32, dst_d, nrows, ncols, stream); + break; + default: + // TODO: k-quants + GGML_ASSERT(false); + break; + } +} + inline void ggml_cuda_op_add( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { @@ -6343,7 +6483,14 @@ inline void ggml_cuda_op_scale( GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - const float scale = ((float *) src1->data)[0]; + float scale; + // HACK: support for ggml backend interface + if (src1->backend == GGML_BACKEND_CPU) { + scale = ((float *) src1->data)[0]; + } else { + // TODO: pass pointer to kernel instead of copying to host + CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost)); + } scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); @@ -6362,9 +6509,9 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT); - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU; @@ -6505,9 +6652,9 @@ static void ggml_cuda_op_mul_mat( const size_t q8_1_ts = sizeof(block_q8_1); const size_t q8_1_bs = QK8_1; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); @@ -6585,7 +6732,7 @@ static void ggml_cuda_op_mul_mat( if (convert_src1_to_q8_1) { src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); - if (split && src1_on_device && src1_is_contiguous) { + if (src1_on_device && src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -6667,7 +6814,7 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) { + if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } @@ -6758,6 +6905,14 @@ static void ggml_cuda_op_mul_mat( } } +static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat); +} + +static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows); +} + static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); } @@ -6812,13 +6967,13 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); @@ -6843,13 +6998,13 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; void * src0_ddq = src0_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; const int64_t row_stride_x = nb01 / sizeof(half); @@ -6870,11 +7025,11 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } - if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_p021(src0, src1, dst); } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - }else if (src0->type == GGML_TYPE_F32) { + } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) { @@ -6935,8 +7090,8 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; @@ -6991,8 +7146,8 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { const size_t nb1 = tensor->nb[1]; - ggml_backend backend = tensor->backend; - struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; + ggml_backend_type backend = tensor->backend; + ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); for (int64_t id = 0; id < g_device_count; ++id) { @@ -7046,7 +7201,6 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); } - CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); extra->data_device[id] = buf; @@ -7085,17 +7239,17 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) { delete extra; } -static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; +static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; static size_t g_temp_tensor_extra_index = 0; -static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { +static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { if (g_temp_tensor_extras == nullptr) { g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; } size_t alloc_index = g_temp_tensor_extra_index; g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES; - struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; + ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; memset(extra, 0, sizeof(*extra)); return extra; @@ -7123,7 +7277,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra return; } - struct ggml_tensor_extra_gpu * extra; + ggml_tensor_extra_gpu * extra; const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW || @@ -7132,7 +7286,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra CUDA_CHECK(ggml_cuda_set_device(g_main_device)); if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -7141,7 +7295,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scra extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src0_ddc + offset; } else if (tensor->op == GGML_OP_CPY) { - struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; void * src1_ddv = src1_extra->data_device[g_main_device]; extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src1_ddv; @@ -7183,13 +7337,13 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); } - struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); + ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || tensor->op == GGML_OP_VIEW; if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t view_offset = 0; if (tensor->op == GGML_OP_VIEW) { @@ -7207,7 +7361,7 @@ void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); GGML_ASSERT(ggml_is_contiguous(tensor)); - struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice)); } @@ -7264,58 +7418,47 @@ void ggml_cuda_free_scratch() { g_scratch_buffer = nullptr; } -bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){ +bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { ggml_cuda_func_t func; const bool any_on_device = tensor->backend == GGML_BACKEND_GPU || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU); + if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) { + return false; + } + switch (tensor->op) { + case GGML_OP_REPEAT: + func = ggml_cuda_repeat; + break; + case GGML_OP_GET_ROWS: + func = ggml_cuda_get_rows; + break; case GGML_OP_DUP: - if (!any_on_device) { - return false; - } func = ggml_cuda_dup; break; case GGML_OP_ADD: - if (!any_on_device) { - return false; - } func = ggml_cuda_add; break; case GGML_OP_MUL: - if (!any_on_device) { - return false; - } func = ggml_cuda_mul; break; case GGML_OP_UNARY: switch (ggml_get_unary_op(tensor)) { case GGML_UNARY_OP_GELU: - if (!any_on_device) { - return false; - } func = ggml_cuda_gelu; break; case GGML_UNARY_OP_SILU: - if (!any_on_device) { - return false; - } func = ggml_cuda_silu; break; default: return false; } break; case GGML_OP_NORM: - if (!any_on_device) { - return false; - } func = ggml_cuda_norm; break; case GGML_OP_RMS_NORM: - if (!any_on_device) { - return false; - } func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: @@ -7325,54 +7468,30 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ func = ggml_cuda_mul_mat; break; case GGML_OP_SCALE: - if (!any_on_device) { - return false; - } func = ggml_cuda_scale; break; case GGML_OP_CPY: - if (!any_on_device) { - return false; - } func = ggml_cuda_cpy; break; case GGML_OP_CONT: - if (!any_on_device) { - return false; - } func = ggml_cuda_dup; break; case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - if (!any_on_device) { - return false; - } func = ggml_cuda_nop; break; case GGML_OP_DIAG_MASK_INF: - if (!any_on_device) { - return false; - } func = ggml_cuda_diag_mask_inf; break; case GGML_OP_SOFT_MAX: - if (!any_on_device) { - return false; - } func = ggml_cuda_soft_max; break; case GGML_OP_ROPE: - if (!any_on_device) { - return false; - } func = ggml_cuda_rope; break; case GGML_OP_ALIBI: - if (!any_on_device) { - return false; - } func = ggml_cuda_alibi; break; default: @@ -7400,3 +7519,263 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +#define UNUSED GGML_UNUSED + +struct ggml_backend_context_cuda { +}; + +static const char * ggml_backend_cuda_name(ggml_backend_t backend) { + return GGML_CUDA_NAME; + + UNUSED(backend); +} + +static void ggml_backend_cuda_free(ggml_backend_t backend) { + ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + delete cuda_ctx; + delete backend; +} + +struct ggml_backend_buffer_context_cuda { + void * device; + + ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; + size_t temp_tensor_extra_index = 0; + + ~ggml_backend_buffer_context_cuda() { + delete[] temp_tensor_extras; + } + + ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + if (temp_tensor_extras == nullptr) { + temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES]; + } + + size_t alloc_index = temp_tensor_extra_index; + temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES; + ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; + memset(extra, 0, sizeof(*extra)); + + return extra; + } +}; + +static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + CUDA_CHECK(cudaFree(ctx->device)); + delete ctx; +} + +static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + return ctx->device; +} + +static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t size = ggml_nbytes_split(tensor, nrows_split); + + int64_t ne0 = tensor->ne[0]; + + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) + * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); + } + } + + return size; + + UNUSED(buffer); +} + +static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + + if (tensor->view_src != NULL && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->backend == buffer->backend); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + + ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); + + extra->data_device[g_main_device] = tensor->data; + + tensor->backend = GGML_BACKEND_GPU; + tensor->extra = extra; + + if (ggml_is_quantized(tensor->type)) { + // initialize padding to 0 to avoid possible NaN values + int64_t row_low = 0; + int64_t row_high = ggml_nrows(tensor); + int64_t nrows_split = row_high - row_low; + + size_t original_size = ggml_nbytes_split(tensor, nrows_split); + size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0])); + } + } + + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .free_tensor = */ NULL, +}; + +static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) { + ggml_cuda_set_device(g_main_device); + + ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda; + CUDA_CHECK(cudaMalloc(&ctx->device, size)); + return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size); +} + +static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) { + return 128; + UNUSED(backend); +} + +static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0])); + + UNUSED(backend); +} + +static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); + + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); + + UNUSED(backend); +} + +static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { + CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); + + UNUSED(backend); +} + +static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { + GGML_ASSERT(!"not implemented"); + + return nullptr; + + UNUSED(backend); + UNUSED(cgraph); +} + +static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { + GGML_ASSERT(!"not implemented"); + + UNUSED(backend); + UNUSED(plan); +} + +static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { + ggml_cuda_set_device(g_main_device); + + ggml_compute_params params = {}; + params.type = GGML_TASK_COMPUTE; + params.ith = 0; + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + + assert(node->backend == GGML_BACKEND_GPU); + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr) { + assert(node->src[j]->backend == GGML_BACKEND_GPU); + } + } + + bool ok = ggml_cuda_compute_forward(¶ms, node); + if (!ok) { + fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + +#if 0 + if (node->type == GGML_TYPE_F32) { + cudaDeviceSynchronize(); + std::vector tmp(ggml_nelements(node), 0.0f); + cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); + printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), + ggml_type_name(node->src[0]->type), + node->src[1] ? ggml_type_name(node->src[1]->type) : "none", + node->src[0]->name, + node->src[1] ? node->src[1]->name : "none"); + double sum = 0.0; + double sq_sum = 0.0; + for (int i = 0; i < ggml_nelements(node); i++) { + printf("%f ", tmp[i]); + sum += tmp[i]; + sq_sum += tmp[i]*tmp[i]; + } + printf("\n"); + printf("sum: %f, ", sum); + printf("sq_sum: %f\n", sq_sum); + } +#endif + } + + UNUSED(backend); +} + +static ggml_backend_i cuda_backend_i = { + /* .get_name = */ ggml_backend_cuda_name, + /* .free = */ ggml_backend_cuda_free, + /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_get_alignment, + /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, + /* .synchronize = */ ggml_backend_cuda_synchronize, + /* .cpy_tensor_from = */ nullptr, + /* .cpy_tensor_to = */ nullptr, + /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, + /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, + /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_compute = */ ggml_backend_cuda_graph_compute, + /* .supports_op = */ nullptr, +}; + +ggml_backend_t ggml_backend_cuda_init() { + ggml_init_cublas(); // TODO: remove from ggml.c + + ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda; + + ggml_backend_t cuda_backend = new ggml_backend { + /* .interface = */ cuda_backend_i, + /* .context = */ ctx + }; + + return cuda_backend; +} diff --git a/ggml-cuda.h b/ggml-cuda.h index fda704b665623..57adc9cf34bc5 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #ifdef GGML_USE_HIPBLAS #define GGML_CUDA_NAME "ROCm" @@ -42,6 +43,9 @@ GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, s GGML_API int ggml_cuda_get_device_count(void); GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size); +// backend API +GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use + #ifdef __cplusplus } #endif diff --git a/ggml-metal.h b/ggml-metal.h index 790cf0bf7b963..096b844e32c6f 100644 --- a/ggml-metal.h +++ b/ggml-metal.h @@ -20,6 +20,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #include #include @@ -35,10 +36,15 @@ struct ggml_cgraph; extern "C" { #endif -void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); +// +// internal API +// temporary exposed to user-code +// struct ggml_metal_context; +void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data); + // number of command buffers to use struct ggml_metal_context * ggml_metal_init(int n_cb); void ggml_metal_free(struct ggml_metal_context * ctx); @@ -83,6 +89,17 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx); // creates gf->n_threads command buffers in parallel void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf); +// +// backend API +// user-code should use only these functions +// + +GGML_API ggml_backend_t ggml_backend_metal_init(void); + +GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); + +GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb); + #ifdef __cplusplus } #endif diff --git a/ggml-metal.m b/ggml-metal.m index 92956ed97e9c9..29cb3c922daeb 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1456,3 +1456,140 @@ void ggml_metal_graph_compute( } } + +//////////////////////////////////////////////////////////////////////////////// + +// backend interface + +static const char * ggml_backend_metal_name(ggml_backend_t backend) { + return "Metal"; + + UNUSED(backend); +} + +static void ggml_backend_metal_free(ggml_backend_t backend) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + ggml_metal_free(ctx); + free(backend); +} + +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + return (void *)buffer->context; +} + +static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { + free(buffer->context); + UNUSED(buffer); +} + +static struct ggml_backend_buffer_i metal_backend_buffer_i = { + /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, + /* .get_base = */ ggml_backend_metal_buffer_get_base, + /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .init_tensor = */ NULL, // no initialization required + /* .free_tensor = */ NULL, // no cleanup required +}; + +static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + void * data = ggml_metal_host_malloc(size); + + // TODO: set proper name of the buffers + ggml_metal_add_buffer(ctx, "backend", data, size, 0); + + return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size); +} + +static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) { + return 32; + UNUSED(backend); +} + +static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy((char *)tensor->data + offset, data, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); + + memcpy(data, (const char *)tensor->data + offset, size); + + UNUSED(backend); +} + +static void ggml_backend_metal_synchronize(ggml_backend_t backend) { + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src)); + + UNUSED(backend); +} + +static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { + struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_graph_compute(metal_ctx, cgraph); +} + +static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { + return true; + UNUSED(backend); + UNUSED(op); +} + +static struct ggml_backend_i metal_backend_i = { + /* .get_name = */ ggml_backend_metal_name, + /* .free = */ ggml_backend_metal_free, + /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer, + /* .get_alignment = */ ggml_backend_metal_get_alignment, + /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async, + /* .synchronize = */ ggml_backend_metal_synchronize, + /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from, + /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to, + /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_metal_graph_compute, + /* .supports_op = */ ggml_backend_metal_supports_op, +}; + +ggml_backend_t ggml_backend_metal_init(void) { + struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context)); + + ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS); + + ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); + + *metal_backend = (struct ggml_backend) { + /* .interface = */ metal_backend_i, + /* .context = */ ctx, + }; + + return metal_backend; +} + +bool ggml_backend_is_metal(ggml_backend_t backend) { + return backend->iface.get_name == ggml_backend_metal_name; +} + +void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { + struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context; + + ggml_metal_set_n_cb(ctx, n_cb); +} diff --git a/ggml.c b/ggml.c index 911a63988e027..6d1776ca46741 100644 --- a/ggml.c +++ b/ggml.c @@ -162,40 +162,16 @@ typedef void * thread_ret_t; #define GGML_PRINT(...) printf(__VA_ARGS__) +// +// end of logging block +// + #ifdef GGML_USE_ACCELERATE // uncomment to use vDSP for soft max computation // note: not sure if it is actually faster //#define GGML_SOFT_MAX_ACCELERATE #endif -// -// logging -// - -#if (GGML_DEBUG >= 1) -#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG(...) -#endif - -#if (GGML_DEBUG >= 5) -#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG_5(...) -#endif - -#if (GGML_DEBUG >= 10) -#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__) -#else -#define GGML_PRINT_DEBUG_10(...) -#endif - -#define GGML_PRINT(...) printf(__VA_ARGS__) - -// -// end of logging block -// - #if defined(_MSC_VER) || defined(__MINGW32__) #define GGML_ALIGNED_MALLOC(size) _aligned_malloc(size, GGML_MEM_ALIGN) #define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr) @@ -4951,6 +4927,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( *result = (struct ggml_tensor) { /*.type =*/ type, /*.backend =*/ GGML_BACKEND_CPU, + /*.buffer =*/ NULL, /*.n_dims =*/ n_dims, /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, @@ -20203,6 +20180,10 @@ static enum ggml_opt_result ggml_opt_lbfgs( ggml_vec_cpy_f32(nx, xp, x); ggml_vec_cpy_f32(nx, gp, g); + // TODO: instead of passing &cancel here, use the return code of the linesearch + // to determine if the optimization should be cancelled + // this is a simple change, but not doing this atm, since I don't have a nice + // way to test and don't want to break something with so many changes lined up ls = linesearch_backtracking(¶ms, nx, x, &fx, g, d, step, xp, f, gb, &cplan, np, ps, &cancel, callback, callback_data); if (cancel) { return GGML_OPT_CANCEL; diff --git a/ggml.h b/ggml.h index a9d4e33d9885c..3eddc44b90fdd 100644 --- a/ggml.h +++ b/ggml.h @@ -326,7 +326,7 @@ extern "C" { GGML_TYPE_COUNT, }; - enum ggml_backend { + enum ggml_backend_type { GGML_BACKEND_CPU = 0, GGML_BACKEND_GPU = 10, GGML_BACKEND_GPU_SPLIT = 20, @@ -479,8 +479,10 @@ extern "C" { // n-dimensional tensor struct ggml_tensor { - enum ggml_type type; - enum ggml_backend backend; + enum ggml_type type; + enum ggml_backend_type backend; + + struct ggml_backend_buffer * buffer; int n_dims; int64_t ne[GGML_MAX_DIMS]; // number of elements @@ -514,7 +516,7 @@ extern "C" { void * extra; // extra things e.g. for ggml-cuda.cu - char padding[4]; + char padding[12]; }; static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor); @@ -1358,7 +1360,7 @@ extern "C" { // alibi position embedding // in-place, returns view(a) - struct ggml_tensor * ggml_alibi( + GGML_API struct ggml_tensor * ggml_alibi( struct ggml_context * ctx, struct ggml_tensor * a, int n_past, @@ -1367,7 +1369,7 @@ extern "C" { // clamp // in-place, returns view(a) - struct ggml_tensor * ggml_clamp( + GGML_API struct ggml_tensor * ggml_clamp( struct ggml_context * ctx, struct ggml_tensor * a, float min, @@ -2102,7 +2104,7 @@ extern "C" { enum ggml_type vec_dot_type; } ggml_type_traits_t; - ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); + GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); #ifdef __cplusplus } diff --git a/llama.cpp b/llama.cpp index a4312ab72d613..77f7fa7c10555 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1730,7 +1730,7 @@ struct llama_model_loader { } } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend backend) { + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { if (backend != GGML_BACKEND_CPU) { ggml_set_no_alloc(ctx, true); } @@ -1748,7 +1748,7 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend backend) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { @@ -2299,8 +2299,8 @@ static void llm_load_tensors( // output { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying @@ -2335,8 +2335,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2365,8 +2365,8 @@ static void llm_load_tensors( { model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying @@ -2401,8 +2401,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2435,8 +2435,8 @@ static void llm_load_tensors( // output { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying @@ -2473,8 +2473,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2512,8 +2512,8 @@ static void llm_load_tensors( // output { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying @@ -2550,8 +2550,8 @@ static void llm_load_tensors( model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT auto & layer = model.layers[i]; @@ -2589,8 +2589,8 @@ static void llm_load_tensors( model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); { - ggml_backend backend_norm; - ggml_backend backend_output; + ggml_backend_type backend_norm; + ggml_backend_type backend_output; if (n_gpu_layers > int(n_layer)) { // norm is not performance relevant on its own but keeping it in VRAM reduces data copying @@ -2624,8 +2624,8 @@ static void llm_load_tensors( const int i_gpu_start = n_layer - n_gpu_layers; model.layers.resize(n_layer); for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; - const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; auto & layer = model.layers[i]; layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index e44c3bd03fa93..4311268bd2d17 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -1,16 +1,18 @@ #!/bin/bash -cp -rpv ../ggml/src/ggml.c ./ggml.c -cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c -cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h -cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu -cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h -cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp -cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h -cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m -cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal -cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h -cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h +cp -rpv ../ggml/src/ggml.c ./ggml.c +cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c +cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c +cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h +cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu +cp -rpv ../ggml/src/ggml-opencl.h ./ggml-opencl.h +cp -rpv ../ggml/src/ggml-opencl.cpp ./ggml-opencl.cpp +cp -rpv ../ggml/src/ggml-metal.h ./ggml-metal.h +cp -rpv ../ggml/src/ggml-metal.m ./ggml-metal.m +cp -rpv ../ggml/src/ggml-metal.metal ./ggml-metal.metal +cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h +cp -rpv ../ggml/include/ggml/ggml-alloc.h ./ggml-alloc.h +cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp From dcc09d25961c5d0626bc148e558ee841141748f7 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 9 Oct 2023 14:28:27 +0300 Subject: [PATCH 12/26] metal : do not use mul_mm kernels when ne00 < 64 (#3542) --- ggml-metal.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index 29cb3c922daeb..7d67db90f078b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1040,7 +1040,7 @@ void ggml_metal_graph_compute( !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1t == GGML_TYPE_F32 && - ne00 % 32 == 0 && + ne00 % 32 == 0 && ne00 >= 64 && ne11 > ne11_mm_min) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); switch (src0->type) { From fcca0a700487999d52a525c96d6661e9f6a8703a Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 9 Oct 2023 14:32:17 +0300 Subject: [PATCH 13/26] refact : fix convert script + zero out KV cache to avoid nans (#3523) * refact : fix convert script + zero out KV cache to avoid nans * ggml : silu(-inf) should never happen * metal : assert various kernel requirements --- convert-refact-hf-to-gguf.py | 71 ++++------------------------------ examples/parallel/parallel.cpp | 2 +- ggml-metal.m | 20 ++++++---- ggml-metal.metal | 18 ++++++--- ggml.c | 27 +++++++------ llama.cpp | 4 ++ 6 files changed, 51 insertions(+), 91 deletions(-) diff --git a/convert-refact-hf-to-gguf.py b/convert-refact-hf-to-gguf.py index e0cd417dbbbc4..bfeabc0825ba7 100755 --- a/convert-refact-hf-to-gguf.py +++ b/convert-refact-hf-to-gguf.py @@ -17,33 +17,6 @@ sys.path.insert(1, str(Path(__file__).parent / "gguf-py" / "gguf")) import gguf - -def bytes_to_unicode(): - # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a significant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = ( - list(range(ord("!"), ord("~") + 1)) - + list(range(ord("¡"), ord("¬") + 1)) - + list(range(ord("®"), ord("ÿ") + 1)) - ) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8 + n) - n += 1 - return dict(zip(bs, (chr(n) for n in cs))) - - def count_model_parts(dir_model: Path) -> int: num_parts = 0 for filename in os.listdir(dir_model): @@ -153,53 +126,25 @@ def parse_args() -> argparse.Namespace: scores: list[float] = [] toktypes: list[int] = [] -tokenizer_json_file = dir_model / "tokenizer.json" -if not tokenizer_json_file.is_file(): - print(f"Error: Missing {tokenizer_json_file}", file=sys.stderr) - sys.exit(1) - # gpt2 tokenizer gguf_writer.add_tokenizer_model("gpt2") -with open(tokenizer_json_file, "r", encoding="utf-8") as f: - tokenizer_json = json.load(f) - print("gguf: get gpt2 tokenizer vocab") +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + # The number of tokens in tokenizer.json can differ from the expected vocab size. # This causes downstream issues with mismatched tensor sizes when running the inference -vocab_size = ( - hparams["vocab_size"] - if "vocab_size" in hparams - else len(tokenizer_json["model"]["vocab"]) -) - -tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True) +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} -byte_encoder = bytes_to_unicode() -byte_decoder = {v: k for k, v in byte_encoder.items()} for i in range(vocab_size): - if i in reverse_vocab: - text = reverse_vocab[i] - try: - text = bytearray([byte_decoder[c] for c in reverse_vocab[i]]) - except KeyError: - text = bytearray() - for c in reverse_vocab[i]: - if ord(c) < 256: # single byte character - text.append(byte_decoder[ord(c)]) - else: # multibyte special token character - text.extend(c.encode("utf-8")) - else: - print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.") - pad_token = f"[PAD{i}]".encode("utf8") - text = bytearray(pad_token) - - tokens.append(text) - scores.append(0.0) # dymmy - toktypes.append(gguf.TokenType.NORMAL) # dummy + tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") + scores.append(0.0) # dummy + toktypes.append(gguf.TokenType.NORMAL) gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 721888da7de94..04f1e45b955dd 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -167,7 +167,7 @@ int main(int argc, char ** argv) { // the max batch size is as large as the context to handle cases where we get very long input prompt from multiple // users. regardless of the size, the main loop will chunk the batch into a maximum of params.n_batch tokens at a time - llama_batch batch = llama_batch_init(params.n_ctx, 0); + llama_batch batch = llama_batch_init(n_ctx, 0); int32_t n_total_prompt = 0; int32_t n_total_gen = 0; diff --git a/ggml-metal.m b/ggml-metal.m index 7d67db90f078b..5a23144d0c891 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -779,8 +779,8 @@ void ggml_metal_graph_compute( } break; case GGML_OP_CONCAT: { + const int64_t nb = ne00; - int64_t nb = ne00; [encoder setComputePipelineState:ctx->pipeline_concat]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; @@ -812,6 +812,7 @@ void ggml_metal_graph_compute( [encoder setBytes:&nb length:sizeof(nb) atIndex:27]; const int nth = MIN(1024, ne0); + [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; } break; case GGML_OP_ADD: @@ -909,9 +910,10 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - const int64_t n = ggml_nelements(dst)/4; + const int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(gf->nodes[i])) { @@ -921,9 +923,10 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst)/4; + const int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_UNARY_OP_RELU: { @@ -941,9 +944,10 @@ void ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst)/4; + const int64_t n = ggml_nelements(dst); + GGML_ASSERT(n % 4 == 0); - [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; default: { @@ -1251,6 +1255,8 @@ void ggml_metal_graph_compute( } break; case GGML_OP_RMS_NORM: { + GGML_ASSERT(ne00 % 4 == 0); + float eps; memcpy(&eps, dst->op_params, sizeof(float)); diff --git a/ggml-metal.metal b/ggml-metal.metal index b6288db28660d..99b9fd7a748ba 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -345,10 +345,11 @@ kernel void kernel_rms_norm( uint sgitg[[simdgroup_index_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]], uint ntg[[threads_per_threadgroup]]) { - device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01); - device const float * x_scalar = (device const float *) x; - float4 sumf=0; - float all_sum=0; + device const float4 * x = (device const float4 *) ((device const char *) src0 + tgpig*nb01); + device const float * x_scalar = (device const float *) x; + + float4 sumf = 0; + float all_sum = 0; // parallel sum for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { @@ -361,6 +362,7 @@ kernel void kernel_rms_norm( } threadgroup_barrier(mem_flags::mem_threadgroup); + // broadcast, simd group number is ntg / 32 for (uint i = ntg / 32 / 2; i > 0; i /= 2) { if (tpitg < i) { @@ -368,7 +370,9 @@ kernel void kernel_rms_norm( } } if (tpitg == 0) { - for (int i = 4 * (ne00 / 4); i < ne00; i++) {sum[0] += x_scalar[i];} + for (int i = 4 * (ne00 / 4); i < ne00; i++) { + sum[0] += x_scalar[i]; + } sum[0] /= ne00; } @@ -383,7 +387,9 @@ kernel void kernel_rms_norm( y[i00] = x[i00] * scale; } if (tpitg == 0) { - for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) {y_scalar[i00] = x_scalar[i00] * scale;} + for (int i00 = 4 * (ne00 / 4); i00 < ne00; i00++) { + y_scalar[i00] = x_scalar[i00] * scale; + } } } diff --git a/ggml.c b/ggml.c index 6d1776ca46741..5bb1da31ba624 100644 --- a/ggml.c +++ b/ggml.c @@ -11233,7 +11233,7 @@ static void ggml_compute_forward_silu_f32( #ifndef NDEBUG for (int k = 0; k < nc; k++) { - const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k]; + const float x = ((float *) ((char *) dst->data + i1*(dst->nb[1])))[k]; UNUSED(x); assert(!isnan(x)); assert(!isinf(x)); @@ -13066,17 +13066,17 @@ static void ggml_compute_forward_alibi_f32( assert(n_past >= 0); - const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 - const int ne1 = src0->ne[1]; // seq_len_without_past - const int ne2 = src0->ne[2]; // n_head -> this is k - //const int ne3 = src0->ne[3]; // 1 -> bsz + const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 + const int64_t ne1 = src0->ne[1]; // seq_len_without_past + const int64_t ne2 = src0->ne[2]; // n_head -> this is k + //const int64_t ne3 = src0->ne[3]; // 1 -> bsz - const int n = ggml_nrows(src0); - const int ne2_ne3 = n/ne1; // ne2*ne3 + const int64_t n = ggml_nrows(src0); + const int64_t ne2_ne3 = n/ne1; // ne2*ne3 - const int nb0 = src0->nb[0]; - const int nb1 = src0->nb[1]; - const int nb2 = src0->nb[2]; + const size_t nb0 = src0->nb[0]; + const size_t nb1 = src0->nb[1]; + const size_t nb2 = src0->nb[2]; //const int nb3 = src0->nb[3]; GGML_ASSERT(nb0 == sizeof(float)); @@ -13088,9 +13088,9 @@ static void ggml_compute_forward_alibi_f32( const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - for (int i = 0; i < ne0; i++) { - for (int j = 0; j < ne1; j++) { - for (int k = 0; k < ne2_ne3; k++) { + for (int64_t i = 0; i < ne0; i++) { + for (int64_t j = 0; j < ne1; j++) { + for (int64_t k = 0; k < ne2_ne3; k++) { float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2); float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2); @@ -13105,7 +13105,6 @@ static void ggml_compute_forward_alibi_f32( } pdst[0] = i * m_k + src[0]; - } } } diff --git a/llama.cpp b/llama.cpp index 77f7fa7c10555..24f07daca6181 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1325,7 +1325,11 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); + // TODO: this should be: + // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*ggml_tensor_overhead()); + // change it and test that it works cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); + memset(cache.buf.data, 0, cache.buf.size); struct ggml_init_params params; params.mem_size = cache.buf.size; From 95bd60a0a69f57e9a2ff1269667ea484a1a9bb40 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 9 Oct 2023 14:44:58 +0200 Subject: [PATCH 14/26] ggml-alloc : fix assert in debug builds (#3555) --- ggml-alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-alloc.c b/ggml-alloc.c index 3321f05e2ef94..34eba3f830e84 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -386,7 +386,7 @@ static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) { // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras - assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend); + assert(ggml_allocr_is_measure(alloc) || !view->buffer || view->buffer->backend == alloc->buffer->backend); ggml_backend_buffer_init_tensor(alloc->buffer, view); } From 11ea5c7d96f2c28e1c99659e08ec0a44574056e2 Mon Sep 17 00:00:00 2001 From: vvhg1 <94630311+vvhg1@users.noreply.github.com> Date: Tue, 10 Oct 2023 09:31:21 +0200 Subject: [PATCH 15/26] infill. : fix tokenization (#3508) * infill tokens correction * serverinfill tokens correction * removing any leading whitespace from infill suffix and removing leeading space token from suffix when params.escape * removing any leading whitespace from infill suffix and removing leeading space token from suffix when params.escape * only rm when params.escape, rm space if possible which is added back or rm added space token * only rm when params.escape, rm space if possible which is added back or rm added space token * Revert "only rm when params.escape, rm space if possible which is added back or rm added space token" This reverts commit 63ba0b621f21077c0e3bc6ba6a327534123cb738. * fix interactive prompt escaping and fix server infill leading space handling * rm unnecessary bool check --- examples/infill/infill.cpp | 37 +++++++++++++++++++++++++++++++++---- examples/server/server.cpp | 15 +++++++++++++-- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 9ec75ce425b2a..d994de5e850c3 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -233,10 +233,22 @@ int main(int argc, char ** argv) { const bool add_bos = llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM; LOG("add_bos: %d\n", add_bos); + bool suff_rm_leading_spc = params.escape; + if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } std::vector embd_inp; - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos); + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + const int space_token = 29871; + if (suff_rm_leading_spc && inp_sfx[0] == space_token) { + inp_sfx.erase(inp_sfx.begin()); + } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + if (add_bos) { + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); + } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); @@ -627,10 +639,27 @@ int main(int argc, char ** argv) { buffer.clear(); // done taking input, reset color console::set_display(console::reset); + + if (params.escape) { + //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here + process_escapes(params.input_prefix); + process_escapes(params.input_suffix); + } + suff_rm_leading_spc = params.escape; + if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } // tokenize new prefix and suffix - std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, add_bos); - std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, add_bos); + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + if (suff_rm_leading_spc && inp_sfx[0] == space_token) { + inp_sfx.erase(inp_sfx.begin()); + } inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + if (add_bos) { + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); + } inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c53a64867336f..8c5318c650ae8 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -344,9 +344,20 @@ struct llama_server_context void loadInfill() { - auto prefix_tokens = tokenize(params.input_prefix, true); // always add BOS - auto suffix_tokens = tokenize(params.input_suffix, true); // always add BOS + bool suff_rm_leading_spc = true; + if (params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + + auto prefix_tokens = tokenize(params.input_prefix, false); + auto suffix_tokens = tokenize(params.input_suffix, false); + const int space_token = 29871; + if (suff_rm_leading_spc && suffix_tokens[0] == space_token) { + suffix_tokens.erase(suffix_tokens.begin()); + } prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx)); + prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); prefix_tokens.push_back(llama_token_middle(ctx)); From f5f9121de140eff558f13b5c5e78a3a3b6b94377 Mon Sep 17 00:00:00 2001 From: Jan Ploski Date: Tue, 10 Oct 2023 09:50:23 +0200 Subject: [PATCH 16/26] llm : add MPT support (#3417) * CUDA: added support for ggml_clamp (see also: https://github.com/ggerganov/ggml/issues/545) * mpt : added an implementation based (mostly) on falcon integration, modified with deltas from ggml/examples/mpt * mpt : protect against "clip_qkv": null in mpt-7b * mpt : quick fix to avoid "Strange model" warning when quantizing MPT models * mpt : addendum to changeset:84e30e8 - leave parameter clamp_kqv out from metadata rather than use 0.0 to indicate "no clamping" (more compliant with the current GGUF spec?) * mpt : standardized all tensor names to follow GGUF spec * mpt : addendum to changeset:1be89c40 - use "req" parameter of GGUF_GET_KEY macro instead of duplicate code * mpt : fixed comment s/gptneox/mpt/ * mpt : remove tabs, trailing whitespace * mpt : removed ne01 + n_past == ne00 assertion from alibi (cuda/f32) and rope_shift from build_mpt * mpt : updated convert-mpt-hf-to-gguf.py to reflect changes made to convert-gptneox-hf-to-gguf.py in pr:3252 * comment out n_past instead of marking it unused * mpt : removed hardcoded +178 from convert script in favor of utilizing hparams["vocab_size"] * mpt : remove unused tokenizer_json in convert script * ggml : remove obsolete n_past assert in ggml_alibi * llama : print clam_kqv and max_alibi_bias hparams --------- Co-authored-by: Cebtenzzre Co-authored-by: Georgi Gerganov --- convert-mpt-hf-to-gguf.py | 216 +++++++++++++++++++ ggml-cuda.cu | 47 ++++- ggml-metal.m | 2 +- ggml.c | 4 +- llama.cpp | 425 +++++++++++++++++++++++++++++++++++++- 5 files changed, 685 insertions(+), 9 deletions(-) create mode 100755 convert-mpt-hf-to-gguf.py diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py new file mode 100755 index 0000000000000..73a4932f7c831 --- /dev/null +++ b/convert-mpt-hf-to-gguf.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# HF mpt--> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert an MPT model to a GGML compatible file") + parser.add_argument( + "--vocab-only", action="store_true", + help="extract only the vocab", + ) + parser.add_argument( + "--outfile", type=Path, + help="path to write to; default: based on input", + ) + parser.add_argument( + "model", type=Path, + help="directory containing model file, or model file itself (*.bin)", + ) + parser.add_argument( + "ftype", type=int, choices=[0, 1], default=1, nargs='?', + help="output format - use 0 for float32, 1 for float16", + ) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "MPTForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + + sys.exit() + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.MPT +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layers"] + +gguf_writer.add_name(dir_model.name) +gguf_writer.add_context_length(hparams["max_seq_len"]) +gguf_writer.add_embedding_length(hparams["d_model"]) +gguf_writer.add_block_count(block_count) +gguf_writer.add_feed_forward_length(4 * hparams["d_model"]) +gguf_writer.add_head_count(hparams["n_heads"]) +gguf_writer.add_layer_norm_eps(1e-05) +if hparams["attn_config"]["clip_qkv"] is not None: + gguf_writer.add_clamp_kqv(hparams["attn_config"]["clip_qkv"]) +gguf_writer.add_max_alibi_bias(hparams["attn_config"]["alibi_bias_max"]) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# MPT token embedding tensors have dimension 50432 (hparams["vocab_size"]), but +# there are only 50254 (len(tokenizer.vocab)) tokens in the vocab, presumably to +# accomodate some "reserved" tokens; this is causing problems down the line in +# llama.cpp, so we pad the vocab with dummy tokens: + +vocab_size = hparams["vocab_size"] + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") + scores.append(0.0) # dummy + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges = True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH,block_count) + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(f"{dir_model}/{part_name}", map_location="cpu") + + for name in model_part.keys(): + data = model_part[name] + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) + if new_name is None: + print("Cannot map tensor '" + name + "'") + continue # for the sake of compatibility with some old published models, don't quit + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + # note: MPT output is tied to (same as) wte in original model; + # for easier implementation in llama.cpp it's duplicated in GGUF, though :/ + if new_name == "token_embd.weight": + gguf_writer.add_tensor("output.weight", data) + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 7e92c519741b9..654d3632fc179 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -415,6 +415,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_CPY_BLOCK_SIZE 32 #define CUDA_SCALE_BLOCK_SIZE 256 +#define CUDA_CLAMP_BLOCK_SIZE 256 #define CUDA_ROPE_BLOCK_SIZE 256 #define CUDA_ALIBI_BLOCK_SIZE 32 #define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 @@ -4585,6 +4586,15 @@ static __global__ void scale_f32(const float * x, float * dst, const float scale dst[i] = scale * x[i]; } +static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); +} template static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) { @@ -5475,6 +5485,11 @@ static void scale_f32_cuda(const float * x, float * dst, const float scale, cons scale_f32<<>>(x, dst, scale, k); } +static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; + clamp_f32<<>>(x, dst, min, max, k); +} + template static void rope_cuda(const T * x, T * dst, const int ncols, const int nrows, const int32_t * pos, const float freq_scale, const int p_delta_rows, const float theta_scale, cudaStream_t stream) { @@ -6419,12 +6434,12 @@ inline void ggml_cuda_op_alibi( const int64_t ne02 = src0->ne[2]; const int64_t nrows = ggml_nrows(src0); - const int n_past = ((int32_t *) dst->op_params)[0]; + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - GGML_ASSERT(ne01 + n_past == ne00); + //GGML_ASSERT(ne01 + n_past == ne00); GGML_ASSERT(n_head == ne02); const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -6500,6 +6515,24 @@ inline void ggml_cuda_op_scale( (void) src1_dd; } +inline void ggml_cuda_op_clamp( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const float min = ((float *) dst->op_params)[0]; + const float max = ((float *) dst->op_params)[1]; + + clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); + CUDA_CHECK(cudaGetLastError()); + + (void) src1; + (void) dst; + (void) src1_dd; +} + static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) { const int64_t nrows0 = ggml_nrows(src0); @@ -7061,6 +7094,10 @@ static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); } +static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp); +} + static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -7470,6 +7507,12 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_ case GGML_OP_SCALE: func = ggml_cuda_scale; break; + case GGML_OP_CLAMP: + if (!any_on_device) { + return false; + } + func = ggml_cuda_clamp; + break; case GGML_OP_CPY: func = ggml_cuda_cpy; break; diff --git a/ggml-metal.m b/ggml-metal.m index 5a23144d0c891..87fa172161405 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -1299,7 +1299,7 @@ void ggml_metal_graph_compute( const int nth = MIN(1024, ne00); - const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); diff --git a/ggml.c b/ggml.c index 5bb1da31ba624..1f5598fa6af8f 100644 --- a/ggml.c +++ b/ggml.c @@ -13059,13 +13059,11 @@ static void ggml_compute_forward_alibi_f32( return; } - const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); + //const int n_past = ((int32_t *) dst->op_params)[0]; const int n_head = ((int32_t *) dst->op_params)[1]; float max_bias; memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - assert(n_past >= 0); - const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1 const int64_t ne1 = src0->ne[1]; // seq_len_without_past const int64_t ne2 = src0->ne[2]; // n_head -> this is k diff --git a/llama.cpp b/llama.cpp index 24f07daca6181..3b63b64010b0f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -424,6 +424,14 @@ static std::map> LLM_TENSOR_NAMES = LLM_ARCH_MPT, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, { @@ -1011,6 +1019,9 @@ struct llama_hparams { float rope_freq_base_train; float rope_freq_scale_train; + float f_clamp_kqv; + float f_max_alibi_bias; + bool operator!=(const llama_hparams & other) const { if (this->vocab_only != other.vocab_only) return true; if (this->n_vocab != other.n_vocab) return true; @@ -2060,6 +2071,20 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_MPT: + { + hparams.f_clamp_kqv = 0.0f; + + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + GGUF_GET_KEY(ctx, hparams.f_clamp_kqv, gguf_get_val_f32, GGUF_TYPE_FLOAT32, false, kv(LLM_KV_ATTENTION_CLAMP_KQV)); + GGUF_GET_KEY(ctx, hparams.f_max_alibi_bias, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS)); + + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 48: model.type = e_model::MODEL_30B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -2204,6 +2229,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: n_gqa = %u\n", __func__, hparams.n_gqa()); LLAMA_LOG_INFO("%s: f_norm_eps = %.1e\n", __func__, hparams.f_norm_eps); LLAMA_LOG_INFO("%s: f_norm_rms_eps = %.1e\n", __func__, hparams.f_norm_rms_eps); + LLAMA_LOG_INFO("%s: f_clamp_kqv = %.1e\n", __func__, hparams.f_clamp_kqv); + LLAMA_LOG_INFO("%s: f_max_alibi_bias = %.1e\n", __func__, hparams.f_max_alibi_bias); LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff); LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train); LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train); @@ -2649,6 +2676,73 @@ static void llm_load_tensors( layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); } } break; + case LLM_ARCH_MPT: + { + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, 3*n_embd}, backend_split); + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + + ggml_nbytes(layer.wqkv) + + ggml_nbytes(layer.wo) + + ggml_nbytes(layer.ffn_norm) + + ggml_nbytes(layer.w2) + + ggml_nbytes(layer.w3); + } + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -4505,7 +4599,6 @@ static struct ggml_cgraph * llm_build_starcoder( return gf; } - static struct ggml_cgraph * llm_build_persimmon( llama_context & lctx, const llama_batch & batch) { @@ -4903,6 +4996,326 @@ static struct ggml_cgraph * llm_build_persimmon( return gf; } +static struct ggml_cgraph * llm_build_mpt( + llama_context & lctx, + const llama_batch & batch) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; // == n_head for MPT, as there's no MQA/GQA + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + const float norm_eps = hparams.f_norm_eps; + const float clamp_kqv = hparams.f_clamp_kqv; + const float max_alibi_bias = hparams.f_max_alibi_bias; + + const int n_gpu_layers = model.n_gpu_layers; + + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; + const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + + //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", + // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + //int warmup = 0; + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); + //warmup = ((uint32_t*) inp_tokens->data)[0] == 0; + } + + ggml_set_name(inp_tokens, "inp_tokens"); + + inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + + ggml_allocr_alloc(lctx.alloc, inpL); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inpL->data, batch.embd, n_tokens * n_embd * ggml_element_size(inpL)); + } + } + + const int i_gpu_start = n_layer - n_gpu_layers; + (void) i_gpu_start; + + // offload functions set the tensor output backend to GPU + // tensors are GPU-accelerated if any input or the output has been offloaded + offload_func_t offload_func_nr = llama_nop; // nr = non-repeating + offload_func_t offload_func_kq = llama_nop; + offload_func_t offload_func_v = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (n_gpu_layers > n_layer) { + offload_func_nr = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 1) { + offload_func_v = ggml_cuda_assign_buffers_no_alloc; + } + if (n_gpu_layers > n_layer + 2) { + offload_func_kq = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + offload_func_kq(KQ_mask); + ggml_set_name(KQ_mask, "KQ_mask"); + ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * attn_norm; + + offload_func_t offload_func = llama_nop; + +#ifdef GGML_USE_CUBLAS + if (il >= i_gpu_start) { + offload_func = ggml_cuda_assign_buffers_no_alloc; + } +#endif // GGML_USE_CUBLAS + + // self-attention + // TODO: refactor into common function (shared with LLaMA) + { + attn_norm = ggml_norm(ctx0, inpL, norm_eps); + offload_func(attn_norm); + + attn_norm = ggml_mul(ctx0, attn_norm, model.layers[il].attn_norm); + offload_func(attn_norm); + + if (1) { + cur = attn_norm; + } + + // compute QKV + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + offload_func_kq(cur); + + if (clamp_kqv > 0.0f) { + cur = ggml_clamp(ctx0, cur, -clamp_kqv, clamp_kqv); + offload_func_kq(cur); + } + + const size_t wsize = ggml_type_size(cur->type); + + struct ggml_tensor * Qcur = ggml_view_3d( + ctx0, cur, n_embd_head, n_head, n_tokens, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + 0); + offload_func_kq(Qcur); + + struct ggml_tensor * Kcur = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, n_tokens, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * n_head); + offload_func_kq(Kcur); + + struct ggml_tensor * tmpv = ggml_view_3d( + ctx0, cur, n_embd_head, n_head_kv, n_tokens, + wsize * n_embd_head, + wsize * n_embd_head * (n_head + 2 * n_head_kv), + wsize * n_embd_head * (n_head + n_head_kv)); + offload_func_kq(Kcur); + + ggml_set_name(Qcur, "Qcur"); + ggml_set_name(Kcur, "Kcur"); + + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); + offload_func_v(Vcur); + offload_func_v(Vcur->src[0]->src[0]); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + offload_func_kq(k); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + offload_func_v(v); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); + offload_func_kq(Q); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + offload_func_kq(K); + ggml_set_name(K, "K"); + + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + offload_func_kq(KQ); + ggml_set_name(KQ, "KQ"); + + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + offload_func_kq(KQ_scaled); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + // TODO: replace with ggml_add() + struct ggml_tensor * KQ_scaled_alibi = + ggml_alibi(ctx0, KQ_scaled, 0, n_head, max_alibi_bias); + offload_func_kq(KQ_scaled_alibi); + ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); + offload_func_kq(KQ_masked); + ggml_set_name(KQ_masked, "KQ_masked"); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + offload_func_v(KQ_soft_max); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + offload_func_v(V); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + offload_func_v(KQV); + ggml_set_name(KQV, "KQV"); + + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + offload_func_v(KQV_merged); + ggml_set_name(KQV_merged, "KQV_merged"); + + cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); + offload_func_v(cur); + ggml_set_name(cur, "KQV_merged_contiguous"); + + cur = ggml_mul_mat(ctx0, model.layers[il].wo, cur); + offload_func(cur); + ggml_set_name(cur, "result_wo"); + } + + // Add the input + cur = ggml_add(ctx0, cur, inpL); + offload_func(cur); + + struct ggml_tensor * attn_out = cur; + + // feed forward + { + // Norm + { + cur = ggml_norm(ctx0, attn_out, norm_eps); + offload_func(cur); + + cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm); + offload_func(cur); + } + + cur = ggml_mul_mat(ctx0, model.layers[il].w3, cur); + offload_func(cur); + + cur = ggml_gelu(ctx0, cur); + offload_func(cur); + cur = ggml_mul_mat(ctx0, model.layers[il].w2, cur); + offload_func(cur); + } + + cur = ggml_add(ctx0, cur, attn_out); + offload_func(cur); + // input for next layer + inpL = cur; + } + + cur = inpL; + + // norm + { + cur = ggml_norm(ctx0, cur, norm_eps); + offload_func_nr(cur); + + cur = ggml_mul(ctx0, cur, model.output_norm); + ggml_set_name(cur, "result_norm"); + } + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { @@ -4935,6 +5348,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_refact(lctx, batch); } break; + case LLM_ARCH_MPT: + { + result = llm_build_mpt(lctx, batch); + } break; default: GGML_ASSERT(false); } @@ -5065,7 +5482,8 @@ static int llama_decode_internal( const bool full_offload_supported = model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_BAICHUAN || model.arch == LLM_ARCH_FALCON || - model.arch == LLM_ARCH_REFACT; + model.arch == LLM_ARCH_REFACT || + model.arch == LLM_ARCH_MPT; const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3; if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) { n_threads = 1; @@ -7161,7 +7579,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_qkv.weight") != std::string::npos) { ++n_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { From 0aa6595ae02f97f2e5ffd74bf57a8b21ac83b272 Mon Sep 17 00:00:00 2001 From: Jhen-Jie Hong Date: Tue, 10 Oct 2023 06:31:13 -0500 Subject: [PATCH 17/26] swift : improvements and fixes (#3564) * swift : use macOS 12 as minimum requirement * swift : add missing ggml-backend.c source * swift : add -O3 -DNDEBUG unsafe flags --- Package.swift | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Package.swift b/Package.swift index 1ea414cc149fd..4ab055b19da2e 100644 --- a/Package.swift +++ b/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.3 +// swift-tools-version:5.5 import PackageDescription #if arch(arm) || arch(arm64) let platforms: [SupportedPlatform]? = [ - .macOS(.v11), + .macOS(.v12), .iOS(.v14), .watchOS(.v4), .tvOS(.v14) @@ -41,12 +41,13 @@ let package = Package( "ggml.c", "llama.cpp", "ggml-alloc.c", + "ggml-backend.c", "k_quants.c", ] + additionalSources, resources: resources, publicHeadersPath: "spm-headers", cSettings: [ - .unsafeFlags(["-Wno-shorten-64-to-32"]), + .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), .define("GGML_USE_K_QUANTS"), .define("GGML_USE_ACCELERATE") // NOTE: NEW_LAPACK will required iOS version 16.4+ From 02d2875deff28599c6c2c6e1886fab002ffe43b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?= Date: Tue, 10 Oct 2023 22:48:21 +0800 Subject: [PATCH 18/26] llm : add bloom models (#3553) * feat: Support bloom models * fix(bloom): fix model size --------- Co-authored-by: Georgi Gerganov --- convert-bloom-hf-to-gguf.py | 238 ++++++++++++++++++++++ gguf-py/gguf/gguf.py | 112 +++++++---- llama.cpp | 383 ++++++++++++++++++++++++++++++++++-- 3 files changed, 678 insertions(+), 55 deletions(-) create mode 100755 convert-bloom-hf-to-gguf.py diff --git a/convert-bloom-hf-to-gguf.py b/convert-bloom-hf-to-gguf.py new file mode 100755 index 0000000000000..7bfc95ec11dae --- /dev/null +++ b/convert-bloom-hf-to-gguf.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 +# HF bloom --> gguf conversion + +from __future__ import annotations + +import argparse +import json +import os +import re +import struct +import sys +from pathlib import Path +from typing import Any + +import numpy as np +import torch +from transformers import AutoTokenizer # type: ignore[import] + +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + + +def count_model_parts(dir_model: Path) -> int: + num_parts = 0 + for filename in os.listdir(dir_model): + if filename.startswith("pytorch_model-"): + num_parts += 1 + + if num_parts > 0: + print("gguf: found " + str(num_parts) + " model parts") + return num_parts + + +# Supported Models: +# https://huggingface.co/bigscience/bloom-1b7 +# https://huggingface.co/bigscience/bloom-3b +# https://huggingface.co/bigscience/bloom-7b1 +# https://huggingface.co/Langboat/bloom-1b4-zh +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Convert a Bloom model to a GGML compatible file") + parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") + parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") + parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.bin)") + parser.add_argument("ftype", type=int, help="output format - use 0 for float32, 1 for float16", choices=[0, 1], default = 1) + return parser.parse_args() + +args = parse_args() + +dir_model = args.model +ftype = args.ftype +if not dir_model.is_dir(): + print(f'Error: {args.model} is not a directory', file = sys.stderr) + sys.exit(1) + +# possible tensor data types +# ftype == 0 -> float32 +# ftype == 1 -> float16 + +# map from ftype to string +ftype_str = ["f32", "f16"] + +if args.outfile is not None: + fname_out = args.outfile +else: + # output in the same directory as the model by default + fname_out = dir_model / f'ggml-model-{ftype_str[ftype]}.gguf' + +print("gguf: loading model "+dir_model.name) + +with open(dir_model / "config.json", "r", encoding="utf-8") as f: + hparams = json.load(f) + +if hparams["architectures"][0] != "BloomForCausalLM": + print("Model architecture not supported: " + hparams["architectures"][0]) + sys.exit(1) + +# get number of model parts +num_parts = count_model_parts(dir_model) + +ARCH=gguf.MODEL_ARCH.BLOOM +gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH]) + +print("gguf: get model metadata") + +block_count = hparams["n_layer"] + +gguf_writer.add_name("Bloom") +n_embed = hparams.get("hidden_size", hparams.get("n_embed")) +n_head = hparams.get("n_head", hparams.get("num_attention_heads")) +gguf_writer.add_context_length(hparams.get("seq_length", n_embed)) +gguf_writer.add_embedding_length(n_embed) +gguf_writer.add_feed_forward_length(4 * n_embed) +gguf_writer.add_block_count(block_count) +gguf_writer.add_head_count(n_head) +gguf_writer.add_head_count_kv(n_head) +gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"]) +gguf_writer.add_file_type(ftype) + +# TOKENIZATION + +print("gguf: get tokenizer metadata") + +tokens: list[bytearray] = [] +scores: list[float] = [] +toktypes: list[int] = [] + +# gpt2 tokenizer +gguf_writer.add_tokenizer_model("gpt2") + +print("gguf: get gpt2 tokenizer vocab") + +# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py +tokenizer = AutoTokenizer.from_pretrained(dir_model) + +# The number of tokens in tokenizer.json can differ from the expected vocab size. +# This causes downstream issues with mismatched tensor sizes when running the inference +vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) +assert max(tokenizer.vocab.values()) < vocab_size + +reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()} + +for i in range(vocab_size): + tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]") + scores.append(0.0) # dummy + toktypes.append(gguf.TokenType.NORMAL) + +gguf_writer.add_token_list(tokens) +gguf_writer.add_token_scores(scores) +gguf_writer.add_token_types(toktypes) + +special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) +special_vocab.add_to_gguf(gguf_writer) + +# TENSORS + +tensor_map = gguf.get_tensor_name_map(ARCH, block_count) + +# params for qkv transform +n_head_kv = hparams.get("n_head_kv", n_head) +head_dim = n_embed // n_head + +# tensor info +print("gguf: get tensor metadata") + +if num_parts == 0: + part_names = iter(("pytorch_model.bin",)) +else: + part_names = ( + f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1) + ) + +for part_name in part_names: + if args.vocab_only: + break + print("gguf: loading model part '" + part_name + "'") + model_part = torch.load(dir_model / part_name, map_location="cpu") + + has_lm_head = True + if "lm_head.weight" not in model_part.keys() and "output.weight" not in model_part.keys(): + has_lm_head = False + + for original_name in model_part.keys(): + data = model_part[original_name] + name = re.sub(r'transformer\.', '', original_name) + + old_dtype = data.dtype + + # convert any unsupported data types to float32 + if data.dtype != torch.float16 and data.dtype != torch.float32: + data = data.to(torch.float32) + + data = data.squeeze().numpy() + + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) + data = np.concatenate( + (qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed))), + axis=0 + ) + print("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) + data = np.concatenate( + (qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,))), + axis=0 + ) + print("re-format attention.linear_qkv.bias") + + # map tensor names + new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) + if new_name is None: + print("Can not map tensor '" + name + "'") + sys.exit() + + n_dims = len(data.shape) + data_dtype = data.dtype + + # if f32 desired, convert any float16 to float32 + if ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) + + # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 + if ftype == 1 and data_dtype == np.float16 and n_dims == 1: + data = data.astype(np.float32) + + # if f16 desired, convert any float32 2-dim weight tensors to float16 + if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: + data = data.astype(np.float16) + + print(name, "=>", new_name + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + + gguf_writer.add_tensor(new_name, data) + + if not has_lm_head and name == "word_embeddings.weight": + gguf_writer.add_tensor("output.weight", data) + print(name, "=>", "output.weight" + ", shape = " + str(data.shape) + ", " + str(old_dtype) + " --> " + str(data.dtype)) # noqa + + +print("gguf: write header") +gguf_writer.write_header_to_file() +print("gguf: write metadata") +gguf_writer.write_kv_data_to_file() +if not args.vocab_only: + print("gguf: write tensors") + gguf_writer.write_tensors_to_file() + +gguf_writer.close() + +print(f"gguf: model successfully exported to '{fname_out}'") +print("") diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py index fb677a6ed7283..557ce7ac0173c 100644 --- a/gguf-py/gguf/gguf.py +++ b/gguf-py/gguf/gguf.py @@ -88,29 +88,31 @@ class MODEL_ARCH(IntEnum): PERSIMMON : int = auto() REFACT : int = auto() BERT : int = auto() + BLOOM : int = auto() class MODEL_TENSOR(IntEnum): - TOKEN_EMBD : int = auto() - TOKEN_TYPES : int = auto() - POS_EMBD : int = auto() - OUTPUT : int = auto() - OUTPUT_NORM : int = auto() - ROPE_FREQS : int = auto() - ATTN_Q : int = auto() - ATTN_K : int = auto() - ATTN_V : int = auto() - ATTN_QKV : int = auto() - ATTN_OUT : int = auto() - ATTN_NORM : int = auto() - ATTN_NORM_2 : int = auto() - ATTN_ROT_EMBD: int = auto() - FFN_GATE : int = auto() - FFN_DOWN : int = auto() - FFN_UP : int = auto() - FFN_NORM : int = auto() - ATTN_Q_NORM : int = auto() - ATTN_K_NORM : int = auto() + TOKEN_EMBD : int = auto() + TOKEN_EMBD_NORM : int = auto() + TOKEN_TYPES : int = auto() + POS_EMBD : int = auto() + OUTPUT : int = auto() + OUTPUT_NORM : int = auto() + ROPE_FREQS : int = auto() + ATTN_Q : int = auto() + ATTN_K : int = auto() + ATTN_V : int = auto() + ATTN_QKV : int = auto() + ATTN_OUT : int = auto() + ATTN_NORM : int = auto() + ATTN_NORM_2 : int = auto() + ATTN_ROT_EMBD : int = auto() + FFN_GATE : int = auto() + FFN_DOWN : int = auto() + FFN_UP : int = auto() + FFN_NORM : int = auto() + ATTN_Q_NORM : int = auto() + ATTN_K_NORM : int = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { @@ -125,29 +127,31 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.PERSIMMON: "persimmon", MODEL_ARCH.REFACT: "refact", MODEL_ARCH.BERT: "bert", + MODEL_ARCH.BLOOM: "bloom", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -282,6 +286,18 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.BLOOM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.GPT2: [ # TODO ], @@ -311,6 +327,7 @@ class TensorNameMap: "gpt_neox.embed_in", # gptneox "transformer.wte", # gpt2 gpt-j mpt refact "transformer.word_embeddings", # falcon + "word_embeddings", # bloom "model.embed_tokens", # llama-hf "tok_embeddings", # llama-pth "embeddings.word_embeddings", # bert @@ -322,6 +339,11 @@ class TensorNameMap: "embeddings.token_type_embeddings", # bert ), + # Normalization of token embeddings + MODEL_TENSOR.TOKEN_EMBD_NORM: ( + "word_embeddings_layernorm", # bloom + ), + # Position embeddings MODEL_TENSOR.POS_EMBD: ( "transformer.wpe", # gpt2 @@ -332,7 +354,7 @@ class TensorNameMap: MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox "lm_head", # gpt2 mpt falcon llama-hf baichuan - "output", # llama-pth + "output", # llama-pth bloom "word_embeddings_for_head", # persimmon ), @@ -344,7 +366,7 @@ class TensorNameMap: "norm", # llama-pth "embeddings.LayerNorm", # bert "transformer.norm_f", # mpt - "ln_f", # refact + "ln_f", # refact bloom "language_model.encoder.final_layernorm", # persimmon ), @@ -361,6 +383,7 @@ class TensorNameMap: "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact "transformer.blocks.{bid}.norm_1", # mpt "transformer.h.{bid}.input_layernorm", # falcon7b + "h.{bid}.input_layernorm", # bloom "transformer.h.{bid}.ln_mlp", # falcon40b "model.layers.{bid}.input_layernorm", # llama-hf "layers.{bid}.attention_norm", # llama-pth @@ -379,6 +402,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.c_attn", # gpt2 "transformer.blocks.{bid}.attn.Wqkv", # mpt "transformer.h.{bid}.self_attention.query_key_value", # falcon + "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon ), @@ -412,6 +436,7 @@ class TensorNameMap: "transformer.h.{bid}.attn.c_proj", # gpt2 refact "transformer.blocks.{bid}.attn.out_proj", # mpt "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom "model.layers.{bid}.self_attn.o_proj", # llama-hf "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert @@ -429,6 +454,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_NORM: ( "gpt_neox.layers.{bid}.post_attention_layernorm", # gptneox "transformer.h.{bid}.ln_2", # gpt2 refact + "h.{bid}.post_attention_layernorm", # bloom "transformer.blocks.{bid}.norm_2", # mpt "model.layers.{bid}.post_attention_layernorm", # llama-hf "layers.{bid}.ffn_norm", # llama-pth @@ -442,6 +468,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_fc", # gpt2 "transformer.blocks.{bid}.ffn.up_proj", # mpt "transformer.h.{bid}.mlp.dense_h_to_4h", # falcon + "h.{bid}.mlp.dense_h_to_4h", # bloom "model.layers.{bid}.mlp.up_proj", # llama-hf refact "layers.{bid}.feed_forward.w3", # llama-pth "encoder.layer.{bid}.intermediate.dense", # bert @@ -461,6 +488,7 @@ class TensorNameMap: "transformer.h.{bid}.mlp.c_proj", # gpt2 refact "transformer.blocks.{bid}.ffn.down_proj", # mpt "transformer.h.{bid}.mlp.dense_4h_to_h", # falcon + "h.{bid}.mlp.dense_4h_to_h", # bloom "model.layers.{bid}.mlp.down_proj", # llama-hf "layers.{bid}.feed_forward.w2", # llama-pth "encoder.layer.{bid}.output.dense", # bert diff --git a/llama.cpp b/llama.cpp index 3b63b64010b0f..4653c80232c5c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -188,6 +188,7 @@ enum llm_arch { LLM_ARCH_STARCODER, LLM_ARCH_PERSIMMON, LLM_ARCH_REFACT, + LLM_ARCH_BLOOM, LLM_ARCH_UNKNOWN, }; @@ -201,7 +202,8 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_BAICHUAN, "baichuan" }, { LLM_ARCH_STARCODER, "starcoder" }, { LLM_ARCH_PERSIMMON, "persimmon" }, - { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BLOOM, "bloom" }, }; enum llm_kv { @@ -304,6 +306,7 @@ struct LLM_KV { enum llm_tensor { LLM_TENSOR_TOKEN_EMBD, + LLM_TENSOR_TOKEN_EMBD_NORM, LLM_TENSOR_POS_EMBD, LLM_TENSOR_OUTPUT, LLM_TENSOR_OUTPUT_NORM, @@ -466,6 +469,21 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_BLOOM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1207,6 +1225,8 @@ struct llama_model { struct ggml_tensor * tok_embeddings; struct ggml_tensor * pos_embeddings; + struct ggml_tensor * tok_norm; + struct ggml_tensor * tok_norm_b; struct ggml_tensor * output_norm; struct ggml_tensor * output_norm_b; @@ -2056,13 +2076,13 @@ static void llm_load_hparams( } } break; case LLM_ARCH_PERSIMMON: - { - GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); - switch (hparams.n_layer) { - case 36: model.type = e_model::MODEL_8B; break; - default: model.type = e_model::MODEL_UNKNOWN; - } - } break; + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + switch (hparams.n_layer) { + case 36: model.type = e_model::MODEL_8B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_REFACT: { GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS)); @@ -2071,6 +2091,19 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_BLOOM: + { + GGUF_GET_KEY(ctx, hparams.f_norm_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_EPS)); + + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 30: + switch (hparams.n_embd) { + case 2560: model.type = e_model::MODEL_3B; break; + case 4096: model.type = e_model::MODEL_7B; break; + } break; + } + } break; case LLM_ARCH_MPT: { hparams.f_clamp_kqv = 0.0f; @@ -2676,6 +2709,88 @@ static void llm_load_tensors( layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); } } break; + case LLM_ARCH_BLOOM: + { + // TODO: CPU-only for now + + model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); + model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + + // output + { + ggml_backend_type backend_norm; + ggml_backend_type backend_output; + + if (n_gpu_layers > int(n_layer)) { + // norm is not performance relevant on its own but keeping it in VRAM reduces data copying + // on Windows however this is detrimental unless everything is on the GPU +#ifndef _WIN32 + backend_norm = LLAMA_BACKEND_OFFLOAD; +#else + backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; +#endif // _WIN32 + + backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT; + } else { + backend_norm = GGML_BACKEND_CPU; + backend_output = GGML_BACKEND_CPU; + } + + model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); + model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); + model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + + if (backend_norm == GGML_BACKEND_GPU) { + vram_weights += ggml_nbytes(model.output_norm); + vram_weights += ggml_nbytes(model.output_norm_b); + } + if (backend_output == GGML_BACKEND_GPU_SPLIT) { + vram_weights += ggml_nbytes(model.output); + } + } + + const uint32_t n_ff = hparams.n_ff; + + const int i_gpu_start = n_layer - n_gpu_layers; + + model.layers.resize(n_layer); + + for (uint32_t i = 0; i < n_layer; ++i) { + const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT + const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + + layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); + layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend_split); + + layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend_split); + + layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + + layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); + layer.b2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend_split); + + layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.b3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend_split); + + if (backend == GGML_BACKEND_GPU) { + vram_weights += + ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) + + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) + + ggml_nbytes(layer.wo) + ggml_nbytes(layer.bo) + + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_norm_b) + + ggml_nbytes(layer.w3) + ggml_nbytes(layer.b3) + + ggml_nbytes(layer.w2) + ggml_nbytes(layer.b2); + } + } + } break; case LLM_ARCH_MPT: { model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); @@ -4996,6 +5111,248 @@ static struct ggml_cgraph * llm_build_persimmon( return gf; } +static struct ggml_cgraph * llm_build_bloom( + llama_context & lctx, + const llama_batch & batch) { + const auto & model = lctx.model; + const auto & hparams = model.hparams; + const auto & cparams = lctx.cparams; + + const auto & kv_self = lctx.kv_self; + + GGML_ASSERT(!!kv_self.ctx); + + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_rot); + + const float norm_eps = hparams.f_norm_eps; + + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; + const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; + + auto & buf_compute = lctx.buf_compute; + + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute.size, + /*.mem_buffer =*/ buf_compute.data, + /*.no_alloc =*/ false, + }; + + params.no_alloc = true; + + struct ggml_context * ctx0 = ggml_init(params); + + ggml_cgraph * gf = ggml_new_graph(ctx0); + + struct ggml_tensor * cur; + struct ggml_tensor * token; + struct ggml_tensor * inpL; + + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + + ggml_allocr_alloc(lctx.alloc, inp_tokens); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(inp_tokens->data, batch.token, n_tokens*ggml_element_size(inp_tokens)); + } + ggml_set_name(inp_tokens, "inp_tokens"); + + token = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens); + } else { +#ifdef GGML_USE_MPI + GGML_ASSERT(false && "not implemented"); +#endif + + token = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + + ggml_allocr_alloc(lctx.alloc, token); + if (!ggml_allocr_is_measure(lctx.alloc)) { + memcpy(token->data, batch.embd, n_tokens * n_embd * ggml_element_size(token)); + } + } + + // KQ_scale + struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1); + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); + ggml_allocr_alloc(lctx.alloc, KQ_scale); + if (!ggml_allocr_is_measure(lctx.alloc)) { + ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head)); + } + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); + ggml_set_name(KQ_mask, "KQ_mask"); + ggml_allocr_alloc(lctx.alloc, KQ_mask); + if (!ggml_allocr_is_measure(lctx.alloc)) { + float * data = (float *) KQ_mask->data; + memset(data, 0, ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const llama_pos pos = batch.pos[j]; + const llama_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + } + + // norm + { + inpL = ggml_norm(ctx0, token, norm_eps); + inpL = ggml_add(ctx0, ggml_mul(ctx0, inpL, model.tok_norm), model.tok_norm_b); + } + + ggml_set_name(inpL, "inpL"); + + for (int il = 0; il < n_layer; ++il) { + { + // Norm + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].attn_norm), model.layers[il].attn_norm_b); + } + + { + // Self Attention + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wqkv, cur), model.layers[il].bqkv); + + struct ggml_tensor * tmpq = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * tmpk = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*n_embd); + struct ggml_tensor * tmpv = ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], sizeof(float)*(n_embd + n_embd_gqa)); + + struct ggml_tensor * Qcur = tmpq; + struct ggml_tensor * Kcur = tmpk; + + // store key and value to memory + { + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_cont(ctx0, tmpv), n_embd_gqa, n_tokens)); + ggml_set_name(Vcur, "Vcur"); + + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head)); + ggml_set_name(k, "k"); + + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa, + ( n_ctx)*ggml_element_size(kv_self.v), + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd_head, n_head, n_tokens)), + 0, 2, 1, 3); + ggml_set_name(Q, "Q"); + + struct ggml_tensor * K = + ggml_view_3d(ctx0, kv_self.k, + n_embd_head, n_kv, n_head_kv, + ggml_element_size(kv_self.k)*n_embd_gqa, + ggml_element_size(kv_self.k)*n_embd_head, + ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il); + ggml_set_name(K, "K"); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_set_name(KQ, "KQ"); + + // KQ_scaled = KQ / sqrt(n_embd_head) + // KQ_scaled shape [n_past + n_tokens, n_tokens, n_head, 1] + struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); + ggml_set_name(KQ_scaled, "KQ_scaled"); + + struct ggml_tensor * KQ_scaled_alibi = ggml_alibi(ctx0, KQ_scaled, /*n_past*/ kv_head, n_head, 8); + ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi"); + + // KQ_masked = mask_past(KQ_scaled) + struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled_alibi, KQ_mask); + ggml_set_name(KQ_masked, "KQ_masked"); + + // KQ = soft_max(KQ_masked) + struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked); + ggml_set_name(KQ_soft_max, "KQ_soft_max"); + + // split cached V into n_head heads + struct ggml_tensor * V = + ggml_view_3d(ctx0, kv_self.v, + n_kv, n_embd_head, n_head_kv, + ggml_element_size(kv_self.v)*n_ctx, + ggml_element_size(kv_self.v)*n_ctx*n_embd_head, + ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il); + ggml_set_name(V, "V"); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); + ggml_set_name(KQV, "KQV"); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_set_name(KQV_merged, "KQV_merged"); + + // cur = KQV_merged.contiguous().view(n_embd, n_tokens) + cur = ggml_cont_2d(ctx0, KQV_merged, n_embd, n_tokens); + ggml_set_name(cur, "KQV_merged_contiguous"); + } + + // Projection + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wo, cur), model.layers[il].bo); + + // Add the input + cur = ggml_add(ctx0, cur, inpL); + + struct ggml_tensor * inpFF = cur; + + // FF + { + // Norm + { + cur = ggml_norm(ctx0, inpFF, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.layers[il].ffn_norm), model.layers[il].ffn_norm_b); + } + + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w3, cur), model.layers[il].b3); + + // GELU activation + cur = ggml_gelu(ctx0, cur); + + // Projection + cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].w2, cur), model.layers[il].b2); + } + + inpL = ggml_add(ctx0, cur, inpFF); + } + + // Output Norm + { + cur = ggml_norm(ctx0, inpL, norm_eps); + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, model.output_norm), model.output_norm_b); + } + ggml_set_name(cur, "result_norm"); + + cur = ggml_mul_mat(ctx0, model.output, cur); + ggml_set_name(cur, "result_output"); + + ggml_build_forward_expand(gf, cur); + + ggml_free(ctx0); + + return gf; +} + static struct ggml_cgraph * llm_build_mpt( llama_context & lctx, const llama_batch & batch) { @@ -5025,9 +5382,6 @@ static struct ggml_cgraph * llm_build_mpt( const int32_t n_kv = ggml_allocr_is_measure(lctx.alloc) ? n_ctx : kv_self.n; const int32_t kv_head = ggml_allocr_is_measure(lctx.alloc) ? n_ctx - n_tokens : kv_self.head; - //printf("kv_head = %d, n_kv = %d, n_tokens = %d, n_ctx = %d, is_measure = %d, has_shift = %d\n", - // kv_head, n_kv, n_tokens, n_ctx, ggml_allocr_is_measure(lctx.alloc), kv_self.has_shift); - auto & buf_compute = lctx.buf_compute; struct ggml_init_params params = { @@ -5348,6 +5702,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm_build_refact(lctx, batch); } break; + case LLM_ARCH_BLOOM: + { + result = llm_build_bloom(lctx, batch); + } break; case LLM_ARCH_MPT: { result = llm_build_mpt(lctx, batch); @@ -7579,8 +7937,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || - name.find("attn_qkv.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++n_attention_wv; } else if (name.find("ffn_down.weight") != std::string::npos) { From c5b49360d0d9e49f32e05a9116e90bd0b39a282d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?= Date: Wed, 11 Oct 2023 00:28:50 +0800 Subject: [PATCH 19/26] readme : add bloom (#3570) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0562795620e69..5c2b57cb4b83b 100644 --- a/README.md +++ b/README.md @@ -96,6 +96,7 @@ as the main playground for developing new features for the [ggml](https://github - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) +- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) **Bindings:** From 233fc1c69f6f415f35363e18a755f9610e89161b Mon Sep 17 00:00:00 2001 From: goerch Date: Tue, 10 Oct 2023 18:59:52 +0200 Subject: [PATCH 20/26] Minor improvements in GPT2 tokenizer (#3567) * Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test --- llama.cpp | 9 ++++----- tests/test-tokenizer-0-falcon.cpp | 8 ++++---- tests/test-tokenizer-0-falcon.py | 9 +++++---- tests/test-tokenizer-0-llama.cpp | 4 +--- tests/test-tokenizer-0-llama.py | 7 +++---- 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/llama.cpp b/llama.cpp index 4653c80232c5c..7ed8722376d2d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6342,7 +6342,6 @@ struct llm_tokenizer_bpe { for (int i = 0; i < (int)text_utf.size(); i++) { const std::string & utf_char = text_utf[i]; bool split_condition = false; - // const char* text_pos = raw_text_p + utf_char.seq_offset_bytes; int bytes_remain = text_utf.size() - i; // forward backward lookups const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; @@ -6368,9 +6367,9 @@ struct llm_tokenizer_bpe { if (!split_condition && bytes_remain >= 3) { // 're|'ve|'ll if (utf_char == "\'" && ( - (utf_char_next == "r" || utf_char_next_next == "e") || - (utf_char_next == "v" || utf_char_next_next == "e") || - (utf_char_next == "l" || utf_char_next_next == "l")) + (utf_char_next == "r" && utf_char_next_next == "e") || + (utf_char_next == "v" && utf_char_next_next == "e") || + (utf_char_next == "l" && utf_char_next_next == "l")) ) { split_condition = true; } @@ -6421,7 +6420,7 @@ struct llm_tokenizer_bpe { else if (collecting_special && (codepoint_type(utf_char) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char) == CODEPOINT_TYPE_DIGIT || codepoint_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { split_condition = true; } - else if (collecting_whitespace_lookahead && codepoint_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) { + else if (collecting_whitespace_lookahead && (codepoint_type(utf_char_next) == CODEPOINT_TYPE_LETTER || codepoint_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { split_condition = true; } } diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 0f3c50bce8ae9..a4e9d2b912728 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -36,6 +36,8 @@ static const std::map> & k_tests() { { " Hello" , { 258, 23090, }, }, { " Hello" , { 466, 23090, }, }, { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, + { "\n =" , { 1212, 40, }, }, + { "' era" , { 18, 4932, }, }, }; return _k_tests; @@ -155,7 +157,7 @@ int main(int argc, char **argv) { fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - const std::vector res = llama_tokenize(ctx, text, true); + const std::vector res = llama_tokenize(ctx, text, false); fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); @@ -169,10 +171,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py index 9c8c1c7d1d3ca..cf65a3f65d72c 100644 --- a/tests/test-tokenizer-0-falcon.py +++ b/tests/test-tokenizer-0-falcon.py @@ -41,6 +41,8 @@ " Hello", " Hello", " Hello\n Hello", + "\n =", + "' era", ] for text in tests: @@ -69,15 +71,14 @@ if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp index 91c841f7bba8f..39c8d188c9086 100644 --- a/tests/test-tokenizer-0-llama.cpp +++ b/tests/test-tokenizer-0-llama.cpp @@ -174,10 +174,8 @@ int main(int argc, char **argv) { } for (const auto & tok : res) { - ofs << tok << " "; + ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector{tok}) << "'" << std::endl; } - - ofs << "\n"; } fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py index bc164ee296cb1..078f680b165ca 100644 --- a/tests/test-tokenizer-0-llama.py +++ b/tests/test-tokenizer-0-llama.py @@ -81,15 +81,14 @@ if fname_tok: print('tokenizing file: ', fname_tok) fname_out = fname_tok + '.tok' - with open(fname_tok, 'r') as f: + with open(fname_tok, 'r', encoding='utf-8') as f: lines = f.readlines() s = ''.join(lines) res = tokenizer.encode(s, add_bos=True) # write to file - with open(fname_out, 'w') as f: + with open(fname_out, 'w', encoding='utf-8') as f: for x in res: - f.write(str(x) + ' ') - f.write('\n') + f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') print('len(res): ', len(res)) print('len(lines): ', len(lines)) print('results written to: ', fname_out) From 9f6ede19f3cfa50d4a51a5babb056c3f8a450b80 Mon Sep 17 00:00:00 2001 From: Galunid Date: Wed, 11 Oct 2023 01:02:49 +0200 Subject: [PATCH 21/26] Add MPT model to supported models in README.md (#3574) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5c2b57cb4b83b..0f1fd75656926 100644 --- a/README.md +++ b/README.md @@ -97,6 +97,7 @@ as the main playground for developing new features for the [ggml](https://github - [X] [Mistral AI v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) +- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) **Bindings:** From 24ba3d829e31a6eda3fa1723f692608c2fa3adda Mon Sep 17 00:00:00 2001 From: Zane Shannon Date: Wed, 11 Oct 2023 04:14:05 -0700 Subject: [PATCH 22/26] examples : add batched.swift + improve CI for swift (#3562) --- .github/workflows/build.yml | 5 + Makefile | 7 +- examples/batched.swift/.gitignore | 9 + examples/batched.swift/Makefile | 6 + examples/batched.swift/Package.swift | 22 ++ examples/batched.swift/README.md | 4 + examples/batched.swift/Sources/main.swift | 255 ++++++++++++++++++++++ 7 files changed, 307 insertions(+), 1 deletion(-) create mode 100644 examples/batched.swift/.gitignore create mode 100755 examples/batched.swift/Makefile create mode 100644 examples/batched.swift/Package.swift create mode 100644 examples/batched.swift/README.md create mode 100644 examples/batched.swift/Sources/main.swift diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e41be76db0820..5af497a3ce321 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -276,6 +276,11 @@ jobs: run: | xcodebuild -scheme llama -destination "${{ matrix.destination }}" + - name: Build Swift Example + id: make_build_swift_example + run: | + make swift + windows-latest-cmake: runs-on: windows-latest diff --git a/Makefile b/Makefile index 40187c4a25e62..87e7bb604c0c8 100644 --- a/Makefile +++ b/Makefile @@ -617,6 +617,11 @@ metal: examples/metal/metal.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) endif +ifeq ($(UNAME_S),Darwin) +swift: examples/batched.swift + (cd examples/batched.swift; make build) +endif + build-info.h: $(wildcard .git/index) scripts/build-info.sh @sh scripts/build-info.sh $(CC) > $@.tmp @if ! cmp -s $@.tmp $@; then \ @@ -637,7 +642,7 @@ benchmark-matmult: examples/benchmark/benchmark-matmult.cpp build-info.h ggml.o run-benchmark-matmult: benchmark-matmult ./$@ -.PHONY: run-benchmark-matmult +.PHONY: run-benchmark-matmult swift vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) diff --git a/examples/batched.swift/.gitignore b/examples/batched.swift/.gitignore new file mode 100644 index 0000000000000..e1e863bec6d5d --- /dev/null +++ b/examples/batched.swift/.gitignore @@ -0,0 +1,9 @@ +.DS_Store +/.build +/Packages +xcuserdata/ +DerivedData/ +.swiftpm/configuration/registries.json +.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata +.netrc +batched_swift diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile new file mode 100755 index 0000000000000..2afb24fb85a1a --- /dev/null +++ b/examples/batched.swift/Makefile @@ -0,0 +1,6 @@ +.PHONY: build + +build: + xcodebuild -scheme batched_swift -destination "generic/platform=macOS" -derivedDataPath build + rm -f ./batched_swift + ln -s ./build/Build/Products/Debug/batched_swift ./batched_swift diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift new file mode 100644 index 0000000000000..826491defd863 --- /dev/null +++ b/examples/batched.swift/Package.swift @@ -0,0 +1,22 @@ +// swift-tools-version: 5.5 +// The swift-tools-version declares the minimum version of Swift required to build this package. + +import PackageDescription + +let package = Package( + name: "batched_swift", + platforms: [.macOS(.v12)], + dependencies: [ + .package(name: "llama", path: "../../"), + ], + targets: [ + // Targets are the basic building blocks of a package, defining a module or a test suite. + // Targets can depend on other targets in this package and products from dependencies. + .executableTarget( + name: "batched_swift", + dependencies: ["llama"], + path: "Sources", + linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] + ), + ] +) diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md new file mode 100644 index 0000000000000..464c9079c4660 --- /dev/null +++ b/examples/batched.swift/README.md @@ -0,0 +1,4 @@ +This is a swift clone of `examples/batched`. + +$ `make` +$ `./swift MODEL_PATH [PROMPT] [PARALLEL]` diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift new file mode 100644 index 0000000000000..938f30512ca6a --- /dev/null +++ b/examples/batched.swift/Sources/main.swift @@ -0,0 +1,255 @@ +import Foundation +import llama + +let arguments = CommandLine.arguments + +// Check that we have at least one argument (the model path) +guard arguments.count > 1 else { + print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]") + exit(1) +} + +let modelPath: String = arguments[1] +let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is" +let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1 + +// total length of the sequences including the prompt +let n_len: Int = 32 + +// init LLM +llama_backend_init(false) +defer { + llama_backend_free() +} + +let model_params = llama_model_default_params() +guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { + print("Failed to load model") + exit(1) +} + +defer { + llama_free_model(model) +} + +var tokens = tokenize(text: prompt, add_bos: true) + +let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) + +var context_params = llama_context_default_params() +context_params.seed = 1234 +context_params.n_ctx = n_kv_req +context_params.n_batch = UInt32(max(n_len, n_parallel)) +context_params.n_threads = 8 +context_params.n_threads_batch = 8 + +let context = llama_new_context_with_model(model, context_params) +guard context != nil else { + print("Failed to initialize context") + exit(1) +} + +defer { + llama_free(context) +} + +let n_ctx = llama_n_ctx(context) + +print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") + +if n_kv_req > n_ctx { + print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req) + exit(1) +} + +var buffer: [CChar] = [] +for id: llama_token in tokens { + print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "") +} + +print("\n") + +var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0) +defer { + llama_batch_free(batch) +} + +// evaluate the initial prompt +batch.n_tokens = Int32(tokens.count) + +for (i, token) in tokens.enumerated() { + batch.token[i] = token + batch.pos[i] = Int32(i) + batch.seq_id[i] = 0 + batch.logits[i] = 0 +} + +// llama_decode will output logits only for the last token of the prompt +batch.logits[Int(batch.n_tokens) - 1] = 1 + +if llama_decode(context, batch) != 0 { + print("llama_decode() failed") + exit(1) +} + +for i in 1 ..< n_parallel { + llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) +} + +if n_parallel > 1 { + print("generating \(n_parallel) sequences ...\n") +} + +var streams: [String] = .init(repeating: "", count: n_parallel) +var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel) +var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel) + +var n_cur = batch.n_tokens +var n_decode = 0 + +let t_main_start = ggml_time_us() + +while n_cur <= n_len { + // prepare the next batch + batch.n_tokens = 0 + + // sample the next token for each parallel sequence / stream + for i in 0 ..< n_parallel { + if i_batch[i] < 0 { + // the stream has already finished + continue + } + + var n_vocab = llama_n_vocab(model) + var logits = llama_get_logits_ith(context, i_batch[i]) + + var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab)) + + for token_id in 0 ..< n_vocab { + candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0)) + } + + var candidates_p: llama_token_data_array = .init( + data: &candidates, + size: candidates.count, + sorted: false + ) + + let top_k: Int32 = 40 + let top_p: Float = 0.9 + let temp: Float = 0.4 + + llama_sample_top_k(context, &candidates_p, top_k, 1) + llama_sample_top_p(context, &candidates_p, top_p, 1) + llama_sample_temp(context, &candidates_p, temp) + + let new_token_id = llama_sample_token(context, &candidates_p) + + // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); + + // is it an end of stream? -> mark the stream as finished + if new_token_id == llama_token_eos(context) || n_cur == n_len { + i_batch[i] = -1 + // print("") + if n_parallel > 1 { + print("stream \(i) finished at n_cur = \(n_cur)") + } + + continue + } + + let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? "" + + // if there is only one stream, we print immediately to stdout + if n_parallel == 1 { + print(nextStringPiece, terminator: "") + } + streams[i] += nextStringPiece + + // push this new token for next evaluation + batch.token[Int(batch.n_tokens)] = new_token_id + batch.pos[Int(batch.n_tokens)] = n_cur + batch.seq_id[Int(batch.n_tokens)] = Int32(i) + batch.logits[Int(batch.n_tokens)] = 1 + + i_batch[i] = batch.n_tokens + + batch.n_tokens += 1 + + n_decode += 1 + } + + // all streams are finished + if batch.n_tokens == 0 { + break + } + + n_cur += 1 + + // evaluate the current batch with the transformer model + if llama_decode(context, batch) != 0 { + print("llama_decode() failed") + exit(1) + } +} + +if n_parallel > 1 { + print("\n") + for (i, stream) in streams.enumerated() { + print("sequence \(i):\n\n\(prompt)\(stream)\n") + } +} + +let t_main_end = ggml_time_us() + +print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n") + +llama_print_timings(context) + +private func tokenize(text: String, add_bos: Bool) -> [llama_token] { + let n_tokens = text.count + (add_bos ? 1 : 0) + let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) + let tokenCount = llama_tokenize(model, text, Int32(text.count), tokens, Int32(n_tokens), add_bos) + var swiftTokens: [llama_token] = [] + for i in 0 ..< tokenCount { + swiftTokens.append(tokens[Int(i)]) + } + tokens.deallocate() + return swiftTokens +} + +private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { + var result = [CChar](repeating: 0, count: 8) + let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count)) + if nTokens < 0 { + if result.count >= -Int(nTokens) { + result.removeLast(-Int(nTokens)) + } else { + result.removeAll() + } + let check = llama_token_to_piece( + model, + token, + &result, + Int32(result.count) + ) + assert(check == nTokens) + } else { + result.removeLast(result.count - Int(nTokens)) + } + if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) { + return utfString + } else { + buffer.append(contentsOf: result) + let data = Data(buffer.map { UInt8(bitPattern: $0) }) + if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer + buffer = [] + } + guard let bufferString = String(data: data, encoding: .utf8) else { + return nil + } + buffer = [] + return bufferString + } + return nil +} From 8c70a5ff25964f0a81e20d142a2f5ac5baff22fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Oct 2023 21:25:33 +0300 Subject: [PATCH 23/26] batched : add bench tool (#3545) * batched : add bench tool * batched : minor fix table * batched-bench : add readme + n_kv_max is now configurable * batched-bench : init warm-up batch * batched-bench : pass custom set of PP, TG and PL * batched-bench : add mmq CLI arg --- .gitignore | 1 + Makefile | 13 +- examples/CMakeLists.txt | 1 + examples/batched-bench/CMakeLists.txt | 5 + examples/batched-bench/README.md | 51 +++++ examples/batched-bench/batched-bench.cpp | 251 +++++++++++++++++++++++ examples/batched/batched.cpp | 2 +- 7 files changed, 321 insertions(+), 3 deletions(-) create mode 100644 examples/batched-bench/CMakeLists.txt create mode 100644 examples/batched-bench/README.md create mode 100644 examples/batched-bench/batched-bench.cpp diff --git a/.gitignore b/.gitignore index 420e0d6d016a2..d288e66fcc3d4 100644 --- a/.gitignore +++ b/.gitignore @@ -55,6 +55,7 @@ models-mnt /server /simple /batched +/batched-bench /export-lora /finetune /speculative diff --git a/Makefile b/Makefile index 87e7bb604c0c8..571ad3bbec1fe 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,14 @@ # Define the default target now so that it is always the first target -BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o +BUILD_TARGETS = \ + main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + simple batched batched-bench save-load-state server embd-input-test gguf llama-bench baby-llama beam-search \ + speculative infill benchmark-matmult parallel finetune export-lora tests/test-c.o # Binaries only useful for tests -TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe +TEST_TARGETS = \ + tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ + tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ + tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -557,6 +563,9 @@ simple: examples/simple/simple.cpp build-info.h ggml. batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + quantize: examples/quantize/quantize.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index de4cf7a691768..ab84593703caa 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -25,6 +25,7 @@ else() add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(simple) add_subdirectory(batched) + add_subdirectory(batched-bench) add_subdirectory(speculative) add_subdirectory(parallel) add_subdirectory(embd-input) diff --git a/examples/batched-bench/CMakeLists.txt b/examples/batched-bench/CMakeLists.txt new file mode 100644 index 0000000000000..40a032c514d11 --- /dev/null +++ b/examples/batched-bench/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET batched-bench) +add_executable(${TARGET} batched-bench.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md new file mode 100644 index 0000000000000..34b343f66d6b9 --- /dev/null +++ b/examples/batched-bench/README.md @@ -0,0 +1,51 @@ +# llama.cpp/example/batched-bench + +Benchmark the batched decoding performance of `llama.cpp` + +## Usage + +There are 2 modes of operation: + +- `prompt not shared` - each batch has a separate prompt of size `PP` (i.e. `N_KV = B*(PP + TG)`) +- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) + +```bash +./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] + +# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared +./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99 + +# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99 + +# custom set of batches +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32 +``` + +## Sample results + +- `PP` - prompt tokens per batch +- `TG` - generated tokens per batch +- `B` - number of batches +- `N_KV` - required KV cache size +- `T_PP` - prompt processing time (i.e. time to first token) +- `S_PP` - prompt processing speed (`(B*PP)/T_PP` or `PP/T_PP`) +- `T_TG` - time to generate all batches +- `S_TG` - text generation speed (`(B*TG)/T_TG`) +- `T` - total time +- `S` - total speed (i.e. all tokens / total time) + +| PP | TG | B | N_KV | T_PP s | S_PP t/s | T_TG s | S_TG t/s | T s | S t/s | +|-------|--------|------|--------|----------|----------|----------|----------|----------|----------| +| 128 | 128 | 1 | 256 | 0.108 | 1186.64 | 3.079 | 41.57 | 3.187 | 80.32 | +| 128 | 128 | 2 | 512 | 0.198 | 1295.19 | 5.029 | 50.90 | 5.227 | 97.95 | +| 128 | 128 | 4 | 1024 | 0.373 | 1373.96 | 6.878 | 74.44 | 7.251 | 141.23 | +| 128 | 128 | 8 | 2048 | 0.751 | 1363.27 | 7.344 | 139.43 | 8.095 | 252.99 | +| 128 | 128 | 16 | 4096 | 1.570 | 1304.68 | 8.455 | 242.23 | 10.024 | 408.60 | +| 128 | 128 | 32 | 8192 | 3.408 | 1201.73 | 8.801 | 465.40 | 12.209 | 670.96 | +| 128 | 256 | 1 | 384 | 0.107 | 1196.70 | 6.329 | 40.45 | 6.436 | 59.67 | +| 128 | 256 | 2 | 768 | 0.194 | 1317.45 | 10.239 | 50.00 | 10.433 | 73.61 | +| 128 | 256 | 4 | 1536 | 0.366 | 1399.03 | 13.960 | 73.35 | 14.326 | 107.22 | +| 128 | 256 | 8 | 3072 | 0.751 | 1363.92 | 15.110 | 135.54 | 15.861 | 193.69 | +| 128 | 256 | 16 | 6144 | 1.569 | 1304.93 | 18.073 | 226.64 | 19.642 | 312.80 | +| 128 | 256 | 32 | 12288 | 3.409 | 1201.35 | 19.223 | 426.15 | 22.633 | 542.93 | diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp new file mode 100644 index 0000000000000..3e1e0716d8312 --- /dev/null +++ b/examples/batched-bench/batched-bench.cpp @@ -0,0 +1,251 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include + +// mutates the input string +static std::vector parse_list(char * p) { + std::vector ret; + + char * q = p; + + while (*p) { + if (*p == ',') { + *p = '\0'; + ret.push_back(std::atoi(q)); + q = p + 1; + } + + ++p; + } + + ret.push_back(std::atoi(q)); + + return ret; +} + +int main(int argc, char ** argv) { + gpt_params params; + + if (argc == 1 || argv[1][0] == '-') { + printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] \n" , argv[0]); + printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); + printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); + return 1 ; + } + + int n_kv_max = 2048; + int is_pp_shared = 0; + int n_gpu_layers = 0; + int mmq = 0; + + std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, }; + std::vector n_tg = { 128, 256, }; + std::vector n_pl = { 1, 2, 4, 8, 16, 32, }; + //std::vector n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, }; + + if (argc >= 2) { + params.model = argv[1]; + } + + if (argc >= 3) { + n_kv_max = std::atoi(argv[2]); + } + + if (argc >= 4) { + is_pp_shared = std::atoi(argv[3]); + } + + if (argc >= 5) { + n_gpu_layers = std::atoi(argv[4]); + } + + if (argc >= 6) { + mmq = std::atoi(argv[5]); + } + + if (argc >= 7) { + n_pp = parse_list(argv[6]); + } + + if (argc >= 8) { + n_tg = parse_list(argv[7]); + } + + if (argc >= 9) { + n_pl = parse_list(argv[8]); + } + + // init LLM + + llama_backend_init(params.numa); + + // initialize the model + + llama_model_params model_params = llama_model_default_params(); + + model_params.n_gpu_layers = n_gpu_layers; + + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + llama_context_params ctx_params = llama_context_default_params(); + + ctx_params.seed = 1234; + ctx_params.n_ctx = n_kv_max; + ctx_params.n_batch = 512; + ctx_params.mul_mat_q = mmq; + + ctx_params.n_threads = params.n_threads; + ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; + + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + llama_batch batch = llama_batch_init(n_kv_max, 0); + + // decode in batches of ctx_params.n_batch tokens + auto decode_helper = [](llama_context * ctx, llama_batch & batch, int32_t n_batch) { + for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { + const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); + + llama_batch batch_view = { + n_tokens, + batch.token + i, + nullptr, + batch.pos + i, + batch.seq_id + i, + batch.logits + i, + 0, 0, 0, // unused + }; + + const int ret = llama_decode(ctx, batch_view); + if (ret != 0) { + LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret); + return false; + } + } + + return true; + }; + + // warm up + { + batch.n_tokens = 16; + + for (int i = 0; i < batch.n_tokens; ++i) { + batch.token[i] = 0; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } + + if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + } + + LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); + LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); + + for ( int i_pp = 0; i_pp < (int) n_pp.size(); ++i_pp) { + for ( int i_tg = 0; i_tg < (int) n_tg.size(); ++i_tg) { + for (int i_pl = 0; i_pl < (int) n_pl.size(); ++i_pl) { + const int pp = n_pp[i_pp]; + const int tg = n_tg[i_tg]; + const int pl = n_pl[i_pl]; + + const int n_ctx_req = is_pp_shared ? pp + pl*tg : pl*(pp + tg); + + if (n_ctx_req > n_kv_max) { + continue; + } + + batch.n_tokens = is_pp_shared ? pp : pl*pp; + + for (int i = 0; i < batch.n_tokens; ++i) { + batch.token[i] = 0; + batch.pos[i] = i; + batch.seq_id[i] = 0; + batch.logits[i] = false; + } + batch.logits[batch.n_tokens - 1] = true; + + const auto t_pp_start = ggml_time_us(); + + llama_kv_cache_tokens_rm(ctx, -1, -1); + + if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + + if (is_pp_shared) { + for (int32_t i = 1; i < pl; ++i) { + llama_kv_cache_seq_cp(ctx, 0, i, 0, pp); + } + } + + const auto t_pp_end = ggml_time_us(); + + const auto t_tg_start = ggml_time_us(); + + for (int i = 0; i < tg; ++i) { + batch.n_tokens = pl; + + for (int j = 0; j < pl; ++j) { + batch.token[j] = 0; + batch.pos[j] = pp + i; + batch.seq_id[j] = j; + batch.logits[j] = true; + } + + if (!decode_helper(ctx, batch, ctx_params.n_batch)) { + LOG_TEE("%s: llama_decode() failed\n", __func__); + return 1; + } + } + + const auto t_tg_end = ggml_time_us(); + + const int32_t n_kv = n_ctx_req; + + const float t_pp = (t_pp_end - t_pp_start) / 1000000.0f; + const float t_tg = (t_tg_end - t_tg_start) / 1000000.0f; + const float t = t_pp + t_tg; + + const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp; + const float speed_tg = pl*tg / t_tg; + const float speed = n_kv / t; + + LOG_TEE("|%6d | %6d | %4d | %6d | %8.3f | %8.2f | %8.3f | %8.2f | %8.3f | %8.2f |\n", pp, tg, pl, n_kv, t_pp, speed_pp, t_tg, speed_tg, t, speed); + } + } + } + + llama_print_timings(ctx); + + llama_batch_free(batch); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + fprintf(stderr, "\n\n"); + + return 0; +} diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 688ef221335a9..a88e022d69aec 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -66,7 +66,7 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); - ctx_params.n_threads = params.n_threads; + ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; llama_context * ctx = llama_new_context_with_model(model, ctx_params); From 70c29da118cdb02bfcbd0376c32b5b2236e48e48 Mon Sep 17 00:00:00 2001 From: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> Date: Wed, 11 Oct 2023 13:35:46 -0600 Subject: [PATCH 24/26] common : fix mirostat state when using multiple sequences (#3543) * Fix mirostat state when using multiple sequences * Fix mirostat by completely refactoring sampling! * Try to fix zig build. * Export function to fetch/create default sampler states Code formatting cleanups and add some comments Silence a warning about id not being used when logging is disabled * Apply some renaming suggestions. Fix comments that were out of sync with the pull. * Use more consistant naming convention for sampling contexts --- Makefile | 86 ++++--- build.zig | 5 +- common/CMakeLists.txt | 2 + common/common.cpp | 228 +++++-------------- common/common.h | 56 +---- common/sampling.cpp | 166 ++++++++++++++ common/sampling.h | 108 +++++++++ examples/embd-input/embd-input-lib.cpp | 19 +- examples/infill/infill.cpp | 18 +- examples/main/main.cpp | 18 +- examples/parallel/parallel.cpp | 6 +- examples/save-load-state/save-load-state.cpp | 5 +- examples/server/server.cpp | 100 ++++---- examples/speculative/speculative.cpp | 12 +- 14 files changed, 495 insertions(+), 334 deletions(-) create mode 100644 common/sampling.cpp create mode 100644 common/sampling.h diff --git a/Makefile b/Makefile index 571ad3bbec1fe..705fa1effc2fa 100644 --- a/Makefile +++ b/Makefile @@ -178,6 +178,24 @@ else MK_CPPFLAGS += -DNDEBUG endif +ifdef LLAMA_SANITIZE_THREAD + MK_CFLAGS += -fsanitize=thread -g + MK_CXXFLAGS += -fsanitize=thread -g + MK_LDFLAGS += -fsanitize=thread -g +endif + +ifdef LLAMA_SANITIZE_ADDRESS + MK_CFLAGS += -fsanitize=address -fno-omit-frame-pointer -g + MK_CXXFLAGS += -fsanitize=address -fno-omit-frame-pointer -g + MK_LDFLAGS += -fsanitize=address -fno-omit-frame-pointer -g +endif + +ifdef LLAMA_SANITIZE_UNDEFINED + MK_CFLAGS += -fsanitize=undefined -g + MK_CXXFLAGS += -fsanitize=undefined -g + MK_LDFLAGS += -fsanitize=undefined -g +endif + ifdef LLAMA_SERVER_VERBOSE MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE) endif @@ -526,7 +544,13 @@ OBJS += ggml-alloc.o ggml-backend.o llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -common.o: common/common.cpp common/common.h build-info.h common/log.h +COMMON_H_DEPS = common/common.h common/sampling.h build-info.h common/log.h +COMMON_DEPS = $(COMMON_H_DEPS) common.o sampling.o + +common.o: common/common.cpp $(COMMON_H_DEPS) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +sampling.o: common/sampling.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ console.o: common/console.cpp common/console.h @@ -548,19 +572,19 @@ clean: # Examples # -main: examples/main/main.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS) +main: examples/main/main.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) @echo @echo '==== Run ./main -h for help. ====' @echo -infill: examples/infill/infill.cpp build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS) +infill: examples/infill/infill.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) console.o grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -simple: examples/simple/simple.cpp build-info.h ggml.o llama.o common.o $(OBJS) +simple: examples/simple/simple.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -batched: examples/batched/batched.cpp build-info.h ggml.o llama.o common.o $(OBJS) +batched: examples/batched/batched.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) batched-bench: examples/batched-bench/batched-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) @@ -572,53 +596,53 @@ quantize: examples/quantize/quantize.cpp build-info.h ggml. quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.h ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o common.o $(OBJS) +perplexity: examples/perplexity/perplexity.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o common.o $(OBJS) +embedding: examples/embedding/embedding.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o common.o $(OBJS) +save-load-state: examples/save-load-state/save-load-state.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o common.o $(OBJS) +$(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-input/embd-input-lib.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) --shared $(CXXFLAGS) $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) -embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS) +embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput gguf: examples/gguf/gguf.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o common.o train.o $(OBJS) +train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o common.o $(OBJS) +llama-bench: examples/llama-bench/llama-bench.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o common.o train.o $(OBJS) +baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o common.o $(OBJS) +beam-search: examples/beam-search/beam-search.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o common.o train.o $(OBJS) +finetune: examples/finetune/finetune.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o common.o $(OBJS) +export-lora: examples/export-lora/export-lora.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) +speculative: examples/speculative/speculative.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o common.o $(OBJS) +parallel: examples/parallel/parallel.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) ifdef LLAMA_METAL @@ -659,40 +683,40 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS) q8dot: pocs/vdot/q8dot.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) -tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o common.o grammar-parser.o $(OBJS) +tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o grammar-parser.o $(OBJS) +tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-grad0: tests/test-grad0.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-opt: tests/test-opt.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-quantize-fns: tests/test-quantize-fns.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-quantize-perf: tests/test-quantize-perf.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-sampling: tests/test-sampling.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-c.o: tests/test-c.c llama.h diff --git a/build.zig b/build.zig index fdc5bc084eb46..0b74cee485320 100644 --- a/build.zig +++ b/build.zig @@ -128,17 +128,18 @@ pub fn build(b: *std.build.Builder) !void { const llama = make.obj("llama", "llama.cpp"); const common = make.obj("common", "common/common.cpp"); const console = make.obj("console", "common/console.cpp"); + const sampling = make.obj("sampling", "common/sampling.cpp"); const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp"); const train = make.obj("train", "common/train.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, console, grammar_parser }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, console, grammar_parser }); _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common }); _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, grammar_parser }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, llama, common, sampling, grammar_parser }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 951aa8340c7e4..fbb0ff0952ac7 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -5,6 +5,8 @@ set(TARGET common) add_library(${TARGET} OBJECT common.h common.cpp + sampling.h + sampling.cpp console.h console.cpp grammar-parser.h diff --git a/common/common.cpp b/common/common.cpp index 0f55c33a713a7..4214e63afd87a 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -107,6 +107,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::string arg; gpt_params default_params; const std::string arg_prefix = "--"; + llama_sampling_params & sparams = params.sampling_params; for (int i = 1; i < argc; i++) { arg = argv[i]; @@ -184,7 +185,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.top_k = std::stoi(argv[i]); + sparams.top_k = std::stoi(argv[i]); } else if (arg == "-c" || arg == "--ctx-size") { if (++i >= argc) { invalid_param = true; @@ -216,73 +217,73 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - params.top_p = std::stof(argv[i]); + sparams.top_p = std::stof(argv[i]); } else if (arg == "--temp") { if (++i >= argc) { invalid_param = true; break; } - params.temp = std::stof(argv[i]); + sparams.temp = std::stof(argv[i]); } else if (arg == "--tfs") { if (++i >= argc) { invalid_param = true; break; } - params.tfs_z = std::stof(argv[i]); + sparams.tfs_z = std::stof(argv[i]); } else if (arg == "--typical") { if (++i >= argc) { invalid_param = true; break; } - params.typical_p = std::stof(argv[i]); + sparams.typical_p = std::stof(argv[i]); } else if (arg == "--repeat-last-n") { if (++i >= argc) { invalid_param = true; break; } - params.repeat_last_n = std::stoi(argv[i]); + sparams.repeat_last_n = std::stoi(argv[i]); } else if (arg == "--repeat-penalty") { if (++i >= argc) { invalid_param = true; break; } - params.repeat_penalty = std::stof(argv[i]); + sparams.repeat_penalty = std::stof(argv[i]); } else if (arg == "--frequency-penalty") { if (++i >= argc) { invalid_param = true; break; } - params.frequency_penalty = std::stof(argv[i]); + sparams.frequency_penalty = std::stof(argv[i]); } else if (arg == "--presence-penalty") { if (++i >= argc) { invalid_param = true; break; } - params.presence_penalty = std::stof(argv[i]); + sparams.presence_penalty = std::stof(argv[i]); } else if (arg == "--mirostat") { if (++i >= argc) { invalid_param = true; break; } - params.mirostat = std::stoi(argv[i]); + sparams.mirostat = std::stoi(argv[i]); } else if (arg == "--mirostat-lr") { if (++i >= argc) { invalid_param = true; break; } - params.mirostat_eta = std::stof(argv[i]); + sparams.mirostat_eta = std::stof(argv[i]); } else if (arg == "--mirostat-ent") { if (++i >= argc) { invalid_param = true; break; } - params.mirostat_tau = std::stof(argv[i]); + sparams.mirostat_tau = std::stof(argv[i]); } else if (arg == "--cfg-negative-prompt") { if (++i >= argc) { invalid_param = true; break; } - params.cfg_negative_prompt = argv[i]; + sparams.cfg_negative_prompt = argv[i]; } else if (arg == "--cfg-negative-prompt-file") { if (++i >= argc) { invalid_param = true; @@ -294,16 +295,16 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.cfg_negative_prompt)); - if (!params.cfg_negative_prompt.empty() && params.cfg_negative_prompt.back() == '\n') { - params.cfg_negative_prompt.pop_back(); + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); } } else if (arg == "--cfg-scale") { if (++i >= argc) { invalid_param = true; break; } - params.cfg_scale = std::stof(argv[i]); + sparams.cfg_scale = std::stof(argv[i]); } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) { invalid_param = true; @@ -512,7 +513,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } else if (arg == "--ignore-eos") { params.ignore_eos = true; } else if (arg == "--no-penalize-nl") { - params.penalize_nl = false; + sparams.penalize_nl = false; } else if (arg == "-l" || arg == "--logit-bias") { if (++i >= argc) { invalid_param = true; @@ -524,7 +525,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { std::string value_str; try { if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - params.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); } else { throw std::exception(); } @@ -627,6 +628,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { + const llama_sampling_params & sparams = params.sampling_params; + printf("usage: %s [options]\n", argv[0]); printf("\n"); printf("options:\n"); @@ -659,19 +662,19 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict); printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx); printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); - printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); - printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); - printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); - printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); - printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); - printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k); + printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p); + printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z); + printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p); + printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.repeat_last_n); + printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.repeat_penalty); + printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.presence_penalty); + printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.frequency_penalty); printf(" --mirostat N use Mirostat sampling.\n"); printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); - printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); - printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat); + printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta); + printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau); printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); printf(" modifies the likelihood of token appearing in the completion,\n"); printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); @@ -682,7 +685,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" negative prompt to use for guidance. (default: empty)\n"); printf(" --cfg-negative-prompt-file FNAME\n"); printf(" negative prompt file to use for guidance. (default: empty)\n"); - printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale); printf(" --rope-scale N RoPE context linear scaling factor, inverse of --rope-freq-scale\n"); printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n"); printf(" --rope-freq-scale N RoPE frequency linear scaling factor (default: loaded from model)\n"); @@ -690,7 +693,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); - printf(" --temp N temperature (default: %.1f)\n", (double)params.temp); + printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); @@ -840,7 +843,7 @@ std::tuple llama_init_from_gpt_par } if (params.ignore_eos) { - params.logit_bias[llama_token_eos(lctx)] = -INFINITY; + params.sampling_params.logit_bias[llama_token_eos(lctx)] = -INFINITY; } { @@ -932,127 +935,6 @@ std::string llama_detokenize_bpe(llama_context * ctx, const std::vector & last_tokens, - std::vector & candidates, - int idx) { - const int n_ctx = llama_n_ctx(ctx); - const int n_vocab = llama_n_vocab(llama_get_model(ctx)); - - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - - llama_token id = 0; - - float * logits = llama_get_logits_ith(ctx, idx); - - // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - candidates.clear(); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; - - if (ctx_guidance) { - llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale); - } - - // apply penalties - if (!last_tokens.empty()) { - const float nl_logit = logits[llama_token_nl(ctx)]; - const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx); - - llama_sample_repetition_penalty(ctx, &cur_p, - last_tokens.data() + last_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &cur_p, - last_tokens.data() + last_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - - if (!penalize_nl) { - for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(ctx)) { - cur_p.data[idx].logit = nl_logit; - break; - } - } - } - } - - if (grammar != NULL) { - llama_sample_grammar(ctx, &cur_p, grammar); - } - - if (temp <= 0) { - // Greedy sampling - id = llama_sample_token_greedy(ctx, &cur_p); - } else { - if (mirostat == 1) { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temp(ctx, &cur_p, temp); - id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } else if (mirostat == 2) { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temp(ctx, &cur_p, temp); - id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } else { - // Temperature sampling - size_t min_keep = std::max(1, params.n_probs); - llama_sample_top_k (ctx, &cur_p, top_k, min_keep); - llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep); - llama_sample_typical (ctx, &cur_p, typical_p, min_keep); - llama_sample_top_p (ctx, &cur_p, top_p, min_keep); - llama_sample_temp(ctx, &cur_p, temp); - - { - const int n_top = 10; - LOG("top %d candidates:\n", n_top); - - for (int i = 0; i < n_top; i++) { - const llama_token id = cur_p.data[i].id; - LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p); - } - } - - id = llama_sample_token(ctx, &cur_p); - - LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str()); - } - } - // printf("`%d`", candidates_p.size); - - if (grammar != NULL) { - llama_grammar_accept_token(ctx, grammar, id); - } - - return id; -} - // // YAML utils // @@ -1204,6 +1086,8 @@ std::string get_sortable_timestamp() { void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx, const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) { + const llama_sampling_params & sparams = params.sampling_params; + fprintf(stream, "build_commit: %s\n", BUILD_COMMIT); fprintf(stream, "build_number: %d\n", BUILD_NUMBER); fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false"); @@ -1250,21 +1134,21 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str()); fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch); - dump_string_yaml_multiline(stream, "cfg_negative_prompt", params.cfg_negative_prompt.c_str()); - fprintf(stream, "cfg_scale: %f # default: 1.0\n", params.cfg_scale); + dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str()); + fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale); fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks); fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false"); fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx); fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false"); fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n"); - fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", params.frequency_penalty); + fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.frequency_penalty); dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str()); fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n"); fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx)); - const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY; + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx)); + const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str()); @@ -1277,7 +1161,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); fprintf(stream, "logit_bias:\n"); - for (std::pair lb : params.logit_bias) { + for (std::pair lb : sparams.logit_bias) { if (ignore_eos && lb.first == logit_bias_eos->first) { continue; } @@ -1301,30 +1185,30 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "lora_base: %s\n", params.lora_base.c_str()); fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu); fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false"); - fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat); - fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau); - fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta); + fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat); + fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); + fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); - fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", params.n_probs); + fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); - fprintf(stream, "no_penalize_nl: %s # default: false\n", !params.penalize_nl ? "true" : "false"); + fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); fprintf(stream, "numa: %s # default: false\n", params.numa ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); - fprintf(stream, "presence_penalty: %f # default: 0.0\n", params.presence_penalty); + fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.presence_penalty); dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str()); fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str()); fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false"); fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false"); dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens); fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false"); - fprintf(stream, "repeat_penalty: %f # default: 1.1\n", params.repeat_penalty); + fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.repeat_penalty); fprintf(stream, "reverse_prompt:\n"); for (std::string ap : params.antiprompt) { @@ -1342,15 +1226,15 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "seed: %d # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); - fprintf(stream, "temp: %f # default: 0.8\n", params.temp); + fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES); dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector); - fprintf(stream, "tfs: %f # default: 1.0\n", params.tfs_z); + fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z); fprintf(stream, "threads: %d # default: %d\n", params.n_threads, std::thread::hardware_concurrency()); - fprintf(stream, "top_k: %d # default: 40\n", params.top_k); - fprintf(stream, "top_p: %f # default: 0.95\n", params.top_p); - fprintf(stream, "typical_p: %f # default: 1.0\n", params.typical_p); + fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k); + fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p); + fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p); fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false"); } diff --git a/common/common.h b/common/common.h index c802152791797..fa115536b64a0 100644 --- a/common/common.h +++ b/common/common.h @@ -4,6 +4,8 @@ #include "llama.h" +#include "sampling.h" + #define LOG_NO_FILE_LINE_FUNCTION #include "log.h" @@ -49,31 +51,12 @@ struct gpt_params { int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. int32_t n_beams = 0; // if non-zero then use beam search of given width. float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor - // sampling parameters - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // 1.0 = disabled - float repeat_penalty = 1.10f; // 1.0 = disabled - int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float frequency_penalty = 0.00f; // 0.0 = disabled - float presence_penalty = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - - std::unordered_map logit_bias; // logit bias for specific tokens - - // Classifier-Free Guidance - // https://arxiv.org/abs/2306.17806 - std::string cfg_negative_prompt; // string to help guidance - float cfg_scale = 1.f; // How strong is guidance + // // sampling parameters + struct llama_sampling_params sampling_params; std::string model = "models/7B/ggml-model-f16.gguf"; // model path std::string model_draft = ""; // draft model for speculative decoding @@ -115,7 +98,6 @@ struct gpt_params { bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool ignore_eos = false; // ignore generated EOS tokens bool instruct = false; // instruction mode (used for Alpaca models) - bool penalize_nl = true; // consider newlines as a repeatable token bool logits_all = false; // return logits for all tokens in the batch bool use_mmap = true; // use mmap for faster loads bool use_mlock = false; // use mlock to keep model in memory @@ -180,36 +162,6 @@ std::string llama_detokenize_bpe( llama_context * ctx, const std::vector & tokens); -// -// Sampling utils -// - -// this is a common sampling function used across the examples for convenience -// it can serve as a starting point for implementing your own sampling function -// -// required: -// - ctx: context to use for sampling -// - params: sampling parameters -// -// optional: -// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL -// - grammar: grammar to use for sampling, ignore if NULL -// - last_tokens: needed for repetition penalty, ignore if empty -// - idx: sample from llama_get_logits_ith(ctx, idx) -// -// returns: -// - token: sampled token -// - candidates: vector of candidate tokens -// -llama_token llama_sample_token( - struct llama_context * ctx, - struct llama_context * ctx_guidance, - struct llama_grammar * grammar, - const struct gpt_params & params, - const std::vector & last_tokens, - std::vector & candidates, - int idx = 0); - // // YAML utils // diff --git a/common/sampling.cpp b/common/sampling.cpp new file mode 100644 index 0000000000000..8ce4194593ca7 --- /dev/null +++ b/common/sampling.cpp @@ -0,0 +1,166 @@ +#include "sampling.h" + +llama_sampling_context::~llama_sampling_context() { + for (auto & it : sequence_contexts) { + if (it.second.grammar != NULL) { + llama_grammar_free(it.second.grammar); + it.second.grammar = NULL; + } + } +} + +llama_sampling_context llama_sampling_context_init( + const struct gpt_params & params, + llama_grammar * grammar) { + llama_sampling_context result; + + result.params = params.sampling_params; + result.grammar = grammar; + return result; +} + +// Note: Creates the context if it doesn't exist, so this always return something. +llama_sampler_sequence_context & llama_sampling_get_sequence_context( + llama_sampling_context & ctx_sampling, + const llama_seq_id seq) { + const auto it = ctx_sampling.sequence_contexts.find(seq); + if (it != ctx_sampling.sequence_contexts.end()) { + return it->second; + } + llama_sampler_sequence_context new_ctx = { + 2.0f * ctx_sampling.params.mirostat_tau, + ctx_sampling.grammar != NULL ? llama_grammar_copy(ctx_sampling.grammar) : NULL, + }; + return ctx_sampling.sequence_contexts.insert({seq, new_ctx}).first->second; +} + +bool llama_sampling_context_reset( + llama_sampling_context & ctx_sampling, + const llama_seq_id seq) { + const auto it = ctx_sampling.sequence_contexts.find(seq); + if (it == ctx_sampling.sequence_contexts.end()) return false; + if (it->second.grammar != NULL) { + llama_grammar_free(it->second.grammar); + it->second.grammar = NULL; + } + ctx_sampling.sequence_contexts.erase(it); + return true; +} + +llama_token llama_sampling_sample( + struct llama_context * ctx, + struct llama_context * ctx_guidance, + struct llama_sampling_context & ctx_sampling, + const std::vector & last_tokens, + std::vector & candidates, + const int idx, + llama_seq_id seq) { + const int n_ctx = llama_n_ctx(ctx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + + const llama_sampling_params & params = ctx_sampling.params; + const float temp = params.temp; + const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k; + const float top_p = params.top_p; + const float tfs_z = params.tfs_z; + const float typical_p = params.typical_p; + const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; + const float repeat_penalty = params.repeat_penalty; + const float alpha_presence = params.presence_penalty; + const float alpha_frequency = params.frequency_penalty; + const int mirostat = params.mirostat; + const float mirostat_tau = params.mirostat_tau; + const float mirostat_eta = params.mirostat_eta; + const bool penalize_nl = params.penalize_nl; + + llama_token id = 0; + + float * logits = llama_get_logits_ith(ctx, idx); + + // Apply params.logit_bias map + for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + logits[it->first] += it->second; + } + + candidates.clear(); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array cur_p = { candidates.data(), candidates.size(), false }; + + if (ctx_guidance) { + llama_sample_classifier_free_guidance(ctx, &cur_p, ctx_guidance, params.cfg_scale); + } + + // apply penalties + if (!last_tokens.empty()) { + const float nl_logit = logits[llama_token_nl(ctx)]; + const int last_n_repeat = std::min(std::min((int)last_tokens.size(), repeat_last_n), n_ctx); + + llama_sample_repetition_penalty(ctx, &cur_p, + last_tokens.data() + last_tokens.size() - last_n_repeat, + last_n_repeat, repeat_penalty); + llama_sample_frequency_and_presence_penalties(ctx, &cur_p, + last_tokens.data() + last_tokens.size() - last_n_repeat, + last_n_repeat, alpha_frequency, alpha_presence); + + if (!penalize_nl) { + for (size_t idx = 0; idx < cur_p.size; idx++) { + if (cur_p.data[idx].id == llama_token_nl(ctx)) { + cur_p.data[idx].logit = nl_logit; + break; + } + } + } + } + + llama_sampler_sequence_context & ctx_seq = llama_sampling_get_sequence_context(ctx_sampling, seq); + + if (ctx_seq.grammar != NULL) { + llama_sample_grammar(ctx, &cur_p, ctx_seq.grammar); + } + + if (temp <= 0) { + // Greedy sampling + id = llama_sample_token_greedy(ctx, &cur_p); + } else { + if (mirostat == 1) { + const int mirostat_m = 100; + llama_sample_temp(ctx, &cur_p, temp); + id = llama_sample_token_mirostat(ctx, &cur_p, mirostat_tau, mirostat_eta, mirostat_m, &ctx_seq.mirostat_mu); + } else if (mirostat == 2) { + llama_sample_temp(ctx, &cur_p, temp); + id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &ctx_seq.mirostat_mu); + } else { + // Temperature sampling + size_t min_keep = std::max(1, params.n_probs); + llama_sample_top_k (ctx, &cur_p, top_k, min_keep); + llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep); + llama_sample_typical (ctx, &cur_p, typical_p, min_keep); + llama_sample_top_p (ctx, &cur_p, top_p, min_keep); + llama_sample_temp(ctx, &cur_p, temp); + + { + const int n_top = 10; + LOG("top %d candidates:\n", n_top); + + for (int i = 0; i < n_top; i++) { + const llama_token id = cur_p.data[i].id; + (void)id; // To avoid a warning that id is unused when logging is disabled. + LOG(" - %5d: '%12s' (%.3f)\n", id, llama_token_to_piece(ctx, id).c_str(), cur_p.data[i].p); + } + } + + id = llama_sample_token(ctx, &cur_p); + + LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx, id).c_str()); + } + } + + if (ctx_seq.grammar != NULL) { + llama_grammar_accept_token(ctx, ctx_seq.grammar, id); + } + + return id; +} diff --git a/common/sampling.h b/common/sampling.h new file mode 100644 index 0000000000000..0aab5d03c2f61 --- /dev/null +++ b/common/sampling.h @@ -0,0 +1,108 @@ +#pragma once + +#include "llama.h" + +#include +#include +#include + +// sampling parameters +typedef struct llama_sampling_params { + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // 1.0 = disabled + float repeat_penalty = 1.10f; // 1.0 = disabled + int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float frequency_penalty = 0.00f; // 0.0 = disabled + float presence_penalty = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + + bool penalize_nl = true; // consider newlines as a repeatable token + + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + + // Classifier-Free Guidance + // https://arxiv.org/abs/2306.17806 + std::string cfg_negative_prompt; // string to help guidance + float cfg_scale = 1.f; // How strong is guidance + + std::unordered_map logit_bias; // logit bias for specific tokens + +} llama_sampling_params; + +// per-sequence sampler context +typedef struct llama_sampler_sequence_context { + float mirostat_mu; // mirostat sampler state + llama_grammar * grammar; +} llama_sampler_sequence_context; + +// general sampler context +typedef struct llama_sampling_context { + ~llama_sampling_context(); + + // parameters that will be used for sampling and when creating + // new llama_sampler_sequence_context instances + llama_sampling_params params; + + // map of sequence ids to sampler contexts + std::unordered_map sequence_contexts; + + // when non-NULL, new instances of llama_sampler_sequence_context + // will get a copy of the grammar here + // note: only the pointer is stored here, it is not a copy of + // the grammar and shouldn't be freed + llama_grammar * grammar; +} llama_sampling_context; + +#include "common.h" + +// Create a new sampling context instance. +llama_sampling_context llama_sampling_context_init( + const struct gpt_params & params, + llama_grammar * grammar = NULL); + +// Fetches the sampler context for the specified sequence id (defaults to 0). +// If the context for that sequence id doesn't already exist, it will be created with +// default values based on the parameters in the ctx_sampling argument. +llama_sampler_sequence_context & llama_sampling_get_sequence_context( + llama_sampling_context & ctx_sampling, + const llama_seq_id seq = 0); + +// Reset the sampler context for the supplied sequence id (defaults to 0). +// This is necessary to reuse a sequence id or free memory used by sequences +// that are no longer required. +bool llama_sampling_context_reset( + llama_sampling_context & ctx_sampling, + const llama_seq_id seq = 0); + +// this is a common sampling function used across the examples for convenience +// it can serve as a starting point for implementing your own sampling function +// Note: When using multiple sequences, it is the caller's responsibility to call +// llama_sampling_context_reset when a sequence ends +// +// required: +// - ctx: context to use for sampling +// - ctx_sampling: sampling-specific context +// +// optional: +// - ctx_guidance: context to use for classifier-free guidance, ignore if NULL +// - last_tokens: needed for repetition penalty, ignore if empty +// - idx: sample from llama_get_logits_ith(ctx, idx) +// - seq: sequence id to associate sampler state with +// +// returns: +// - token: sampled token +// - candidates: vector of candidate tokens +// +llama_token llama_sampling_sample( + struct llama_context * ctx, + struct llama_context * ctx_guidance, + struct llama_sampling_context & ctx_sampling, + const std::vector & last_tokens, + std::vector & candidates, + const int idx = 0, + llama_seq_id seq = 0); diff --git a/examples/embd-input/embd-input-lib.cpp b/examples/embd-input/embd-input-lib.cpp index 99e6bdad5ac45..87a5a1c26f88b 100644 --- a/examples/embd-input/embd-input-lib.cpp +++ b/examples/embd-input/embd-input-lib.cpp @@ -128,21 +128,22 @@ bool eval_string(struct MyModel * mymodel,const char* str){ llama_token sampling_id(struct MyModel* mymodel) { llama_context* ctx = mymodel->ctx; gpt_params params = mymodel->params; + llama_sampling_params & sparams = params.sampling_params; // int n_ctx = llama_n_ctx(ctx); // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; + const float temp = sparams.temp; + const int32_t top_k = sparams.top_k <= 0 ? llama_n_vocab(llama_get_model(ctx)) : sparams.top_k; + const float top_p = sparams.top_p; + const float tfs_z = sparams.tfs_z; + const float typical_p = sparams.typical_p; // const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; // const float repeat_penalty = params.repeat_penalty; // const float alpha_presence = params.presence_penalty; // const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; + const int mirostat = sparams.mirostat; + const float mirostat_tau = sparams.mirostat_tau; + const float mirostat_eta = sparams.mirostat_eta; // const bool penalize_nl = params.penalize_nl; llama_token id = 0; @@ -151,7 +152,7 @@ llama_token sampling_id(struct MyModel* mymodel) { auto n_vocab = llama_n_vocab(llama_get_model(ctx)); // Apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { + for (auto it = sparams.logit_bias.begin(); it != sparams.logit_bias.end(); it++) { logits[it->first] += it->second; } diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index d994de5e850c3..187623f5d8424 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -104,6 +104,7 @@ static void sigint_handler(int signo) { int main(int argc, char ** argv) { gpt_params params; + llama_sampling_params & sparams = params.sampling_params; g_params = ¶ms; if (!gpt_params_parse(argc, argv, params)) { @@ -206,7 +207,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (params.cfg_scale > 1.f) { + if (sparams.cfg_scale > 1.f) { struct llama_context_params lparams = llama_context_params_from_gpt_params(params); ctx_guidance = llama_new_context_with_model(model, lparams); } @@ -269,9 +270,9 @@ int main(int argc, char ** argv) { int guidance_offset = 0; int original_prompt_len = 0; if (ctx_guidance) { - LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); @@ -312,7 +313,7 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG_TEE("\n"); - LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); @@ -358,7 +359,7 @@ int main(int argc, char ** argv) { } } LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", - params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); + sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); @@ -376,8 +377,8 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); { - auto it = params.logit_bias.find(llama_token_eos(ctx)); - if (it != params.logit_bias.end() && it->second == -INFINITY) { + auto it = sparams.logit_bias.find(llama_token_eos(ctx)); + if (it != sparams.logit_bias.end() && it->second == -INFINITY) { LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); } } @@ -434,6 +435,7 @@ int main(int argc, char ** argv) { const int n_vocab = llama_n_vocab(model); + llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar); std::vector candidates; candidates.reserve(n_vocab); @@ -552,7 +554,7 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) { - const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates); + const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates); last_tokens.erase(last_tokens.begin()); last_tokens.push_back(id); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 775a5a201e5b8..b39a67d979c88 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -109,6 +109,7 @@ int main(int argc, char ** argv) { if (!gpt_params_parse(argc, argv, params)) { return 1; } + llama_sampling_params & sparams = params.sampling_params; #ifndef LOG_DISABLE_LOGS log_set_target(log_filename_generator("main", "log")); @@ -179,7 +180,7 @@ int main(int argc, char ** argv) { // load the model and apply lora adapter, if any LOG("%s: load the model and apply lora adapter, if any\n", __func__); std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (params.cfg_scale > 1.f) { + if (sparams.cfg_scale > 1.f) { struct llama_context_params lparams = llama_context_params_from_gpt_params(params); ctx_guidance = llama_new_context_with_model(model, lparams); } @@ -257,9 +258,9 @@ int main(int argc, char ** argv) { int guidance_offset = 0; int original_prompt_len = 0; if (ctx_guidance) { - LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(params.cfg_negative_prompt)); + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, params.cfg_negative_prompt, add_bos); + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp)); std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); @@ -343,7 +344,7 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG_TEE("\n"); - LOG_TEE("%s: negative prompt: '%s'\n", __func__, params.cfg_negative_prompt.c_str()); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); for (int i = 0; i < (int) guidance_inp.size(); i++) { LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); @@ -395,7 +396,7 @@ int main(int argc, char ** argv) { } } LOG_TEE("sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n", - params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau); + sparams.repeat_last_n, sparams.repeat_penalty, sparams.presence_penalty, sparams.frequency_penalty, sparams.top_k, sparams.tfs_z, sparams.top_p, sparams.typical_p, sparams.temp, sparams.mirostat, sparams.mirostat_eta, sparams.mirostat_tau); LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); LOG_TEE("\n\n"); @@ -413,8 +414,8 @@ int main(int argc, char ** argv) { LOG_TEE("\n"); { - auto it = params.logit_bias.find(llama_token_eos(ctx)); - if (it != params.logit_bias.end() && it->second == -INFINITY) { + auto it = sparams.logit_bias.find(llama_token_eos(ctx)); + if (it != sparams.logit_bias.end() && it->second == -INFINITY) { LOG_TEE("%s: warning: EOS token is disabled, which will cause most grammars to fail\n", __func__); } } @@ -469,6 +470,7 @@ int main(int argc, char ** argv) { const int n_vocab = llama_n_vocab(model); + llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar); std::vector candidates; candidates.reserve(n_vocab); @@ -625,7 +627,7 @@ int main(int argc, char ** argv) { LOG("saved session to %s\n", path_session.c_str()); } - const llama_token id = llama_sample_token(ctx, ctx_guidance, grammar, params, last_tokens, candidates); + const llama_token id = llama_sampling_sample(ctx, ctx_guidance, ctx_sampling, last_tokens, candidates); last_tokens.erase(last_tokens.begin()); last_tokens.push_back(id); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 04f1e45b955dd..63ddcd8ed8857 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -125,6 +125,8 @@ int main(int argc, char ** argv) { params.logits_all = true; std::tie(model, ctx) = llama_init_from_gpt_params(params); + llama_sampling_context ctx_sampling = llama_sampling_context_init(params, NULL); + // load the prompts from an external file if there are any if (params.prompt.empty()) { printf("\n\033[32mNo new questions so proceed with build-in defaults.\033[0m\n"); @@ -339,7 +341,7 @@ int main(int argc, char ** argv) { //printf("client %d, seq %d, token %d, pos %d, batch %d\n", // client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch); - const llama_token id = llama_sample_token(ctx, NULL, NULL, params, client.tokens_prev, candidates, client.i_batch - i); + const llama_token id = llama_sampling_sample(ctx, NULL, ctx_sampling, client.tokens_prev, candidates, client.i_batch - i, client.seq_id); if (client.n_decoded == 1) { // start measuring generation time after the first token to make sure all concurrent clients @@ -384,7 +386,7 @@ int main(int argc, char ** argv) { n_total_prompt += client.n_prompt; n_total_gen += client.n_decoded; - + llama_sampling_context_reset(ctx_sampling, client.seq_id); client.seq_id = -1; } diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index acc6dbdfd07d0..f9e3c98a38a40 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -8,9 +8,10 @@ int main(int argc, char ** argv) { gpt_params params; + llama_sampling_params & sparams = params.sampling_params; params.seed = 42; params.n_threads = 4; - params.repeat_last_n = 64; + sparams.repeat_last_n = 64; params.prompt = "The quick brown fox"; if (!gpt_params_parse(argc, argv, params)) { @@ -24,7 +25,7 @@ int main(int argc, char ** argv) { } auto n_past = 0; - auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); + auto last_n_tokens_data = std::vector(sparams.repeat_last_n, 0); // init llama_model * model; diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 8c5318c650ae8..58af78de96cb5 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -200,6 +200,7 @@ struct llama_server_context llama_model *model = nullptr; llama_context *ctx = nullptr; gpt_params params; + llama_sampling_context ctx_sampling; int n_ctx; grammar_parser::parse_state parsed_grammar; @@ -254,6 +255,7 @@ struct llama_server_context if (grammar != nullptr) { llama_grammar_free(grammar); grammar = nullptr; + ctx_sampling = llama_sampling_context_init(params, NULL); } } @@ -329,8 +331,8 @@ struct llama_server_context grammar_parser::print_grammar(stderr, parsed_grammar); { - auto it = params.logit_bias.find(llama_token_eos(ctx)); - if (it != params.logit_bias.end() && it->second == -INFINITY) { + auto it = params.sampling_params.logit_bias.find(llama_token_eos(ctx)); + if (it != params.sampling_params.logit_bias.end() && it->second == -INFINITY) { LOG_WARNING("EOS token is disabled, which will cause most grammars to fail", {}); } } @@ -339,6 +341,7 @@ struct llama_server_context grammar = llama_grammar_init( grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); } + ctx_sampling = llama_sampling_context_init(params, grammar); return true; } @@ -550,12 +553,12 @@ struct llama_server_context std::vector candidates; candidates.reserve(llama_n_vocab(model)); - result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates); + result.tok = llama_sampling_sample(ctx, NULL, ctx_sampling, last_n_tokens, candidates); llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; - const int32_t n_probs = params.n_probs; - if (params.temp <= 0 && n_probs > 0) + const int32_t n_probs = params.sampling_params.n_probs; + if (params.sampling_params.temp <= 0 && n_probs > 0) { // For llama_sample_token_greedy we need to sort candidates llama_sample_softmax(ctx, &candidates_p); @@ -630,7 +633,7 @@ struct llama_server_context const std::string token_text = token_with_probs.tok == -1 ? "" : llama_token_to_piece(ctx, token_with_probs.tok); generated_text += token_text; - if (params.n_probs > 0) + if (params.sampling_params.n_probs > 0) { generated_token_probs.push_back(token_with_probs); } @@ -1018,34 +1021,35 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, static json format_generation_settings(llama_server_context &llama) { - const auto eos_bias = llama.params.logit_bias.find(llama_token_eos(llama.ctx)); - const bool ignore_eos = eos_bias != llama.params.logit_bias.end() && + const auto & sparams = llama.params.sampling_params; + const auto eos_bias = sparams.logit_bias.find(llama_token_eos(llama.ctx)); + const bool ignore_eos = eos_bias != sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); return json{ {"n_ctx", llama.n_ctx}, {"model", llama.params.model_alias}, {"seed", llama.params.seed}, - {"temp", llama.params.temp}, - {"top_k", llama.params.top_k}, - {"top_p", llama.params.top_p}, - {"tfs_z", llama.params.tfs_z}, - {"typical_p", llama.params.typical_p}, - {"repeat_last_n", llama.params.repeat_last_n}, - {"repeat_penalty", llama.params.repeat_penalty}, - {"presence_penalty", llama.params.presence_penalty}, - {"frequency_penalty", llama.params.frequency_penalty}, - {"mirostat", llama.params.mirostat}, - {"mirostat_tau", llama.params.mirostat_tau}, - {"mirostat_eta", llama.params.mirostat_eta}, - {"penalize_nl", llama.params.penalize_nl}, + {"temp", sparams.temp}, + {"top_k", sparams.top_k}, + {"top_p", sparams.top_p}, + {"tfs_z", sparams.tfs_z}, + {"typical_p", sparams.typical_p}, + {"repeat_last_n", sparams.repeat_last_n}, + {"repeat_penalty", sparams.repeat_penalty}, + {"presence_penalty", sparams.presence_penalty}, + {"frequency_penalty", sparams.frequency_penalty}, + {"mirostat", sparams.mirostat}, + {"mirostat_tau", sparams.mirostat_tau}, + {"mirostat_eta", sparams.mirostat_eta}, + {"penalize_nl", sparams.penalize_nl}, {"stop", llama.params.antiprompt}, {"n_predict", llama.params.n_predict}, {"n_keep", llama.params.n_keep}, {"ignore_eos", ignore_eos}, {"stream", llama.stream}, - {"logit_bias", llama.params.logit_bias}, - {"n_probs", llama.params.n_probs}, + {"logit_bias", sparams.logit_bias}, + {"n_probs", sparams.n_probs}, {"grammar", llama.params.grammar}, }; } @@ -1094,7 +1098,7 @@ static json format_final_response(llama_server_context &llama, const std::string {"timings", format_timings(llama)}, }; - if (llama.params.n_probs > 0) + if (llama.params.sampling_params.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1110,7 +1114,7 @@ static json format_partial_response( {"stop", false}, }; - if (llama.params.n_probs > 0) + if (llama.params.sampling_params.n_probs > 0) { res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); } @@ -1142,26 +1146,28 @@ static T json_value(const json &body, const std::string &key, const T &default_v static void parse_options_completion(const json &body, llama_server_context &llama) { gpt_params default_params; + const auto & default_sparams = default_params.sampling_params; + auto & sparams = llama.params.sampling_params; llama.stream = json_value(body, "stream", false); llama.params.n_predict = json_value(body, "n_predict", default_params.n_predict); - llama.params.top_k = json_value(body, "top_k", default_params.top_k); - llama.params.top_p = json_value(body, "top_p", default_params.top_p); - llama.params.tfs_z = json_value(body, "tfs_z", default_params.tfs_z); - llama.params.typical_p = json_value(body, "typical_p", default_params.typical_p); - llama.params.repeat_last_n = json_value(body, "repeat_last_n", default_params.repeat_last_n); - llama.params.temp = json_value(body, "temperature", default_params.temp); - llama.params.repeat_penalty = json_value(body, "repeat_penalty", default_params.repeat_penalty); - llama.params.presence_penalty = json_value(body, "presence_penalty", default_params.presence_penalty); - llama.params.frequency_penalty = json_value(body, "frequency_penalty", default_params.frequency_penalty); - llama.params.mirostat = json_value(body, "mirostat", default_params.mirostat); - llama.params.mirostat_tau = json_value(body, "mirostat_tau", default_params.mirostat_tau); - llama.params.mirostat_eta = json_value(body, "mirostat_eta", default_params.mirostat_eta); - llama.params.penalize_nl = json_value(body, "penalize_nl", default_params.penalize_nl); + sparams.top_k = json_value(body, "top_k", default_sparams.top_k); + sparams.top_p = json_value(body, "top_p", default_sparams.top_p); + sparams.tfs_z = json_value(body, "tfs_z", default_sparams.tfs_z); + sparams.typical_p = json_value(body, "typical_p", default_sparams.typical_p); + sparams.repeat_last_n = json_value(body, "repeat_last_n", default_sparams.repeat_last_n); + sparams.temp = json_value(body, "temperature", default_sparams.temp); + sparams.repeat_penalty = json_value(body, "repeat_penalty", default_sparams.repeat_penalty); + sparams.presence_penalty = json_value(body, "presence_penalty", default_sparams.presence_penalty); + sparams.frequency_penalty = json_value(body, "frequency_penalty", default_sparams.frequency_penalty); + sparams.mirostat = json_value(body, "mirostat", default_sparams.mirostat); + sparams.mirostat_tau = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); + sparams.mirostat_eta = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); + sparams.penalize_nl = json_value(body, "penalize_nl", default_sparams.penalize_nl); llama.params.n_keep = json_value(body, "n_keep", default_params.n_keep); llama.params.seed = json_value(body, "seed", default_params.seed); llama.params.grammar = json_value(body, "grammar", default_params.grammar); - llama.params.n_probs = json_value(body, "n_probs", default_params.n_probs); + sparams.n_probs = json_value(body, "n_probs", default_sparams.n_probs); if (body.count("prompt") != 0) { @@ -1172,10 +1178,10 @@ static void parse_options_completion(const json &body, llama_server_context &lla llama.prompt = ""; } - llama.params.logit_bias.clear(); + sparams.logit_bias.clear(); if (json_value(body, "ignore_eos", false)) { - llama.params.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; + sparams.logit_bias[llama_token_eos(llama.ctx)] = -INFINITY; } const auto &logit_bias = body.find("logit_bias"); @@ -1191,11 +1197,11 @@ static void parse_options_completion(const json &body, llama_server_context &lla { if (el[1].is_number()) { - llama.params.logit_bias[tok] = el[1].get(); + sparams.logit_bias[tok] = el[1].get(); } else if (el[1].is_boolean() && !el[1].get()) { - llama.params.logit_bias[tok] = -INFINITY; + sparams.logit_bias[tok] = -INFINITY; } } } @@ -1215,6 +1221,8 @@ static void parse_options_completion(const json &body, llama_server_context &lla } } + llama.ctx_sampling = llama_sampling_context_init(llama.params, llama.grammar); + LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama)); } @@ -1423,7 +1431,7 @@ int main(int argc, char **argv) } auto probs = llama.generated_token_probs; - if (llama.params.n_probs > 0 && llama.stopped_word) { + if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) { const std::vector stop_word_toks = llama_tokenize(llama.ctx, llama.stopping_word, false); probs = std::vector(llama.generated_token_probs.begin(), llama.generated_token_probs.end() - stop_word_toks.size()); } @@ -1475,7 +1483,7 @@ int main(int argc, char **argv) std::vector probs_output = {}; - if (llama.params.n_probs > 0) { + if (llama.params.sampling_params.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); @@ -1596,7 +1604,7 @@ int main(int argc, char **argv) std::vector probs_output = {}; - if (llama.params.n_probs > 0) { + if (llama.params.sampling_params.n_probs > 0) { const std::vector to_send_toks = llama_tokenize(llama.ctx, to_send, false); size_t probs_pos = std::min(sent_token_probs_index, llama.generated_token_probs.size()); size_t probs_stop_pos = std::min(sent_token_probs_index + to_send_toks.size(), llama.generated_token_probs.size()); diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 75a2e5e22d046..018dbf9a205b9 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -125,6 +125,8 @@ int main(int argc, char ** argv) { grammar_tgt = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); } + llama_sampling_context ctx_sampling = llama_sampling_context_init(params, grammar_tgt); + const auto t_dec_start = ggml_time_us(); while (true) { @@ -134,7 +136,7 @@ int main(int argc, char ** argv) { while (true) { // sample from the target model - llama_token id = llama_sample_token(ctx_tgt, NULL, grammar_tgt, params, last_tokens, candidates, i_dft); + llama_token id = llama_sampling_sample(ctx_tgt, NULL, ctx_sampling, last_tokens, candidates, i_dft); // remember which tokens were sampled - used for repetition penalties during sampling last_tokens.erase(last_tokens.begin()); @@ -211,7 +213,13 @@ int main(int argc, char ** argv) { if (grammar_dft) { llama_grammar_free(grammar_dft); } - grammar_dft = llama_grammar_copy(grammar_tgt); + // Note: Hardcoded to sequence id 0, if this ever supports parallel generation + // that will need to change. + auto it = ctx_sampling.sequence_contexts.find(0); + GGML_ASSERT(it != ctx_sampling.sequence_contexts.end()); + // This is necessary because each sequence id in sequence_contexts + // uses a copy of the original grammar. + grammar_dft = llama_grammar_copy(it->second.grammar); LOG("copied target grammar to draft grammar\n"); } From a8bdd65525ae86dea905e9866ad369b53e30ac14 Mon Sep 17 00:00:00 2001 From: Michael Coppola Date: Wed, 11 Oct 2023 15:42:22 -0400 Subject: [PATCH 25/26] server : add parameter -tb N, --threads-batch N (#3584) Co-authored-by: Michael Coppola --- examples/server/server.cpp | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 58af78de96cb5..d992feeef7026 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -714,15 +714,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf("usage: %s [options]\n", argv0); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); @@ -867,6 +868,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads = std::stoi(argv[i]); } + else if (arg == "--threads-batch" || arg == "-tb") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) From b8fe4b5cc9cb237ca98e5bc51b5d189e3c446d13 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 11 Oct 2023 23:55:08 +0300 Subject: [PATCH 26/26] main : fix session loading bug (#3400) --- examples/main/main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/main/main.cpp b/examples/main/main.cpp index b39a67d979c88..55f73356fb89a 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -297,6 +297,9 @@ int main(int argc, char ** argv) { LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", __func__, n_matching_session_tokens, embd_inp.size()); } + + // remove any "future" tokens that we might have inherited from the previous session + llama_kv_cache_tokens_rm(ctx, n_matching_session_tokens, -1); } LOGLN( @@ -545,9 +548,6 @@ int main(int argc, char ** argv) { if (i > 0) { embd.erase(embd.begin(), embd.begin() + i); } - - // remove any "future" tokens that we might have inherited from the session from the KV cache - llama_kv_cache_tokens_rm(ctx, n_past, -1); } // evaluate tokens in batches