diff --git a/Makefile b/Makefile index 346b120f9217ac..8478302fb31ace 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ BUILD_TARGETS = \ llama-speculative \ llama-tokenize \ llama-vdot \ + llama-vision \ llama-cvector-generator \ llama-gen-docs \ tests/test-c.o @@ -1469,6 +1470,11 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \ $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +llama-vision: examples/vision/vision.cpp \ + $(OBJ_ALL) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + ifdef GGML_RPC rpc-server: examples/rpc/rpc-server.cpp \ $(OBJ_GGML) diff --git a/common/common.cpp b/common/common.cpp index cf075d45dc701e..42cb8e697143fe 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1526,7 +1526,7 @@ std::vector llama_tokenize_with_img( std::vector output; for (const auto & part : parts) { bool add_bos = &parts.front() == ∂ - auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special); + auto tokens = common_tokenize(ctx, part, add_special && add_bos, parse_special); output.insert(output.end(), tokens.begin(), tokens.end()); if (&parts.back() != &part) { // add image token to middle of 2 parts diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d63a96c1c25475..41f3d398f82358 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -51,4 +51,5 @@ else() add_subdirectory(simple-chat) add_subdirectory(speculative) add_subdirectory(tokenize) + add_subdirectory(vision) endif() diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 15e2923b3ad6ae..59760fe95db220 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -2,7 +2,6 @@ #include #include #include -#include "vision.h" #include static void print_usage(int, char ** argv) { @@ -21,10 +20,7 @@ int main(int argc, char ** argv) { // number of tokens to predict int n_predict = 32; - //params.prompt = "Hello my name is"; - params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" - "USER:\nwhat did you see?\nASSISTANT:"; - params.n_predict = 32; + // parse command line arguments { int i = 1; @@ -127,107 +123,25 @@ int main(int argc, char ** argv) { llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); - // tokenize the prompt - - std::vector tokens_list; - tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true); - - const int n_ctx = llama_n_ctx(ctx); - const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); - - LOG("\n"); - LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); - - // make sure the KV cache is big enough to hold all the prompt and generated tokens - if (n_kv_req > n_ctx) { - LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); - LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); - return 1; - } - // print the prompt token-by-token - LOG("\n"); - - for (auto id : tokens_list) { - if (id == TOKEN_IMG_PLACEMENT) { - LOG(""); - } else { - LOG("%s", llama_token_to_piece(ctx, id).c_str()); - } - } - - LOG("\n\n"); - - // load image - llama_batch_img img_batch = llama_batch_img_init(1); - img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg"); - - // create a llama_batch with size 512 - // we use this object to submit token data for decoding - - llama_batch batch = llama_batch_init(512, 0, 1); - - // evaluate the initial prompt - int n_cur = 0; - int i_img = 0; - for (auto id : tokens_list) { - if (id == TOKEN_IMG_PLACEMENT) { - img_batch.pos[i_img] = n_cur; - n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]); - i_img++; - } else { - llama_batch_add(batch, id, n_cur, { 0 }, false); - printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str()); - n_cur++; + for (auto id : prompt_tokens) { + char buf[128]; + int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true); + if (n < 0) { + fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__); + return 1; } + std::string s(buf, n); + printf("%s", s.c_str()); } - // llama_decode will output logits only for the last token of the prompt - batch.logits[batch.n_tokens - 1] = true; - - if (llama_encode_vision(ctx, img_batch) != 0) { - LOG("%s: llama_encode_vision() failed\n", __func__); - return 1; - } - - n_cur = 0; - { - auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false); - auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false); - t1.insert(t1.begin(), 1); - - n_cur = 0; - llama_batch_clear(batch); - llama_batch_add(batch, 1, 0, { 0 }, false); - llama_decode(ctx, batch); - - n_cur = t1.size(); - llama_batch_clear(batch); - llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, }; - llama_decode(ctx, batch0); - - n_cur = 0; - llama_batch_clear(batch); - for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } - llama_decode(ctx, batch); - - n_cur = t1.size() + 576; - llama_batch_clear(batch); - printf("pos %d\n", n_cur); - for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } - batch.logits[batch.n_tokens - 1] = true; - } + // prepare a batch for the prompt - if (llama_decode(ctx, batch) != 0) { - LOG("%s: llama_decode() failed\n", __func__); - return 1; - } + llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size()); // main loop - int n_decode = 0; - const auto t_main_start = ggml_time_us(); int n_decode = 0; llama_token new_token_id; @@ -241,15 +155,12 @@ int main(int argc, char ** argv) { n_pos += batch.n_tokens; - for (int i = 0; i < n_predict; i++) { // sample the next token { new_token_id = llama_sampler_sample(smpl, ctx, -1); // is it an end of generation? if (llama_token_is_eog(model, new_token_id)) { - LOG("\n"); - break; } diff --git a/examples/vision/CMakeLists.txt b/examples/vision/CMakeLists.txt new file mode 100644 index 00000000000000..572dae88f170e8 --- /dev/null +++ b/examples/vision/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-vision) +add_executable(${TARGET} vision.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp new file mode 100644 index 00000000000000..262943b9f80c88 --- /dev/null +++ b/examples/vision/vision.cpp @@ -0,0 +1,224 @@ +#include "arg.h" +#include "common.h" +#include "log.h" +#include "llama.h" +#include "vision.h" + +#include + +static void print_usage(int, char ** argv) { + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]); + LOG("\n"); +} + +int main(int argc, char ** argv) { + common_params params; + + //params.prompt = "Hello my name is"; + params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n" + "USER:\nwhat did you see?\nASSISTANT:"; + params.n_predict = 32; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) { + return 1; + } + + common_init(); + + // total length of the sequence including the prompt + const int n_predict = params.n_predict; + + // init LLM + + llama_backend_init(); + llama_numa_init(params.numa); + + // initialize the model + + llama_model_params model_params = common_model_params_to_llama(params); + + llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); + + if (model == NULL) { + fprintf(stderr , "%s: error: unable to load model\n" , __func__); + return 1; + } + + // initialize the context + + llama_context_params ctx_params = common_context_params_to_llama(params); + + llama_context * ctx = llama_new_context_with_model(model, ctx_params); + + if (ctx == NULL) { + fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + return 1; + } + + auto sparams = llama_sampler_chain_default_params(); + + sparams.no_perf = false; + + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_greedy()); + + // tokenize the prompt + + std::vector tokens_list; + tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true); + + const int n_ctx = llama_n_ctx(ctx); + const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size()); + + LOG("\n"); + LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req); + + // make sure the KV cache is big enough to hold all the prompt and generated tokens + if (n_kv_req > n_ctx) { + LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__); + LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__); + return 1; + } + + // print the prompt token-by-token + + LOG("\n"); + + for (auto id : tokens_list) { + if (id == TOKEN_IMG_PLACEMENT) { + LOG(""); + } else { + LOG("%s", common_token_to_piece(ctx, id).c_str()); + } + } + + LOG("\n\n"); + + // load image + llama_batch_img img_batch = llama_batch_img_init(1); + img_batch.imgs[0] = load_image_from_file("models/eiffel-tower-3349075_1280.jpg"); + + // create a llama_batch with size 512 + // we use this object to submit token data for decoding + + llama_batch batch = llama_batch_init(512, 0, 1); + + // evaluate the initial prompt + int n_cur = 0; + int i_img = 0; + for (auto id : tokens_list) { + if (id == TOKEN_IMG_PLACEMENT) { + img_batch.pos[i_img] = n_cur; + n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]); + i_img++; + } else { + common_batch_add(batch, id, n_cur, { 0 }, false); + printf("pos %d tok %d --> %s\n", n_cur, id, common_token_to_piece(ctx, id).c_str()); + n_cur++; + } + } + + // llama_decode will output logits only for the last token of the prompt + batch.logits[batch.n_tokens - 1] = true; + + if (llama_encode_vision(ctx, img_batch) != 0) { + LOG("%s: llama_encode_vision() failed\n", __func__); + return 1; + } + + n_cur = 0; + { + auto t1 = ::common_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false); + auto t2 = ::common_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false); + t1.insert(t1.begin(), 1); + + n_cur = 0; + common_batch_clear(batch); + common_batch_add(batch, 1, 0, { 0 }, false); + llama_decode(ctx, batch); + + n_cur = t1.size(); + common_batch_clear(batch); + llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr }; + llama_decode(ctx, batch0); + + n_cur = 0; + common_batch_clear(batch); + for (auto t : t1) { common_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } + llama_decode(ctx, batch); + + n_cur = t1.size() + 576; + common_batch_clear(batch); + printf("pos %d\n", n_cur); + for (auto t : t2) { common_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; } + batch.logits[batch.n_tokens - 1] = true; + } + + if (llama_decode(ctx, batch) != 0) { + LOG("%s: llama_decode() failed\n", __func__); + return 1; + } + + // main loop + + int n_decode = 0; + + const auto t_main_start = ggml_time_us(); + + for (int i = 0; i < n_predict; i++) { + // sample the next token + { + const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1); + + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id)) { + LOG("\n"); + + break; + } + + LOG("%s", common_token_to_piece(ctx, new_token_id).c_str()); + fflush(stdout); + + // prepare the next batch + common_batch_clear(batch); + + // push this new token for next evaluation + common_batch_add(batch, new_token_id, n_cur, { 0 }, true); + + n_decode += 1; + } + + n_cur += 1; + + // evaluate the current batch with the transformer model + if (llama_decode(ctx, batch)) { + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + } + + LOG("\n"); + + const auto t_main_end = ggml_time_us(); + + LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n", + __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f)); + + LOG("\n"); + llama_perf_sampler_print(smpl); + llama_perf_context_print(ctx); + + LOG("\n"); + + llama_batch_free(batch); + llama_sampler_free(smpl); + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 807dd040f4e495..3344b30a27a612 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -690,6 +690,7 @@ class TensorNameMap: MODEL_TENSOR.CLS_OUT: ( "classifier.out_proj", # roberta + ), MODEL_TENSOR.V_MMPROJ: ( "multi_modal_projector.linear_{bid}", diff --git a/src/llama.cpp b/src/llama.cpp index e93edcbeea7c68..339ecae4fe7904 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -7224,10 +7224,9 @@ static const std::map llm_tensor_info_mapping = { }; static const std::map vision_tensor_info_mapping = { - {VISION_TENSOR_MMPROJ_A, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, - {VISION_TENSOR_MMPROJ_B, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, + {VISION_TENSOR_MMPROJ, {LLM_TENSOR_LAYER_REPEATING }}, {VISION_TENSOR_ENC_EMBD_CLS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, - {VISION_TENSOR_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, + {VISION_TENSOR_ENC_EMBD_PATCH, {LLM_TENSOR_LAYER_INPUT, }}, {VISION_TENSOR_ENC_EMBD_POS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_GET_ROWS}}, {VISION_TENSOR_ENC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {VISION_TENSOR_ENC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, @@ -9347,19 +9346,19 @@ static bool llm_load_tensors( switch (vparams.arch) { case VISION_ARCH_LLAVA: { - model.clip.mm_a_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}, 0); - model.clip.mm_a_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}, 0); - model.clip.mm_b_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}, 0); - model.clip.mm_b_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}, 0); + model.clip.mm_1_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}, 0); + model.clip.mm_1_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias" , 1), {n_ff}, 0); + model.clip.mm_2_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff, n_ff}, 0); + model.clip.mm_2_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias" , 2), {n_ff}, 0); model.clip.class_embedding = create_tensor(tn(VISION_TENSOR_ENC_EMBD_CLS ), {n_embd}, 0); model.clip.patch_embeddings = create_tensor(tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd}, 0); model.clip.position_embeddings = create_tensor(tn(VISION_TENSOR_ENC_EMBD_POS, "weight"), {n_embd, max_pos_embd}, 0); - model.clip.pre_norm_w = create_tensor(tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd}, 0); - model.clip.pre_norm_w = create_tensor(tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd}, 0); - model.clip.post_norm_w = create_tensor(tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, 0); - model.clip.post_norm_w = create_tensor(tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, 0); + model.clip.pre_norm_w = create_tensor(tn(VISION_TENSOR_PRE_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.clip.pre_norm_b = create_tensor(tn(VISION_TENSOR_PRE_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.clip.post_norm_w = create_tensor(tn(VISION_TENSOR_POST_NORM, "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + model.clip.post_norm_b = create_tensor(tn(VISION_TENSOR_POST_NORM, "bias" ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); for (int i = 0; i < n_layer; ++i) { //ggml_context * ctx_layer = ctx_for_layer(i);