From c39064a2449172d3f4bc840c7bc26d400c9fecc5 Mon Sep 17 00:00:00 2001
From: Daniel Bevenius <daniel.bevenius@gmail.com>
Date: Mon, 11 Nov 2024 16:06:12 +0100
Subject: [PATCH] vision: add separate vision example

This commit also make a few updates to merge issues that I did not
discover while rebasing. The rebasing was a little challenging because
there had been quite a few changes in the upstream branch.

The motivation for a separate example is that the simple example in
upstream no longer uses common.h so I though it would be better to have
a separate example for vision.

The status of this is that I'm able to run the example in the original
PR with the Eiffel tower image. With this in place hopefully this can
be iterated upon and cleaned up (I'm talking about the changes that I've
made here).
---
 Makefile                       |   6 +
 common/common.cpp              |   2 +-
 examples/CMakeLists.txt        |   1 +
 examples/simple/simple.cpp     | 111 ++--------------
 examples/vision/CMakeLists.txt |   5 +
 examples/vision/vision.cpp     | 224 +++++++++++++++++++++++++++++++++
 gguf-py/gguf/tensor_mapping.py |   1 +
 src/llama.cpp                  |  21 ++--
 8 files changed, 259 insertions(+), 112 deletions(-)
 create mode 100644 examples/vision/CMakeLists.txt
 create mode 100644 examples/vision/vision.cpp
diff --git a/Makefile b/Makefile
index 346b120f9217ac..8478302fb31ace 100644
--- a/Makefile
+++ b/Makefile
@@ -37,6 +37,7 @@ BUILD_TARGETS = \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
+	llama-vision \
 	llama-cvector-generator \
 	llama-gen-docs \
 	tests/test-c.o
@@ -1469,6 +1470,11 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+llama-vision: examples/vision/vision.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 ifdef GGML_RPC
 rpc-server: examples/rpc/rpc-server.cpp \
 	$(OBJ_GGML)
diff --git a/common/common.cpp b/common/common.cpp
index cf075d45dc701e..42cb8e697143fe 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1526,7 +1526,7 @@ std::vector<llama_token> llama_tokenize_with_img(
     std::vector<llama_token> output;
     for (const auto & part : parts) {
         bool add_bos = &parts.front() == &part;
-        auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
+        auto tokens = common_tokenize(ctx, part, add_special && add_bos, parse_special);
         output.insert(output.end(), tokens.begin(), tokens.end());
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index d63a96c1c25475..41f3d398f82358 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -51,4 +51,5 @@ else()
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
     add_subdirectory(tokenize)
+    add_subdirectory(vision)
 endif()
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
index 15e2923b3ad6ae..59760fe95db220 100644
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@@ -2,7 +2,6 @@
 #include <cstdio>
 #include <cstring>
 #include <string>
-#include "vision.h"
 #include <vector>
 
 static void print_usage(int, char ** argv) {
@@ -21,10 +20,7 @@ int main(int argc, char ** argv) {
     // number of tokens to predict
     int n_predict = 32;
 
-    //params.prompt = "Hello my name is";
-    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
-        "USER:<img_placement>\nwhat did you see?\nASSISTANT:";
-    params.n_predict = 32;
+    // parse command line arguments
 
     {
         int i = 1;
@@ -127,107 +123,25 @@ int main(int argc, char ** argv) {
 
     llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
 
-    // tokenize the prompt
-
-    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);
-
-    const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
-
-    LOG("\n");
-    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
-
-    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
-        return 1;
-    }
-
     // print the prompt token-by-token
 
-    LOG("\n");
-
-    for (auto id : tokens_list) {
-        if (id == TOKEN_IMG_PLACEMENT) {
-            LOG("<img_placement>");
-        } else {
-            LOG("%s", llama_token_to_piece(ctx, id).c_str());
-        }
-    }
-
-    LOG("\n\n");
-
-    // load image
-    llama_batch_img img_batch = llama_batch_img_init(1);
-    img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
-
-    // create a llama_batch with size 512
-    // we use this object to submit token data for decoding
-
-    llama_batch batch = llama_batch_init(512, 0, 1);
-
-    // evaluate the initial prompt
-    int n_cur = 0;
-    int i_img = 0;
-    for (auto id : tokens_list) {
-        if (id == TOKEN_IMG_PLACEMENT) {
-            img_batch.pos[i_img] = n_cur;
-            n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
-            i_img++;
-        } else {
-            llama_batch_add(batch, id, n_cur, { 0 }, false);
-            printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
-            n_cur++;
+    for (auto id : prompt_tokens) {
+        char buf[128];
+        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+            return 1;
         }
+        std::string s(buf, n);
+        printf("%s", s.c_str());
     }
 
-    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_encode_vision(ctx, img_batch) != 0) {
-        LOG("%s: llama_encode_vision() failed\n", __func__);
-        return 1;
-    }
-
-    n_cur = 0;
-    {
-        auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
-        auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
-        t1.insert(t1.begin(), 1);
-
-        n_cur = 0;
-        llama_batch_clear(batch);
-        llama_batch_add(batch, 1, 0, { 0 }, false);
-        llama_decode(ctx, batch);
-
-        n_cur = t1.size();
-        llama_batch_clear(batch);
-        llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
-        llama_decode(ctx, batch0);
-
-        n_cur = 0;
-        llama_batch_clear(batch);
-        for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
-        llama_decode(ctx, batch);
-
-        n_cur = t1.size() + 576;
-        llama_batch_clear(batch);
-        printf("pos %d\n", n_cur);
-        for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
-        batch.logits[batch.n_tokens - 1] = true;
-    }
+    // prepare a batch for the prompt
 
-    if (llama_decode(ctx, batch) != 0) {
-        LOG("%s: llama_decode() failed\n", __func__);
-        return 1;
-    }
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
 
     // main loop
 
-    int n_decode = 0;
-
     const auto t_main_start = ggml_time_us();
     int n_decode = 0;
     llama_token new_token_id;
@@ -241,15 +155,12 @@ int main(int argc, char ** argv) {
 
         n_pos += batch.n_tokens;
 
-    for (int i = 0; i < n_predict; i++) {
         // sample the next token
         {
             new_token_id = llama_sampler_sample(smpl, ctx, -1);
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id)) {
-                LOG("\n");
-
                 break;
             }
 
diff --git a/examples/vision/CMakeLists.txt b/examples/vision/CMakeLists.txt
new file mode 100644
index 00000000000000..572dae88f170e8
--- /dev/null
+++ b/examples/vision/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-vision)
+add_executable(${TARGET} vision.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/vision/vision.cpp b/examples/vision/vision.cpp
new file mode 100644
index 00000000000000..262943b9f80c88
--- /dev/null
+++ b/examples/vision/vision.cpp
@@ -0,0 +1,224 @@
+#include "arg.h"
+#include "common.h"
+#include "log.h"
+#include "llama.h"
+#include "vision.h"
+
+#include <vector>
+
+static void print_usage(int, char ** argv) {
+    LOG("\nexample usage:\n");
+    LOG("\n    %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
+    LOG("\n");
+}
+
+int main(int argc, char ** argv) {
+    common_params params;
+
+    //params.prompt = "Hello my name is";
+    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
+        "USER:<img_placement>\nwhat did you see?\nASSISTANT:";
+    params.n_predict = 32;
+
+    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
+        return 1;
+    }
+
+    common_init();
+
+    // total length of the sequence including the prompt
+    const int n_predict = params.n_predict;
+
+    // init LLM
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // initialize the model
+
+    llama_model_params model_params = common_model_params_to_llama(params);
+
+    llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    if (model == NULL) {
+        fprintf(stderr , "%s: error: unable to load model\n" , __func__);
+        return 1;
+    }
+
+    // initialize the context
+
+    llama_context_params ctx_params = common_context_params_to_llama(params);
+
+    llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+
+    if (ctx == NULL) {
+        fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
+        return 1;
+    }
+
+    auto sparams = llama_sampler_chain_default_params();
+
+    sparams.no_perf = false;
+
+    llama_sampler * smpl = llama_sampler_chain_init(sparams);
+
+    llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
+
+    // tokenize the prompt
+
+    std::vector<llama_token> tokens_list;
+    tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);
+
+    const int n_ctx    = llama_n_ctx(ctx);
+    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
+
+    LOG("\n");
+    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
+
+    // make sure the KV cache is big enough to hold all the prompt and generated tokens
+    if (n_kv_req > n_ctx) {
+        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
+        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
+        return 1;
+    }
+
+    // print the prompt token-by-token
+
+    LOG("\n");
+
+    for (auto id : tokens_list) {
+        if (id == TOKEN_IMG_PLACEMENT) {
+            LOG("<img_placement>");
+        } else {
+            LOG("%s", common_token_to_piece(ctx, id).c_str());
+        }
+    }
+
+    LOG("\n\n");
+
+    // load image
+    llama_batch_img img_batch = llama_batch_img_init(1);
+    img_batch.imgs[0] = load_image_from_file("models/eiffel-tower-3349075_1280.jpg");
+
+    // create a llama_batch with size 512
+    // we use this object to submit token data for decoding
+
+    llama_batch batch = llama_batch_init(512, 0, 1);
+
+    // evaluate the initial prompt
+    int n_cur = 0;
+    int i_img = 0;
+    for (auto id : tokens_list) {
+        if (id == TOKEN_IMG_PLACEMENT) {
+            img_batch.pos[i_img] = n_cur;
+            n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
+            i_img++;
+        } else {
+            common_batch_add(batch, id, n_cur, { 0 }, false);
+            printf("pos %d tok %d --> %s\n", n_cur, id, common_token_to_piece(ctx, id).c_str());
+            n_cur++;
+        }
+    }
+
+    // llama_decode will output logits only for the last token of the prompt
+    batch.logits[batch.n_tokens - 1] = true;
+
+    if (llama_encode_vision(ctx, img_batch) != 0) {
+        LOG("%s: llama_encode_vision() failed\n", __func__);
+        return 1;
+    }
+
+    n_cur = 0;
+    {
+        auto t1 = ::common_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
+        auto t2 = ::common_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
+        t1.insert(t1.begin(), 1);
+
+        n_cur = 0;
+        common_batch_clear(batch);
+        common_batch_add(batch, 1, 0, { 0 }, false);
+        llama_decode(ctx, batch);
+
+        n_cur = t1.size();
+        common_batch_clear(batch);
+        llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr };
+        llama_decode(ctx, batch0);
+
+        n_cur = 0;
+        common_batch_clear(batch);
+        for (auto t : t1) { common_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
+        llama_decode(ctx, batch);
+
+        n_cur = t1.size() + 576;
+        common_batch_clear(batch);
+        printf("pos %d\n", n_cur);
+        for (auto t : t2) { common_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
+        batch.logits[batch.n_tokens - 1] = true;
+    }
+
+    if (llama_decode(ctx, batch) != 0) {
+        LOG("%s: llama_decode() failed\n", __func__);
+        return 1;
+    }
+
+    // main loop
+
+    int n_decode = 0;
+
+    const auto t_main_start = ggml_time_us();
+
+    for (int i = 0; i < n_predict; i++) {
+        // sample the next token
+        {
+            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
+
+            // is it an end of generation?
+            if (llama_token_is_eog(model, new_token_id)) {
+                LOG("\n");
+
+                break;
+            }
+
+            LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
+            fflush(stdout);
+
+            // prepare the next batch
+            common_batch_clear(batch);
+
+            // push this new token for next evaluation
+            common_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+
+            n_decode += 1;
+        }
+
+        n_cur += 1;
+
+        // evaluate the current batch with the transformer model
+        if (llama_decode(ctx, batch)) {
+            LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
+            return 1;
+        }
+    }
+
+    LOG("\n");
+
+    const auto t_main_end = ggml_time_us();
+
+    LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
+            __func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
+
+    LOG("\n");
+    llama_perf_sampler_print(smpl);
+    llama_perf_context_print(ctx);
+
+    LOG("\n");
+
+    llama_batch_free(batch);
+    llama_sampler_free(smpl);
+    llama_free(ctx);
+    llama_free_model(model);
+
+    llama_backend_free();
+
+    return 0;
+}
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 807dd040f4e495..3344b30a27a612 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -690,6 +690,7 @@ class TensorNameMap:
 
         MODEL_TENSOR.CLS_OUT: (
             "classifier.out_proj", # roberta
+        ),
 
         MODEL_TENSOR.V_MMPROJ: (
             "multi_modal_projector.linear_{bid}",
diff --git a/src/llama.cpp b/src/llama.cpp
index e93edcbeea7c68..339ecae4fe7904 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -7224,10 +7224,9 @@ static const std::map<llm_tensor, llm_tensor_info> llm_tensor_info_mapping = {
 };
 
 static const std::map<vision_tensor, llm_tensor_info> vision_tensor_info_mapping = {
-    {VISION_TENSOR_MMPROJ_A,                {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
-    {VISION_TENSOR_MMPROJ_B,                {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
+    {VISION_TENSOR_MMPROJ,                  {LLM_TENSOR_LAYER_REPEATING                  }},
     {VISION_TENSOR_ENC_EMBD_CLS,            {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
-    {VISION_TENSOR_ENC_EMBD_PATCH,          {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
+    {VISION_TENSOR_ENC_EMBD_PATCH,          {LLM_TENSOR_LAYER_INPUT,                     }},
     {VISION_TENSOR_ENC_EMBD_POS,            {LLM_TENSOR_LAYER_INPUT,     GGML_OP_GET_ROWS}},
     {VISION_TENSOR_ENC_ATTN_Q,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {VISION_TENSOR_ENC_ATTN_K,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
@@ -9347,19 +9346,19 @@ static bool llm_load_tensors(
             switch (vparams.arch) {
                 case VISION_ARCH_LLAVA:
                     {
-                        model.clip.mm_a_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}, 0);
-                        model.clip.mm_a_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias"  , 1), {n_ff}, 0);
-                        model.clip.mm_b_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff,   n_ff}, 0);
-                        model.clip.mm_b_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias"  , 2), {n_ff}, 0);
+                        model.clip.mm_1_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 1), {n_embd, n_ff}, 0);
+                        model.clip.mm_1_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias"  , 1), {n_ff}, 0);
+                        model.clip.mm_2_w = create_tensor(tn(VISION_TENSOR_MMPROJ, "weight", 2), {n_ff,   n_ff}, 0);
+                        model.clip.mm_2_b = create_tensor(tn(VISION_TENSOR_MMPROJ, "bias"  , 2), {n_ff}, 0);
 
                         model.clip.class_embedding     = create_tensor(tn(VISION_TENSOR_ENC_EMBD_CLS            ), {n_embd}, 0);
                         model.clip.patch_embeddings    = create_tensor(tn(VISION_TENSOR_ENC_EMBD_PATCH, "weight"), {patch_size, patch_size, n_channel, n_embd}, 0);
                         model.clip.position_embeddings = create_tensor(tn(VISION_TENSOR_ENC_EMBD_POS,   "weight"), {n_embd, max_pos_embd}, 0);
 
-                        model.clip.pre_norm_w          = create_tensor(tn(VISION_TENSOR_PRE_NORM,       "weight"), {n_embd}, 0);
-                        model.clip.pre_norm_w          = create_tensor(tn(VISION_TENSOR_PRE_NORM,       "bias"  ), {n_embd}, 0);
-                        model.clip.post_norm_w         = create_tensor(tn(VISION_TENSOR_POST_NORM,      "weight"), {n_embd}, 0);
-                        model.clip.post_norm_w         = create_tensor(tn(VISION_TENSOR_POST_NORM,      "bias"  ), {n_embd}, 0);
+                        model.clip.pre_norm_w          = create_tensor(tn(VISION_TENSOR_PRE_NORM,       "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.clip.pre_norm_b          = create_tensor(tn(VISION_TENSOR_PRE_NORM,       "bias"  ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.clip.post_norm_w         = create_tensor(tn(VISION_TENSOR_POST_NORM,      "weight"), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
+                        model.clip.post_norm_b         = create_tensor(tn(VISION_TENSOR_POST_NORM,      "bias"  ), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED);
 
                         for (int i = 0; i < n_layer; ++i) {
                             //ggml_context * ctx_layer = ctx_for_layer(i);