vision: add separate vision example

This commit also make a few updates to merge issues that I did not discover while rebasing. The rebasing was a little challenging because there had been quite a few changes in the upstream branch. The motivation for a separate example is that the simple example in upstream no longer uses common.h so I though it would be better to have a separate example for vision. The status of this is that I'm able to run the example in the original PR with the Eiffel tower image. With this in place hopefully this can be iterated upon and cleaned up (I'm talking about the changes that I've made here).
danbev · Nov 11, 2024 · c39064a · c39064a
1 parent 5669ab7
commit c39064a
Show file tree

Hide file tree

Showing 8 changed files with 259 additions and 112 deletions.
diff --git a/Makefile b/Makefile
@@ -37,6 +37,7 @@ BUILD_TARGETS = \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
+	llama-vision \
 	llama-cvector-generator \
 	llama-gen-docs \
 	tests/test-c.o
@@ -1469,6 +1470,11 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+llama-vision: examples/vision/vision.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 ifdef GGML_RPC
 rpc-server: examples/rpc/rpc-server.cpp \
 	$(OBJ_GGML)

diff --git a/common/common.cpp b/common/common.cpp
@@ -1526,7 +1526,7 @@ std::vector<llama_token> llama_tokenize_with_img(
     std::vector<llama_token> output;
     for (const auto & part : parts) {
         bool add_bos = &parts.front() == &part;
-        auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
+        auto tokens = common_tokenize(ctx, part, add_special && add_bos, parse_special);
         output.insert(output.end(), tokens.begin(), tokens.end());
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -51,4 +51,5 @@ else()
     add_subdirectory(simple-chat)
     add_subdirectory(speculative)
     add_subdirectory(tokenize)
+    add_subdirectory(vision)
 endif()
diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp
@@ -2,7 +2,6 @@
 #include <cstdio>
 #include <cstring>
 #include <string>
-#include "vision.h"
 #include <vector>
 
 static void print_usage(int, char ** argv) {
@@ -21,10 +20,7 @@ int main(int argc, char ** argv) {
     // number of tokens to predict
     int n_predict = 32;
 
-    //params.prompt = "Hello my name is";
-    params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
-        "USER:<img_placement>\nwhat did you see?\nASSISTANT:";
-    params.n_predict = 32;
+    // parse command line arguments
 
     {
         int i = 1;
@@ -127,107 +123,25 @@ int main(int argc, char ** argv) {
 
     llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
 
-    // tokenize the prompt
-
-    std::vector<llama_token> tokens_list;
-    tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);
-
-    const int n_ctx    = llama_n_ctx(ctx);
-    const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
-
-    LOG("\n");
-    LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
-
-    // make sure the KV cache is big enough to hold all the prompt and generated tokens
-    if (n_kv_req > n_ctx) {
-        LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
-        LOG_ERR("%s:        either reduce n_predict or increase n_ctx\n", __func__);
-        return 1;
-    }
-
     // print the prompt token-by-token
 
-    LOG("\n");
-
-    for (auto id : tokens_list) {
-        if (id == TOKEN_IMG_PLACEMENT) {
-            LOG("<img_placement>");
-        } else {
-            LOG("%s", llama_token_to_piece(ctx, id).c_str());
-        }
-    }
-
-    LOG("\n\n");
-
-    // load image
-    llama_batch_img img_batch = llama_batch_img_init(1);
-    img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");
-
-    // create a llama_batch with size 512
-    // we use this object to submit token data for decoding
-
-    llama_batch batch = llama_batch_init(512, 0, 1);
-
-    // evaluate the initial prompt
-    int n_cur = 0;
-    int i_img = 0;
-    for (auto id : tokens_list) {
-        if (id == TOKEN_IMG_PLACEMENT) {
-            img_batch.pos[i_img] = n_cur;
-            n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
-            i_img++;
-        } else {
-            llama_batch_add(batch, id, n_cur, { 0 }, false);
-            printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
-            n_cur++;
+    for (auto id : prompt_tokens) {
+        char buf[128];
+        int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
+            return 1;
         }
+        std::string s(buf, n);
+        printf("%s", s.c_str());
     }
 
-    // llama_decode will output logits only for the last token of the prompt
-    batch.logits[batch.n_tokens - 1] = true;
-
-    if (llama_encode_vision(ctx, img_batch) != 0) {
-        LOG("%s: llama_encode_vision() failed\n", __func__);
-        return 1;
-    }
-
-    n_cur = 0;
-    {
-        auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
-        auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
-        t1.insert(t1.begin(), 1);
-
-        n_cur = 0;
-        llama_batch_clear(batch);
-        llama_batch_add(batch, 1, 0, { 0 }, false);
-        llama_decode(ctx, batch);
-
-        n_cur = t1.size();
-        llama_batch_clear(batch);
-        llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
-        llama_decode(ctx, batch0);
-
-        n_cur = 0;
-        llama_batch_clear(batch);
-        for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
-        llama_decode(ctx, batch);
-
-        n_cur = t1.size() + 576;
-        llama_batch_clear(batch);
-        printf("pos %d\n", n_cur);
-        for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
-        batch.logits[batch.n_tokens - 1] = true;
-    }
+    // prepare a batch for the prompt
 
-    if (llama_decode(ctx, batch) != 0) {
-        LOG("%s: llama_decode() failed\n", __func__);
-        return 1;
-    }
+    llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());
 
     // main loop
 
-    int n_decode = 0;
-
     const auto t_main_start = ggml_time_us();
     int n_decode = 0;
     llama_token new_token_id;
@@ -241,15 +155,12 @@ int main(int argc, char ** argv) {
 
         n_pos += batch.n_tokens;
 
-    for (int i = 0; i < n_predict; i++) {
         // sample the next token
         {
             new_token_id = llama_sampler_sample(smpl, ctx, -1);
 
             // is it an end of generation?
             if (llama_token_is_eog(model, new_token_id)) {
-                LOG("\n");
-
                 break;
             }
 

diff --git a/examples/vision/CMakeLists.txt b/examples/vision/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-vision)
+add_executable(${TARGET} vision.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)