llama : add llama-simple-vision-mllama example (wip)

This commit adds a vision example that uses Llama 3.2 Vision Instruct to experiment with how a multi-modal cross-attention model works. The implementation is based on Ollama's multi-modal implementation with some modifications to make it work with the new Vision API. The motivation for this example is only to get some experience with multi-modal cross-attention and understand how it works. This is a bare minimum approach to get something working and see if it is something that is worth exploring further, but parts of this migth be useful on their own like the model conversion for example. This is a work in progress and there is currently an issue with the scheduler/graph computation causing the model to act weirdly.
danbev · Dec 17, 2024 · 361d983 · 361d983
1 parent bb0ec7e
commit 361d983
Show file tree

Hide file tree

Showing 39 changed files with 15,999 additions and 212 deletions.
diff --git a/Makefile b/Makefile
@@ -43,6 +43,7 @@ BUILD_TARGETS = \
 	llama-speculative \
 	llama-tokenize \
 	llama-vdot \
+	llama-simple-vision \
 	llama-cvector-generator \
 	llama-gen-docs \
 	tests/test-c.o
@@ -1351,6 +1352,11 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+llama-simple-vision: examples/simple-vision/simple-vision.cpp \
+	$(OBJ_ALL)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 ifdef GGML_RPC
 rpc-server: examples/rpc/rpc-server.cpp \
 	$(OBJ_GGML)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -73,6 +73,7 @@ add_library(${TARGET} STATIC
     vision.h
     vision.cpp
     stb_image.h
+    stb_image_resize2.h
     )
 
 if (BUILD_SHARED_LIBS)

diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1351,7 +1351,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.image.emplace_back(value);
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION}));
+    add_opt(common_arg(
+        {"--n-tiles"}, "N",
+        "number of tiles to split the image into. use with multimodal models.",
+        [](common_params & params, int value) {
+            params.n_tiles = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_VISION}));
+    add_opt(common_arg(
+        {"--cross-attention"},
+        "enables cross attention for multimodal models.",
+        [](common_params & params) {
+            params.cross_attention = true;
+        }
+    ).set_examples({LLAMA_EXAMPLE_VISION}));
     if (llama_supports_rpc()) {
         add_opt(common_arg(
             {"--rpc"}, "SERVERS",

diff --git a/common/common.cpp b/common/common.cpp
@@ -1586,7 +1586,7 @@ std::vector<llama_token> llama_tokenize_with_img(
     std::vector<llama_token> output;
     for (const auto & part : parts) {
         bool add_bos = &parts.front() == &part;
-        auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
+        auto tokens = common_tokenize(ctx, part, add_special && add_bos, parse_special);
         output.insert(output.end(), tokens.begin(), tokens.end());
         if (&parts.back() != &part) {
             // add image token to middle of 2 parts

diff --git a/common/common.h b/common/common.h
@@ -82,6 +82,7 @@ enum llama_example {
     LLAMA_EXAMPLE_PARALLEL,
 
     LLAMA_EXAMPLE_COUNT,
+    LLAMA_EXAMPLE_VISION,
 };
 
 enum common_sampler_type {
@@ -295,6 +296,8 @@ struct common_params {
     // multimodal models (see examples/llava)
     std::string mmproj = "";        // path to multimodal projector                                         // NOLINT
     std::vector<std::string> image; // path to image file(s)
+    bool cross_attention = false;
+    int  n_tiles = 1;
 
     // embedding
     bool embedding         = false; // get only sentence embedding
@@ -374,6 +377,7 @@ struct common_params {
 
     // batched-bench params
     bool batched_bench_output_jsonl = false;
+
 };
 
 // call once at the start of a program if it uses libcommon

diff --git a/common/stb_image_resize2.h b/common/stb_image_resize2.h