Skip to content

Commit

Permalink
llama : add llama-simple-vision-mllama example (wip)
Browse files Browse the repository at this point in the history
This commit adds a vision example that uses Llama 3.2 Vision
Instruct to experiment with how a multi-modal cross-attention model
works. The implementation is based on Ollama's multi-modal
implementation with some modifications to make it work with the new
Vision API.

The motivation for this example is only to get some experience with
multi-modal cross-attention and understand how it works.  This is a bare
minimum approach to get something working and see if it is something
that is worth exploring further, but parts of this migth be useful on
their own like the model conversion for example.

This is a work in progress and there is currently an issue with the
scheduler/graph computation causing the model to act weirdly.
  • Loading branch information
danbev committed Dec 17, 2024
1 parent bb0ec7e commit 361d983
Show file tree
Hide file tree
Showing 39 changed files with 15,999 additions and 212 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ BUILD_TARGETS = \
llama-speculative \
llama-tokenize \
llama-vdot \
llama-simple-vision \
llama-cvector-generator \
llama-gen-docs \
tests/test-c.o
Expand Down Expand Up @@ -1351,6 +1352,11 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-simple-vision: examples/simple-vision/simple-vision.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

ifdef GGML_RPC
rpc-server: examples/rpc/rpc-server.cpp \
$(OBJ_GGML)
Expand Down
1 change: 1 addition & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ add_library(${TARGET} STATIC
vision.h
vision.cpp
stb_image.h
stb_image_resize2.h
)

if (BUILD_SHARED_LIBS)
Expand Down
16 changes: 15 additions & 1 deletion common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1351,7 +1351,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
params.image.emplace_back(value);
}
).set_examples({LLAMA_EXAMPLE_LLAVA}));
).set_examples({LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_VISION}));
add_opt(common_arg(
{"--n-tiles"}, "N",
"number of tiles to split the image into. use with multimodal models.",
[](common_params & params, int value) {
params.n_tiles = value;
}
).set_examples({LLAMA_EXAMPLE_VISION}));
add_opt(common_arg(
{"--cross-attention"},
"enables cross attention for multimodal models.",
[](common_params & params) {
params.cross_attention = true;
}
).set_examples({LLAMA_EXAMPLE_VISION}));
if (llama_supports_rpc()) {
add_opt(common_arg(
{"--rpc"}, "SERVERS",
Expand Down
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1586,7 +1586,7 @@ std::vector<llama_token> llama_tokenize_with_img(
std::vector<llama_token> output;
for (const auto & part : parts) {
bool add_bos = &parts.front() == &part;
auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
auto tokens = common_tokenize(ctx, part, add_special && add_bos, parse_special);
output.insert(output.end(), tokens.begin(), tokens.end());
if (&parts.back() != &part) {
// add image token to middle of 2 parts
Expand Down
4 changes: 4 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ enum llama_example {
LLAMA_EXAMPLE_PARALLEL,

LLAMA_EXAMPLE_COUNT,
LLAMA_EXAMPLE_VISION,
};

enum common_sampler_type {
Expand Down Expand Up @@ -295,6 +296,8 @@ struct common_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector // NOLINT
std::vector<std::string> image; // path to image file(s)
bool cross_attention = false;
int n_tiles = 1;

// embedding
bool embedding = false; // get only sentence embedding
Expand Down Expand Up @@ -374,6 +377,7 @@ struct common_params {

// batched-bench params
bool batched_bench_output_jsonl = false;

};

// call once at the start of a program if it uses libcommon
Expand Down
10,601 changes: 10,601 additions & 0 deletions common/stb_image_resize2.h

Large diffs are not rendered by default.

Loading

0 comments on commit 361d983

Please sign in to comment.