Skip to content

Commit

Permalink
vision: add separate vision example
Browse files Browse the repository at this point in the history
This commit also make a few updates to merge issues that I did not
discover while rebasing. The rebasing was a little challenging because
there had been quite a few changes in the upstream branch.

The motivation for a separate example is that the simple example in
upstream no longer uses common.h so I though it would be better to have
a separate example for vision.

The status of this is that I'm able to run the example in the original
PR with the Eiffel tower image. With this in place hopefully this can
be iterated upon and cleaned up (I'm talking about the changes that I've
made here).
  • Loading branch information
danbev committed Nov 11, 2024
1 parent 5669ab7 commit c39064a
Show file tree
Hide file tree
Showing 8 changed files with 259 additions and 112 deletions.
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ BUILD_TARGETS = \
llama-speculative \
llama-tokenize \
llama-vdot \
llama-vision \
llama-cvector-generator \
llama-gen-docs \
tests/test-c.o
Expand Down Expand Up @@ -1469,6 +1470,11 @@ llama-gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp \
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

llama-vision: examples/vision/vision.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

ifdef GGML_RPC
rpc-server: examples/rpc/rpc-server.cpp \
$(OBJ_GGML)
Expand Down
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1526,7 +1526,7 @@ std::vector<llama_token> llama_tokenize_with_img(
std::vector<llama_token> output;
for (const auto & part : parts) {
bool add_bos = &parts.front() == &part;
auto tokens = llama_tokenize(ctx, part, add_special && add_bos, parse_special);
auto tokens = common_tokenize(ctx, part, add_special && add_bos, parse_special);
output.insert(output.end(), tokens.begin(), tokens.end());
if (&parts.back() != &part) {
// add image token to middle of 2 parts
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ else()
add_subdirectory(simple-chat)
add_subdirectory(speculative)
add_subdirectory(tokenize)
add_subdirectory(vision)
endif()
111 changes: 11 additions & 100 deletions examples/simple/simple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <cstdio>
#include <cstring>
#include <string>
#include "vision.h"
#include <vector>

static void print_usage(int, char ** argv) {
Expand All @@ -21,10 +20,7 @@ int main(int argc, char ** argv) {
// number of tokens to predict
int n_predict = 32;

//params.prompt = "Hello my name is";
params.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n"
"USER:<img_placement>\nwhat did you see?\nASSISTANT:";
params.n_predict = 32;
// parse command line arguments

{
int i = 1;
Expand Down Expand Up @@ -127,107 +123,25 @@ int main(int argc, char ** argv) {

llama_sampler_chain_add(smpl, llama_sampler_init_greedy());

// tokenize the prompt

std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize_with_img(ctx, params.prompt, true);

const int n_ctx = llama_n_ctx(ctx);
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());

LOG("\n");
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);

// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
return 1;
}

// print the prompt token-by-token

LOG("\n");

for (auto id : tokens_list) {
if (id == TOKEN_IMG_PLACEMENT) {
LOG("<img_placement>");
} else {
LOG("%s", llama_token_to_piece(ctx, id).c_str());
}
}

LOG("\n\n");

// load image
llama_batch_img img_batch = llama_batch_img_init(1);
img_batch.imgs[0] = load_image_from_file("../models/eiffel-tower-3349075_1280.jpg");

// create a llama_batch with size 512
// we use this object to submit token data for decoding

llama_batch batch = llama_batch_init(512, 0, 1);

// evaluate the initial prompt
int n_cur = 0;
int i_img = 0;
for (auto id : tokens_list) {
if (id == TOKEN_IMG_PLACEMENT) {
img_batch.pos[i_img] = n_cur;
n_cur += llama_img_n_tokens(ctx, img_batch.imgs[i_img]);
i_img++;
} else {
llama_batch_add(batch, id, n_cur, { 0 }, false);
printf("pos %d tok %d --> %s\n", n_cur, id, llama_token_to_piece(ctx, id).c_str());
n_cur++;
for (auto id : prompt_tokens) {
char buf[128];
int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
if (n < 0) {
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
return 1;
}
std::string s(buf, n);
printf("%s", s.c_str());
}

// llama_decode will output logits only for the last token of the prompt
batch.logits[batch.n_tokens - 1] = true;

if (llama_encode_vision(ctx, img_batch) != 0) {
LOG("%s: llama_encode_vision() failed\n", __func__);
return 1;
}

n_cur = 0;
{
auto t1 = ::llama_tokenize(ctx, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", false);
auto t2 = ::llama_tokenize(ctx, "\nwhat did you see?\nASSISTANT:", false);
t1.insert(t1.begin(), 1);

n_cur = 0;
llama_batch_clear(batch);
llama_batch_add(batch, 1, 0, { 0 }, false);
llama_decode(ctx, batch);

n_cur = t1.size();
llama_batch_clear(batch);
llama_batch batch0 = {int32_t(576), nullptr, _test_get_img_embd(ctx), nullptr, nullptr, nullptr, nullptr, n_cur, 1, 0, };
llama_decode(ctx, batch0);

n_cur = 0;
llama_batch_clear(batch);
for (auto t : t1) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
llama_decode(ctx, batch);

n_cur = t1.size() + 576;
llama_batch_clear(batch);
printf("pos %d\n", n_cur);
for (auto t : t2) { llama_batch_add(batch, t, n_cur, { 0 }, false); n_cur++; }
batch.logits[batch.n_tokens - 1] = true;
}
// prepare a batch for the prompt

if (llama_decode(ctx, batch) != 0) {
LOG("%s: llama_decode() failed\n", __func__);
return 1;
}
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size());

// main loop

int n_decode = 0;

const auto t_main_start = ggml_time_us();
int n_decode = 0;
llama_token new_token_id;
Expand All @@ -241,15 +155,12 @@ int main(int argc, char ** argv) {

n_pos += batch.n_tokens;

for (int i = 0; i < n_predict; i++) {
// sample the next token
{
new_token_id = llama_sampler_sample(smpl, ctx, -1);

// is it an end of generation?
if (llama_token_is_eog(model, new_token_id)) {
LOG("\n");

break;
}

Expand Down
5 changes: 5 additions & 0 deletions examples/vision/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET llama-vision)
add_executable(${TARGET} vision.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
Loading

0 comments on commit c39064a

Please sign in to comment.