Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support MiniCPM-V-2.6 #8967

Merged
merged 74 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from 73 commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
7a49a6f
init
tc-mb May 23, 2024
c536fa6
rename
tc-mb May 23, 2024
2b91903
add run android for termux in readme
tc-mb May 23, 2024
0480d5f
add android readme
tc-mb May 23, 2024
ec1cea7
add instructions in readme
tc-mb May 23, 2024
a491f45
change name in readme
tc-mb May 23, 2024
7573b63
Update README.md
iceflame89 May 23, 2024
94dcaba
fixed line
harvestingmoon May 23, 2024
b31f51f
Merge pull request #1 from harvestingmoon/minicpm-v2.5
tc-mb May 24, 2024
629420e
add result in readme
tc-mb May 24, 2024
b48708a
random pos_embed
tc-mb May 26, 2024
d9fbc1d
add positions index
tc-mb May 26, 2024
18fe620
change for ollama
tc-mb May 26, 2024
2997a68
change for ollama
tc-mb May 26, 2024
8541e99
better pos_embed in clip
tc-mb May 26, 2024
d8974b8
support ollama
tc-mb May 27, 2024
e73a0c7
updata cmakelist
tc-mb May 28, 2024
6366d62
updata cmakelist
tc-mb May 28, 2024
056d178
rename wrapper
tc-mb May 28, 2024
3c306f1
clear code
tc-mb May 28, 2024
9495504
replace and organize code
tc-mb May 28, 2024
b37ab0b
add link
tc-mb May 28, 2024
8767ce2
Merge branch 'prepare-PR-of-minicpm-v2.5' into prepare-PR
tc-mb May 28, 2024
8bd47ce
Merge pull request #7 from OpenBMB/prepare-PR
tc-mb May 28, 2024
28d4a7f
Merge pull request #8 from OpenBMB/master
tc-mb May 28, 2024
02eb445
sync master
tc-mb May 28, 2024
07f48f9
fix warnings
tc-mb May 28, 2024
c38d152
fix warnings
tc-mb May 28, 2024
88f5e6a
fix bug in bicubic resize when need resize iamge smaller
tc-mb May 30, 2024
a913ca4
receive review comments and modify
tc-mb May 31, 2024
a95a6d9
receive review comments and modify
tc-mb Jun 2, 2024
c390dd4
Merge branch 'ggerganov:master' into prepare-PR-of-minicpm-v2.5
tc-mb Jun 4, 2024
efe4c61
put all code into llava dir
tc-mb Jun 4, 2024
ee5b850
Merge pull request #11 from OpenBMB/pr_add_all_in_llava
tc-mb Jun 4, 2024
77beb4d
Merge branch 'prepare-PR-of-minicpm-v2.5' into master
tc-mb Jun 24, 2024
cb8cfb9
Merge pull request #15 from OpenBMB/master
tc-mb Jun 24, 2024
8f03505
fix quality problem in pr code
tc-mb Jun 25, 2024
e68c8bc
change n_layer
tc-mb Jun 25, 2024
4c67d7c
add space in "-1"
tc-mb Jun 25, 2024
977941d
imitate reshape bug of python code
tc-mb Jul 4, 2024
3e6348b
fix bug in clip
tc-mb Jul 7, 2024
c5b6851
fix issues for merging
tc-mb Jul 17, 2024
5959b14
fix llama-minicpmv-cli in cmake file
tc-mb Jul 19, 2024
292a469
change pr readme
tc-mb Jul 20, 2024
be8b5b2
fix code review
tc-mb Jul 22, 2024
4c75583
remove in line 33 directory in the /cmakelists.txt (not in example, i…
tc-mb Jul 22, 2024
62fa15b
fix cmakefile
tc-mb Jul 23, 2024
dad4abe
add warn
tc-mb Jul 23, 2024
3642be9
fix KEY_HAS_MINICPMV_PROJ
tc-mb Jul 23, 2024
fcde997
remove load_image_size into clip_ctx
tc-mb Jul 23, 2024
6fd0937
remove the extern "C", MINICPMV_API
tc-mb Jul 23, 2024
107e1ed
fix uhd code for review comment
tc-mb Jul 25, 2024
72b9629
delete minicpmv-wrapper in pr
tc-mb Jul 25, 2024
f3d400d
remove uhd_image_embed
tc-mb Jul 26, 2024
65f7455
Modify 2 notes
tc-mb Jul 26, 2024
6da5130
support minicpmv2.6
tc-mb Aug 2, 2024
77c580d
modify convert script of minicpmv
tc-mb Aug 2, 2024
ea0c828
modify convert
tc-mb Aug 10, 2024
fc1c860
Merge branch 'prepare-PR-of-minicpm-v2.6' into master
tc-mb Aug 10, 2024
ce0d1a6
Merge pull request #24 from OpenBMB/master
tc-mb Aug 10, 2024
6cad864
modify convert
tc-mb Aug 10, 2024
fe39ecc
add readme
tc-mb Aug 10, 2024
bffbe1c
add resampler of v2.6
tc-mb Aug 10, 2024
28d6a0f
modify clip
tc-mb Aug 10, 2024
4a87d1d
modify readme
tc-mb Aug 10, 2024
32b47f6
fix type-check
tc-mb Aug 10, 2024
662d4c1
fix type-check
tc-mb Aug 12, 2024
a945b3c
fix type-check
tc-mb Aug 12, 2024
89d378c
fix type-check
tc-mb Aug 12, 2024
1ec79f0
modify convert script and readme
tc-mb Aug 12, 2024
1123376
fix convert script and readme
tc-mb Aug 12, 2024
f30c5e1
fix convert
tc-mb Aug 12, 2024
47eb0a5
fix num in convert
tc-mb Aug 12, 2024
1ca3f06
fix type-check
tc-mb Aug 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/llava/README-minicpmv2.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ Convert PyTorch model to gguf files (You can also download the converted [gguf](

```bash
python ./examples/minicpmv/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5
python ./convert-hf-to-gguf.py ../MiniCPM-Llama3-V-2_5/model
python ./examples/minicpmv/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2
python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model

# quantize int4 version
./llama-quantize ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf Q4_K_M
Expand Down
107 changes: 107 additions & 0 deletions examples/llava/README-minicpmv2.6.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
## MiniCPM-V 2.6

### Prepare models and code

Download [MiniCPM-V-2_6](https://huggingface.co/openbmb/MiniCPM-V-2_6) PyTorch model from huggingface to "MiniCPM-V-2_6" folder.

Clone llama.cpp:
```bash
git clone [email protected]:OpenBMB/llama.cpp.git
cd llama.cpp
git checkout minicpmv-main
```

### Usage of MiniCPM-V 2.6

Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us)
tc-mb marked this conversation as resolved.
Show resolved Hide resolved

```bash
python ./examples/llava/minicpmv-surgery.py -m ../MiniCPM-V-2_6
python ./examples/llava/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3
python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model

# quantize int4 version
./llama-quantize ../MiniCPM-V-2_6/model/ggml-model-f16.gguf ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf Q4_K_M
```

Build for Linux or Mac

```bash
make
make llama-minicpmv-cli
```

Inference on Linux or Mac
```
# run f16 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# run quantized int4 version
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
# or run in interactive mode
./llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -i
```

### Video
Install FFmpeg
```
brew install ffmpeg
brew install pkg-config
```

### Android

#### Build on Android device using Termux
We found that build on Android device would bring better runtime performance, so we recommend to build on device.

[Termux](https://github.com/termux/termux-app#installation) is a terminal app on Android device (no root required).

Install tools in Termux:
```
apt update && apt upgrade -y
apt install git make cmake
```

It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```

#### Building the Project using Android NDK
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.

Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:

```bash
mkdir build-android
cd build-android
export NDK=/your_ndk_path
cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
make
```

Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).

Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:

(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
```
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
$cd /data/data/com.termux/files/home/bin
$chmod +x ./*
```

Download models and push them to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`

```
$mv /sdcard/llama.cpp/ggml-model-Q4_K_M.gguf /data/data/com.termux/files/home/model/
$mv /sdcard/llama.cpp/mmproj-model-f16.gguf /data/data/com.termux/files/home/model/
```

Now, you can start chatting:
```
$cd /data/data/com.termux/files/home/bin
$./llama-minicpmv-cli -m ../model/ggml-model-Q4_K_M.gguf --mmproj ../model/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
```
96 changes: 75 additions & 21 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ static std::string format(const char * fmt, ...) {
#define KEY_HAS_VIS_ENC "clip.has_vision_encoder"
#define KEY_HAS_LLAVA_PROJ "clip.has_llava_projector"
#define KEY_HAS_MINICPMV_PROJ "clip.has_minicpmv_projector"
#define KEY_MINICPMV_VERSION "clip.minicpmv_version"
#define KEY_USE_GELU "clip.use_gelu"
#define KEY_N_EMBD "clip.%s.embedding_length"
#define KEY_N_FF "clip.%s.feed_forward_length"
Expand Down Expand Up @@ -526,6 +527,7 @@ struct clip_ctx {
bool has_vision_encoder = false;
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
int minicpmv_version = 2;

struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
Expand Down Expand Up @@ -641,7 +643,12 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
if (ctx->has_minicpmv_projector) {
int pos_w = image_size_width/patch_size;
int pos_h = image_size_height/patch_size;
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
if (ctx->minicpmv_version == 2) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
}
else if (ctx->minicpmv_version == 3) {
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
}
ggml_set_name(pos_embed, "pos_embed");
ggml_set_input(pos_embed);
}
Expand Down Expand Up @@ -768,8 +775,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = ggml_gelu(ctx0, embeddings);
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);

} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
}
else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
Expand Down Expand Up @@ -949,10 +956,20 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}

{ // attention
const int hidden_size = 4096;
int hidden_size = 4096;
const int d_head = 128;
const int n_head = hidden_size/d_head;
const int num_query = 96;
int n_head = hidden_size/d_head;
int num_query = 96;
if (ctx->minicpmv_version == 2) {
hidden_size = 4096;
n_head = hidden_size/d_head;
num_query = 96;
}
else if (ctx->minicpmv_version == 3) {
hidden_size = 3584;
n_head = hidden_size/d_head;
num_query = 64;
}

struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
Q = ggml_scale_inplace(ctx0, Q, 1.0f / sqrt((float)d_head));
Expand Down Expand Up @@ -1149,6 +1166,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
new_clip->has_minicpmv_projector = gguf_get_val_bool(ctx, idx);
}

idx = gguf_find_key(ctx, KEY_MINICPMV_VERSION);
if (idx != -1) {
new_clip->minicpmv_version = gguf_get_val_i32(ctx, idx);
}

// GGML_ASSERT(new_clip->has_llava_projector); // see monatis/clip.cpp for image and/or text encoding for semantic search

GGML_ASSERT(new_clip->has_vision_encoder);
Expand Down Expand Up @@ -1910,10 +1932,12 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
if (clip_is_minicpmv(ctx)) {
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img);

if(clip_is_minicpmv(ctx)){
int max_slice_nums = 9;
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i) {
for (size_t i = 0; i < imgs.size(); ++i){
res_imgs->size += imgs[i].size();
}
res_imgs->data = new clip_image_f32[res_imgs->size];
Expand Down Expand Up @@ -2146,7 +2170,12 @@ int clip_n_patches(const struct clip_ctx * ctx) {
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
n_patches /= 4;
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
n_patches = 96;
if (ctx->minicpmv_version == 2) {
n_patches = 96;
}
else if (ctx->minicpmv_version == 3) {
n_patches = 64;
}
}

return n_patches;
Expand Down Expand Up @@ -2282,6 +2311,11 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}
const int pos_w = ctx->load_image_size->width/patch_size;
const int pos_h = ctx->load_image_size->height/patch_size;

{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
Expand Down Expand Up @@ -2316,8 +2350,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
int* positions_data = (int*)malloc(ggml_nbytes(positions));
for (int i = 0; i < num_positions; i++) {
positions_data[i] = std::floor(70.0*i/num_positions);
int bucket_coords_h[70];
int bucket_coords_w[70];
for (int i = 0; i < pos_h; i++){
bucket_coords_h[i] = std::floor(70.0*i/pos_h);
}
for (int i = 0; i < pos_w; i++){
bucket_coords_w[i] = std::floor(70.0*i/pos_w);
}
for (int i = 0, id = 0; i < pos_h; i++){
for (int j = 0; j < pos_w; j++){
positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
}
}
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
free(positions_data);
Expand All @@ -2328,12 +2372,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
if(ctx->load_image_size==nullptr){
ctx->load_image_size= clip_image_size_init();
}
int pos_w = ctx->load_image_size->width/patch_size;
int pos_h = ctx->load_image_size->height/patch_size;
int embed_dim = 4096;
if (ctx->minicpmv_version == 2) {
embed_dim = 4096;
}
else if (ctx->minicpmv_version == 3) {
embed_dim = 3584;
}
auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));

float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
Expand All @@ -2346,7 +2391,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
free(pos_embed_data);
}
} else {
}
else{
{
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
Expand Down Expand Up @@ -2548,13 +2594,21 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
return ctx->vision_model.mm_3_b->ne[0];
}
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
return 4096;
if (ctx->minicpmv_version == 2) {
return 4096;
}
else if (ctx->minicpmv_version == 3) {
return 3584;
}
}

std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
}

bool clip_is_minicpmv(const struct clip_ctx * ctx) {
return ctx->has_minicpmv_projector;
int clip_is_minicpmv(const struct clip_ctx * ctx) {
if (ctx->has_minicpmv_projector) {
return ctx->minicpmv_version;
}
return 0;
}
2 changes: 1 addition & 1 deletion examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons

CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

CLIP_API bool clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);

#ifdef __cplusplus
}
Expand Down
9 changes: 8 additions & 1 deletion examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,14 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
const bool encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
Expand Down
26 changes: 23 additions & 3 deletions examples/llava/minicpmv-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,13 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
std::string system_prompt;
int idx = 0;
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (has_minicpmv_projector == 2) {
system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
}
else if (has_minicpmv_projector == 3) {
system_prompt = "<|im_start|>user\n";
}
LOG_TEE("%s: image token past: %d\n", __func__, n_past);
eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
Expand Down Expand Up @@ -210,10 +216,24 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri

static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){
std::string user_prompt = prompt;
if (!is_first) user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
if (!is_first) {
if (has_minicpmv_projector == 2) {
user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
}
else if (has_minicpmv_projector == 3) {
user_prompt = "<|im_start|>user\n" + prompt;
}
}

eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
if (has_minicpmv_projector == 2) {
eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
}
else if (has_minicpmv_projector == 3) {
eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
}

// generate the response

LOG_TEE("\n");
Expand Down
Loading
Loading