diff --git a/CMakeLists.txt b/CMakeLists.txt index a313206351677..ee1783325d63b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -69,6 +69,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE}) # 3rd party libs option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) +option(LLAMA_FFMPEG "llama: use ffmpeg to load video files" OFF) # Required for relocatable CMake package include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake) diff --git a/Makefile b/Makefile index 332496cfc39c1..b7f8298b2fb02 100644 --- a/Makefile +++ b/Makefile @@ -968,6 +968,11 @@ override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL override LDFLAGS := $(LDFLAGS) -lcurl endif +ifdef LLAMA_FFMPEG +override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_FFMPEG $(shell pkg-config --cflags libavformat libavcodec libavutil) +override LDFLAGS := $(LDFLAGS) $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale +endif + # # Print build information # diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 761971d6881f3..a46f76ec7e76f 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -83,6 +83,19 @@ if (LLAMA_CURL) set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY}) endif () +# Use ffmpeg to load video files +if (LLAMA_FFMPEG) + find_package(PkgConfig REQUIRED) + pkg_check_modules(FFMPEG REQUIRED + libavformat + libavcodec + libavutil + ) + add_definitions(-DLLAMA_USE_FFMPEG) + include_directories(${FFMPEG_INCLUDE_DIRS}) + set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${FFMPEG_LIBRARIES}) +endif () + target_include_directories(${TARGET} PUBLIC .) target_compile_features (${TARGET} PUBLIC cxx_std_11) target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads) diff --git a/common/common.cpp b/common/common.cpp index 715adf94658f0..fabdbbb593b1c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -816,6 +816,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa params.image.emplace_back(argv[i]); return true; } + if (arg == "--video") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.video = argv[i]; + return true; + } + if (arg == "--frame-num") { + CHECK_ARG + params.frame_num = std::stoi(argv[i]); + return true; + } if (arg == "-i" || arg == "--interactive") { params.interactive = true; return true; @@ -1639,6 +1652,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param options.push_back({ "multi-modality" }); options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" }); options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" }); + options.push_back({ "*", " --video FILE", "path to a video file. use with multimodal models. Specify multiple times for batching" }); + options.push_back({ "*", " --frame-num N", "number of max video frame (default: 16)" }); options.push_back({ "backend" }); options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" }); diff --git a/common/common.h b/common/common.h index f603ba2be1d35..345edcb94335d 100644 --- a/common/common.h +++ b/common/common.h @@ -193,6 +193,8 @@ struct gpt_params { // multimodal models (see examples/llava) std::string mmproj = ""; // path to multimodal projector std::vector image; // path to image file(s) + std::string video = ""; + int frame_num = 16; // embedding bool embedding = false; // get only sentence embedding diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 10e8765b4cd19..2e4d32b646102 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3,6 +3,7 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" +#include "common.h" #include "log.h" #include "ggml.h" #include "ggml-alloc.h" @@ -538,6 +539,7 @@ struct clip_ctx { bool has_llava_projector = false; bool has_minicpmv_projector = false; int minicpmv_version = 2; + int max_slice_nums = 9; struct clip_vision_model vision_model; projector_type proj_type = PROJECTOR_TYPE_MLP; @@ -1623,7 +1625,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32* } } -inline float clip(float x, float lower, float upper) { +inline int clip(int x, int lower, int upper) { return std::max(lower, std::min(x, upper)); } @@ -1827,10 +1829,6 @@ static std::pair uhd_get_refine_size(std::pair original_size return refine_size; } -inline int clip(int x, int lower, int upper) { - return std::max(lower, std::min(x, upper)); -} - static std::pair uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) { std::vector candidate_split_grids_nums; for (int i : {multiple - 1, multiple, multiple + 1}) { @@ -1932,7 +1930,7 @@ static std::vector> uhd_slice_image(const clip_imag } int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { - const int max_slice_nums=9; + const int max_slice_nums=ctx_clip->max_slice_nums; const int scale_resolution=448; const int original_width = ctx_clip->load_image_size->width; const int original_height = ctx_clip->load_image_size->height; @@ -1948,7 +1946,7 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) { bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { if(clip_is_minicpmv(ctx)){ - int max_slice_nums = 9; + int max_slice_nums = ctx->max_slice_nums; std::vector> imgs = uhd_slice_image(img, max_slice_nums); res_imgs->size = 0; for (size_t i = 0; i < imgs.size(); ++i){ @@ -2626,3 +2624,7 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) { } return 0; } + +void clip_uhd_max_slice_nums(struct clip_ctx * ctx, int max_slice_nums) { + ctx->max_slice_nums = max_slice_nums; +} diff --git a/examples/llava/clip.h b/examples/llava/clip.h index 78588bdf1745c..7a0f479261e32 100644 --- a/examples/llava/clip.h +++ b/examples/llava/clip.h @@ -86,6 +86,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype); CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx); +CLIP_API void clip_uhd_max_slice_nums(struct clip_ctx * ctx, int max_slice_nums); #ifdef __cplusplus } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 379fc295f1101..9b82f165e1a58 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -9,14 +9,168 @@ #include #include +#if defined(LLAMA_USE_FFMPEG) +extern "C" { + #include + #include + #include + #include +} +#endif // LLAMA_USE_FFMPEG + struct llava_context { struct clip_ctx * ctx_clip = NULL; struct llama_context * ctx_llama = NULL; struct llama_model * model = NULL; }; +struct clip_image_u8 { + int nx; + int ny; + std::vector buf; +}; + +#if defined(LLAMA_USE_FFMPEG) + +static std::vector extract_frames(const std::string& video_path, const int frame_num) { + AVFormatContext* format_ctx = nullptr; + if (avformat_open_input(&format_ctx, video_path.c_str(), nullptr, nullptr) < 0) { + LOG_TEE("Could not open video file."); + return {}; + } + + if (avformat_find_stream_info(format_ctx, nullptr) < 0) { + LOG_TEE("Could not find stream information."); + avformat_close_input(&format_ctx); + return {}; + } + + const AVCodec* codec = nullptr; + AVCodecContext* codec_ctx = nullptr; + int video_stream_index = -1; + + for (size_t i = 0; i < format_ctx->nb_streams; ++i) { + if (format_ctx->streams[i]->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) { + codec = avcodec_find_decoder(format_ctx->streams[i]->codecpar->codec_id); + if (codec) { + video_stream_index = i; + break; + } + } + } + + if (video_stream_index == -1) { + LOG_TEE("Could not find video stream."); + avformat_close_input(&format_ctx); + return {}; + } + + codec_ctx = avcodec_alloc_context3(codec); + if (!codec_ctx) { + LOG_TEE("Could not allocate video codec context."); + avformat_close_input(&format_ctx); + return {}; + } + + if (avcodec_parameters_to_context(codec_ctx, format_ctx->streams[video_stream_index]->codecpar) < 0) { + LOG_TEE("Could not copy codec parameters to codec context."); + avcodec_free_context(&codec_ctx); + avformat_close_input(&format_ctx); + return {}; + } + + if (avcodec_open2(codec_ctx, codec, nullptr) < 0) { + LOG_TEE("Could not open codec."); + avcodec_free_context(&codec_ctx); + avformat_close_input(&format_ctx); + return {}; + } + + AVFrame* frame = av_frame_alloc(); + AVFrame* frame_rgb = av_frame_alloc(); + if (!frame || !frame_rgb) { + LOG_TEE("Could not allocate frames."); + av_frame_free(&frame); + av_frame_free(&frame_rgb); + avcodec_free_context(&codec_ctx); + avformat_close_input(&format_ctx); + return {}; + } + + int num_bytes = av_image_get_buffer_size(AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); + uint8_t* buffer = (uint8_t*)av_malloc(num_bytes * sizeof(uint8_t)); + av_image_fill_arrays(frame_rgb->data, frame_rgb->linesize, buffer, AV_PIX_FMT_RGB24, codec_ctx->width, codec_ctx->height, 1); + + struct SwsContext* sws_ctx = sws_getContext(codec_ctx->width, codec_ctx->height, codec_ctx->pix_fmt, + codec_ctx->width, codec_ctx->height, AV_PIX_FMT_RGB24, + SWS_BILINEAR, nullptr, nullptr, nullptr); + + std::vector frames; + + AVPacket packet; + int64_t last_pts = AV_NOPTS_VALUE; + int64_t total_frames = format_ctx->streams[video_stream_index]->nb_frames; + // LOG_TEE("total_frames: %lld\n", total_frames); + + int64_t frame_interval = (int64_t)codec_ctx->framerate.num / codec_ctx->framerate.den; + // LOG_TEE("frame_interval: %lld\n", frame_interval); + // LOG_TEE("codec_ctx->framerate.num: %lld\n", codec_ctx->framerate.num); + // LOG_TEE("codec_ctx->framerate.den: %lld\n", codec_ctx->framerate.den); + + float frame_len = 1.0 * total_frames / frame_interval; + LOG_TEE("frame_len: %f\n", frame_len); + if (frame_len > frame_num - 1) { + frame_len = fmax(0, frame_num - 1); + frame_interval = (int64_t)(1.0 * total_frames / frame_len); + } + // LOG_TEE("frame_interval: %lld\n", frame_interval); + int frame_idx = 0; + while (av_read_frame(format_ctx, &packet) >= 0) { + if (packet.stream_index == video_stream_index) { + if (avcodec_send_packet(codec_ctx, &packet) == 0) { + for(;avcodec_receive_frame(codec_ctx, frame) == 0;frame_idx++) { + // int frame_idx = frame->pts/codec_ctx->framerate.den; + // LOG_TEE("frame_idx: %d %d\n", frame_idx, frame_idx % frame_interval); + if (frame->pts != last_pts && (frame_idx) % frame_interval == 0) { + sws_scale(sws_ctx, frame->data, frame->linesize, 0, codec_ctx->height, + frame_rgb->data, frame_rgb->linesize); + + clip_image_u8 * img = clip_image_u8_init(); + img->nx = codec_ctx->width; + img->ny = codec_ctx->height; + img->buf.resize(num_bytes); + std::copy(buffer, buffer + num_bytes, img->buf.begin()); + + frames.push_back(img); + last_pts = frame->pts; + } + } + } + } + av_packet_unref(&packet); + } + + av_free(buffer); + av_frame_free(&frame_rgb); + av_frame_free(&frame); + avcodec_free_context(&codec_ctx); + avformat_close_input(&format_ctx); + sws_freeContext(sws_ctx); + + return frames; +} + +#else + +static std::vector extract_frames(const std::string& video_path, const int frame_num) { + LOG_TEE("%s: llama.cpp built without ffmpeg, processing video files is not supported. Please recompile with LLAMA_FFMPEG=1 to add video support.\n", __func__); + return {}; +} + +#endif // LLAMA_USE_FFMPEG + static void show_additional_info(int /*argc*/, char ** argv) { - LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE("\n example usage: %s -m --mmproj [--video ] [--image ] [--image ] [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); } @@ -26,6 +180,17 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v LOG_TEE("%s", text); } +static struct clip_ctx * clip_init_context(gpt_params * params) { + const char * clip_path = params->mmproj.c_str(); + + auto prompt = params->prompt; + if (prompt.empty()) { + prompt = "describe the image in detail."; + } + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); + return ctx_clip; +} + static struct llama_model * llava_init(gpt_params * params) { llama_backend_init(); llama_numa_init(params->numa); @@ -40,11 +205,19 @@ static struct llama_model * llava_init(gpt_params * params) { return model; } -static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { +static struct llava_context * llava_init_context(gpt_params * params) { + auto model = llava_init(params); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); + return NULL; + } + + const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; if (prompt.empty()) { prompt = "describe the image in detail."; } + auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); if (params->n_ctx < 2048) { @@ -65,6 +238,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode auto ctx_llava = (struct llava_context *)malloc(sizeof(llava_context)); ctx_llava->ctx_llama = ctx_llama; + ctx_llava->ctx_clip = ctx_clip; ctx_llava->model = model; return ctx_llava; } @@ -80,17 +254,6 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } -static struct clip_ctx * clip_init_context(gpt_params * params) { - const char * clip_path = params->mmproj.c_str(); - - auto prompt = params->prompt; - if (prompt.empty()) { - prompt = "describe the image in detail."; - } - auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - return ctx_clip; -} - static bool eval_tokens(struct llama_context * ctx_llama, std::vector tokens, int n_batch, int * n_past) { int N = (int) tokens.size(); for (int i = 0; i < N; i += n_batch) { @@ -122,7 +285,7 @@ static bool eval_string(struct llama_context * ctx_llama, const char* str, int n static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) { float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip)); std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip)); - + auto slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed)); slice_embed->embed = image_embed; slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip); @@ -130,21 +293,15 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str llava_image_embed_free(slice_embed); } -static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { +static int process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) { std::string system_prompt; + bool res = false; int idx = 0; int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip); - int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); - if (has_minicpmv_projector == 2) { - system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; - } - else if (has_minicpmv_projector == 3) { - system_prompt = "<|im_start|>user\n"; - } LOG_TEE("%s: image token past: %d\n", __func__, n_past); eval_string(ctx_llava->ctx_llama, (system_prompt+"").c_str(), params->n_batch, &n_past, false); process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++); - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + res = eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); if (num_image_embeds > 1) { size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip); eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); @@ -158,9 +315,70 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e } } } - eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); + res = eval_string(ctx_llava->ctx_llama, std::string("").c_str(), params->n_batch, &n_past, false); } LOG_TEE("%s: image token past: %d\n", __func__, n_past); + if(!res) return 0; + return n_past; +} + +static bool process_prompt(int type, struct llava_context * ctx_llava, gpt_params * params, int &n_past, std::string prompt = ""){ + int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); + if (type==0) { + std::string system_prompt; + if (has_minicpmv_projector == 1) { + system_prompt = "<用户>"; + } + else if (has_minicpmv_projector == 2) { + system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"; + } + else if (has_minicpmv_projector == 3) { + system_prompt = "<|im_start|>user\n"; + } + return eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, false); + } + else if (type==1) { + std::string user_prompt = prompt; + return eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); + } + else if (type==2) { + if (has_minicpmv_projector == 1) { + return eval_string(ctx_llava->ctx_llama, "\n", params->n_batch, &n_past, false); + } + else if (has_minicpmv_projector == 2) { + return eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); + } + else if (has_minicpmv_projector == 3) { + return eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); + } + } + return 0; +} + +static struct llava_image_embed * video_image_embed(struct clip_ctx * ctx_clip, gpt_params * params, const clip_image_u8 * img){ + float* image_embed = NULL; + int n_image_pos = 0; + clip_uhd_max_slice_nums(ctx_clip, 2); + bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, params->n_threads, img, &image_embed, &n_image_pos); + if (!image_embed_result) { + LOG_TEE("%s: coulnd't embed the image\n", __func__); + return NULL; + } + + auto result = (llava_image_embed*)malloc(sizeof(llava_image_embed)); + result->embed = image_embed; + result->n_image_pos = n_image_pos; + return result; +} + +static struct llava_image_embed * interleaved_image_embed(struct clip_ctx * ctx_clip, gpt_params * params, const std::string & fname){ + clip_uhd_max_slice_nums(ctx_clip, 9); + llava_image_embed * embed = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); + if (!embed) { + LOG_TEE("error: failed to embed image Terminating\n\n"); + return NULL; + } + return embed; } static const char * sample(struct llama_sampling_context * ctx_sampling, @@ -182,7 +400,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri auto ctx_clip = clip_init_context(params); auto embeds = llava_image_embed_make_with_filename(ctx_clip, params->n_threads, fname.c_str()); if (!embeds) { - std::cerr << "error: failed to load image " << fname << ". Terminating\n\n"; + LOG_TEE("error: failed to load image %s. Terminating\n\n", fname.c_str()); return NULL; } @@ -192,19 +410,15 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return NULL; } - auto model = llava_init(params); - if (model == NULL) { - fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__); - return NULL; - } const int64_t t_llava_init_start_us = ggml_time_us(); - auto ctx_llava = llava_init_context(params, model); + auto ctx_llava = llava_init_context(params); ctx_llava->ctx_clip = ctx_clip; const int64_t t_llava_init_end_us = ggml_time_us(); float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0; LOG_TEE("\n%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms); const int64_t t_process_image_start_us = ggml_time_us(); + process_prompt(0, ctx_llava, params, n_past); process_image(ctx_llava, embeds, params, n_past); const int64_t t_process_image_end_us = ggml_time_us(); float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0; @@ -214,36 +428,32 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri return ctx_llava; } -static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ - std::string user_prompt = prompt; - int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip); - if (!is_first) { - if (has_minicpmv_projector == 2) { - user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt; - } - else if (has_minicpmv_projector == 3) { - user_prompt = "<|im_start|>user\n" + prompt; +static int process_input(struct llava_context * ctx_llava, gpt_params * params, int type, std::string prompt, int &n_past, struct llava_image_embed * embeds = nullptr){ + if (type==0) { + if (process_prompt(1, ctx_llava, params, n_past, prompt)) return 1; + } + else if (type == 1) { + if(embeds != NULL){ + return (process_image(ctx_llava, embeds, params, n_past)); } } + return 0; +} - eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); - if (has_minicpmv_projector == 2) { - eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false); - } - else if (has_minicpmv_projector == 3) { - eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false); - } +static struct llama_sampling_context * llama_init(struct llava_context * ctx_llava, gpt_params * params, std::string prompt, int &n_past, bool is_first = false){ + std::string user_prompt = prompt; + if(is_first)process_prompt(0, ctx_llava, params, n_past); + process_prompt(1, ctx_llava, params, n_past, prompt); + process_prompt(2, ctx_llava, params, n_past); // generate the response - LOG_TEE("\n"); - struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); return ctx_sampling; } static const char * llama_loop(struct llava_context * ctx_llava,struct llama_sampling_context * ctx_sampling, int &n_past){ - + const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); return tmp; } @@ -265,21 +475,67 @@ int main(int argc, char ** argv) { llama_log_set(llama_log_callback_logTee, nullptr); #endif // LOG_DISABLE_LOGS - if (params.mmproj.empty() || (params.image.empty())) { + if (params.mmproj.empty()) { gpt_params_print_usage(argc, argv, params); show_additional_info(argc, argv); return 1; } - - for (auto & image : params.image) { - int n_past = 0; - auto ctx_llava = minicpmv_init(¶ms, image, n_past); - + + int n_past = 0; + struct llava_context * ctx_llava = nullptr; + + if (params.video.size() > 0){ + ctx_llava = llava_init_context(¶ms); + auto video = params.video; + std::vector frames = extract_frames(video.c_str(), params.frame_num); + process_prompt(0, ctx_llava, ¶ms, n_past); + // LOG_TEE("frames.size: %zu\n", frames.size()); + for (size_t i = 0; i < frames.size(); ++i) { + auto embeds = video_image_embed(ctx_llava->ctx_clip, ¶ms, frames[i]); + process_input(ctx_llava, ¶ms, 1, "", n_past, embeds); + } + process_input(ctx_llava, ¶ms, 0, params.prompt.c_str(), n_past); + process_prompt(2, ctx_llava, ¶ms, n_past); if (!params.prompt.empty()) { + LOG_TEE("minicpmv_version: %d\n", clip_is_minicpmv(ctx_llava->ctx_clip)); LOG_TEE("%s\n", params.prompt.c_str()); LOG_TEE(""); - auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, true); - const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + } + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + const int max_tgt_len = params.n_predict < 0 ? 8192 : params.n_predict; + std::string response = ""; + bool have_tmp = false; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0){ + if(!have_tmp)continue; + else break; + } + have_tmp = true; + printf("%s", tmp); + if (strstr(response.c_str(), "")) break; // minicpm-v + + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + } + else { + if (params.image.size() > 1) { + ctx_llava = llava_init_context(¶ms); + process_prompt(0, ctx_llava, ¶ms, n_past); + for (auto & image : params.image) { + auto embeds = interleaved_image_embed(ctx_llava->ctx_clip, ¶ms, image); + process_input(ctx_llava, ¶ms, 1, "", n_past, embeds); + } + process_prompt(2, ctx_llava, ¶ms, n_past); + if (!params.prompt.empty()) { + LOG_TEE("minicpmv_version: %d\n", clip_is_minicpmv(ctx_llava->ctx_clip)); + LOG_TEE("%s\n", params.prompt.c_str()); + LOG_TEE(""); + } + struct llama_sampling_context * ctx_sampling = llama_sampling_init(params.sparams); + const int max_tgt_len = params.n_predict < 0 ? 8192 : params.n_predict; std::string response = ""; bool have_tmp = false; for (int i = 0; i < max_tgt_len; i++) { @@ -289,7 +545,6 @@ int main(int argc, char ** argv) { if(!have_tmp)continue; else break; } - if (strstr(tmp, "###")) break; // Yi-VL behavior have_tmp = true; printf("%s", tmp); if (strstr(response.c_str(), "")) break; // minicpm-v @@ -297,26 +552,56 @@ int main(int argc, char ** argv) { fflush(stdout); } llama_sampling_free(ctx_sampling); - }else { - while (true) { - LOG_TEE(""); - std::string prompt; - std::getline(std::cin, prompt); + } + else { + auto image = params.image[0]; + ctx_llava = minicpmv_init(¶ms, image, n_past); + + if (!params.prompt.empty()) { + LOG_TEE("minicpmv_version: %d\n", clip_is_minicpmv(ctx_llava->ctx_clip)); + LOG_TEE("%s\n", params.prompt.c_str()); LOG_TEE(""); - auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, true); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, params.prompt.c_str(), n_past, false); const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; std::string response = ""; + bool have_tmp = false; for (int i = 0; i < max_tgt_len; i++) { auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); response += tmp; - if (strcmp(tmp, "") == 0) break; - if (strstr(tmp, "###")) break; // Yi-VL behavior - printf("%s", tmp);// mistral llava-1.6 + if (strcmp(tmp, "") == 0){ + if(!have_tmp)continue; + else break; + } + have_tmp = true; + printf("%s", tmp); if (strstr(response.c_str(), "")) break; // minicpm-v + fflush(stdout); } llama_sampling_free(ctx_sampling); } + else { + while (true) { + LOG_TEE("minicpmv_version: %d\n", clip_is_minicpmv(ctx_llava->ctx_clip)); + LOG_TEE(""); + std::string prompt; + std::getline(std::cin, prompt); + LOG_TEE(""); + auto ctx_sampling = llama_init(ctx_llava, ¶ms, prompt, n_past, false); + const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict; + std::string response = ""; + for (int i = 0; i < max_tgt_len; i++) { + auto tmp = llama_loop(ctx_llava, ctx_sampling, n_past); + response += tmp; + if (strcmp(tmp, "") == 0) break; + if (strstr(tmp, "###")) break; // Yi-VL behavior + printf("%s", tmp);// mistral llava-1.6 + if (strstr(response.c_str(), "")) break; // minicpm-v + fflush(stdout); + } + llama_sampling_free(ctx_sampling); + } + } } printf("\n"); llama_print_timings(ctx_llava->ctx_llama); @@ -326,4 +611,4 @@ int main(int argc, char ** argv) { } return 0; -} +} \ No newline at end of file diff --git a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py index ea773742a832b..b44d10820462b 100644 --- a/examples/llava/minicpmv-convert-image-encoder-to-gguf.py +++ b/examples/llava/minicpmv-convert-image-encoder-to-gguf.py @@ -587,7 +587,6 @@ def bytes_to_unicode(): fname_middle = "mmproj-" has_text_encoder = False has_minicpmv_projector = True - minicpmv_version = 3 elif args.vision_only: fname_middle = "vision-" has_text_encoder = False diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt index cbcbf26c9b4e9..4caa5a4fa07b0 100644 --- a/examples/llava/requirements.txt +++ b/examples/llava/requirements.txt @@ -3,3 +3,4 @@ pillow~=10.2.0 torch~=2.2.1 torchvision~=0.17.1 +ffmpeg~=7.0.1