Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support video understanding #9165

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})

# 3rd party libs
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_FFMPEG "llama: use ffmpeg to load video files" OFF)

# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,11 @@ override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
override LDFLAGS := $(LDFLAGS) -lcurl
endif

ifdef LLAMA_FFMPEG
override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_FFMPEG $(shell pkg-config --cflags libavformat libavcodec libavutil)
override LDFLAGS := $(LDFLAGS) $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale
endif

#
# Print build information
#
Expand Down
13 changes: 13 additions & 0 deletions common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,19 @@ if (LLAMA_CURL)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
endif ()

# Use ffmpeg to load video files
if (LLAMA_FFMPEG)
find_package(PkgConfig REQUIRED)
pkg_check_modules(FFMPEG REQUIRED
libavformat
libavcodec
libavutil
)
add_definitions(-DLLAMA_USE_FFMPEG)
include_directories(${FFMPEG_INCLUDE_DIRS})
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${FFMPEG_LIBRARIES})
endif ()

target_include_directories(${TARGET} PUBLIC .)
target_compile_features (${TARGET} PUBLIC cxx_std_11)
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
15 changes: 15 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.image.emplace_back(argv[i]);
return true;
}
if (arg == "--video") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.video = argv[i];
return true;
}
if (arg == "--frame-num") {
CHECK_ARG
params.frame_num = std::stoi(argv[i]);
return true;
}
if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
return true;
Expand Down Expand Up @@ -1639,6 +1652,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
options.push_back({ "*", " --video FILE", "path to a video file. use with multimodal models. Specify multiple times for batching" });
options.push_back({ "*", " --frame-num N", "number of max video frame (default: 16)" });

options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
std::string video = "";
int frame_num = 16;

// embedding
bool embedding = false; // get only sentence embedding
Expand Down
16 changes: 9 additions & 7 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "common.h"
#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
Expand Down Expand Up @@ -538,6 +539,7 @@ struct clip_ctx {
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
int minicpmv_version = 2;
int max_slice_nums = 9;

struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
Expand Down Expand Up @@ -1623,7 +1625,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
}
}

inline float clip(float x, float lower, float upper) {
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}

Expand Down Expand Up @@ -1827,10 +1829,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
return refine_size;
}

inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}

static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
Expand Down Expand Up @@ -1932,7 +1930,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
}

int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
const int max_slice_nums=9;
const int max_slice_nums=ctx_clip->max_slice_nums;
const int scale_resolution=448;
const int original_width = ctx_clip->load_image_size->width;
const int original_height = ctx_clip->load_image_size->height;
Expand All @@ -1948,7 +1946,7 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {

if(clip_is_minicpmv(ctx)){
int max_slice_nums = 9;
int max_slice_nums = ctx->max_slice_nums;
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i){
Expand Down Expand Up @@ -2626,3 +2624,7 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
}
return 0;
}

void clip_uhd_max_slice_nums(struct clip_ctx * ctx, int max_slice_nums) {
ctx->max_slice_nums = max_slice_nums;
}
1 change: 1 addition & 0 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API void clip_uhd_max_slice_nums(struct clip_ctx * ctx, int max_slice_nums);

#ifdef __cplusplus
}
Expand Down
Loading