Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support video understanding #9165

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1465,13 +1465,16 @@ llama-llava-cli: examples/llava/llava-cli.cpp \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual

FFMPEG_CFLAGS := $(shell pkg-config --cflags libavformat libavcodec libavutil)
FFMPEG_LIBS := $(shell pkg-config --libs libavformat libavcodec libavutil) -lswscale

llama-minicpmv-cli: examples/llava/minicpmv-cli.cpp \
examples/llava/llava.cpp \
examples/llava/llava.h \
examples/llava/clip.cpp \
examples/llava/clip.h \
$(OBJ_ALL)
$(CXX) $(CXXFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) -Wno-cast-qual
$(CXX) $(CXXFLAGS) $(FFMPEG_CFLAGS) $< $(filter-out %.h $<,$^) -o $@ $(LDFLAGS) $(FFMPEG_LIBS) -Wno-cast-qual
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to only enable support for video with a special flag, for example LLAMA_FFMPEG (same way with LLAMA_CURL)

Also, don't forget to add support for cmake linking ffmpeg in cmake

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, i will try it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have taken a stab at implementing this compiler flag in an amending PR -- it may or may not be useful to you:
OpenBMB#32

@tc-mb If you like it, feel free to merge that one -- if you do, it should smoothly merge my changes into your PR here. If you don't want it, then no hard feelings -- I won't be offended. :) I'm simply a fan of your work, and generally wanted to make an attempt at helping this PR along.


ifeq ($(UNAME_S),Darwin)
swift: examples/batched.swift
Expand Down
15 changes: 15 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.image.emplace_back(argv[i]);
return true;
}
if (arg == "--video") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.video = argv[i];
return true;
}
if (arg == "--frame-num") {
CHECK_ARG
params.frame_num = std::stoi(argv[i]);
return true;
}
if (arg == "-i" || arg == "--interactive") {
params.interactive = true;
return true;
Expand Down Expand Up @@ -1639,6 +1652,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
options.push_back({ "multi-modality" });
options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
options.push_back({ "*", " --video FILE", "path to an video file. use with multimodal models. Specify multiple times for batching" });
tc-mb marked this conversation as resolved.
Show resolved Hide resolved
options.push_back({ "*", " --frame-num N", "number of max video frame (default: 16)" });

options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
std::string video = "";
int frame_num = 16;

// embedding
bool embedding = false; // get only sentence embedding
Expand Down
16 changes: 9 additions & 7 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// I'll gradually clean and extend it
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "common.h"
#include "log.h"
#include "ggml.h"
#include "ggml-alloc.h"
Expand Down Expand Up @@ -538,6 +539,7 @@ struct clip_ctx {
bool has_llava_projector = false;
bool has_minicpmv_projector = false;
int minicpmv_version = 2;
int max_slice_nums = 9;

struct clip_vision_model vision_model;
projector_type proj_type = PROJECTOR_TYPE_MLP;
Expand Down Expand Up @@ -1623,7 +1625,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8* src, clip_image_f32*
}
}

inline float clip(float x, float lower, float upper) {
inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}

Expand Down Expand Up @@ -1827,10 +1829,6 @@ static std::pair<int, int> uhd_get_refine_size(std::pair<int, int> original_size
return refine_size;
}

inline int clip(int x, int lower, int upper) {
return std::max(lower, std::min(x, upper));
}

static std::pair<int, int> uhd_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
std::vector<int> candidate_split_grids_nums;
for (int i : {multiple - 1, multiple, multiple + 1}) {
Expand Down Expand Up @@ -1932,7 +1930,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
}

int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
const int max_slice_nums=9;
const int max_slice_nums=ctx_clip->max_slice_nums;
const int scale_resolution=448;
const int original_width = ctx_clip->load_image_size->width;
const int original_height = ctx_clip->load_image_size->height;
Expand All @@ -1948,7 +1946,7 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {

if(clip_is_minicpmv(ctx)){
int max_slice_nums = 9;
int max_slice_nums = ctx->max_slice_nums;
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, max_slice_nums);
res_imgs->size = 0;
for (size_t i = 0; i < imgs.size(); ++i){
Expand Down Expand Up @@ -2626,3 +2624,7 @@ int clip_is_minicpmv(const struct clip_ctx * ctx) {
}
return 0;
}

void clip_uhd_max_slice_nums(struct clip_ctx * ctx, int max_slice_nums) {
ctx->max_slice_nums = max_slice_nums;
}
1 change: 1 addition & 0 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, cons
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);

CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
CLIP_API void clip_uhd_max_slice_nums(struct clip_ctx * ctx, int max_slice_nums);

#ifdef __cplusplus
}
Expand Down
Loading
Loading