ggerganov · cpumaxx · Mar 25, 2024 · Mar 26, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/common/common.cpp b/common/common.cpp
@@ -768,7 +768,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
             invalid_param = true;
             return true;
         }
-        params.image = argv[i];
+        if (params.image != "") {
+            params.image += ",";
+        }
+        params.image += argv[i];
+        params.image = std::regex_replace(params.image,std::regex(" --image "), ",");
         return true;
     }
     if (arg == "-i" || arg == "--interactive") {
@@ -1391,7 +1395,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
-    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models\n");
+    printf("  --image IMAGE_FILE    path to an image file. use with multimodal models. Specify multiple times for batching\n");
     if (llama_supports_mlock()) {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
     }

diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
@@ -208,26 +208,28 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 }
 
 
-static struct llava_context * llava_init(gpt_params * params) {
-    const char * clip_path = params->mmproj.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
-
+static struct llama_model * llava_init(gpt_params * params) {
     llama_backend_init();
     llama_numa_init(params->numa);
 
     llama_model_params model_params = llama_model_params_from_gpt_params(*params);
-
     llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
     if (model == NULL) {
         fprintf(stderr , "%s: error: unable to load model\n" , __func__);
         return NULL;
     }
+    return model;
+}
+
+static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
+    const char * clip_path = params->mmproj.c_str();
+
+    auto prompt = params->prompt;
+    if (prompt.empty()) {
+        prompt = "describe the image in detail.";
+    }
+
+    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
 
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
@@ -273,23 +275,42 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    auto ctx_llava = llava_init(&params);
-    if (ctx_llava == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava\n", __func__);
+    auto model = llava_init(&params);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
         return 1;
     }
 
-    auto image_embed = load_image(ctx_llava, &params);
-    if (!image_embed) {
-        return 1;
+    std::stringstream ss(params.image);
+    std::vector<std::string> imagestack;
+
+    while( ss.good() )
+    {
+        std::string substr;
+        getline( ss, substr, ',' );
+        imagestack.push_back( substr );
     }
 
-    // process the prompt
-    process_prompt(ctx_llava, image_embed, &params, params.prompt);
+    for (auto & image : imagestack) {
 
-    llama_print_timings(ctx_llava->ctx_llama);
+        auto ctx_llava = llava_init_context(&params, model);
+        params.image=image;
+
+        auto image_embed = load_image(ctx_llava, &params);
+        if (!image_embed) {
+            std::cerr << "error: failed to load image " << params.image << ". Terminating\n\n";
+            return 1;
+        }
 
-    llava_image_embed_free(image_embed);
-    llava_free(ctx_llava);
+        // process the prompt
+        process_prompt(ctx_llava, image_embed, &params, params.prompt);
+
+        llama_print_timings(ctx_llava->ctx_llama);
+
+        llava_image_embed_free(image_embed);
+        ctx_llava->model = NULL;
+        llava_free(ctx_llava);
+    }
+    llama_free_model(model);
     return 0;
 }