From 07a38d096caf1805ecc4d4c05c376843267b44c8 Mon Sep 17 00:00:00 2001
From: Eric Curtin <ecurtin@redhat.com>
Date: Fri, 13 Dec 2024 22:46:13 +0000
Subject: [PATCH] Improve progress bar

Set default width to whatever the terminal is. Also fixed a small bug around
default n_gpu_layers value.

Signed-off-by: Eric Curtin <ecurtin@redhat.com>
---
 README.md            |   4 +-
 examples/run/run.cpp | 275 ++++++++++++++++++++++++++++---------------
 2 files changed, 183 insertions(+), 96 deletions(-)
diff --git a/README.md b/README.md
index 54466c2501c081..42979f73c42c53 100644
--- a/README.md
+++ b/README.md
@@ -409,7 +409,7 @@ To learn more about model quantization, [read this documentation](examples/quant
 
     </details>
 
-[^1]: [examples/perplexity/README.md](examples/perplexity/README.md)
+[^1]: [examples/perplexity/README.md](https://github.com/ggerganov/llama.cpp/blob/master/examples/perplexity/README.md)
 [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity)
 
 ## [`llama-bench`](example/bench)
@@ -446,7 +446,7 @@ To learn more about model quantization, [read this documentation](examples/quant
 
     </details>
 
-[^3]: [https://github.com/containers/ramalama](RamaLama)
+[^3]: [RamaLama](https://github.com/containers/ramalama)
 
 ## [`llama-simple`](examples/simple)
 
diff --git a/examples/run/run.cpp b/examples/run/run.cpp
index 834ea8f7b4aeb4..2c8c535341fda1 100644
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@@ -1,6 +1,7 @@
 #if defined(_WIN32)
 #    include <windows.h>
 #else
+#    include <sys/ioctl.h>
 #    include <unistd.h>
 #endif
 
@@ -29,7 +30,6 @@
 class Opt {
   public:
     int init(int argc, const char ** argv) {
-        construct_help_str_();
         // Parse arguments
         if (parse(argc, argv)) {
             printe("Error: Failed to parse arguments.\n");
@@ -48,14 +48,53 @@ class Opt {
 
     std::string model_;
     std::string user_;
-    int         context_size_ = 2048, ngl_ = -1;
+    int         context_size_ = -1, ngl_ = -1;
+    bool        verbose_ = false;
 
   private:
-    std::string help_str_;
     bool        help_ = false;
 
-    void construct_help_str_() {
-        help_str_ =
+    int parse(int argc, const char ** argv) {
+        int positional_args_i = 0;
+        for (int i = 1; i < argc; ++i) {
+            if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
+                if (i + 1 >= argc) {
+                    return 1;
+                }
+
+                context_size_ = std::atoi(argv[++i]);
+            } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
+                if (i + 1 >= argc) {
+                    return 1;
+                }
+
+                ngl_ = std::atoi(argv[++i]);
+            } else if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0 ||
+                       strcmp(argv[i], "--log-verbose") == 0) {
+                verbose_ = true;
+            } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
+                help_ = true;
+                return 0;
+            } else if (!positional_args_i) {
+                if (!argv[i][0] || argv[i][0] == '-') {
+                    return 1;
+                }
+
+                ++positional_args_i;
+                model_ = argv[i];
+            } else if (positional_args_i == 1) {
+                ++positional_args_i;
+                user_ = argv[i];
+            } else {
+                user_ += " " + std::string(argv[i]);
+            }
+        }
+
+        return model_.empty();  // model_ is the only required value
+    }
+
+    void help() const {
+        printf(
             "Description:\n"
             "  Runs a llm\n"
             "\n"
@@ -64,15 +103,11 @@ class Opt {
             "\n"
             "Options:\n"
             "  -c, --context-size <value>\n"
-            "      Context size (default: " +
-            std::to_string(context_size_);
-        help_str_ +=
-            ")\n"
+            "      Context size (default: %d)\n"
             "  -n, --ngl <value>\n"
-            "      Number of GPU layers (default: " +
-            std::to_string(ngl_);
-        help_str_ +=
-            ")\n"
+            "      Number of GPU layers (default: %d)\n"
+            "  -v, --verbose, --log-verbose\n"
+            "      Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
             "  -h, --help\n"
             "      Show help message\n"
             "\n"
@@ -96,43 +131,10 @@ class Opt {
             "  llama-run https://example.com/some-file1.gguf\n"
             "  llama-run some-file2.gguf\n"
             "  llama-run file://some-file3.gguf\n"
-            "  llama-run --ngl 99 some-file4.gguf\n"
-            "  llama-run --ngl 99 some-file5.gguf Hello World\n";
-    }
-
-    int parse(int argc, const char ** argv) {
-        int positional_args_i = 0;
-        for (int i = 1; i < argc; ++i) {
-            if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) {
-                if (i + 1 >= argc) {
-                    return 1;
-                }
-
-                context_size_ = std::atoi(argv[++i]);
-            } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) {
-                if (i + 1 >= argc) {
-                    return 1;
-                }
-
-                ngl_ = std::atoi(argv[++i]);
-            } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
-                help_ = true;
-                return 0;
-            } else if (!positional_args_i) {
-                ++positional_args_i;
-                model_ = argv[i];
-            } else if (positional_args_i == 1) {
-                ++positional_args_i;
-                user_ = argv[i];
-            } else {
-                user_ += " " + std::string(argv[i]);
-            }
-        }
-
-        return model_.empty();  // model_ is the only required value
+            "  llama-run --ngl 999 some-file4.gguf\n"
+            "  llama-run --ngl 999 some-file5.gguf Hello World\n",
+            llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
     }
-
-    void help() const { printf("%s", help_str_.c_str()); }
 };
 
 struct progress_data {
@@ -151,8 +153,20 @@ struct FileDeleter {
 
 typedef std::unique_ptr<FILE, FileDeleter> FILE_ptr;
 
+static int get_terminal_width() {
+#if defined(_WIN32)
+    CONSOLE_SCREEN_BUFFER_INFO csbi;
+    GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
+    return csbi.srWindow.Right - csbi.srWindow.Left + 1;
+#else
+    struct winsize w;
+    ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+    return w.ws_col;
+#endif
+}
+
 #ifdef LLAMA_USE_CURL
-class CurlWrapper {
+class HttpClient {
   public:
     int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
              const bool progress, std::string * response_str = nullptr) {
@@ -181,7 +195,7 @@ class CurlWrapper {
         return 0;
     }
 
-    ~CurlWrapper() {
+    ~HttpClient() {
         if (chunk) {
             curl_slist_free_all(chunk);
         }
@@ -219,7 +233,7 @@ class CurlWrapper {
         if (progress) {
             curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
             curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
-            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback);
+            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress);
         }
     }
 
@@ -270,9 +284,9 @@ class CurlWrapper {
 
     static std::string human_readable_size(curl_off_t size) {
         static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
-        char         length   = sizeof(suffix) / sizeof(suffix[0]);
-        int          i        = 0;
-        double       dbl_size = size;
+        char                length   = sizeof(suffix) / sizeof(suffix[0]);
+        int                 i        = 0;
+        double              dbl_size = size;
         if (size > 1024) {
             for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
                 dbl_size = size / 1024.0;
@@ -284,8 +298,8 @@ class CurlWrapper {
         return out.str();
     }
 
-    static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
-                                 curl_off_t) {
+    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
+                               curl_off_t) {
         progress_data * data = static_cast<progress_data *>(ptr);
         if (total_to_download <= 0) {
             return 0;
@@ -293,27 +307,80 @@ class CurlWrapper {
 
         total_to_download += data->file_size;
         const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
-        const curl_off_t percentage                    = (now_downloaded_plus_file_size * 100) / total_to_download;
-        const curl_off_t pos                           = (percentage / 5);
+        const curl_off_t percentage      = calculate_percentage(now_downloaded_plus_file_size, total_to_download);
+        std::string      progress_prefix = generate_progress_prefix(percentage);
+
+        const double speed = calculate_speed(now_downloaded, data->start_time);
+        const double tim  = (total_to_download - now_downloaded) / speed;
+        std::string  progress_suffix =
+            generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim);
+
+        int         progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix);
         std::string progress_bar;
-        for (int i = 0; i < 20; ++i) {
-            progress_bar.append((i < pos) ? "█" : " ");
-        }
+        generate_progress_bar(progress_bar_width, percentage, progress_bar);
 
-        // Calculate download speed and estimated time to completion
-        const auto                          now             = std::chrono::steady_clock::now();
-        const std::chrono::duration<double> elapsed_seconds = now - data->start_time;
-        const double                        speed           = now_downloaded / elapsed_seconds.count();
-        const double                        estimated_time  = (total_to_download - now_downloaded) / speed;
-        printe("\r%ld%% |%s| %s/%s  %.2f MB/s  %s      ", percentage, progress_bar.c_str(),
-               human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(),
-               speed / (1024 * 1024), human_readable_time(estimated_time).c_str());
-        fflush(stderr);
+        print_progress(progress_prefix, progress_bar, progress_suffix);
         data->printed = true;
 
         return 0;
     }
 
+    static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) {
+        return (now_downloaded_plus_file_size * 100) / total_to_download;
+    }
+
+    static std::string generate_progress_prefix(curl_off_t percentage) {
+        std::ostringstream progress_output;
+        progress_output << std::setw(3) << percentage << "% |";
+        return progress_output.str();
+    }
+
+    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
+        const auto                          now             = std::chrono::steady_clock::now();
+        const std::chrono::duration<double> elapsed_seconds = now - start_time;
+        return now_downloaded / elapsed_seconds.count();
+    }
+
+    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
+                                                double speed, double estimated_time) {
+        const int          width = 10;
+        std::ostringstream progress_output;
+        progress_output << std::setw(width) << human_readable_size(now_downloaded_plus_file_size) << "/"
+                        << std::setw(width) << human_readable_size(total_to_download) << std::setw(width)
+                        << human_readable_size(speed) << "/s" << std::setw(width)
+                        << human_readable_time(estimated_time);
+        return progress_output.str();
+    }
+
+    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
+        int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 5;
+        if (progress_bar_width < 1) {
+            progress_bar_width = 1;
+        }
+
+        return progress_bar_width;
+    }
+
+    static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage,
+                                             std::string & progress_bar) {
+        const curl_off_t pos = (percentage * progress_bar_width) / 100;
+        for (int i = 0; i < progress_bar_width; ++i) {
+            progress_bar.append((i < pos) ? "█" : " ");
+        }
+
+        return progress_bar;
+    }
+
+    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
+                               const std::string & progress_suffix) {
+        std::ostringstream progress_output;
+        progress_output << progress_prefix << progress_bar << "| " << progress_suffix;
+        printe(
+            "\r%*s"
+            "\r%s",
+            get_terminal_width(), " ", progress_output.str().c_str());
+    }
+
     // Function to write data to a file
     static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
         FILE * out = static_cast<FILE *>(stream);
@@ -357,8 +424,8 @@ class LlamaData {
 #ifdef LLAMA_USE_CURL
     int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
                  const bool progress, std::string * response_str = nullptr) {
-        CurlWrapper curl;
-        if (curl.init(url, headers, output_file, progress, response_str)) {
+        HttpClient http;
+        if (http.init(url, headers, output_file, progress, response_str)) {
             return 1;
         }
 
@@ -467,6 +534,10 @@ class LlamaData {
         llama_model_params model_params = llama_model_default_params();
         model_params.n_gpu_layers       = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
         resolve_model(opt.model_);
+        printe(
+            "\r%*s"
+            "\rLoading model\r",
+            get_terminal_width(), " ");
         llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
         if (!model) {
             printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
@@ -478,8 +549,7 @@ class LlamaData {
     // Initializes the context with the specified parameters
     llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
         llama_context_params ctx_params = llama_context_default_params();
-        ctx_params.n_ctx                = n_ctx;
-        ctx_params.n_batch              = n_ctx;
+        ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
         llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
         if (!context) {
             printe("%s: error: failed to create the llama_context\n", __func__);
@@ -609,16 +679,20 @@ static int read_user_input(std::string & user) {
 }
 
 // Function to generate a response based on the prompt
-static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response,
+                             const bool stdout_a_terminal) {
     // Set response color
-    printf("\033[33m");
+    if (stdout_a_terminal) {
+        printf("\033[33m");
+    }
+
     if (generate(llama_data, prompt, response)) {
         printe("failed to generate response\n");
         return 1;
     }
 
     // End response with color reset and newline
-    printf("\n\033[0m");
+    printf("\n%s", stdout_a_terminal ? "\033[0m" : "");
     return 0;
 }
 
@@ -642,15 +716,37 @@ static int handle_user_input(std::string & user_input, const std::string & user_
     }
 
     printf(
-        "\r                                                                       "
-        "\r\033[32m> \033[0m");
+        "\r%*s"
+        "\r\033[32m> \033[0m",
+        get_terminal_width(), " ");
     return read_user_input(user_input);  // Returns true if input ends the loop
 }
 
+static bool is_stdin_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdin, &mode);
+#else
+    return isatty(STDIN_FILENO);
+#endif
+}
+
+static bool is_stdout_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdout, &mode);
+#else
+    return isatty(STDOUT_FILENO);
+#endif
+}
+
 // Function to tokenize the prompt
 static int chat_loop(LlamaData & llama_data, const std::string & user_) {
     int prev_len = 0;
     llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
+    static const bool stdout_a_terminal = is_stdout_a_terminal();
     while (true) {
         // Get user input
         std::string user_input;
@@ -665,7 +761,7 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
 
         std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
         std::string response;
-        if (generate_response(llama_data, prompt, response)) {
+        if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
             return 1;
         }
 
@@ -682,22 +778,13 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) {
     return 0;
 }
 
-static void log_callback(const enum ggml_log_level level, const char * text, void *) {
-    if (level == GGML_LOG_LEVEL_ERROR) {
+static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
+    const Opt * opt = static_cast<Opt *>(p);
+    if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
         printe("%s", text);
     }
 }
 
-static bool is_stdin_a_terminal() {
-#if defined(_WIN32)
-    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
-    DWORD  mode;
-    return GetConsoleMode(hStdin, &mode);
-#else
-    return isatty(STDIN_FILENO);
-#endif
-}
-
 static std::string read_pipe_data() {
     std::ostringstream result;
     result << std::cin.rdbuf();  // Read all data from std::cin
@@ -721,7 +808,7 @@ int main(int argc, const char ** argv) {
         opt.user_ += read_pipe_data();
     }
 
-    llama_log_set(log_callback, nullptr);
+    llama_log_set(log_callback, &opt);
     LlamaData llama_data;
     if (llama_data.init(opt)) {
         return 1;