ggerganov · phymbert · Mar 17, 2024 · Mar 16, 2024 · Mar 16, 2024 · Mar 16, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -33,17 +33,20 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential gcc-8
+          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
 
       - name: Build
         id: make_build
         env:
             LLAMA_FATAL_WARNINGS: 1
+            LLAMA_CURL: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 
       - name: Test
         id: make_test
+        env:
+          LLAMA_CURL: 1
         run: |
           CC=gcc-8 make tests -j $(nproc)
           make test -j $(nproc)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -57,7 +57,8 @@ jobs:
             cmake \
             python3-pip \
             wget \
-            language-pack-en
+            language-pack-en \
+            libcurl4-openssl-dev
 
       - name: Build
         id: cmake_build
@@ -101,12 +102,31 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Download libCURL
+        id: get_libcurl
+        env:
+          CURL_TAG: 8_6_0
+          CURL_VERSION: 8.6.0
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/libcurl.tar.gz -L "https://github.com/curl/curl/releases/download/curl-${env:CURL_TAG}/curl-${env:CURL_VERSION}.tar.gz"
+          mkdir $env:RUNNER_TEMP/libcurl
+          tar.exe -xvf $env:RUNNER_TEMP/libcurl.tar.gz --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+
+      - name: Build libcurl
+        id: build_libcurl
+        run: |
+          cd $env:RUNNER_TEMP/libcurl
+          mkdir build
+          cd build
+          cmake .. -DCMAKE_BUILD_TYPE=Release
+          cmake --build . --config Release
+
       - name: Build
         id: cmake_build
         run: |
           mkdir build
           cd build
-          cmake ..  -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
+          cmake .. -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup

diff --git a/Makefile b/Makefile
@@ -595,6 +595,11 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 
+ifdef LLAMA_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS  := $(LDFLAGS) -lcurl
+endif
+
 #
 # Print build information
 #

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
@@ -47,6 +47,16 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+# Check for curl
+find_package(CURL QUIET)
+if (CURL_FOUND)
+    add_definitions(-DLLAMA_USE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    link_libraries(${CURL_LIBRARIES})
+else()
+    message(INFO " libcurl not found. Building without model download support.")
+endif ()
+
 
 set(TARGET common)
 

diff --git a/common/common.cpp b/common/common.cpp
@@ -16,6 +16,9 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
+#ifdef LLAMA_USE_CURL
+#include <curl/curl.h>
+#endif
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -50,6 +53,19 @@
 #define GGML_USE_CUBLAS_SYCL_VULKAN
 #endif
 
+#ifdef LLAMA_USE_CURL
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
+#define LLAMA_CURL_MAX_HEADER_LENGTH 256
+#endif // LLAMA_USE_CURL
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
     // enumerate the set of thread siblings, num entries is num cores
@@ -644,6 +660,13 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
             params.model = argv[i];
         }
+        if (arg == "-mu" || arg == "--model-url") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_url = argv[i];
+        }
         if (arg == "-md" || arg == "--model-draft") {
             arg_found = true;
             if (++i >= argc) {
@@ -1368,6 +1391,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
     printf("  -m FNAME, --model FNAME\n");
     printf("                        model path (default: %s)\n", params.model.c_str());
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                            model download url (default: %s)\n", params.model_url.c_str());
     printf("  -md FNAME, --model-draft FNAME\n");
     printf("                        draft model for speculative decoding\n");
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
@@ -1613,10 +1638,192 @@ void llama_batch_add(
     batch.n_tokens++;
 }
 
+#ifdef LLAMA_USE_CURL
+
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
+                                              struct llama_model_params params) {
+    // Basic validation of the model_url
+    if (!model_url || strlen(model_url) == 0) {
+        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
+    // Initialize libcurl globally
+    curl_global_init(CURL_GLOBAL_DEFAULT);
+    auto curl = curl_easy_init();
+
+    if (!curl) {
+        curl_global_cleanup();
+        fprintf(stderr, "%s: error initializing lib curl\n", __func__);
+        return NULL;
+    }
+
+    // Set the URL, allow to follow http redirection
+    curl_easy_setopt(curl, CURLOPT_URL, model_url);
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);
+
+    // Check if the file already exists locally
+    struct stat buffer;
+    auto file_exists = (stat(path_model, &buffer) == 0);
+
+    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
+    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+    snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
+
+    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
+
+    if (file_exists) {
+        auto * f_etag = fopen(etag_path, "r");
+        if (f_etag) {
+            if (!fgets(etag, sizeof(etag), f_etag)) {
+                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
+            } else {
+                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
+            }
+            fclose(f_etag);
+        }
+
+        auto * f_last_modified = fopen(last_modified_path, "r");
+        if (f_last_modified) {
+            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
+                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
+            } else {
+                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
+                        last_modified);
+            }
+            fclose(f_last_modified);
+        }
+    }
+
+    // Send a HEAD request to retrieve the etag and last-modified headers
+    struct llama_load_model_from_url_headers {
+        char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+        char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    };
+    llama_load_model_from_url_headers headers;
+    {
+        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
+            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
+
+            const char * etag_prefix = "etag: ";
+            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
+                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove LRLF
+            }
+
+            const char * last_modified_prefix = "last-modified: ";
+            if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
+                strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
+                        n_items - strlen(last_modified_prefix) - 2); // Remove LRLF
+            }
+            return n_items;
+        };
+
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+        curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
+
+        CURLcode res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+            return NULL;
+        }
+    }
+
+    // If only the ETag or the Last-Modified header are different, trigger a new download
+    if (strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
+        // Set the output file
+        auto * outfile = fopen(path_model, "wb");
+        if (!outfile) {
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
+            return NULL;
+        }
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
+
+        //  display download progress
+        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+
+        // start the download
+        fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+                model_url, path_model, headers.etag, headers.last_modified);
+        auto res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            fclose(outfile);
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+            return NULL;
+        }
+
+        long http_code = 0;
+        curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code < 200 || http_code >= 400) {
+            fclose(outfile);
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: invalid http status code failed: %ld\n", __func__, http_code);
+            return NULL;
+        }
+
+        // Clean up
+        fclose(outfile);
+
+        // Write the new ETag to the .etag file
+        if (strlen(headers.etag) > 0) {
+            auto * etag_file = fopen(etag_path, "w");
+            if (etag_file) {
+                fputs(headers.etag, etag_file);
+                fclose(etag_file);
+                fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
+            }
+        }
+
+        // Write the new lastModified to the .etag file
+        if (strlen(headers.last_modified) > 0) {
+            auto * last_modified_file = fopen(last_modified_path, "w");
+            if (last_modified_file) {
+                fputs(headers.last_modified, last_modified_file);
+                fclose(last_modified_file);
+                fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
+                        headers.last_modified);
+            }
+        }
+    }
+
+    curl_easy_cleanup(curl);
+    curl_global_cleanup();
+
+    return llama_load_model_from_file(path_model, params);
+}
+
+#else
+
+struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
+                                              struct llama_model_params /*params*/) {
+    fprintf(stderr, "%s: llama.cpp built without curl support, downloading from an url not supported.\n", __func__);
+    return nullptr;
+}
+
+#endif // LLAMA_USE_CURL
+
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
 
-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = nullptr;
+    if (!params.model_url.empty()) {
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+    } else {
+        model = llama_load_model_from_file(params.model.c_str(), mparams);
+    }
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
         return std::make_tuple(nullptr, nullptr);

diff --git a/common/common.h b/common/common.h
@@ -17,6 +17,12 @@
 #include <unordered_map>
 #include <tuple>
 
+#ifdef HAVE_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@@ -89,6 +95,7 @@ struct gpt_params {
     struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_url         = ""; // model url to download
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
@@ -191,6 +198,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
+                                                         struct llama_model_params     params);
+
 // Batch utils
 
 void llama_batch_clear(struct llama_batch & batch);

diff --git a/examples/main/README.md b/examples/main/README.md
@@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.

@@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.