Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into server-update-JSON-response
Browse files Browse the repository at this point in the history
  • Loading branch information
MichelleTanPY authored Dec 13, 2024
2 parents 220cf7f + a76c56f commit 3c8a053
Show file tree
Hide file tree
Showing 24 changed files with 9,556 additions and 164 deletions.
26 changes: 25 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,8 @@ jobs:
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'llvm-arm64-opencl-adreno'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'

steps:
- name: Clone
Expand Down Expand Up @@ -703,6 +705,28 @@ jobs:
run: |
choco install ninja
- name: Install OpenCL Headers and Libs
id: install_opencl
if: ${{ matrix.build == 'llvm-arm64-opencl-adreno' }}
run: |
git clone https://github.com/KhronosGroup/OpenCL-Headers
cd OpenCL-Headers
mkdir build && cd build
cmake .. `
-DBUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_TESTING=OFF `
-DOPENCL_HEADERS_BUILD_CXX_TESTS=OFF `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install
git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader
cd OpenCL-ICD-Loader
mkdir build-arm64-release && cd build-arm64-release
cmake .. `
-A arm64 `
-DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" `
-DCMAKE_INSTALL_PREFIX="$env:RUNNER_TEMP/opencl-arm64-release"
cmake --build . --target install --config release
- name: Build
id: cmake_build
run: |
Expand Down Expand Up @@ -732,7 +756,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
Expand Down
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,20 @@ To learn more about model quantization, [read this documentation](examples/quant

</details>

## [`llama-run`](examples/run)

#### A comprehensive example for running `llama.cpp` models. Useful for inferencing. Used with RamaLama [^3].

- <details>
<summary>Run a model with a specific prompt (by default it's pulled from Ollama registry)</summary>
```bash
llama-run granite-code
```
</details>
[^3]: [https://github.com/containers/ramalama](RamaLama)
## [`llama-simple`](examples/simple)
Expand Down
2 changes: 1 addition & 1 deletion common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ set(LLAMA_COMMON_EXTRA_LIBS build_info)
# Use curl to download model url
if (LLAMA_CURL)
find_package(CURL REQUIRED)
add_definitions(-DLLAMA_USE_CURL)
target_compile_definitions(${TARGET} PUBLIC LLAMA_USE_CURL)
include_directories(${CURL_INCLUDE_DIRS})
find_library(CURL_LIBRARY curl REQUIRED)
set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
Expand Down
6 changes: 0 additions & 6 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1076,12 +1076,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
#define CURL_MAX_RETRY 3
#define CURL_RETRY_DELAY_SECONDS 2


static bool starts_with(const std::string & str, const std::string & prefix) {
// While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}

static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) {
int remaining_attempts = max_attempts;

Expand Down
11 changes: 8 additions & 3 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ using llama_tokens = std::vector<llama_token>;

// build info
extern int LLAMA_BUILD_NUMBER;
extern char const * LLAMA_COMMIT;
extern char const * LLAMA_COMPILER;
extern char const * LLAMA_BUILD_TARGET;
extern const char * LLAMA_COMMIT;
extern const char * LLAMA_COMPILER;
extern const char * LLAMA_BUILD_TARGET;

struct common_control_vector_load_info;

Expand Down Expand Up @@ -437,6 +437,11 @@ std::vector<std::string> string_split<std::string>(const std::string & input, ch
return parts;
}

static bool string_starts_with(const std::string & str,
const std::string & prefix) { // While we wait for C++20's std::string::starts_with...
return str.rfind(prefix, 0) == 0;
}

bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);

Expand Down
2 changes: 1 addition & 1 deletion examples/run/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
set(TARGET llama-run)
add_executable(${TARGET} run.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
42 changes: 41 additions & 1 deletion examples/run/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,45 @@
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.

```bash
./llama-run Meta-Llama-3.1-8B-Instruct.gguf
llama-run granite-code
...

```bash
llama-run -h
Description:
Runs a llm
Usage:
llama-run [options] model [prompt]
Options:
-c, --context-size <value>
Context size (default: 2048)
-n, --ngl <value>
Number of GPU layers (default: 0)
-h, --help
Show help message
Commands:
model
Model is a string with an optional prefix of
huggingface:// (hf://), ollama://, https:// or file://.
If no protocol is specified and a file exists in the specified
path, file:// is assumed, otherwise if a file does not exist in
the specified path, ollama:// is assumed. Models that are being
pulled are downloaded with .partial extension while being
downloaded and then renamed as the file without the .partial
extension when complete.
Examples:
llama-run llama3
llama-run ollama://granite-code
llama-run ollama://smollm:135m
llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf
llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf
llama-run https://example.com/some-file1.gguf
llama-run some-file2.gguf
llama-run file://some-file3.gguf
llama-run --ngl 99 some-file4.gguf
llama-run --ngl 99 some-file5.gguf Hello World
...
Loading

0 comments on commit 3c8a053

Please sign in to comment.