-
Notifications
You must be signed in to change notification settings - Fork 10.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* ggml : add RPC backend The RPC backend proxies all operations to a remote server which runs a regular backend (CPU, CUDA, Metal, etc). * set TCP_NODELAY * add CI workflows * Address review comments * fix warning * implement llama_max_devices() for RPC * Address review comments * Address review comments * wrap sockfd into a struct * implement get_alignment and get_max_size * add get_device_memory * fix warning * win32 support * add README * readme : trim trailing whitespace * Address review comments * win32 fix * Address review comments * fix compile warnings on macos
- Loading branch information
Showing
12 changed files
with
1,395 additions
and
98 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
add_executable(rpc-server rpc-server.cpp) | ||
target_link_libraries(rpc-server PRIVATE ggml llama) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
## Overview | ||
|
||
The `rpc-server` allows running `ggml` backend on a remote host. | ||
The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. | ||
This can be used for distributed LLM inference with `llama.cpp` in the following way: | ||
|
||
```mermaid | ||
flowchart TD | ||
rpcb---|TCP|srva | ||
rpcb---|TCP|srvb | ||
rpcb-.-|TCP|srvn | ||
subgraph hostn[Host N] | ||
srvn[rpc-server]-.-backend3["Backend (CUDA,Metal,etc.)"] | ||
end | ||
subgraph hostb[Host B] | ||
srvb[rpc-server]---backend2["Backend (CUDA,Metal,etc.)"] | ||
end | ||
subgraph hosta[Host A] | ||
srva[rpc-server]---backend["Backend (CUDA,Metal,etc.)"] | ||
end | ||
subgraph host[Main Host] | ||
ggml[llama.cpp]---rpcb[RPC backend] | ||
end | ||
style hostn stroke:#66,stroke-width:2px,stroke-dasharray: 5 5 | ||
``` | ||
|
||
Each host can run a different backend, e.g. one with CUDA and another with Metal. | ||
You can also run multiple `rpc-server` instances on the same host, each with a different backend. | ||
|
||
## Usage | ||
|
||
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options. | ||
For example, to build the CUDA backend with RPC support: | ||
|
||
```bash | ||
mkdir build-rpc-cuda | ||
cd build-rpc-cuda | ||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON | ||
cmake --build . --config Release | ||
``` | ||
|
||
Then, start the `rpc-server` with the backend: | ||
|
||
```bash | ||
$ bin/rpc-server 0.0.0.0 50052 | ||
create_backend: using CUDA backend | ||
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no | ||
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes | ||
ggml_cuda_init: found 1 CUDA devices: | ||
Device 0: NVIDIA T1200 Laptop GPU, compute capability 7.5, VMM: yes | ||
Starting RPC server on 0.0.0.0:50052 | ||
``` | ||
|
||
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.: | ||
```bash | ||
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052 | ||
``` | ||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. | ||
|
||
|
||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`: | ||
|
||
```bash | ||
mkdir build-rpc | ||
cd build-rpc | ||
cmake .. -DLLAMA_RPC=ON | ||
cmake --build . --config Release | ||
``` | ||
|
||
Finally, use the `--rpc` option to specify the host and port of each `rpc-server`: | ||
|
||
```bash | ||
$ bin/main -m ../models/tinyllama-1b/ggml-model-f16.gguf -p "Hello, my name is" --repeat-penalty 1.0 -n 64 --rpc 192.168.88.10:50052,192.168.88.11:50052 -ngl 99 | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#ifdef GGML_USE_CUDA | ||
#include "ggml-cuda.h" | ||
#endif | ||
|
||
#ifdef GGML_USE_METAL | ||
#include "ggml-metal.h" | ||
#endif | ||
|
||
#include "ggml-rpc.h" | ||
#include <string> | ||
#include <stdio.h> | ||
|
||
static ggml_backend_t create_backend() { | ||
ggml_backend_t backend = NULL; | ||
#ifdef GGML_USE_CUDA | ||
fprintf(stderr, "%s: using CUDA backend\n", __func__); | ||
backend = ggml_backend_cuda_init(0); // init device 0 | ||
if (!backend) { | ||
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); | ||
} | ||
#elif GGML_USE_METAL | ||
fprintf(stderr, "%s: using Metal backend\n", __func__); | ||
backend = ggml_backend_metal_init(); | ||
if (!backend) { | ||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); | ||
} | ||
#endif | ||
|
||
// if there aren't GPU Backends fallback to CPU backend | ||
if (!backend) { | ||
fprintf(stderr, "%s: using CPU backend\n", __func__); | ||
backend = ggml_backend_cpu_init(); | ||
} | ||
return backend; | ||
} | ||
|
||
static void get_backend_memory(size_t * free_mem, size_t * total_mem) { | ||
#ifdef GGML_USE_CUDA | ||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); | ||
#else | ||
// TODO: implement for other backends | ||
*free_mem = 1; | ||
*total_mem = 1; | ||
#endif | ||
} | ||
|
||
int main(int argc, char * argv[]) { | ||
if (argc < 3) { | ||
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]); | ||
return 1; | ||
} | ||
const char * host = argv[1]; | ||
int port = std::stoi(argv[2]); | ||
if (port <= 0 || port > 65535) { | ||
fprintf(stderr, "Invalid port number: %d\n", port); | ||
return 1; | ||
} | ||
ggml_backend_t backend = create_backend(); | ||
if (!backend) { | ||
fprintf(stderr, "Failed to create backend\n"); | ||
return 1; | ||
} | ||
printf("Starting RPC server on %s:%d\n", host, port); | ||
size_t free_mem, total_mem; | ||
get_backend_memory(&free_mem, &total_mem); | ||
std::string endpoint = std::string(host) + ":" + std::to_string(port); | ||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); | ||
ggml_backend_free(backend); | ||
return 0; | ||
} |
Oops, something went wrong.