Skip to content

Commit

Permalink
backend : offload large batches to GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
slaren committed Mar 15, 2024
1 parent 46acb36 commit 19add48
Show file tree
Hide file tree
Showing 8 changed files with 162 additions and 191 deletions.
10 changes: 7 additions & 3 deletions ggml-alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];

if (ggml_is_view(node)) {
// TODO: better way to add external dependencies
// GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to
// control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node
// itself is never used and should not be considered a dependency
if (ggml_is_view(node) && node->op != GGML_OP_NONE) {
struct ggml_tensor * view_src = node->view_src;
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
Expand All @@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr

ggml_gallocr_hash_get(galloc, src)->n_children += 1;

// allocate explicit inputs and leafs
if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
// allocate explicit inputs
if (src->flags & GGML_TENSOR_FLAG_INPUT) {
ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
}
}
Expand Down
5 changes: 5 additions & 0 deletions ggml-backend-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ extern "C" {
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);

// check if the backend want to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);

// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
Expand Down
43 changes: 39 additions & 4 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,21 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_
return err;
}

bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return backend->iface.graph_compute(backend, cgraph);
}

bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
return backend->iface.supports_op(backend, op);
}

bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
}
return false;
}

// backend copy

static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
Expand Down Expand Up @@ -834,6 +841,7 @@ static struct ggml_backend_i cpu_backend_i = {
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_record = */ NULL,
Expand Down Expand Up @@ -999,7 +1007,7 @@ static bool ggml_is_view_op(enum ggml_op op) {
#endif

#ifndef GGML_SCHED_MAX_SPLITS
#define GGML_SCHED_MAX_SPLITS 256
#define GGML_SCHED_MAX_SPLITS 1024
#endif

#ifndef GGML_SCHED_MAX_SPLIT_INPUTS
Expand Down Expand Up @@ -1138,14 +1146,23 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
}

// assign nodes that use weights to the backend of the weights
// operations with weights are preferably run on the same backend as the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = tensor->src[i];
if (src == NULL) {
continue;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend = ggml_backend_sched_backend_from_buffer(sched, src);
// operations with weights are always run on the same backend as the weights
// check if a backend with higher prio wants to run the op
if (src_backend == sched->n_backends - 1) {
for (int b = 0; b < src_backend; b++) {
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
return b;
}
}
}
SET_CAUSE(tensor, "1.wgt%d", i);
return src_backend;
}
Expand Down Expand Up @@ -1404,7 +1421,25 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg

GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now

if (tensor_backend_id != cur_backend_id) {
// check if a weight is on a different backend and start a new split if so
bool offload = false;
if (tensor_backend_id == cur_backend_id && sched->splits[cur_split].n_inputs > 0) {
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
continue;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src);
if (src_backend_id != -1 && src_backend_id != cur_backend_id) {
offload = true;
break;
}
}
}
}

if (tensor_backend_id != cur_backend_id || offload) {
sched->splits[cur_split].i_end = i;
cur_split++;
GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS);
Expand Down
8 changes: 4 additions & 4 deletions ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ extern "C" {
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);

GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);

GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);

// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
Expand Down
Loading

0 comments on commit 19add48

Please sign in to comment.