ggml : move headers one up [no ci]

ggerganov · Jun 24, 2024 · 46b9bee · 46b9bee
1 parent 362a975
commit 46b9bee
Show file tree

Hide file tree

Showing 45 changed files with 3,557 additions and 765 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,13 +1,5 @@
 build/
-build-blas/
-build-debug/
-build-release/
-build-sanitize-addr/
-build-sanitize-thread/
-build-cov/
-build-ci-debug/
-build-ci-release/
-build-cublas/
+build-*/
 out/
 tmp/
 models/

diff --git a/examples/gpt-2/main-alloc.cpp b/examples/gpt-2/main-alloc.cpp
@@ -1,6 +1,6 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 
 #include "common.h"
 #include "common-ggml.h"

diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp
@@ -1,6 +1,6 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"

diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp
@@ -1,6 +1,6 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"

diff --git a/examples/gpt-2/main-ctx.cpp b/examples/gpt-2/main-ctx.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "common.h"
 #include "common-ggml.h"

diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp
@@ -1,6 +1,6 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"

diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "common.h"
 #include "common-ggml.h"

diff --git a/examples/gpt-j/README.md b/examples/gpt-j/README.md
@@ -147,7 +147,7 @@ sys.    0m7.103s
 ## Implementation details
 
 The high level implementation of the model is contained in the [main.cpp](main.cpp) file. The core computations are
-performed by the [ggml](https://github.com/ggerganov/ggml/blob/master/include/ggml/ggml.h) library.
+performed by the [ggml](https://github.com/ggerganov/ggml/blob/master/include/ggml.h) library.
 
 
 #### Matrix multiplication

diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "common.h"
 #include "common-ggml.h"

diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "common.h"
 #include "common-ggml.h"

diff --git a/examples/magika/main.cpp b/examples/magika/main.cpp
@@ -1,6 +1,7 @@
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
 #include <algorithm>
 #include <cmath>
 #include <numeric>

diff --git a/examples/mnist/main-cnn.cpp b/examples/mnist/main-cnn.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "common.h"
 

diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp
@@ -10,7 +10,7 @@
 // $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
 //
 
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include <algorithm>
 #include <cmath>

diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp
@@ -10,7 +10,7 @@
 // $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
 //
 
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "main-mtl.h"
 

diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m
@@ -1,6 +1,6 @@
 #import "main-mtl.h"
 
-#import "ggml/ggml.h"
+#import "ggml.h"
 
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>

diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 
 #include "common.h"
 

diff --git a/examples/python/README.md b/examples/python/README.md
@@ -93,7 +93,7 @@ You can also edit [api.h](./api.h) to control which files should be included in
 In fact, if you wanted to only generate bindings for the current version of the `ggml` repo itself (instead of `llama.cpp`; you'd loose support for k-quants), you could run:
 
 ```bash
-API=../../include/ggml/ggml.h python regenerate.py
+API=../../include/ggml.h python regenerate.py
 ```
 
 ## Develop
@@ -109,7 +109,7 @@ pytest
 This example's goal is to showcase [cffi](https://cffi.readthedocs.io/)-generated bindings that are trivial to use and update, but there are already alternatives in the wild:
 
 - https://github.com/abetlen/ggml-python: these bindings seem to be hand-written and use [ctypes](https://docs.python.org/3/library/ctypes.html). It has [high-quality API reference docs](https://ggml-python.readthedocs.io/en/latest/api-reference/#ggml.ggml) that can be used with these bindings too, but it doesn't expose Metal, CUDA, MPI or OpenCL calls, doesn't support transparent (de/re)quantization like this example does (see [ggml.utils](./ggml/utils.py) module), and won't pick up your local changes.
-  
+
 - https://github.com/abetlen/llama-cpp-python: these expose the C++ `llama.cpp` interface, which this example cannot easily be extended to support (`cffi` only generates bindings of C libraries)
 
 - [pybind11](https://github.com/pybind/pybind11) and [nanobind](https://github.com/wjakob/nanobind) are two alternatives to cffi that support binding C++ libraries, but it doesn't seem either of them have an automatic generator (writing bindings is rather time-consuming).
diff --git a/examples/simple/simple-backend.cpp b/examples/simple/simple-backend.cpp
@@ -1,6 +1,6 @@
 #include "ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
 
 #ifdef GGML_USE_CUDA
 #include "ggml-cuda.h"

diff --git a/examples/yolo/yolov3-tiny.cpp b/examples/yolo/yolov3-tiny.cpp
@@ -1,4 +1,4 @@
-#include "ggml/ggml.h"
+#include "ggml.h"
 #include "yolo-image.h"
 
 #include <cmath>

diff --git a/include/ggml-alloc.h b/include/ggml-alloc.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend * ggml_backend_t;
+
+// Tensor allocator
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
+
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+
+// Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
+
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
+
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
+
+#ifdef  __cplusplus
+}
+#endif