From dca7ad86273c1316a65bb5f2286f3c3964bada95 Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 7 Aug 2024 15:42:11 -0400 Subject: [PATCH 1/2] llama : avoid useless copies in dummy session writer --- src/llama.cpp | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index a7b1c9ebd9e37..82e541c3c3a27 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17346,6 +17346,14 @@ struct llama_data_write { virtual size_t get_size_written() = 0; virtual ~llama_data_write() = default; + std::vector temp_buffer; + + virtual void * get_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + return temp_buffer.data(); + } + void write_string(const std::string & str) { uint32_t str_size = str.size(); @@ -17465,9 +17473,9 @@ struct llama_data_write { // Read each range of cells of k_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * k_size_row); - ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * k_size_row; + const void * data = get_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); + write(data, buf_size); } } @@ -17486,9 +17494,9 @@ struct llama_data_write { // Read each range of cells of v_size length each into tmp_buf and write out for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; - tmp_buf.resize(range_size * v_size_row); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_row; + const void * data = get_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); + write(data, buf_size); } } } else { @@ -17514,9 +17522,9 @@ struct llama_data_write { for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; - tmp_buf.resize(range_size * v_size_el); - ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); - write(tmp_buf.data(), tmp_buf.size()); + const size_t buf_size = range_size * v_size_el; + const void * data = get_tensor_data(kv_self.v_l[il], src_offset, buf_size); + write(data, buf_size); } } } @@ -17881,6 +17889,10 @@ struct llama_data_write_dummy : llama_data_write { size_written += size; } + void * get_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t /* size */) override { + return nullptr; + } + size_t get_size_written() override { return size_written; } From 9329953a61215de3b0cc458c50e8e3b955ec10ad Mon Sep 17 00:00:00 2001 From: Francis Couture-Harpin Date: Wed, 7 Aug 2024 16:03:17 -0400 Subject: [PATCH 2/2] llama : avoid double tensor copy when saving session to buffer --- src/llama.cpp | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/llama.cpp b/src/llama.cpp index 82e541c3c3a27..68512d2ef70bb 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -17343,17 +17343,10 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi // TODO: replace all non-fatal assertions with returned errors or exceptions struct llama_data_write { virtual void write(const void * src, size_t size) = 0; + virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0; virtual size_t get_size_written() = 0; virtual ~llama_data_write() = default; - std::vector temp_buffer; - - virtual void * get_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) { - temp_buffer.resize(size); - ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); - return temp_buffer.data(); - } - void write_string(const std::string & str) { uint32_t str_size = str.size(); @@ -17474,8 +17467,7 @@ struct llama_data_write { for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * k_size_row; - const void * data = get_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); - write(data, buf_size); + write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size); } } @@ -17495,8 +17487,7 @@ struct llama_data_write { for (const auto & range : cell_ranges) { const size_t range_size = range.second - range.first; const size_t buf_size = range_size * v_size_row; - const void * data = get_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); - write(data, buf_size); + write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size); } } } else { @@ -17523,8 +17514,7 @@ struct llama_data_write { const size_t range_size = range.second - range.first; const size_t src_offset = (range.first + j * kv_size) * v_size_el; const size_t buf_size = range_size * v_size_el; - const void * data = get_tensor_data(kv_self.v_l[il], src_offset, buf_size); - write(data, buf_size); + write_tensor_data(kv_self.v_l[il], src_offset, buf_size); } } } @@ -17883,14 +17873,12 @@ struct llama_data_write_dummy : llama_data_write { llama_data_write_dummy() {} - // TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context - void write(const void * /* src */, size_t size) override { size_written += size; } - void * get_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t /* size */) override { - return nullptr; + void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override { + size_written += size; } size_t get_size_written() override { @@ -17915,6 +17903,16 @@ struct llama_data_write_buffer : llama_data_write { buf_size -= size; } + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + if (size > buf_size) { + throw std::runtime_error("unexpectedly reached end of buffer"); + } + ggml_backend_tensor_get(tensor, ptr, offset, size); + ptr += size; + size_written += size; + buf_size -= size; + } + size_t get_size_written() override { return size_written; } @@ -17950,6 +17948,7 @@ struct llama_data_read_buffer : llama_data_read { struct llama_data_write_file : llama_data_write { llama_file * file; size_t size_written = 0; + std::vector temp_buffer; llama_data_write_file(llama_file * f) : file(f) {} @@ -17958,6 +17957,12 @@ struct llama_data_write_file : llama_data_write { size_written += size; } + void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override { + temp_buffer.resize(size); + ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size); + write(temp_buffer.data(), temp_buffer.size()); + } + size_t get_size_written() override { return size_written; }