Skip to content

Commit

Permalink
llama : reduce useless copies when saving session (ggerganov#8916)
Browse files Browse the repository at this point in the history
* llama : avoid useless copies in dummy session writer

* llama : avoid double tensor copy when saving session to buffer
  • Loading branch information
compilade authored and arthw committed Nov 18, 2024
1 parent b270371 commit 6ac252e
Showing 1 changed file with 28 additions and 11 deletions.
39 changes: 28 additions & 11 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17348,6 +17348,7 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
// TODO: replace all non-fatal assertions with returned errors or exceptions
struct llama_data_write {
virtual void write(const void * src, size_t size) = 0;
virtual void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) = 0;
virtual size_t get_size_written() = 0;
virtual ~llama_data_write() = default;

Expand Down Expand Up @@ -17470,9 +17471,8 @@ struct llama_data_write {
// Read each range of cells of k_size length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
tmp_buf.resize(range_size * k_size_row);
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row);
write(tmp_buf.data(), tmp_buf.size());
const size_t buf_size = range_size * k_size_row;
write_tensor_data(kv_self.k_l[il], range.first * k_size_row, buf_size);
}
}

Expand All @@ -17491,9 +17491,8 @@ struct llama_data_write {
// Read each range of cells of v_size length each into tmp_buf and write out
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
tmp_buf.resize(range_size * v_size_row);
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row);
write(tmp_buf.data(), tmp_buf.size());
const size_t buf_size = range_size * v_size_row;
write_tensor_data(kv_self.v_l[il], range.first * v_size_row, buf_size);
}
}
} else {
Expand All @@ -17519,9 +17518,8 @@ struct llama_data_write {
for (const auto & range : cell_ranges) {
const size_t range_size = range.second - range.first;
const size_t src_offset = (range.first + j * kv_size) * v_size_el;
tmp_buf.resize(range_size * v_size_el);
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size());
write(tmp_buf.data(), tmp_buf.size());
const size_t buf_size = range_size * v_size_el;
write_tensor_data(kv_self.v_l[il], src_offset, buf_size);
}
}
}
Expand Down Expand Up @@ -17880,12 +17878,14 @@ struct llama_data_write_dummy : llama_data_write {

llama_data_write_dummy() {}

// TODO: avoid unnecessary calls to ggml_backend_tensor_get in a dummy context

void write(const void * /* src */, size_t size) override {
size_written += size;
}

void write_tensor_data(const struct ggml_tensor * /* tensor */, size_t /* offset */, size_t size) override {
size_written += size;
}

size_t get_size_written() override {
return size_written;
}
Expand All @@ -17908,6 +17908,16 @@ struct llama_data_write_buffer : llama_data_write {
buf_size -= size;
}

void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
if (size > buf_size) {
throw std::runtime_error("unexpectedly reached end of buffer");
}
ggml_backend_tensor_get(tensor, ptr, offset, size);
ptr += size;
size_written += size;
buf_size -= size;
}

size_t get_size_written() override {
return size_written;
}
Expand Down Expand Up @@ -17943,6 +17953,7 @@ struct llama_data_read_buffer : llama_data_read {
struct llama_data_write_file : llama_data_write {
llama_file * file;
size_t size_written = 0;
std::vector<uint8_t> temp_buffer;

llama_data_write_file(llama_file * f) : file(f) {}

Expand All @@ -17951,6 +17962,12 @@ struct llama_data_write_file : llama_data_write {
size_written += size;
}

void write_tensor_data(const struct ggml_tensor * tensor, size_t offset, size_t size) override {
temp_buffer.resize(size);
ggml_backend_tensor_get(tensor, temp_buffer.data(), offset, size);
write(temp_buffer.data(), temp_buffer.size());
}

size_t get_size_written() override {
return size_written;
}
Expand Down

0 comments on commit 6ac252e

Please sign in to comment.