Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sealad886 authored Jun 9, 2024
2 parents eeda2de + 3e2ee44 commit 516d7b7
Show file tree
Hide file tree
Showing 15 changed files with 668 additions and 397 deletions.
30 changes: 22 additions & 8 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,19 +200,13 @@ void gpt_params_handle_model_default(gpt_params & params) {
}
params.hf_file = params.model;
} else if (params.model.empty()) {
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
params.model = cache_directory + string_split(params.hf_file, '/').back();
params.model = fs_get_cache_file(string_split(params.hf_file, '/').back());
}
} else if (!params.model_url.empty()) {
if (params.model.empty()) {
auto f = string_split(params.model_url, '#').front();
f = string_split(f, '?').front();
f = string_split(f, '/').back();
params.model = "models/" + f;
params.model = fs_get_cache_file(string_split(f, '/').back());
}
} else if (params.model.empty()) {
params.model = DEFAULT_MODEL_PATH;
Expand Down Expand Up @@ -1491,6 +1485,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.chat_template = argv[i];
return true;
}
if (arg == "--slot-prompt-similarity" || arg == "-sps") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.slot_prompt_similarity = std::stof(argv[i]);
return true;
}
if (arg == "-pps") {
params.is_pp_shared = true;
return true;
Expand Down Expand Up @@ -1913,6 +1915,8 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"set custom jinja chat template (default: template taken from model's metadata)\n"
"only commonly used templates are accepted:\n"
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });

#ifndef LOG_DISABLE_LOGS
options.push_back({ "logging" });
Expand Down Expand Up @@ -2269,6 +2273,16 @@ std::string fs_get_cache_directory() {
return ensure_trailing_slash(cache_directory);
}

std::string fs_get_cache_file(const std::string & filename) {
GGML_ASSERT(filename.find(DIRECTORY_SEPARATOR) == std::string::npos);
std::string cache_directory = fs_get_cache_directory();
const bool success = fs_create_directory_with_parents(cache_directory);
if (!success) {
throw std::runtime_error("failed to create cache directory: " + cache_directory);
}
return cache_directory + filename;
}


//
// Model utils
Expand Down
3 changes: 3 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ struct gpt_params {

std::string slot_save_path;

float slot_prompt_similarity = 0.5f;

// batched-bench params
bool is_pp_shared = false;

Expand Down Expand Up @@ -275,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);

std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);

//
// Model utils
Expand Down
42 changes: 22 additions & 20 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,12 @@ class Model:
_model_classes: dict[str, type[Model]] = {}

dir_model: Path
ftype: int
ftype: gguf.LlamaFileType
is_big_endian: bool
endianess: gguf.GGUFEndian
use_temp_file: bool
lazy: bool
model_name: str | None
part_names: list[str]
is_safetensors: bool
hparams: dict[str, Any]
Expand All @@ -64,7 +65,7 @@ class Model:
# subclasses should define this!
model_arch: gguf.MODEL_ARCH

def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model
Expand All @@ -73,10 +74,11 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
self.lazy = not eager
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
self.model_name = model_name
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
Expand All @@ -94,7 +96,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)

@classmethod
def __init_subclass__(cls):
Expand Down Expand Up @@ -182,7 +184,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
return new_name

def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.block_count)

if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
Expand Down Expand Up @@ -324,21 +326,21 @@ def write_tensors(self):

def write(self):
self.write_tensors()
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close()

def write_vocab(self):
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()

@staticmethod
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = []
for filename in os.listdir(dir_model):
if filename.endswith(suffix):
if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename)

part_names.sort()
Expand Down Expand Up @@ -665,7 +667,7 @@ class GPTNeoXModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]

self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
Expand Down Expand Up @@ -798,7 +800,7 @@ def set_vocab(self):

def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count)
Expand Down Expand Up @@ -850,7 +852,7 @@ def set_gguf_parameters(self):
raise ValueError("gguf: can not find ctx length parameter.")

self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
Expand Down Expand Up @@ -887,7 +889,7 @@ def set_gguf_parameters(self):
else:
raise ValueError("gguf: can not find ctx length parameter.")

self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
Expand Down Expand Up @@ -1010,7 +1012,7 @@ def set_gguf_parameters(self):
else:
raise ValueError("gguf: can not find ctx length parameter.")

self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
Expand Down Expand Up @@ -1206,7 +1208,7 @@ def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]

self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
Expand Down Expand Up @@ -1681,7 +1683,7 @@ class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2

def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
Expand Down Expand Up @@ -2248,7 +2250,7 @@ def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]

self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
Expand Down Expand Up @@ -2348,7 +2350,7 @@ def set_gguf_parameters(self):
# Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model

self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
Expand Down Expand Up @@ -2852,7 +2854,7 @@ def main() -> None:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)

model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)

logger.info("Set model parameters")
model_instance.set_gguf_parameters()
Expand Down
10 changes: 0 additions & 10 deletions examples/rpc/rpc-server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@
#include "ggml-metal.h"
#endif

#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif

#include "ggml-rpc.h"
#ifdef _WIN32
# include <windows.h>
Expand Down Expand Up @@ -83,12 +79,6 @@ static ggml_backend_t create_backend() {
if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#elif GGML_USE_SYCL
fprintf(stderr, "%s: using SYCL backend\n", __func__);
backend = ggml_backend_sycl_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
}
#endif

// if there aren't GPU Backends fallback to CPU backend
Expand Down
2 changes: 1 addition & 1 deletion examples/server/public/index-new.html
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@
message = html`<${Probabilities} data=${data} />`
} else {
const text = isArrayMessage ?
data.map(msg => msg.content).join('').replace(/^\s+/, '') :
data.map(msg => msg.content).join('') :
data;
message = isCompletionMode ?
text :
Expand Down
Loading

0 comments on commit 516d7b7

Please sign in to comment.