Skip to content

Commit

Permalink
Merge branch 'ggerganov:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sealad886 authored Jun 21, 2024
2 parents 59e694d + 7d5e877 commit a42a18e
Show file tree
Hide file tree
Showing 13 changed files with 867 additions and 95 deletions.
2 changes: 2 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
/**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; }
else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; }
else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; }
else if (value == "last") { params.pooling_type = LLAMA_POOLING_TYPE_LAST; }
else { invalid_param = true; }
return true;
}
Expand Down Expand Up @@ -1869,6 +1870,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param

options.push_back({ "backend" });
options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });

if (llama_supports_mlock()) {
options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
}
Expand Down
4 changes: 2 additions & 2 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,15 +214,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
"""

convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = convert_py_pth.read_text(encoding="utf-8")
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)

convert_py_pth.write_text(convert_py)
convert_py_pth.write_text(convert_py, encoding="utf-8")

logger.info("+++ convert-hf-to-gguf.py was updated")

Expand Down
21 changes: 11 additions & 10 deletions examples/embedding/embedding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@ static std::vector<std::string> split_lines(const std::string & s) {
return lines;
}

static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}

Expand All @@ -40,13 +41,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu

// try to get sequence embeddings - supported only when pooling_type is not NONE
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
if (embd == NULL) {
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
continue;
}
}
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");

float * out = output + batch.seq_id[i][0] * n_embd;
//TODO: I would also add a parameter here to enable normalization or not.
Expand Down Expand Up @@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);

const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}

if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
Expand Down
6 changes: 4 additions & 2 deletions examples/gritlm/gritlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve

// clear previous kv_cache values (irrelevant for embeddings)
llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, true);
llama_set_causal_attn(ctx, false);

// run model
Expand Down Expand Up @@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
llama_token eos_token = llama_token_eos(mdl);

llama_kv_cache_clear(ctx);
llama_set_embeddings(ctx, false);
llama_set_causal_attn(ctx, true);

llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);

std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
Expand Down Expand Up @@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {

llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);

// create new context - set to embedding mode
cparams.embeddings = true;
// create generation context
llama_context * ctx = llama_new_context_with_model(mdl, cparams);

// ### Embedding/Representation ###
Expand Down
35 changes: 21 additions & 14 deletions examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift
Original file line number Diff line number Diff line change
Expand Up @@ -131,22 +131,29 @@ class LlamaState: ObservableObject {

messageLog += "\(text)"

while await llamaContext.n_cur < llamaContext.n_len {
let result = await llamaContext.completion_loop()
messageLog += "\(result)"
}
Task.detached {
while await llamaContext.n_cur < llamaContext.n_len {
let result = await llamaContext.completion_loop()
await MainActor.run {
self.messageLog += "\(result)"
}
}

let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
let t_end = DispatchTime.now().uptimeNanoseconds
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
let tokens_per_second = Double(await llamaContext.n_len) / t_generation

await llamaContext.clear()
messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
await llamaContext.clear()

await MainActor.run {
self.messageLog += """
\n
Done
Heat up took \(t_heat)s
Generated \(tokens_per_second) t/s\n
"""
}
}
}

func bench() async {
Expand Down
13 changes: 10 additions & 3 deletions examples/retrieval/retrieval.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
return chunks;
}

static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
for (size_t i = 0; i < tokens.size(); i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
size_t n_tokens = tokens.size();
for (size_t i = 0; i < n_tokens; i++) {
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
}
}

Expand Down Expand Up @@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
const int n_ctx_train = llama_n_ctx_train(model);
const int n_ctx = llama_n_ctx(ctx);

const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
return 1;
}

if (n_ctx > n_ctx_train) {
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
__func__, n_ctx_train, n_ctx);
Expand Down
Loading

0 comments on commit a42a18e

Please sign in to comment.