Skip to content

Commit

Permalink
Merge branch 'master' into pr/7359
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Jul 4, 2024
2 parents 9971c38 + f8c4c07 commit 305fb45
Show file tree
Hide file tree
Showing 88 changed files with 1,894 additions and 1,444 deletions.
17 changes: 7 additions & 10 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
clblast,
curl,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
useOpenCL
useRocm
useVulkan
] && blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false,
useRocm ? config.rocmSupport,
enableCurl ? true,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

Expand All @@ -56,7 +55,6 @@ let
++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ];

Expand Down Expand Up @@ -198,19 +196,19 @@ effectiveStdenv.mkDerivation (
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];

cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CLBLAST" useOpenCL)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
Expand Down Expand Up @@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
useVulkan
;
Expand All @@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
badPlatforms = optionals useCuda lib.platforms.darwin;

# Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`.
Expand Down
2 changes: 0 additions & 2 deletions .github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,3 @@ contact_links:
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with


4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ endif()

option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})

if (WIN32)
add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
endif()

#
# option list
#
Expand Down
1 change: 1 addition & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "cl",
"GGML_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
Expand Down
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ TEST_TARGETS = \
tests/test-tokenizer-1-bpe \
tests/test-tokenizer-1-spm

# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm

# Deprecation aliases
ifdef LLAMA_CUBLAS
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
Expand Down Expand Up @@ -1086,6 +1091,7 @@ clean:
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -rvf $(LEGACY_TARGETS)
find examples pocs -type f -name "*.o" -delete

#
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
Expand Down Expand Up @@ -217,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
**Tools:**

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption

**Infrastructure:**

- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp

---

Expand Down
2 changes: 1 addition & 1 deletion common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -757,7 +757,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.cache_type_v = argv[++i];
return true;
}
if (arg == "--multiline-input") {
if (arg == "-mli" || arg == "--multiline-input") {
params.multiline_input = true;
return true;
}
Expand Down
1 change: 0 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -459,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

6 changes: 5 additions & 1 deletion convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "gemma", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2b", },
{"name": "gemma-2", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/google/gemma-2-9b", },
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
]


Expand Down Expand Up @@ -272,7 +275,8 @@ def get_vocab_base_pre(self, tokenizer) -> str:
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
"Cửa Việt", # llama-bpe fails on this
" discards",
chktxt,
]

Expand Down
149 changes: 135 additions & 14 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
if chkhsh == "7fc505bd3104ca1083b150b17d088b59534ede9bde81f0dd2090967d7fe52cee":
# ref: https://huggingface.co/LumiOpen/Viking-7B
res = "viking"
if chkhsh == "b53802fb28e26d645c3a310b34bfe07da813026ec7c7716883404d5e0f8b1901":
# ref: https://huggingface.co/core42/jais-13b
res = "jais"

if res is None:
logger.warning("\n")
Expand Down Expand Up @@ -576,7 +579,19 @@ def _set_vocab_qwen(self):
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_sentencepiece(self):
def _set_vocab_sentencepiece(self, add_to_gguf=True):
tokens, scores, toktypes = self._create_vocab_sentencepiece()

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

def _create_vocab_sentencepiece(self):
from sentencepiece import SentencePieceProcessor

tokenizer_path = self.dir_model / 'tokenizer.model'
Expand Down Expand Up @@ -638,14 +653,7 @@ def _set_vocab_sentencepiece(self):
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)

self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
return tokens, scores, toktypes

def _set_vocab_llama_hf(self):
vocab = gguf.LlamaHfVocab(self.dir_model)
Expand Down Expand Up @@ -1979,7 +1987,7 @@ def set_gguf_parameters(self):
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')

if rope_scaling_type == 'su':
if rope_scaling_type == 'su' or rope_scaling_type == 'longrope':
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
elif rope_scaling_type == 'yarn':
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
Expand Down Expand Up @@ -2353,6 +2361,8 @@ def set_vocab(self):
special_vocab._set_special_token("eot", 107)
special_vocab.add_to_gguf(self.gguf_writer)

self.gguf_writer.add_add_space_prefix(False)

def set_gguf_parameters(self):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
Expand Down Expand Up @@ -2390,7 +2400,20 @@ class Gemma2Model(Model):
model_arch = gguf.MODEL_ARCH.GEMMA2

def set_vocab(self):
self._set_vocab_llama_hf()
tokens, scores, toktypes = self._create_vocab_sentencepiece()
# hack: This is required so that we can properly use start/end-of-turn for chat template
for i in range(108):
# including <unusedX>, <start_of_turn>, <end_of_turn>
toktypes[i] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

self.gguf_writer.add_add_space_prefix(False)

def set_gguf_parameters(self):
Expand All @@ -2414,6 +2437,12 @@ def set_gguf_parameters(self):
self.gguf_writer.add_final_logit_softcapping(
self.hparams["final_logit_softcapping"]
)
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])

# sanity check
attn_scalar = self.hparams["query_pre_attn_scalar"]
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unusem
Expand Down Expand Up @@ -3031,6 +3060,96 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
return [(self.map_tensor_name(name), data_torch)]


@Model.register("JAISLMHeadModel")
class JaisModel(Model):
model_arch = gguf.MODEL_ARCH.JAIS

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

# SwigLU activation
assert self.hparams["activation_function"] == "swiglu"
# ALiBi position embedding
assert self.hparams["position_embedding_type"] == "alibi"

# Embeddings scale
self.embeddings_scale = 1.0
# note: For some JAIS flavors, output is tied to (same as) wte in original model
self.output_is_wte = False
if 'mup_embeddings_scale' in self.hparams:
self.output_is_wte = True # Hack (?)
self.embeddings_scale = self.hparams['mup_embeddings_scale']
elif 'embeddings_scale' in self.hparams:
self.embeddings_scale = self.hparams['embeddings_scale']
else:
assert False

self.width_scale = 1.0
if 'mup_output_alpha' in self.hparams:
assert 'mup_width_scale' in self.hparams
self.width_scale = self.hparams['mup_output_alpha'] * self.hparams['mup_width_scale']
elif 'width_scale' in self.hparams:
self.width_scale = self.hparams['width_scale']
else:
assert False

self.max_alibi_bias = 8.0

def set_vocab(self):
self._set_vocab_gpt2()

def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_positions"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
self.gguf_writer.add_feed_forward_length(self.hparams["n_inner"])
self.gguf_writer.add_head_count(self.hparams["n_head"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
self.gguf_writer.add_file_type(self.ftype)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused

tensors: list[tuple[str, Tensor]] = []

# we don't need these
if name.endswith((".attn.bias")):
return tensors

if name.endswith(("relative_pe.slopes")):
# Calculate max ALiBi bias (this is the inverse of the ALiBi calculation)
# Some other models has max_alibi_bias spelled out explicitly in the hyperparams,
# but Jais's PyTorch model simply precalculates the slope values and places them
# in relative_pes.slopes
n_head_closest_log2 = 2 ** math.floor(math.log2(self.hparams["n_head"]))
first_val = float(data_torch._data[0])
self.max_alibi_bias = -round(math.log2(first_val) * n_head_closest_log2)

return tensors

if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_fc2.weight")):
data_torch = data_torch.transpose(1, 0)

new_name = self.map_tensor_name(name)

if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
tensors.append((new_name, data_torch * self.embeddings_scale))
if self.output_is_wte:
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch * self.width_scale))
elif new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
assert not self.output_is_wte
tensors.append((new_name, data_torch * self.width_scale))
else:
tensors.append((new_name, data_torch))

return tensors

def write_tensors(self):
super().write_tensors()
self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)


###### CONVERSION LOGIC ######


Expand Down Expand Up @@ -3186,7 +3305,8 @@ def main() -> None:
"auto": gguf.LlamaFileType.GUESSED,
}

if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
if args.use_temp_file and is_split:
logger.error("Error: Cannot use temp file when splitting")
sys.exit(1)

Expand Down Expand Up @@ -3223,11 +3343,12 @@ def main() -> None:
if args.vocab_only:
logger.info("Exporting model vocab...")
model_instance.write_vocab()
logger.info("Model vocab successfully exported.")
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
else:
logger.info("Exporting model...")
model_instance.write()
logger.info("Model successfully exported.")
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
logger.info(f"Model successfully exported to {out_path}")


if __name__ == '__main__':
Expand Down
1 change: 0 additions & 1 deletion examples/embedding/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,4 +58,3 @@ The above command will output space-separated float values.
```powershell
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
```

Loading

0 comments on commit 305fb45

Please sign in to comment.