Skip to content

Commit

Permalink
Merge pull request #204 from ggerganov/master
Browse files Browse the repository at this point in the history
b3284
  • Loading branch information
Nexesenex authored Jul 2, 2024
2 parents db7341f + 3e2618b commit 8385690
Show file tree
Hide file tree
Showing 67 changed files with 2,013 additions and 1,522 deletions.
17 changes: 7 additions & 10 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,18 @@
rocmPackages,
vulkan-headers,
vulkan-loader,
clblast,
curl,
useBlas ? builtins.all (x: !x) [
useCuda
useMetalKit
useOpenCL
useRocm
useVulkan
] && blas.meta.available,
useCuda ? config.cudaSupport,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
useMpi ? false, # Increases the runtime closure size by ~700M
useOpenCL ? false,
useRocm ? config.rocmSupport,
enableCurl ? true,
useVulkan ? false,
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake

Expand All @@ -56,7 +55,6 @@ let
++ lib.optionals useCuda [ "CUDA" ]
++ lib.optionals useMetalKit [ "MetalKit" ]
++ lib.optionals useMpi [ "MPI" ]
++ lib.optionals useOpenCL [ "OpenCL" ]
++ lib.optionals useRocm [ "ROCm" ]
++ lib.optionals useVulkan [ "Vulkan" ];

Expand Down Expand Up @@ -198,19 +196,19 @@ effectiveStdenv.mkDerivation (
optionals effectiveStdenv.isDarwin darwinBuildInputs
++ optionals useCuda cudaBuildInputs
++ optionals useMpi [ mpi ]
++ optionals useOpenCL [ clblast ]
++ optionals useRocm rocmBuildInputs
++ optionals useBlas [ blas ]
++ optionals useVulkan vulkanBuildInputs;
++ optionals useVulkan vulkanBuildInputs
++ optionals enableCurl [ curl ];

cmakeFlags =
[
(cmakeBool "LLAMA_BUILD_SERVER" true)
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
(cmakeBool "LLAMA_CURL" enableCurl)
(cmakeBool "GGML_NATIVE" false)
(cmakeBool "GGML_BLAS" useBlas)
(cmakeBool "GGML_CLBLAST" useOpenCL)
(cmakeBool "GGML_CUDA" useCuda)
(cmakeBool "GGML_HIPBLAS" useRocm)
(cmakeBool "GGML_METAL" useMetalKit)
Expand Down Expand Up @@ -254,7 +252,6 @@ effectiveStdenv.mkDerivation (
useCuda
useMetalKit
useMpi
useOpenCL
useRocm
useVulkan
;
Expand All @@ -281,7 +278,7 @@ effectiveStdenv.mkDerivation (
# Configurations we don't want even the CI to evaluate. Results in the
# "unsupported platform" messages. This is mostly a no-op, because
# cudaPackages would've refused to evaluate anyway.
badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
badPlatforms = optionals useCuda lib.platforms.darwin;

# Configurations that are known to result in build failures. Can be
# overridden by importing Nixpkgs with `allowBroken = true`.
Expand Down
2 changes: 0 additions & 2 deletions .github/ISSUE_TEMPLATE/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,3 @@ contact_links:
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with


1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,7 @@ jobs:
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
cd build
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
- name: Determine tag name
Expand Down
11 changes: 9 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,15 @@ set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${LLAMA_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${LLAMA_FATAL_WARNINGS})
set(GGML_LLAMAFILE ON)
set(GGML_CUDA_USE_GRAPHS ON)

# change the default for these ggml options
if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE ON)
endif()

if (NOT DEFINED GGML_CUDA_USE_GRAPHS)
set(GGML_CUDA_USE_GRAPHS ON)
endif()

# transition helpers
function (llama_option_depr TYPE OLD NEW)
Expand Down
1 change: 1 addition & 0 deletions CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_CXX_COMPILER": "icx",
"CMAKE_C_COMPILER": "cl",
"GGML_SYCL": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
Expand Down
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,11 @@ TEST_TARGETS = \
tests/test-tokenizer-1-bpe \
tests/test-tokenizer-1-spm

# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm

# Deprecation aliases
ifdef LLAMA_CUBLAS
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
Expand Down Expand Up @@ -1086,6 +1091,7 @@ clean:
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -rvf $(LEGACY_TARGETS)
find examples pocs -type f -name "*.o" -delete

#
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ Typically finetunes of the base models below are supported as well.
- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
- [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
- [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
- [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
- [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
Expand Down Expand Up @@ -217,6 +218,11 @@ Unless otherwise noted these projects are open-source with permissive licensing:
**Tools:**

- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
- [crashr/gppm](https://github.com/crashr/gppm) – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption

**Infrastructure:**

- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp

---

Expand Down
22 changes: 19 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1014,16 +1014,23 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
params.enable_chat_template = false;
return true;
}
if (arg == "--in-prefix") {
CHECK_ARG
params.input_prefix = argv[i];
params.enable_chat_template = false;
return true;
}
if (arg == "--in-suffix") {
CHECK_ARG
params.input_suffix = argv[i];
params.enable_chat_template = false;
return true;
}
if (arg == "--spm-infill") {
params.spm_infill = true;
return true;
}
if (arg == "--grammar") {
Expand Down Expand Up @@ -1402,13 +1409,15 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"halt generation at PROMPT, return control in interactive mode\n"
"can be specified more than once for multiple prompts" });
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
options.push_back({ "server infill",
" --spm-infill", "use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: %s)", params.spm_infill ? "enabled" : "disabled" });

options.push_back({ "sampling" });
options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
Expand Down Expand Up @@ -2662,12 +2671,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
const std::vector<llama_chat_msg> & past_msg,
const llama_chat_msg & new_msg,
bool add_ass) {
std::ostringstream ss;
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
std::vector<llama_chat_msg> chat_new(past_msg);
// if the past_msg ends with a newline, we must preserve it in the formatted version
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
ss << "\n";
};
// format chat with new_msg
chat_new.push_back(new_msg);
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return formatted;
// get the diff part
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
return ss.str();
}

std::string llama_chat_format_example(const struct llama_model * model,
Expand Down
4 changes: 3 additions & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ struct gpt_params {
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
bool enable_chat_template = true;

std::vector<std::string> api_keys;

Expand Down Expand Up @@ -250,6 +251,8 @@ struct gpt_params {
std::string cvector_outfile = "control_vector.gguf";
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";

bool spm_infill = false; // suffix/prefix/middle pattern for infill
};

void gpt_params_handle_model_default(gpt_params & params);
Expand Down Expand Up @@ -456,4 +459,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

4 changes: 2 additions & 2 deletions common/json-schema-to-grammar.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ std::unordered_map<char, std::string> GRAMMAR_LITERAL_ESCAPES = {
};

std::unordered_set<char> NON_LITERAL_SET = {'|', '.', '(', ')', '[', ']', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};
std::unordered_set<char> ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = {'^', '$', '.', '[', ']', '(', ')', '|', '{', '}', '*', '+', '?'};

template <typename Iterator>
std::string join(Iterator begin, Iterator end, const std::string & separator) {
Expand Down Expand Up @@ -720,7 +720,7 @@ class SchemaConverter {
}
prop_names.push_back(prop_name);
}
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
if ((additional_properties.is_boolean() && additional_properties.get<bool>()) || additional_properties.is_object()) {
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
std::string value_rule =
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
Expand Down
1 change: 1 addition & 0 deletions convert-hf-to-gguf-update.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class TOKENIZER_TYPE(IntEnum):
{"name": "poro-chat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Poro-34B-chat", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
{"name": "viking", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LumiOpen/Viking-7B", }, # Also used for Viking 13B and 33B
{"name": "jais", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/core42/jais-13b", },
]


Expand Down
Loading

0 comments on commit 8385690

Please sign in to comment.