From f10cb8cdc6a55caf4a1f613531015988c77a395e Mon Sep 17 00:00:00 2001 From: thxCode Date: Thu, 4 Jul 2024 23:56:34 +0800 Subject: [PATCH] refactor: detail uma Signed-off-by: thxCode --- cmd/gguf-parser/README.md | 285 ++++++++++++++++++++------------------ cmd/gguf-parser/main.go | 42 ++++-- file_estimate.go | 44 ++++-- file_estimate_option.go | 10 ++ 4 files changed, 226 insertions(+), 155 deletions(-) diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index fe461c1..f3d8174 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -31,6 +31,8 @@ Usage of gguf-parser ...: Output as pretty JSON. (default true) -kv-type string Specify the type of Key-Value cache, which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], default is f16. Use quantization type means enabling --flash-attention as well. (default "f16") + -no-kv-offload + Specify disabling Key-Value offloading, which is used to estimate the usage. Key-Value offloading can reduce the usage of VRAM. -no-mmap Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. -offload-layers int @@ -78,29 +80,29 @@ Usage of gguf-parser ...: ```shell $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf" -+-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+ +-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| | jeffq | llama | 2 | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | -+-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ ++--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | ++--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| MODEL | jeffq | llama | 2 | IQ3_XXS/Q5_K_M | true | 4.78 GiB | 7.24 B | 5.67 bpw | ++--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 32032 | +| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 32032 | +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | -+ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A | -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| | llama | 32768 | false | true | 33 (32 + 1) | Yes | 4.09 GiB | 238.39 MiB | 10.80 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | true | 33 (32 + 1) | Yes | 88.39 MiB + 11.06 GiB = 8.68 GiB | 238.39 MiB | 11.06 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ ``` @@ -108,29 +110,29 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra ```shell $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" -+-------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ -| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+ +----------+-------+----------------------+-------------+---------------+--------+------------+----------+ -| | emozilla | llama | 2 | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | -+-------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ ++--------------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ +| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | ++--------------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ +| MODEL | emozilla | llama | 2 | Q4_K/Q3_K_M | true | 21 GiB | 46.70 B | 3.86 bpw | ++--------------+----------+-------+----------------------+-------------+---------------+--------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 8 | 32002 | +| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | 32768 | 4096 | 1024 | 32 | 32 | 14336 | 8 | 32002 | +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | -+ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| | llama | 449.91 KiB | 32002 | 0 | 1 | 32000 | 0 | N/A | 2 | -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | llama | 449.91 KiB | 32002 | 0 | 1 | 32000 | 0 | N/A | 2 | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 33 (32 + 1) | Yes | 25.08 GiB | 292.68 MiB | 27.04 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 33 (32 + 1) | Yes | 245.24 MiB + 27.31 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -138,29 +140,29 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8 ```shell $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-model-Q5_K_M.gguf" -+-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+ +-------+-------+----------------------+----------------+---------------+----------+------------+----------+ -| | model | llama | 2 | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | -+-------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ ++--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | ++--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +| MODEL | model | llama | 2 | IQ3_XXS/Q5_K_M | true | 5.33 GiB | 8.03 B | 5.70 bpw | ++--------------+-------+-------+----------------------+----------------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| | 8192 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 128256 | +| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | 8192 | 4096 | 1024 | 32 | 32 | 14336 | 0 | 128256 | +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | -+ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| | gpt2 | 2 MiB | 128256 | 0 | 128000 | 128001 | 128002 | N/A | 0 | -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | gpt2 | 2 MiB | 128256 | 0 | 128000 | 128001 | 128002 | N/A | 0 | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| | llama | 8192 | false | true | 33 (32 + 1) | Yes | 1.08 GiB | 234.61 MiB | 6.49 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| ESTIMATE | llama | 8192 | false | true | 33 (32 + 1) | Yes | 84.61 MiB + 6.49 GiB = 5.68 GiB | 234.61 MiB | 6.49 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ ``` @@ -168,69 +170,80 @@ $ gguf-parser --hf-repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --hf-file="ggml-mode ```shell $ gguf-parser --ol-model="gemma2" -+-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+ +--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| | gemma2 | gemma2 | 2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | -+-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ ++--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | ++--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ +| MODEL | gemma2 | gemma2 | 2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | ++--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| | 8192 | 3584 | 2048 | 16 | 42 | 14336 | 0 | 256000 | +| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | 8192 | 3584 | 2048 | 16 | 42 | 14336 | 0 | 256000 | +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | -+ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| | llama | 3.80 MiB | 256000 | 0 | 2 | 1 | 3 | N/A | 0 | -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ - -+----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 2.69 GiB | 215.97 MiB | 8.43 GiB | -+----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | llama | 3.80 MiB | 256000 | 0 | 2 | 1 | 3 | N/A | 0 | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ ++--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| ESTIMATE | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 65.97 MiB + 8.43 GiB = 7.05 GiB | 215.97 MiB | 8.43 GiB | ++--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ $ gguf-parser --ol-model="gemma2" --ol-crawl -+-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| MODEL | NAME | ARCH | QUANTIZATION VERSION | FILE TYPE | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | -+ +--------+--------+----------------------+-----------+---------------+----------+------------+----------+ -| | gemma2 | gemma2 | 2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | -+-------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ ++--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ +| \ | Name | Arch | Quantization Version | File Type | Little Endian | Size | Parameters | BPW | ++--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ +| MODEL | gemma2 | gemma2 | 2 | Q4_0 | true | 5.06 GiB | 9.24 B | 4.71 bpw | ++--------------+--------+--------+----------------------+-----------+---------------+----------+------------+----------+ +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| ARCHITECTURE | MAX CONTEXT LEN | EMBEDDING LEN | EMBEDDING GQA | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | -+ +-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -| | 8192 | 3584 | 2048 | 16 | 42 | 14336 | 0 | 256000 | +| \ | Max Context Len | Embedding Len | Embedding GQA | Attention Head Cnt | Layers | Feed Forward Len | Expert Cnt | Vocabulary Len | ++--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ +| ARCHITECTURE | 8192 | 3584 | 2048 | 16 | 42 | 14336 | 0 | 256000 | +--------------+-----------------+---------------+---------------+--------------------+--------+------------------+------------+----------------+ -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| TOKENIZER | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | -+ +-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -| | llama | 0 B | 256000 | 0 | 2 | 1 | 3 | N/A | 0 | -+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| \ | Model | Tokens Size | Tokens Len | Added Tokens Len | BOS Token | EOS Token | Unknown Token | Separator Token | Padding Token | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ +| TOKENIZER | llama | 0 B | 256000 | 0 | 2 | 1 | 3 | N/A | 0 | ++--------------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ -| | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 2.69 GiB | 215.99 MiB | 8.12 GiB | -+----------+--------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ ++--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ +| ESTIMATE | gemma2 | 8192 | false | true | 43 (42 + 1) | Yes | 65.99 MiB + 8.43 GiB = 7.05 GiB | 215.99 MiB | 8.43 GiB | ++--------------+--------+--------------+-----------------+--------------+----------------+----------------+---------------------------------+------------+-------------+ ``` ### Estimate +#### Estimate with full layers offload (default) + +```shell +$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 33 (32 + 1) | Yes | 245.24 MiB + 27.31 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ + +``` + #### Estimate with zero layers offload ```shell -$ gguf-parser --hf-repo="mradermacher/Falcon2-8B-Dutch-GGUF" --hf-file="Falcon2-8B-Dutch.Q5_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=0 -+----------+--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+ -| | falcon | 2048 | false | true | 0 | No | 391.46 MiB | 541.46 MiB | 654.91 MiB | -+----------+--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+ +$ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=0 ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 2.46 GiB = 25.09 GiB | 25.24 GiB | 2.46 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ ``` @@ -238,11 +251,11 @@ $ gguf-parser --hf-repo="mradermacher/Falcon2-8B-Dutch-GGUF" --hf-file="Falcon2- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers=10 -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 10 | No | 25.09 GiB | 17.51 GiB | 10.19 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 10 | No | 17.36 GiB + 10.19 GiB = 25.09 GiB | 17.51 GiB | 10.19 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+-----------------------------------+------------+-------------+ ``` @@ -250,11 +263,11 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --ctx-size=4096 -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| | llama | 4096 | false | false | 33 (32 + 1) | Yes | 21.53 GiB | 339.24 MiB | 21.89 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| ESTIMATE | llama | 4096 | false | false | 33 (32 + 1) | Yes | 189.24 MiB + 21.89 GiB = 21.53 GiB | 339.24 MiB | 21.89 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -262,11 +275,11 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --flash-attention -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | true | false | 33 (32 + 1) | Yes | 25.08 GiB | 395.24 MiB | 25.33 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | true | false | 33 (32 + 1) | Yes | 245.24 MiB + 25.33 GiB = 25.08 GiB | 395.24 MiB | 25.33 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` @@ -274,11 +287,11 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --no-mmap -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 10 | No | 25.09 GiB | 17.51 GiB | 10.19 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 2.46 GiB = 25.09 GiB | 25.24 GiB | 2.46 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+----------------------------------+------------+-------------+ ``` @@ -286,25 +299,25 @@ $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf- ```shell $ gguf-parser --hf-repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --hf-file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --gpu-layers-step=5 -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 0 | No | 25.09 GiB | 25.24 GiB | 2.46 GiB | -+ + + + + +----------------+ + +------------+-------------+ -| | | | | | 5 | | | 21.37 GiB | 6.33 GiB | -+ + + + + +----------------+ + +------------+-------------+ -| | | | | | 10 | | | 17.51 GiB | 10.19 GiB | -+ + + + + +----------------+ + +------------+-------------+ -| | | | | | 15 | | | 13.64 GiB | 14.06 GiB | -+ + + + + +----------------+ + +------------+-------------+ -| | | | | | 20 | | | 9.78 GiB | 17.92 GiB | -+ + + + + +----------------+ + +------------+-------------+ -| | | | | | 25 | | | 5.91 GiB | 21.79 GiB | -+ + + + + +----------------+ + +------------+-------------+ -| | | | | | 30 | | | 2.05 GiB | 25.65 GiB | -+ + + + + +----------------+----------------+-----------+------------+-------------+ -| | | | | | 33 (32 + 1) | Yes | 25.08 GiB | 395.24 MiB | 27.31 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| \ | Arch | Context Size | Flash Attention | MMap Support | Offload Layers | Full Offloaded | UMA (RAM + VRAM) | NonUMA RAM | NonUMA VRAM | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ +| ESTIMATE | llama | 32768 | false | false | 0 | No | 25.09 GiB + 2.46 GiB = 25.09 GiB | 25.24 GiB | 2.46 GiB | ++ + + + + +----------------+ +------------------------------------+------------+-------------+ +| | | | | | 5 | | 21.23 GiB + 6.33 GiB = 25.09 GiB | 21.37 GiB | 6.33 GiB | ++ + + + + +----------------+ +------------------------------------+------------+-------------+ +| | | | | | 10 | | 17.36 GiB + 10.19 GiB = 25.09 GiB | 17.51 GiB | 10.19 GiB | ++ + + + + +----------------+ +------------------------------------+------------+-------------+ +| | | | | | 15 | | 13.50 GiB + 14.06 GiB = 25.09 GiB | 13.64 GiB | 14.06 GiB | ++ + + + + +----------------+ +------------------------------------+------------+-------------+ +| | | | | | 20 | | 9.63 GiB + 17.92 GiB = 25.09 GiB | 9.78 GiB | 17.92 GiB | ++ + + + + +----------------+ +------------------------------------+------------+-------------+ +| | | | | | 25 | | 5.77 GiB + 21.79 GiB = 25.09 GiB | 5.91 GiB | 21.79 GiB | ++ + + + + +----------------+ +------------------------------------+------------+-------------+ +| | | | | | 30 | | 1.90 GiB + 25.65 GiB = 25.09 GiB | 2.05 GiB | 25.65 GiB | ++ + + + + +----------------+----------------+------------------------------------+------------+-------------+ +| | | | | | 33 (32 + 1) | Yes | 245.24 MiB + 27.31 GiB = 25.08 GiB | 395.24 MiB | 27.31 GiB | ++--------------+-------+--------------+-----------------+--------------+----------------+----------------+------------------------------------+------------+-------------+ ``` diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 457d798..34dba80 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -41,6 +41,7 @@ func main() { physicalBatchSize = 512 parallelSize = 1 kvType = "f16" + noKVOffload bool flashAttention bool platformFootprint = "150,250" noMMap bool @@ -103,6 +104,9 @@ func main() { "which is used to estimate the usage, select from [f32, f16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1], "+ "default is f16. "+ "Use quantization type means enabling --flash-attention as well.") + fs.BoolVar(&noKVOffload, "no-kv-offload", noKVOffload, "Specify disabling Key-Value offloading, "+ + "which is used to estimate the usage. "+ + "Key-Value offloading can reduce the usage of VRAM.") fs.BoolVar(&flashAttention, "flash-attention", flashAttention, "Specify enabling Flash Attention, "+ "which is used to estimate the usage. "+ "Flash Attention can reduce the usage of RAM/VRAM.") @@ -197,6 +201,9 @@ func main() { } eopts = append(eopts, WithCacheKeyType(kv), WithCacheValueType(kv)) } + if noKVOffload { + eopts = append(eopts, WithoutOffloadKVCache()) + } if flashAttention { eopts = append(eopts, WithFlashAttention()) } @@ -330,6 +337,7 @@ func main() { tprint( "MODEL", []string{"Name", "Arch", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"}, + nil, []string{ m.Name, m.Architecture, @@ -346,6 +354,7 @@ func main() { tprint( "ARCHITECTURE", []string{"Max Context Len", "Embedding Len", "Embedding GQA", "Attention Head Cnt", "Layers", "Feed Forward Len", "Expert Cnt", "Vocabulary Len"}, + nil, []string{ sprintf(a.MaximumContextLength), sprintf(a.EmbeddingLength), @@ -362,6 +371,7 @@ func main() { tprint( "TOKENIZER", []string{"Model", "Tokens Size", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "Unknown Token", "Separator Token", "Padding Token"}, + nil, []string{ t.Model, sprintf(GGUFBytesScalar(t.TokensSize)), @@ -412,14 +422,15 @@ func main() { sprintf(!es.NoMMap), sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)), sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")), - sprintf(es.Memory[i].UMA), + sprintf("%s + %s = %s", es.Memory[i].UMA.RAM, es.Memory[i].NonUMA.VRAM, es.Memory[i].UMA.RAM+es.Memory[i].UMA.VRAM), sprintf(es.Memory[i].NonUMA.RAM), sprintf(es.Memory[i].NonUMA.VRAM), } } tprint( "ESTIMATE", - []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "Full Offloaded", "UMA RAM", "NonUMA RAM", "NonUMA VRAM"}, + []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "Full Offloaded", "UMA (RAM + VRAM)", "NonUMA RAM", "NonUMA VRAM"}, + []int{0, 1, 2, 3, 5}, bd...) } } @@ -456,22 +467,37 @@ func sprintf(f any, a ...any) string { } } -func tprint(title string, header []string, body ...[]string) { +func tprint(title string, header []string, merges []int, body ...[]string) { title = strings.ToUpper(title) - for i := range header { - header[i] = strings.ToUpper(header[i]) - } tb := tablewriter.NewWriter(os.Stdout) + tb.SetTablePadding("\t") tb.SetAlignment(tablewriter.ALIGN_CENTER) tb.SetHeaderLine(true) tb.SetRowLine(true) - tb.SetAutoMergeCells(true) - tb.Append(append([]string{title}, header...)) + + tb.SetHeaderAlignment(tablewriter.ALIGN_CENTER) + tb.SetAutoFormatHeaders(false) + tb.SetHeader(append([]string{"\\"}, header...)) + + tb.SetAutoWrapText(false) + tb.SetColMinWidth(0, 12) + tb.SetAutoMergeCellsByColumnIndex(func() (r []int) { + if len(merges) == 0 { + return []int{0} + } + r = make([]int, len(merges)+1) + r = append(r, 0) + for i := range merges { + r[i] = merges[i] + 1 + } + return r + }()) for i := range body { tb.Append(append([]string{title}, body[i]...)) } + tb.Render() fmt.Println() } diff --git a/file_estimate.go b/file_estimate.go index 0177e91..c211986 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -87,6 +87,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( if o.CacheValueType == nil { o.CacheValueType = ptr.To(GGMLTypeF16) } + if o.OffloadKVCache == nil { + o.OffloadKVCache = ptr.To(true) + } if o.PhysicalBatchSize == nil { o.PhysicalBatchSize = ptr.To(int32(512)) } @@ -247,6 +250,13 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( e.Load.KVCache.Value = GGUFBytesScalar(vrs * nLoadLayers) e.Offload.KVCache.Key = GGUFBytesScalar(krs * nOffloadLayers) e.Offload.KVCache.Value = GGUFBytesScalar(vrs * nOffloadLayers) + + if !*o.OffloadKVCache { + e.Load.KVCache.Key += e.Offload.KVCache.Key + e.Load.KVCache.Value += e.Offload.KVCache.Value + e.Offload.KVCache.Key = GGUFBytesScalar(0) + e.Offload.KVCache.Value = GGUFBytesScalar(0) + } } // Computation. @@ -429,7 +439,12 @@ type ( // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // UMA represents the usage of Unified Memory Architecture. - UMA GGUFBytesScalar `json:"uma"` + UMA struct { + // Load is the memory usage for loading the GGUF file in Load. + RAM GGUFBytesScalar `json:"ram"` + // VRAM is the memory usage for loading the GGUF file in VRAM. + VRAM GGUFBytesScalar `json:"vram"` + } `json:"uma"` // NonUMA represents the usage of Non-Unified Memory Architecture. NonUMA struct { // Load is the memory usage for loading the GGUF file in Load. @@ -442,7 +457,7 @@ type ( // SummarizeMemory returns the summary of the estimated memory usage of loading the GGUF file in llama.cpp, // the input options are used to adjust the summary. -func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, platformRAM, platformVRAM uint64) (ems LLaMACppUsageEstimateMemorySummary) { +func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, ramFootprint, vramFootprint uint64) (ems LLaMACppUsageEstimateMemorySummary) { ems.OffloadLayers, ems.FullOffloaded = e.OffloadLayers, e.FullOffloaded if ems.FullOffloaded { ems.OffloadLayers++ // The output layer is offloaded. @@ -450,20 +465,27 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, platformRAM, platformV // UMA. { - fp := e.Load.Footprint + e.Offload.Footprint - wg := e.Load.Weight.Sum() + e.Offload.Weight.Sum() - kv := e.Load.KVCache.Sum() + e.Offload.KVCache.Sum() + // RAM + fp := e.Load.Footprint + wg := e.Load.Weight.Sum() + kv := e.Load.KVCache.Sum() cp := e.Load.Computation.Sum() - ems.UMA = fp + wg + kv + cp + ems.UMA.RAM = fp + wg + kv + cp if !e.NoMMap && mmap { - ems.UMA -= wg + ems.UMA.RAM -= wg } + // VRAM. + fp = e.Offload.Footprint + wg = e.Offload.Weight.Sum() + kv = e.Offload.KVCache.Sum() + cp = 0 + ems.UMA.VRAM = fp + wg + kv + cp } // NonUMA. { // RAM. - fp := GGUFBytesScalar(platformRAM) + e.Load.Footprint + fp := GGUFBytesScalar(ramFootprint) + e.Load.Footprint wg := e.Load.Weight.Sum() kv := e.Load.KVCache.Sum() cp := e.Load.Computation.Sum() @@ -475,7 +497,7 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, platformRAM, platformV } } // VRAM. - fp = GGUFBytesScalar(platformVRAM) + e.Offload.Footprint + fp = GGUFBytesScalar(vramFootprint) + e.Offload.Footprint wg = e.Offload.Weight.Sum() kv = e.Offload.KVCache.Sum() cp = e.Offload.Computation.Sum() @@ -487,10 +509,10 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, platformRAM, platformV // Summarize returns the summary of the estimated result of loading the GGUF file in llama.cpp, // the input options are used to adjust the summary. -func (e LLaMACppUsageEstimate) Summarize(mmap bool, platformRAM, platformVRAM uint64) (es LLaMACppUsageEstimateSummary) { +func (e LLaMACppUsageEstimate) Summarize(mmap bool, ramFootprint, vramFootprint uint64) (es LLaMACppUsageEstimateSummary) { // Summarize memory. es.Memory = []LLaMACppUsageEstimateMemorySummary{ - e.SummarizeMemory(mmap, platformRAM, platformVRAM), + e.SummarizeMemory(mmap, ramFootprint, vramFootprint), } // Just copy from the original estimate. diff --git a/file_estimate_option.go b/file_estimate_option.go index a0cf10f..f82d09c 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -2,6 +2,8 @@ package gguf_parser import ( "slices" + + "github.com/thxcode/gguf-parser-go/util/ptr" ) type ( @@ -13,6 +15,7 @@ type ( ParallelSize *int32 CacheKeyType *GGMLType CacheValueType *GGMLType + OffloadKVCache *bool OffloadLayers *uint64 FlashAttention bool } @@ -95,6 +98,13 @@ func WithCacheValueType(t GGMLType) LLaMACppUsageEstimateOption { } } +// WithoutOffloadKVCache disables offloading the KV cache. +func WithoutOffloadKVCache() LLaMACppUsageEstimateOption { + return func(o *_LLaMACppUsageEstimateOptions) { + o.OffloadKVCache = ptr.To(false) + } +} + // WithOffloadLayers sets the number of layers to offload. func WithOffloadLayers(layers uint64) LLaMACppUsageEstimateOption { return func(o *_LLaMACppUsageEstimateOptions) {