From 9218d990355cfd75db5564b31dbd50f9475a8803 Mon Sep 17 00:00:00 2001 From: thxCode Date: Mon, 17 Jun 2024 15:31:08 +0800 Subject: [PATCH] fix: full offload Signed-off-by: thxCode --- cmd/gguf-parser/README.md | 120 +++++++++++++++++++------------------- cmd/gguf-parser/main.go | 7 ++- file_estimate.go | 25 ++++---- 3 files changed, 79 insertions(+), 73 deletions(-) diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index 11173e2..e7d2af1 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -75,11 +75,11 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra | | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A | +-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ -| | llama | 32768 | false | true | 32 | 4.09 GiB | 238.39 MiB | 10.70 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ +| | llama | 32768 | false | true | 33 (32 + 1) | Yes | 4.09 GiB | 238.39 MiB | 10.80 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ ``` @@ -105,11 +105,11 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8 | | llama | 449.91 KiB | 32002 | 0 | 1 | 32000 | 0 | N/A | 2 | +-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 32 | 25.08 GiB | 395.24 MiB | 26.94 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 33 (32 + 1) | Yes | 25.08 GiB | 292.68 MiB | 27.04 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ``` @@ -135,11 +135,11 @@ $ gguf-parser --repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --file="ggml-model-Q5_K | | gpt2 | 2 MiB | 128256 | 0 | 128000 | 128001 | 128002 | N/A | 0 | +-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+ -+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ -| | llama | 8192 | false | true | 32 | 1.08 GiB | 234.61 MiB | 6.25 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ +| | llama | 8192 | false | true | 33 (32 + 1) | Yes | 1.08 GiB | 234.61 MiB | 6.55 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+ ``` @@ -149,11 +149,11 @@ $ gguf-parser --repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --file="ggml-model-Q5_K ```shell $ gguf-parser --repo="mradermacher/Falcon2-8B-Dutch-GGUF" --file="Falcon2-8B-Dutch.Q5_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=0 -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 0 | 25.08 GiB | 25.23 GiB | 2.10 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ ++----------+--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+ +| | falcon | 2048 | false | true | 0 | No | 383.46 MiB | 533.46 MiB | 404.91 MiB | ++----------+--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+ ``` @@ -161,11 +161,11 @@ $ gguf-parser --repo="mradermacher/Falcon2-8B-Dutch-GGUF" --file="Falcon2-8B-Dut ```shell $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 10 | 25.08 GiB | 17.50 GiB | 9.83 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 10 | No | 25.08 GiB | 17.50 GiB | 9.83 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ``` @@ -173,11 +173,11 @@ $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file=" ```shell $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --ctx-size=4096 -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 4096 | false | false | 32 | 21.53 GiB | 339.24 MiB | 21.64 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| | llama | 4096 | false | false | 33 (32 + 1) | Yes | 21.53 GiB | 236.68 MiB | 21.74 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ``` @@ -185,11 +185,11 @@ $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file=" ```shell $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --flash-attention -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | true | false | 32 | 25.08 GiB | 395.24 MiB | 25.08 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | true | false | 33 (32 + 1) | Yes | 25.08 GiB | 292.68 MiB | 25.18 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ``` @@ -197,37 +197,37 @@ $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file=" ```shell $ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --no-mmap -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 10 | 25.08 GiB | 17.50 GiB | 9.83 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 10 | No | 25.08 GiB | 17.50 GiB | 9.83 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ``` #### Estimate step-by-step offload layers ```shell -$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --offload-layers-step=5 -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM | -+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ -| | llama | 32768 | false | false | 0 | 25.08 GiB | 25.23 GiB | 2.10 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 5 | | 21.36 GiB | 5.97 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 10 | | 17.50 GiB | 9.83 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 15 | | 13.63 GiB | 13.70 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 20 | | 9.77 GiB | 17.56 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 25 | | 5.91 GiB | 21.42 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 30 | | 2.04 GiB | 25.29 GiB | -+ + + + + +----------------+ +------------+-------------+ -| | | | | | 32 | | 395.24 MiB | 26.94 GiB | -+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+ +$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers-step=5 ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM | ++ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ +| | llama | 32768 | false | false | 0 | No | 25.08 GiB | 25.23 GiB | 2.10 GiB | ++ + + + + +----------------+ + +------------+-------------+ +| | | | | | 5 | | | 21.36 GiB | 5.97 GiB | ++ + + + + +----------------+ + +------------+-------------+ +| | | | | | 10 | | | 17.50 GiB | 9.83 GiB | ++ + + + + +----------------+ + +------------+-------------+ +| | | | | | 15 | | | 13.63 GiB | 13.70 GiB | ++ + + + + +----------------+ + +------------+-------------+ +| | | | | | 20 | | | 9.77 GiB | 17.56 GiB | ++ + + + + +----------------+ + +------------+-------------+ +| | | | | | 25 | | | 5.91 GiB | 21.42 GiB | ++ + + + + +----------------+ + +------------+-------------+ +| | | | | | 30 | | | 2.04 GiB | 25.29 GiB | ++ + + + + +----------------+----------------+ +------------+-------------+ +| | | | | | 33 (32 + 1) | Yes | | 292.68 MiB | 27.04 GiB | ++----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+ ``` diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 56eec52..a30cca3 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -322,7 +322,7 @@ func main() { } if offloadLayersStep < e.OffloadLayers { cnt := e.OffloadLayers/offloadLayersStep + 1 - if e.OffloadLayers%offloadLayersStep != 0 { + if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded { cnt++ } ess := make([]LLaMACppUsageEstimateMemorySummary, cnt) @@ -347,7 +347,8 @@ func main() { sprintf(es.ContextSize), sprintf(es.FlashAttention), sprintf(!es.NoMMap), - sprintf(es.Memory[i].OffloadLayers), + sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)), + sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")), sprintf(es.Memory[i].UMA), sprintf(es.Memory[i].NonUMA.RAM), sprintf(es.Memory[i].NonUMA.VRAM), @@ -355,7 +356,7 @@ func main() { } tprint( "ESTIMATE", - []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "UMA RAM", "NonUMA RAM", "NonUMA VRAM"}, + []string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "Full Offloaded", "UMA RAM", "NonUMA RAM", "NonUMA VRAM"}, bd...) } } diff --git a/file_estimate.go b/file_estimate.go index 8a398d6..339bae5 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -18,11 +18,11 @@ type ( FlashAttention bool `json:"flashAttention"` // ContextSize is the size of the context. ContextSize uint64 `json:"contextSize"` - // FullOffload is the flag to indicate whether the layers are fully offloaded, - // false for partial offloaded or zero offloaded. - FullOffload bool `json:"fullOffload"` // OffloadLayers is the number of offloaded layers. OffloadLayers uint64 `json:"offloadLayers"` + // FullOffloaded is the flag to indicate whether the layers are fully offloaded, + // false for partial offloaded or zero offloaded. + FullOffloaded bool `json:"fullOffloaded"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` @@ -167,18 +167,17 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( if v := o.OffloadLayers; v == nil { o.OffloadLayers = ptr.To(a.BlockCount) nOffloadLayers = a.BlockCount - } else if *v > 0 { + isOffloadOutputLayer = true + } else if *v != 0 { nOffloadLayers = *v - if nOffloadLayers >= a.BlockCount+1 { - isOffloadOutputLayer = true - } if nOffloadLayers > a.BlockCount { + isOffloadOutputLayer = true nOffloadLayers = a.BlockCount } } nLoadLayers -= nOffloadLayers - e.FullOffload = isOffloadOutputLayer && nLoadLayers == 0 + e.FullOffloaded = isOffloadOutputLayer && nLoadLayers == 0 e.OffloadLayers = nOffloadLayers } @@ -411,6 +410,9 @@ type ( LLaMACppUsageEstimateMemorySummary struct { // OffloadLayers is the number of offloaded layers. OffloadLayers uint64 `json:"offloadLayers"` + // FullOffloaded is the flag to indicate whether the layers are fully offloaded, + // false for partial offloaded or zero offloaded. + FullOffloaded bool `json:"fullOffloaded"` // UMA represents the usage of Unified Memory Architecture. UMA GGUFBytesScalar `json:"uma"` // NonUMA represents the usage of Non-Unified Memory Architecture. @@ -424,7 +426,10 @@ type ( ) func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEstimateMemorySummary) { - ems.OffloadLayers = e.OffloadLayers + ems.OffloadLayers, ems.FullOffloaded = e.OffloadLayers, e.FullOffloaded + if ems.FullOffloaded { + ems.OffloadLayers++ // The output layer is offloaded. + } // UMA. { @@ -458,7 +463,7 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEsti kv := e.Load.KVCache.Sum() cp := e.Load.Computation.Sum() ems.NonUMA.RAM = fp + wg + kv + cp - if !e.NoMMap && (mmap || e.FullOffload) { + if !e.NoMMap && (mmap || e.FullOffloaded) { ems.NonUMA.RAM -= wg } // VRAM.