Skip to content

Commit

Permalink
fix: full offload
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <[email protected]>
  • Loading branch information
thxCode committed Jun 17, 2024
1 parent 115e016 commit 9218d99
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 73 deletions.
120 changes: 60 additions & 60 deletions cmd/gguf-parser/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,11 @@ $ gguf-parser --path="~/.cache/lm-studio/models/NousResearch/Hermes-2-Pro-Mistra
| | llama | 450.50 KiB | 32032 | 0 | 1 | 32000 | N/A | N/A | N/A |
+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+
| | llama | 32768 | false | true | 32 | 4.09 GiB | 238.39 MiB | 10.70 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
| | llama | 32768 | false | true | 33 (32 + 1) | Yes | 4.09 GiB | 238.39 MiB | 10.80 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
```
Expand All @@ -105,11 +105,11 @@ $ gguf-parser --url="https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8
| | llama | 449.91 KiB | 32002 | 0 | 1 | 32000 | 0 | N/A | 2 |
+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 32 | 25.08 GiB | 395.24 MiB | 26.94 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 33 (32 + 1) | Yes | 25.08 GiB | 292.68 MiB | 27.04 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
```
Expand All @@ -135,11 +135,11 @@ $ gguf-parser --repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --file="ggml-model-Q5_K
| | gpt2 | 2 MiB | 128256 | 0 | 128000 | 128001 | 128002 | N/A | 0 |
+-----------+-------+-------------+------------+------------------+-----------+-----------+---------------+-----------------+---------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+
| | llama | 8192 | false | true | 32 | 1.08 GiB | 234.61 MiB | 6.25 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
| | llama | 8192 | false | true | 33 (32 + 1) | Yes | 1.08 GiB | 234.61 MiB | 6.55 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+----------+------------+-------------+
```
Expand All @@ -149,85 +149,85 @@ $ gguf-parser --repo="openbmb/MiniCPM-Llama3-V-2_5-gguf" --file="ggml-model-Q5_K
```shell
$ gguf-parser --repo="mradermacher/Falcon2-8B-Dutch-GGUF" --file="Falcon2-8B-Dutch.Q5_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=0
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 0 | 25.08 GiB | 25.23 GiB | 2.10 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
+----------+--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+
| | falcon | 2048 | false | true | 0 | No | 383.46 MiB | 533.46 MiB | 404.91 MiB |
+----------+--------+--------------+-----------------+--------------+----------------+----------------+------------+------------+-------------+
```
#### Estimate with specific layers offload
```shell
$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 10 | 25.08 GiB | 17.50 GiB | 9.83 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 10 | No | 25.08 GiB | 17.50 GiB | 9.83 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
```
#### Estimate with specific context size
```shell
$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --ctx-size=4096
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 4096 | false | false | 32 | 21.53 GiB | 339.24 MiB | 21.64 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| | llama | 4096 | false | false | 33 (32 + 1) | Yes | 21.53 GiB | 236.68 MiB | 21.74 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
```
#### Estimate with Flash Attention
```shell
$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --flash-attention
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 32768 | true | false | 32 | 25.08 GiB | 395.24 MiB | 25.08 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| | llama | 32768 | true | false | 33 (32 + 1) | Yes | 25.08 GiB | 292.68 MiB | 25.18 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
```
#### Estimate with No MMap
```shell
$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --no-mmap
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 10 | 25.08 GiB | 17.50 GiB | 9.83 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 10 | No | 25.08 GiB | 17.50 GiB | 9.83 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
```
#### Estimate step-by-step offload layers
```shell
$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers=10 --offload-layers-step=5
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 0 | 25.08 GiB | 25.23 GiB | 2.10 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 5 | | 21.36 GiB | 5.97 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 10 | | 17.50 GiB | 9.83 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 15 | | 13.63 GiB | 13.70 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 20 | | 9.77 GiB | 17.56 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 25 | | 5.91 GiB | 21.42 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 30 | | 2.04 GiB | 25.29 GiB |
+ + + + + +----------------+ +------------+-------------+
| | | | | | 32 | | 395.24 MiB | 26.94 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+-----------+------------+-------------+
$ gguf-parser --repo="NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF" --file="Nous-Hermes-2-Mixtral-8x7B-DPO.Q3_K_M.gguf" --skip-model --skip-architecture --skip-tokenizer --offload-layers-step=5
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| ESTIMATE | ARCH | CONTEXT SIZE | FLASH ATTENTION | MMAP SUPPORT | OFFLOAD LAYERS | FULL OFFLOADED | UMA RAM | NONUMA RAM | NONUMA VRAM |
+ +-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
| | llama | 32768 | false | false | 0 | No | 25.08 GiB | 25.23 GiB | 2.10 GiB |
+ + + + + +----------------+ + +------------+-------------+
| | | | | | 5 | | | 21.36 GiB | 5.97 GiB |
+ + + + + +----------------+ + +------------+-------------+
| | | | | | 10 | | | 17.50 GiB | 9.83 GiB |
+ + + + + +----------------+ + +------------+-------------+
| | | | | | 15 | | | 13.63 GiB | 13.70 GiB |
+ + + + + +----------------+ + +------------+-------------+
| | | | | | 20 | | | 9.77 GiB | 17.56 GiB |
+ + + + + +----------------+ + +------------+-------------+
| | | | | | 25 | | | 5.91 GiB | 21.42 GiB |
+ + + + + +----------------+ + +------------+-------------+
| | | | | | 30 | | | 2.04 GiB | 25.29 GiB |
+ + + + + +----------------+----------------+ +------------+-------------+
| | | | | | 33 (32 + 1) | Yes | | 292.68 MiB | 27.04 GiB |
+----------+-------+--------------+-----------------+--------------+----------------+----------------+-----------+------------+-------------+
```
Expand Down
7 changes: 4 additions & 3 deletions cmd/gguf-parser/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ func main() {
}
if offloadLayersStep < e.OffloadLayers {
cnt := e.OffloadLayers/offloadLayersStep + 1
if e.OffloadLayers%offloadLayersStep != 0 {
if e.OffloadLayers%offloadLayersStep != 0 || e.FullOffloaded {
cnt++
}
ess := make([]LLaMACppUsageEstimateMemorySummary, cnt)
Expand All @@ -347,15 +347,16 @@ func main() {
sprintf(es.ContextSize),
sprintf(es.FlashAttention),
sprintf(!es.NoMMap),
sprintf(es.Memory[i].OffloadLayers),
sprintf(tenary(es.Memory[i].FullOffloaded, sprintf("%d (%d + 1)", es.Memory[i].OffloadLayers, es.Memory[i].OffloadLayers-1), es.Memory[i].OffloadLayers)),
sprintf(tenary(es.Memory[i].FullOffloaded, "Yes", "No")),
sprintf(es.Memory[i].UMA),
sprintf(es.Memory[i].NonUMA.RAM),
sprintf(es.Memory[i].NonUMA.VRAM),
}
}
tprint(
"ESTIMATE",
[]string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "UMA RAM", "NonUMA RAM", "NonUMA VRAM"},
[]string{"Arch", "Context Size", "Flash Attention", "MMap Support", "Offload Layers", "Full Offloaded", "UMA RAM", "NonUMA RAM", "NonUMA VRAM"},
bd...)
}
}
Expand Down
Loading

0 comments on commit 9218d99

Please sign in to comment.