diff --git a/README.md b/README.md index 9df749d..6e55030 100644 --- a/README.md +++ b/README.md @@ -517,7 +517,7 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ -| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 36 + 0 | 144 MiB | 17.87 GiB | 44 + 1 | 22.01 GiB | 22.44 GiB | +| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 36 + 0 | 144 MiB | 17.79 GiB | 44 + 1 | 22.01 GiB | 22.51 GiB | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ ``` @@ -528,8 +528,8 @@ resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| | host1 | ENOUGH | 388.08 MiB | | | :thumbsup: | -| host1 (NVIDIA 4080 0) | | | 8 GiB | 17.87 GiB | | -| host1 (NVIDIA 4080 1) | | | 10 GiB | 22.44 GiB | | +| host1 (NVIDIA 4080 0) | | | 8 GiB | 17.79 GiB | | +| host1 (NVIDIA 4080 1) | | | 10 GiB | 22.51 GiB | | It appears that running the model on `host1` alone is not feasible. @@ -570,7 +570,7 @@ $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llam +--------------------+------------+------------+----------------+----------+----------+----------------+-----------+-----------+----------------+-----------+-----------+----------------+----------+----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+----------+----------+----------------+-----------+-----------+----------------+-----------+-----------+----------------+----------+----------+ -| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 18 + 0 | 8.85 GiB | 9.37 GiB | 23 + 0 | 10.88 GiB | 11.32 GiB | 27 + 0 | 12.75 GiB | 13.19 GiB | 12 + 1 | 6.87 GiB | 7.31 GiB | +| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 18 + 0 | 8.85 GiB | 9.28 GiB | 23 + 0 | 10.88 GiB | 11.32 GiB | 27 + 0 | 12.75 GiB | 13.19 GiB | 12 + 1 | 6.87 GiB | 7.38 GiB | +--------------------+------------+------------+----------------+----------+----------+----------------+-----------+-----------+----------------+-----------+-----------+----------------+----------+----------+ ``` @@ -581,7 +581,7 @@ following resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| | host4 | 11 GiB | 388.08 MiB | | | :thumbsup: | -| host1 (NVIDIA 4080 0) | | | 8 GiB | 9.37 GiB | | +| host1 (NVIDIA 4080 0) | | | 8 GiB | 9.28 GiB | | | host1 (NVIDIA 4080 1) | | | 10 GiB | 11.32 GiB | | | host2 (NVIDIA 4090) | | | 12 GiB | 13.19 GiB | | | host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 6.87 GiB | | @@ -615,15 +615,15 @@ flowchart TD ```shell $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --skip-metadata --skip-architecture --skip-tokenizer --ctx-size=1024 --tensor-split="6,11,12,8,10" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --in-short -+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+----------------------------------------------+------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+ -| RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 | VRAM 4 | -+--------------------+------------+------------+----------------+--------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+ -| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | -+--------------------+------------+------------+----------------+--------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+ -| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 11 + 0 | 44 MiB | 6.08 GiB | 19 + 0 | 8.96 GiB | 9.39 GiB | 20 + 0 | 9.47 GiB | 9.90 GiB | 14 + 0 | 6.63 GiB | 7.07 GiB | 16 + 1 | 8.74 GiB | 9.18 GiB | -+--------------------+------------+------------+----------------+--------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+ ++---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++----------------------------------------------+----------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+--------------------------------------+ +| RAM | VRAM 0 | VRAM 1 | VRAM 2 | VRAM 3 | VRAM 4 | ++--------------------+------------+------------+----------------+--------+--------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+ +| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | ++--------------------+------------+------------+----------------+--------+--------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+ +| 1 + 0 + 0 | 238.08 MiB | 388.08 MiB | 11 + 0 | 44 MiB | 6 GiB | 19 + 0 | 8.96 GiB | 9.39 GiB | 20 + 0 | 9.47 GiB | 9.90 GiB | 14 + 0 | 6.63 GiB | 7.07 GiB | 16 + 1 | 8.74 GiB | 9.25 GiB | ++--------------------+------------+------------+----------------+--------+--------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+----------------+----------+----------+ ``` @@ -637,7 +637,7 @@ following resource consumption: | host4 | 11 GiB | 9.39 GiB | | | :thumbsup: | | host1 (NVIDIA 4080 1) | | | 12 GiB | 9.90 GiB | :thumbsup: | | host2 (NVIDIA 4080 0) | | | 8 GiB | 7.07 GiB | :thumbsup: | -| host3 (NVIDIA 4080 1) | | | 10 GiB | 9.18 GiB | :thumbsup: | +| host3 (NVIDIA 4080 1) | | | 10 GiB | 9.25 GiB | :thumbsup: | Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`. diff --git a/file_estimate.go b/file_estimate.go index 43f09f1..c179419 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -546,7 +546,7 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL } // Finally, get the usage of output layer. if a.Type == "model" { - outInc := inpEmbd + var outInc uint64 if a.Architecture == "mamba" { outInc += inpSMask + inpSSeq } @@ -557,10 +557,16 @@ func (gf *GGUFFile) EstimateLLaMACppRun(opts ...LLaMACppRunEstimateOption) (e LL rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) outInc += rs } + idx := 0 // Default to the main host's RAM. if !fullOffload { - outInc += uint64(e.Devices[0].Weight.Output) + if len(e.Devices) != len(o.RPCServers)+1 { // If the main host has a GPU. + outInc += uint64(e.Devices[0].Weight.Output) + idx = o.MainGPUIndex + 1 + } + } else { + idx = len(e.Devices) - 1 // The last device is the output device. } - e.Devices[o.MainGPUIndex+1].Computation.Output += GGUFBytesScalar(outInc) + e.Devices[idx].Computation.Output += GGUFBytesScalar(outInc) } }