Skip to content

Commit

Permalink
refactor: estimate for moe
Browse files Browse the repository at this point in the history
Signed-off-by: thxCode <[email protected]>
  • Loading branch information
thxCode committed Jun 12, 2024
1 parent 98d6dee commit 6ef5c4d
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 22 deletions.
11 changes: 4 additions & 7 deletions cmd/gguf-parser/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,9 @@ func main() {
if !skipModel {
tprintf(
"MODEL",
[]string{"Name", "File Size", "Arch", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"},
[]string{"Name", "Arch", "Quantization Version", "File Type", "Little Endian", "Size", "Parameters", "BPW"},
[]string{
m.Name,
sprintf(m.FileSize),
m.Architecture,
sprintf(m.QuantizationVersion),
sprintf(m.FileType),
Expand Down Expand Up @@ -277,17 +276,15 @@ func main() {
}
tprintf(
"ESTIMATE",
[]string{"Mem. Arch", "MMap", "Context Size", "Usage"},
[]string{"Context Size", "Mem. Arch", "Usage"},
[]string{
"UMA",
sprintf(!noMMap),
sprintf(ctxSize),
"UMA",
sprintf(es.UMA),
},
[]string{
"NonUMA",
sprintf(!noMMap),
sprintf(ctxSize),
"NonUMA",
fmt.Sprintf("%s (RAM) + %s (VRAM)", es.NonUMA.RAM, es.NonUMA.VRAM),
})
}
Expand Down
39 changes: 24 additions & 15 deletions file_estimate.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ import (
type (
// LLaMACppUsageEstimate represents the estimated result of loading the GGUF file in llama.cpp.
LLaMACppUsageEstimate struct {
// Layers is the number of layers for loading the GGUF file.
Layers uint64 `json:"layers"`
// OffloadLayers is the number of layers to offload.
OffloadLayers uint64 `json:"offloadLayers"`
// FullOffload is the flag to indicate whether the layers are fully offloaded,
// false for partial offloaded or zero offloaded.
FullOffload bool `json:"fullOffload"`
// NoMMap is the flag to indicate whether the file must be loaded without mmap,
// true for total loaded.
NoMMap bool `json:"noMMap"`
// RAM is the memory usage for loading the GGUF file in RAM.
RAM LLaMACppMemoryUsage `json:"ram"`
// VRAM is the memory usage for loading the GGUF file in VRAM.
Expand Down Expand Up @@ -96,8 +98,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
}
nLoadLayers -= nOffloadLayers
}
e.Layers = a.BlockCount
e.OffloadLayers = nOffloadLayers
e.FullOffload = a.BlockCount == nOffloadLayers

// Footprint.
{
Expand Down Expand Up @@ -133,8 +134,10 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
{
// Compute.
for i, offloadStart := uint64(0), uint64(len(tfLs))-nOffloadLayers; i < uint64(len(tfLs)); i++ {
e.RAM.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
if i >= offloadStart {
switch {
case i < nLoadLayers:
e.RAM.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
case i >= offloadStart:
e.VRAM.Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
}
}
Expand Down Expand Up @@ -237,6 +240,10 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) (
}
e.RAM.Computation.Compute = GGUFBytesScalar(max(kvcInc, ffnInc))
}
// Special case: we cannot use mmap for splitting expert weights in MoE.
if a.ExpertCount > 0 {
e.NoMMap = len(tfLs[0].Search(regexp.MustCompile(`.*\.\d+\.ffn_gate_exps\.weight`))) == 0
}
}
// Finally, get the usage of output layer.
{
Expand Down Expand Up @@ -268,18 +275,20 @@ type LLaMACppUsageEstimateSummery struct {
func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummery) {
// UMA.
{
es.UMA = e.RAM.Footprint
es.UMA += max(e.RAM.KVCache.Sum()+e.VRAM.KVCache.Sum(), e.RAM.Computation.Sum())
if !mmap {
es.UMA += e.RAM.Weight.Sum()
kv := e.RAM.KVCache.Sum() + e.VRAM.KVCache.Sum()
wg := e.RAM.Weight.Sum() + e.VRAM.Weight.Sum()
es.UMA = e.RAM.Footprint + max(kv, e.RAM.Computation.Sum()) + wg
if !e.NoMMap && mmap {
es.UMA -= wg
}
}

// NonUMA.
{
es.NonUMA.RAM = e.RAM.Footprint + e.RAM.KVCache.Sum() + e.RAM.Computation.Sum()
if !mmap && e.Layers != e.OffloadLayers {
es.NonUMA.RAM += e.RAM.Weight.Sum()
wg := e.RAM.Weight.Sum()
es.NonUMA.RAM = e.RAM.Footprint + e.RAM.KVCache.Sum() + e.RAM.Computation.Sum() + wg
if !e.NoMMap && (mmap || e.FullOffload) {
es.NonUMA.RAM -= wg
}
es.NonUMA.VRAM = e.VRAM.Footprint + e.VRAM.Weight.Sum() + e.VRAM.KVCache.Sum() + e.VRAM.Computation.Sum()
}
Expand Down

0 comments on commit 6ef5c4d

Please sign in to comment.