From 3f552cafb3806a01acf1d295ff6ef03db2c9ed21 Mon Sep 17 00:00:00 2001 From: thxCode Date: Tue, 2 Jul 2024 15:09:21 +0800 Subject: [PATCH] refactor: estimate Signed-off-by: thxCode --- cmd/gguf-parser/README.md | 2 +- cmd/gguf-parser/main.go | 59 ++++++++++++++++++++++----- file.go | 35 +++++++++------- file_architecture.go | 5 +++ file_estimate.go | 84 ++++++++++++++++++++++----------------- file_estimate_option.go | 24 +++++------ 6 files changed, 134 insertions(+), 75 deletions(-) diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index e7d2af1..1a83e61 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -7,7 +7,7 @@ Review/Check/Estimate [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/ ```shell $ gguf-parser --help Usage of gguf-parser ...: - -batch-size int + -ubatch-size int Specify the physical maximum batch size, which is used to estimate the usage, default is 512. (default 512) -ctx-size int Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default -1) diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index a30cca3..829cbf4 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -32,10 +32,11 @@ func main() { skipTLSVerify bool // estimate options ctxSize = -1 - batchSize = 512 + physicalBatchSize = 512 parallelSize = 1 kvType = "f16" flashAttention bool + platformFootprint = "150,250" noMMap bool offloadLayers = -1 offloadLayersStep uint64 @@ -45,6 +46,7 @@ func main() { skipArchitecture bool skipTokenizer bool skipEstimate bool + inMib bool json bool jsonPretty = true ) @@ -61,15 +63,19 @@ func main() { "/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. "+ "Note that gguf-parser does not need to download the entire GGUF file.") fs.StringVar(&repo, "repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+ - "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file.") + "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --file. [Deprecated, use --hf-repo instead]") fs.StringVar(&file, "file", file, "Model file below the --repo, e.g. "+ + "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf. [Deprecated, use --hf-file instead]") // Deprecated. + fs.StringVar(&repo, "hf-repo", repo, "Repository of HuggingFace which the GGUF file store, e.g. "+ + "NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF, works with --hf-file.") // Deprecated. + fs.StringVar(&file, "hf-file", file, "Model file below the --repo, e.g. "+ "Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf.") fs.BoolVar(&debug, "debug", debug, "Enable debugging, verbosity.") fs.BoolVar(&skipTLSVerify, "skip-tls-verify", skipTLSVerify, "Skip TLS verification, works with --url.") fs.IntVar(&ctxSize, "ctx-size", ctxSize, "Specify the size of prompt context, "+ "which is used to estimate the usage, "+ "default is equal to the model's maximum context size.") - fs.IntVar(&batchSize, "batch-size", batchSize, "Specify the physical maximum batch size, "+ + fs.IntVar(&physicalBatchSize, "ubatch-size", physicalBatchSize, "Specify the physical maximum batch size, "+ "which is used to estimate the usage, "+ "default is 512.") fs.IntVar(¶llelSize, "parallel-size", parallelSize, "Specify the number of parallel sequences to decode, "+ @@ -82,20 +88,32 @@ func main() { fs.BoolVar(&flashAttention, "flash-attention", flashAttention, "Specify enabling Flash Attention, "+ "which is used to estimate the usage. "+ "Flash Attention can reduce the usage of RAM/VRAM.") + fs.StringVar(&platformFootprint, "platform-footprint", platformFootprint, "Specify the platform footprint(RAM,VRAM) in MiB, "+ + "which is used to estimate the NonUMA usage, "+ + "default is 150,250. "+ + "Different platform always gets different RAM and VRAM footprints, "+ + "for example, within CUDA, `cudaMemGetInfo` would occupy some RAM and VRAM, "+ + "see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.") fs.BoolVar(&noMMap, "no-mmap", noMMap, "Specify disabling Memory-Mapped using, "+ "which is used to estimate the usage. "+ "Memory-Mapped can avoid loading the entire model weights into RAM.") fs.IntVar(&offloadLayers, "offload-layers", offloadLayers, "Specify how many layers to offload, "+ + "which is used to estimate the usage, "+ + "default is full offloaded. [Deprecated, use --gpu-layers instead]") // Deprecated. + fs.IntVar(&offloadLayers, "gpu-layers", offloadLayers, "Specify how many layers to offload, "+ "which is used to estimate the usage, "+ "default is full offloaded.") fs.Uint64Var(&offloadLayersStep, "offload-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+ - "works with --offload-layers.") + "works with --offload-layers. [Deprecated, use --gpu-layers-step instead]") // Deprecated. + fs.Uint64Var(&offloadLayersStep, "gpu-layers-step", offloadLayersStep, "Specify the step of layers to offload, "+ + "works with --gpu-layers.") fs.BoolVar(&version, "version", version, "Show gguf-parser version.") fs.BoolVar(&skipModel, "skip-model", skipModel, "Skip to display model metadata.") fs.BoolVar(&skipArchitecture, "skip-architecture", skipArchitecture, "Skip to display architecture metadata.") fs.BoolVar(&skipTokenizer, "skip-tokenizer", skipTokenizer, "Skip to display tokenizer metadata") fs.BoolVar(&skipEstimate, "skip-estimate", skipEstimate, "Skip to estimate.") - fs.BoolVar(&json, "json", json, "Output as JSON,") + fs.BoolVar(&inMib, "in-mib", inMib, "Display the estimated result in table with MiB.") + fs.BoolVar(&json, "json", json, "Output as JSON.") fs.BoolVar(&jsonPretty, "json-pretty", jsonPretty, "Output as pretty JSON.") if err := fs.Parse(os.Args[1:]); err != nil { fmt.Println(err.Error()) @@ -127,8 +145,8 @@ func main() { if ctxSize > 0 { eopts = append(eopts, WithContextSize(int32(ctxSize))) } - if batchSize > 0 { - eopts = append(eopts, WithBatchSize(int32(batchSize))) + if physicalBatchSize > 0 { + eopts = append(eopts, WithPhysicalBatchSize(int32(physicalBatchSize))) } if parallelSize > 0 { eopts = append(eopts, WithParallelSize(int32(parallelSize))) @@ -208,6 +226,23 @@ func main() { } // Output + var ( + mmap = !noMMap + platformRAM, platformVRAM uint64 + ) + { + if platformFootprint != "" { + parts := strings.Split(platformFootprint, ",") + if len(parts) == 2 { + if v, err := strconv.ParseUint(parts[0], 10, 64); err == nil { + platformRAM = v * 1024 * 1024 + } + if v, err := strconv.ParseUint(parts[1], 10, 64); err == nil { + platformVRAM = v * 1024 * 1024 + } + } + } + } if json { o := map[string]any{} @@ -221,7 +256,7 @@ func main() { o["tokenizer"] = t } if !skipEstimate { - es := e.Summarize(!noMMap) + es := e.Summarize(mmap, platformRAM, platformVRAM) switch { case offloadLayersStep > e.OffloadLayers: offloadLayersStep = e.OffloadLayers @@ -241,7 +276,7 @@ func main() { defer wg.Done() eopts := eopts[:len(eopts):len(eopts)] eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) - ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(!noMMap) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) }(i) } wg.Wait() @@ -263,6 +298,8 @@ func main() { return } + InMiBytes = inMib + if !skipModel { tprint( "MODEL", @@ -313,7 +350,7 @@ func main() { } if !skipEstimate { - es := e.Summarize(!noMMap) + es := e.Summarize(mmap, platformRAM, platformVRAM) switch { case offloadLayersStep > e.OffloadLayers: offloadLayersStep = e.OffloadLayers @@ -333,7 +370,7 @@ func main() { defer wg.Done() eopts := eopts[:len(eopts):len(eopts)] eopts = append(eopts, WithOffloadLayers(uint64(i)*offloadLayersStep)) - ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(!noMMap) + ess[i] = gf.EstimateLLaMACppUsage(eopts...).SummarizeMemory(mmap, platformRAM, platformVRAM) }(i) } wg.Wait() diff --git a/file.go b/file.go index 569f8e4..7980b5e 100644 --- a/file.go +++ b/file.go @@ -514,27 +514,34 @@ const ( _PiBytes ) +var InMiBytes bool + func (s GGUFBytesScalar) String() string { if s == 0 { return "0 B" } b, u := float64(1), "B" - switch { - case s >= _PiBytes: - b = _PiBytes - u = "PiB" - case s >= _TiBytes: - b = _TiBytes - u = "TiB" - case s >= _GiBytes: - b = _GiBytes - u = "GiB" - case s >= _MiBytes: + if InMiBytes { b = _MiBytes u = "MiB" - case s >= _KiBytes: - b = _KiBytes - u = "KiB" + } else { + switch { + case s >= _PiBytes: + b = _PiBytes + u = "PiB" + case s >= _TiBytes: + b = _TiBytes + u = "TiB" + case s >= _GiBytes: + b = _GiBytes + u = "GiB" + case s >= _MiBytes: + b = _MiBytes + u = "MiB" + case s >= _KiBytes: + b = _KiBytes + u = "KiB" + } } f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) return strings.TrimSuffix(f, ".00") + " " + u diff --git a/file_architecture.go b/file_architecture.go index c242972..513787a 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -83,6 +83,8 @@ type GGUFArchitectureMetadata struct { /* Appendix */ + // EmbeddingGroup is the number of groups in the embedding layer. + EmbeddingGroup uint64 `json:"embeddingGroup,omitempty"` // EmbeddingKeyGQA is the number of key GQA in the embedding layer. EmbeddingKeyGQA uint64 `json:"embeddingKeyGQA,omitempty"` // EmbeddingValueGQA is the number of value GQA in the embedding layer. @@ -274,6 +276,9 @@ func (gf *GGUFFile) Architecture() (ga GGUFArchitectureMetadata) { } { + if ga.AttentionHeadCountKV > 0 { + ga.EmbeddingGroup = ga.AttentionHeadCount / ga.AttentionHeadCountKV + } if ga.AttentionHeadCount > 0 { ga.EmbeddingKeyGQA = uint64(ga.AttentionKeyLength) * ga.AttentionHeadCountKV ga.EmbeddingValueGQA = uint64(ga.AttentionValueLength) * ga.AttentionHeadCountKV diff --git a/file_estimate.go b/file_estimate.go index 339bae5..0177e91 100644 --- a/file_estimate.go +++ b/file_estimate.go @@ -87,6 +87,9 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( if o.CacheValueType == nil { o.CacheValueType = ptr.To(GGMLTypeF16) } + if o.PhysicalBatchSize == nil { + o.PhysicalBatchSize = ptr.To(int32(512)) + } // Architecture and tokenizer metadata. var ( @@ -138,7 +141,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } // Correct token size, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224. - nTokens = min(nContext, uint64(ptr.Deref(o.BatchSize, 512))) + nTokens = min(nContext, uint64(*o.PhysicalBatchSize)) nBatch = nTokens nOutputs = nTokens nParallel = uint64(ptr.Deref(o.ParallelSize, 1)) @@ -230,12 +233,7 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( if _, ok := opLs.Get("output.weight"); ok { e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes()) } else { - e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes() + ioLs.Bytes() /* duplicate the input layer */) - } - if isOffloadOutputLayer && nLoadLayers == 0 { // Full offloaded. - // Transfer the output weight to VRAM when all layers are offloaded. - e.Offload.Weight.Output = e.Load.Weight.Output - e.Load.Weight.Output = 0 + e.Load.Weight.Output = GGUFBytesScalar(opLs.Bytes()) + e.Load.Weight.Input /* duplicate the input layer */ } } @@ -318,38 +316,54 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( } e.Offload.Computation.Compute = GGUFBytesScalar(convInc + ssmInc) } else { - attnInc := uint64(0) + loadAttnInc, offloadAttnInc := uint64(0), uint64(0) if o.FlashAttention { // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387. - attnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens}) + offloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens}) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) { if strings.HasSuffix(l.Name, ".attn_norm.weight") { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) - attnInc += rs + offloadAttnInc += rs continue } rs := l.Bytes() - attnInc += rs + offloadAttnInc += rs } // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L6986-L6992. rs := o.CacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV}) - attnInc += rs + offloadAttnInc += rs // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7000-L7007. rs = o.CacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV}) - attnInc += rs + offloadAttnInc += rs } else { - attnInc = uint64(e.Load.KVCache.Key + e.Offload.KVCache.Key) + offloadAttnInc = uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv)\.weight`)) { - rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) - attnInc += rs + var rs uint64 switch { - default: - continue + default: // norm. + rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) + offloadAttnInc += rs case strings.HasSuffix(l.Name, ".attn_q.weight"): + rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens}) + offloadAttnInc += rs * 2 // Qcur, Qcur + RoPE. + if !isOffloadOutputLayer { + loadAttnInc = rs // Vcur. + } + rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount}) + offloadAttnInc += rs // kq. + rs = o.CacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV}) + offloadAttnInc += rs * 2 // k-?, v-?. case strings.HasSuffix(l.Name, ".attn_qkv.weight"): rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens}) + offloadAttnInc += rs * 2 // Qcur, Qcur + RoPE. + if !isOffloadOutputLayer { + loadAttnInc = rs // Vcur. + } + rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount}) + offloadAttnInc += rs // kq. + rs = o.CacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV}) + offloadAttnInc += rs * 2 // k-?, v-?. } - attnInc += rs * 2 // for RoPE } } ffnInc := uint64(0) @@ -357,7 +371,8 @@ func (gf *GGUFFile) EstimateLLaMACppUsage(opts ...LLaMACppUsageEstimateOption) ( rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) ffnInc += rs } - e.Offload.Computation.Compute = GGUFBytesScalar(max(attnInc, ffnInc)) + e.Load.Computation.Compute = GGUFBytesScalar(loadAttnInc) + e.Offload.Computation.Compute = GGUFBytesScalar(max(offloadAttnInc, ffnInc)) // Special case: we cannot use mmap for splitting expert weights in MoE. if a.ExpertCount > 0 { e.NoMMap = len(tfLs[0].Search(regexp.MustCompile(`.*\.\d+\.ffn_gate_exps\.weight`))) == 0 @@ -425,7 +440,9 @@ type ( } ) -func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEstimateMemorySummary) { +// SummarizeMemory returns the summary of the estimated memory usage of loading the GGUF file in llama.cpp, +// the input options are used to adjust the summary. +func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool, platformRAM, platformVRAM uint64) (ems LLaMACppUsageEstimateMemorySummary) { ems.OffloadLayers, ems.FullOffloaded = e.OffloadLayers, e.FullOffloaded if ems.FullOffloaded { ems.OffloadLayers++ // The output layer is offloaded. @@ -443,31 +460,22 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEsti } } - // TODO(thxCode): complete more cases, - // and support optional parameters for the following constants. - // - // Footprint, - // see https://github.com/ggerganov/llama.cpp/blob/f578b86b2123d0f92afbaa98a031df4d4464e582/llama.cpp#L2454-L2486. - const ( - // The function `cudaMemGetInfo` occupies some memory, - // see https://github.com/ggerganov/llama.cpp/blob/f578b86b2123d0f92afbaa98a031df4d4464e582/ggml-cuda.cu#L3009-L3013, - // and https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. - cudaFootprint = GGUFBytesScalar(150 * 1024 * 1024) - ) - // NonUMA. { // RAM. - fp := cudaFootprint + e.Load.Footprint + fp := GGUFBytesScalar(platformRAM) + e.Load.Footprint wg := e.Load.Weight.Sum() kv := e.Load.KVCache.Sum() cp := e.Load.Computation.Sum() ems.NonUMA.RAM = fp + wg + kv + cp if !e.NoMMap && (mmap || e.FullOffloaded) { ems.NonUMA.RAM -= wg + if !mmap { + ems.NonUMA.RAM += e.Load.Weight.Output + } } // VRAM. - fp = e.Offload.Footprint + fp = GGUFBytesScalar(platformVRAM) + e.Offload.Footprint wg = e.Offload.Weight.Sum() kv = e.Offload.KVCache.Sum() cp = e.Offload.Computation.Sum() @@ -477,10 +485,12 @@ func (e LLaMACppUsageEstimate) SummarizeMemory(mmap bool) (ems LLaMACppUsageEsti return ems } -func (e LLaMACppUsageEstimate) Summarize(mmap bool) (es LLaMACppUsageEstimateSummary) { +// Summarize returns the summary of the estimated result of loading the GGUF file in llama.cpp, +// the input options are used to adjust the summary. +func (e LLaMACppUsageEstimate) Summarize(mmap bool, platformRAM, platformVRAM uint64) (es LLaMACppUsageEstimateSummary) { // Summarize memory. es.Memory = []LLaMACppUsageEstimateMemorySummary{ - e.SummarizeMemory(mmap), + e.SummarizeMemory(mmap, platformRAM, platformVRAM), } // Just copy from the original estimate. diff --git a/file_estimate_option.go b/file_estimate_option.go index 813b54b..a0cf10f 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -6,15 +6,15 @@ import ( type ( _LLaMACppUsageEstimateOptions struct { - Architecture *GGUFArchitectureMetadata - Tokenizer *GGUFTokenizerMetadata - ContextSize *int32 - BatchSize *int32 - ParallelSize *int32 - CacheKeyType *GGMLType - CacheValueType *GGMLType - OffloadLayers *uint64 - FlashAttention bool + Architecture *GGUFArchitectureMetadata + Tokenizer *GGUFTokenizerMetadata + ContextSize *int32 + PhysicalBatchSize *int32 + ParallelSize *int32 + CacheKeyType *GGMLType + CacheValueType *GGMLType + OffloadLayers *uint64 + FlashAttention bool } LLaMACppUsageEstimateOption func(*_LLaMACppUsageEstimateOptions) ) @@ -47,13 +47,13 @@ func WithContextSize(size int32) LLaMACppUsageEstimateOption { } } -// WithBatchSize sets the physical batch size for the estimate. -func WithBatchSize(size int32) LLaMACppUsageEstimateOption { +// WithPhysicalBatchSize sets the physical batch size for the estimate. +func WithPhysicalBatchSize(size int32) LLaMACppUsageEstimateOption { return func(o *_LLaMACppUsageEstimateOptions) { if size <= 0 { return } - o.BatchSize = &size + o.PhysicalBatchSize = &size } }